From e0cc3e4bf6a2ebf6334163622f87fd800f487306 Mon Sep 17 00:00:00 2001 From: Hugo Shaka Date: Mon, 30 Jun 2025 18:16:46 -0400 Subject: [PATCH 01/14] autoupdate canary support: proto messages (#56259) --- .../teleport/autoupdate/v1/autoupdate.pb.go | 178 ++++++++++++++---- .../teleport/autoupdate/v1/autoupdate.proto | 24 +++ 2 files changed, 169 insertions(+), 33 deletions(-) diff --git a/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go b/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go index cb963a7017a38..acd17cbe9f817 100644 --- a/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go +++ b/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go @@ -55,6 +55,9 @@ const ( // AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK represents that the group has been rolled back. // New agents should run v1, existing agents should update to v1. AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK AutoUpdateAgentGroupState = 4 + // AUTO_UPDATE_AGENT_GROUP_STATE_CANARY represents that the group is updating a few canary nodes, but that most nodes + // have not started updating yet. + AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY AutoUpdateAgentGroupState = 5 ) // Enum value maps for AutoUpdateAgentGroupState. @@ -65,6 +68,7 @@ var ( 2: "AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE", 3: "AUTO_UPDATE_AGENT_GROUP_STATE_DONE", 4: "AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK", + 5: "AUTO_UPDATE_AGENT_GROUP_STATE_CANARY", } AutoUpdateAgentGroupState_value = map[string]int32{ "AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED": 0, @@ -72,6 +76,7 @@ var ( "AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE": 2, "AUTO_UPDATE_AGENT_GROUP_STATE_DONE": 3, "AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK": 4, + "AUTO_UPDATE_AGENT_GROUP_STATE_CANARY": 5, } ) @@ -1105,6 +1110,12 @@ type AutoUpdateAgentRolloutStatusGroup struct { // to the done state if: // - the ratio present_count/initial_count is above 0.9 (no more than 10% of the nodes dropped during update) UpToDateCount uint64 `protobuf:"varint,12,opt,name=up_to_date_count,json=upToDateCount,proto3" json:"up_to_date_count,omitempty"` + // canary_count represents how many canaries this group should have to leave the AUTO_UPDATE_AGENT_GROUP_STATE_CANARY + // state. + CanaryCount uint64 `protobuf:"varint,13,opt,name=canary_count,json=canaryCount,proto3" json:"canary_count,omitempty"` + // canaries is the list of canary agents that should be updated. + // This list is empty until we enter the AUTO_UPDATE_AGENT_GROUP_STATE_CANARY state. + Canaries []*Canary `protobuf:"bytes,14,rep,name=canaries,proto3" json:"canaries,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -1216,6 +1227,20 @@ func (x *AutoUpdateAgentRolloutStatusGroup) GetUpToDateCount() uint64 { return 0 } +func (x *AutoUpdateAgentRolloutStatusGroup) GetCanaryCount() uint64 { + if x != nil { + return x.CanaryCount + } + return 0 +} + +func (x *AutoUpdateAgentRolloutStatusGroup) GetCanaries() []*Canary { + if x != nil { + return x.Canaries + } + return nil +} + // AutoUpdateAgentReport is a report generated by each Teleport Auth service. // The report tracks per group and per version how many agents are running. // The report is used to track which version agents are running. @@ -1504,6 +1529,82 @@ func (x *AutoUpdateAgentReportSpecOmitted) GetReason() string { return "" } +// Canary describes a node that is acting as a canary and being updated before other nodes in its group. +type Canary struct { + state protoimpl.MessageState `protogen:"open.v1"` + // updater_id is reported by the agent in its control stream Hello. This allows us to uniquely identify an updater so + // the proxy can modulate its answer when the request comes from this specific updater. + UpdaterId string `protobuf:"bytes,1,opt,name=updater_id,json=updaterId,proto3" json:"updater_id,omitempty"` + // host_id is the node Host ID, reported by the agent in its control stream Hello. + HostId string `protobuf:"bytes,2,opt,name=host_id,json=hostId,proto3" json:"host_id,omitempty"` + // hostname is the server hostname reported by the agent in its control stream Hello. + // This is purely for debugging purposes: if the agent drops, we won't be able to query the inventory to know which + // agent it was. + Hostname string `protobuf:"bytes,3,opt,name=hostname,proto3" json:"hostname,omitempty"` + // success represents if the agent successfully connected back, running the target version. + Success bool `protobuf:"varint,4,opt,name=success,proto3" json:"success,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *Canary) Reset() { + *x = Canary{} + mi := &file_teleport_autoupdate_v1_autoupdate_proto_msgTypes[19] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *Canary) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Canary) ProtoMessage() {} + +func (x *Canary) ProtoReflect() protoreflect.Message { + mi := &file_teleport_autoupdate_v1_autoupdate_proto_msgTypes[19] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Canary.ProtoReflect.Descriptor instead. +func (*Canary) Descriptor() ([]byte, []int) { + return file_teleport_autoupdate_v1_autoupdate_proto_rawDescGZIP(), []int{19} +} + +func (x *Canary) GetUpdaterId() string { + if x != nil { + return x.UpdaterId + } + return "" +} + +func (x *Canary) GetHostId() string { + if x != nil { + return x.HostId + } + return "" +} + +func (x *Canary) GetHostname() string { + if x != nil { + return x.Hostname + } + return "" +} + +func (x *Canary) GetSuccess() bool { + if x != nil { + return x.Success + } + return false +} + var File_teleport_autoupdate_v1_autoupdate_proto protoreflect.FileDescriptor const file_teleport_autoupdate_v1_autoupdate_proto_rawDesc = "" + @@ -1569,7 +1670,7 @@ const file_teleport_autoupdate_v1_autoupdate_proto_rawDesc = "" + "\x05state\x18\x02 \x01(\x0e23.teleport.autoupdate.v1.AutoUpdateAgentRolloutStateR\x05state\x129\n" + "\n" + "start_time\x18\x03 \x01(\v2\x1a.google.protobuf.TimestampR\tstartTime\x12?\n" + - "\rtime_override\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\ftimeOverride\"\xb3\x04\n" + + "\rtime_override\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\ftimeOverride\"\x92\x05\n" + "!AutoUpdateAgentRolloutStatusGroup\x12\x12\n" + "\x04name\x18\x01 \x01(\tR\x04name\x129\n" + "\n" + @@ -1584,7 +1685,9 @@ const file_teleport_autoupdate_v1_autoupdate_proto_rawDesc = "" + "\rinitial_count\x18\n" + " \x01(\x04R\finitialCount\x12#\n" + "\rpresent_count\x18\v \x01(\x04R\fpresentCount\x12'\n" + - "\x10up_to_date_count\x18\f \x01(\x04R\rupToDateCountJ\x04\b\b\x10\tR\x10config_wait_days\"\xe1\x01\n" + + "\x10up_to_date_count\x18\f \x01(\x04R\rupToDateCount\x12!\n" + + "\fcanary_count\x18\r \x01(\x04R\vcanaryCount\x12:\n" + + "\bcanaries\x18\x0e \x03(\v2\x1e.teleport.autoupdate.v1.CanaryR\bcanariesJ\x04\b\b\x10\tR\x10config_wait_days\"\xe1\x01\n" + "\x15AutoUpdateAgentReport\x12\x12\n" + "\x04kind\x18\x01 \x01(\tR\x04kind\x12\x19\n" + "\bsub_kind\x18\x02 \x01(\tR\asubKind\x12\x18\n" + @@ -1607,13 +1710,20 @@ const file_teleport_autoupdate_v1_autoupdate_proto_rawDesc = "" + "\x05count\x18\x01 \x01(\x05R\x05count\"P\n" + " AutoUpdateAgentReportSpecOmitted\x12\x14\n" + "\x05count\x18\x01 \x01(\x03R\x05count\x12\x16\n" + - "\x06reason\x18\x02 \x01(\tR\x06reason*\xf7\x01\n" + + "\x06reason\x18\x02 \x01(\tR\x06reason\"v\n" + + "\x06Canary\x12\x1d\n" + + "\n" + + "updater_id\x18\x01 \x01(\tR\tupdaterId\x12\x17\n" + + "\ahost_id\x18\x02 \x01(\tR\x06hostId\x12\x1a\n" + + "\bhostname\x18\x03 \x01(\tR\bhostname\x12\x18\n" + + "\asuccess\x18\x04 \x01(\bR\asuccess*\xa1\x02\n" + "\x19AutoUpdateAgentGroupState\x12-\n" + ")AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED\x10\x00\x12+\n" + "'AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED\x10\x01\x12(\n" + "$AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE\x10\x02\x12&\n" + "\"AUTO_UPDATE_AGENT_GROUP_STATE_DONE\x10\x03\x12,\n" + - "(AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK\x10\x04*\x83\x02\n" + + "(AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK\x10\x04\x12(\n" + + "$AUTO_UPDATE_AGENT_GROUP_STATE_CANARY\x10\x05*\x83\x02\n" + "\x1bAutoUpdateAgentRolloutState\x12/\n" + "+AUTO_UPDATE_AGENT_ROLLOUT_STATE_UNSPECIFIED\x10\x00\x12-\n" + ")AUTO_UPDATE_AGENT_ROLLOUT_STATE_UNSTARTED\x10\x01\x12*\n" + @@ -1634,7 +1744,7 @@ func file_teleport_autoupdate_v1_autoupdate_proto_rawDescGZIP() []byte { } var file_teleport_autoupdate_v1_autoupdate_proto_enumTypes = make([]protoimpl.EnumInfo, 2) -var file_teleport_autoupdate_v1_autoupdate_proto_msgTypes = make([]protoimpl.MessageInfo, 21) +var file_teleport_autoupdate_v1_autoupdate_proto_msgTypes = make([]protoimpl.MessageInfo, 22) var file_teleport_autoupdate_v1_autoupdate_proto_goTypes = []any{ (AutoUpdateAgentGroupState)(0), // 0: teleport.autoupdate.v1.AutoUpdateAgentGroupState (AutoUpdateAgentRolloutState)(0), // 1: teleport.autoupdate.v1.AutoUpdateAgentRolloutState @@ -1657,48 +1767,50 @@ var file_teleport_autoupdate_v1_autoupdate_proto_goTypes = []any{ (*AutoUpdateAgentReportSpecGroup)(nil), // 18: teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup (*AutoUpdateAgentReportSpecGroupVersion)(nil), // 19: teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroupVersion (*AutoUpdateAgentReportSpecOmitted)(nil), // 20: teleport.autoupdate.v1.AutoUpdateAgentReportSpecOmitted - nil, // 21: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.GroupsEntry - nil, // 22: teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup.VersionsEntry - (*v1.Metadata)(nil), // 23: teleport.header.v1.Metadata - (*durationpb.Duration)(nil), // 24: google.protobuf.Duration - (*timestamppb.Timestamp)(nil), // 25: google.protobuf.Timestamp + (*Canary)(nil), // 21: teleport.autoupdate.v1.Canary + nil, // 22: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.GroupsEntry + nil, // 23: teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup.VersionsEntry + (*v1.Metadata)(nil), // 24: teleport.header.v1.Metadata + (*durationpb.Duration)(nil), // 25: google.protobuf.Duration + (*timestamppb.Timestamp)(nil), // 26: google.protobuf.Timestamp } var file_teleport_autoupdate_v1_autoupdate_proto_depIdxs = []int32{ - 23, // 0: teleport.autoupdate.v1.AutoUpdateConfig.metadata:type_name -> teleport.header.v1.Metadata + 24, // 0: teleport.autoupdate.v1.AutoUpdateConfig.metadata:type_name -> teleport.header.v1.Metadata 3, // 1: teleport.autoupdate.v1.AutoUpdateConfig.spec:type_name -> teleport.autoupdate.v1.AutoUpdateConfigSpec 4, // 2: teleport.autoupdate.v1.AutoUpdateConfigSpec.tools:type_name -> teleport.autoupdate.v1.AutoUpdateConfigSpecTools 5, // 3: teleport.autoupdate.v1.AutoUpdateConfigSpec.agents:type_name -> teleport.autoupdate.v1.AutoUpdateConfigSpecAgents - 24, // 4: teleport.autoupdate.v1.AutoUpdateConfigSpecAgents.maintenance_window_duration:type_name -> google.protobuf.Duration + 25, // 4: teleport.autoupdate.v1.AutoUpdateConfigSpecAgents.maintenance_window_duration:type_name -> google.protobuf.Duration 6, // 5: teleport.autoupdate.v1.AutoUpdateConfigSpecAgents.schedules:type_name -> teleport.autoupdate.v1.AgentAutoUpdateSchedules 7, // 6: teleport.autoupdate.v1.AgentAutoUpdateSchedules.regular:type_name -> teleport.autoupdate.v1.AgentAutoUpdateGroup - 23, // 7: teleport.autoupdate.v1.AutoUpdateVersion.metadata:type_name -> teleport.header.v1.Metadata + 24, // 7: teleport.autoupdate.v1.AutoUpdateVersion.metadata:type_name -> teleport.header.v1.Metadata 9, // 8: teleport.autoupdate.v1.AutoUpdateVersion.spec:type_name -> teleport.autoupdate.v1.AutoUpdateVersionSpec 10, // 9: teleport.autoupdate.v1.AutoUpdateVersionSpec.tools:type_name -> teleport.autoupdate.v1.AutoUpdateVersionSpecTools 11, // 10: teleport.autoupdate.v1.AutoUpdateVersionSpec.agents:type_name -> teleport.autoupdate.v1.AutoUpdateVersionSpecAgents - 23, // 11: teleport.autoupdate.v1.AutoUpdateAgentRollout.metadata:type_name -> teleport.header.v1.Metadata + 24, // 11: teleport.autoupdate.v1.AutoUpdateAgentRollout.metadata:type_name -> teleport.header.v1.Metadata 13, // 12: teleport.autoupdate.v1.AutoUpdateAgentRollout.spec:type_name -> teleport.autoupdate.v1.AutoUpdateAgentRolloutSpec 14, // 13: teleport.autoupdate.v1.AutoUpdateAgentRollout.status:type_name -> teleport.autoupdate.v1.AutoUpdateAgentRolloutStatus - 24, // 14: teleport.autoupdate.v1.AutoUpdateAgentRolloutSpec.maintenance_window_duration:type_name -> google.protobuf.Duration + 25, // 14: teleport.autoupdate.v1.AutoUpdateAgentRolloutSpec.maintenance_window_duration:type_name -> google.protobuf.Duration 15, // 15: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatus.groups:type_name -> teleport.autoupdate.v1.AutoUpdateAgentRolloutStatusGroup 1, // 16: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatus.state:type_name -> teleport.autoupdate.v1.AutoUpdateAgentRolloutState - 25, // 17: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatus.start_time:type_name -> google.protobuf.Timestamp - 25, // 18: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatus.time_override:type_name -> google.protobuf.Timestamp - 25, // 19: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatusGroup.start_time:type_name -> google.protobuf.Timestamp + 26, // 17: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatus.start_time:type_name -> google.protobuf.Timestamp + 26, // 18: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatus.time_override:type_name -> google.protobuf.Timestamp + 26, // 19: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatusGroup.start_time:type_name -> google.protobuf.Timestamp 0, // 20: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatusGroup.state:type_name -> teleport.autoupdate.v1.AutoUpdateAgentGroupState - 25, // 21: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatusGroup.last_update_time:type_name -> google.protobuf.Timestamp - 23, // 22: teleport.autoupdate.v1.AutoUpdateAgentReport.metadata:type_name -> teleport.header.v1.Metadata - 17, // 23: teleport.autoupdate.v1.AutoUpdateAgentReport.spec:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpec - 25, // 24: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.timestamp:type_name -> google.protobuf.Timestamp - 21, // 25: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.groups:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpec.GroupsEntry - 20, // 26: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.omitted:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpecOmitted - 22, // 27: teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup.versions:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup.VersionsEntry - 18, // 28: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.GroupsEntry.value:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup - 19, // 29: teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup.VersionsEntry.value:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroupVersion - 30, // [30:30] is the sub-list for method output_type - 30, // [30:30] is the sub-list for method input_type - 30, // [30:30] is the sub-list for extension type_name - 30, // [30:30] is the sub-list for extension extendee - 0, // [0:30] is the sub-list for field type_name + 26, // 21: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatusGroup.last_update_time:type_name -> google.protobuf.Timestamp + 21, // 22: teleport.autoupdate.v1.AutoUpdateAgentRolloutStatusGroup.canaries:type_name -> teleport.autoupdate.v1.Canary + 24, // 23: teleport.autoupdate.v1.AutoUpdateAgentReport.metadata:type_name -> teleport.header.v1.Metadata + 17, // 24: teleport.autoupdate.v1.AutoUpdateAgentReport.spec:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpec + 26, // 25: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.timestamp:type_name -> google.protobuf.Timestamp + 22, // 26: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.groups:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpec.GroupsEntry + 20, // 27: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.omitted:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpecOmitted + 23, // 28: teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup.versions:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup.VersionsEntry + 18, // 29: teleport.autoupdate.v1.AutoUpdateAgentReportSpec.GroupsEntry.value:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup + 19, // 30: teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroup.VersionsEntry.value:type_name -> teleport.autoupdate.v1.AutoUpdateAgentReportSpecGroupVersion + 31, // [31:31] is the sub-list for method output_type + 31, // [31:31] is the sub-list for method input_type + 31, // [31:31] is the sub-list for extension type_name + 31, // [31:31] is the sub-list for extension extendee + 0, // [0:31] is the sub-list for field type_name } func init() { file_teleport_autoupdate_v1_autoupdate_proto_init() } @@ -1712,7 +1824,7 @@ func file_teleport_autoupdate_v1_autoupdate_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_teleport_autoupdate_v1_autoupdate_proto_rawDesc), len(file_teleport_autoupdate_v1_autoupdate_proto_rawDesc)), NumEnums: 2, - NumMessages: 21, + NumMessages: 22, NumExtensions: 0, NumServices: 0, }, diff --git a/api/proto/teleport/autoupdate/v1/autoupdate.proto b/api/proto/teleport/autoupdate/v1/autoupdate.proto index 0a1a6a0cf2d62..ff42592c6fb8c 100644 --- a/api/proto/teleport/autoupdate/v1/autoupdate.proto +++ b/api/proto/teleport/autoupdate/v1/autoupdate.proto @@ -230,6 +230,12 @@ message AutoUpdateAgentRolloutStatusGroup { // to the done state if: // - the ratio present_count/initial_count is above 0.9 (no more than 10% of the nodes dropped during update) uint64 up_to_date_count = 12; + // canary_count represents how many canaries this group should have to leave the AUTO_UPDATE_AGENT_GROUP_STATE_CANARY + // state. + uint64 canary_count = 13; + // canaries is the list of canary agents that should be updated. + // This list is empty until we enter the AUTO_UPDATE_AGENT_GROUP_STATE_CANARY state. + repeated Canary canaries = 14; } // AutoUpdateAgentGroupState represents the agent group state. This state controls whether the agents from this group @@ -247,6 +253,9 @@ enum AutoUpdateAgentGroupState { // AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK represents that the group has been rolled back. // New agents should run v1, existing agents should update to v1. AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK = 4; + // AUTO_UPDATE_AGENT_GROUP_STATE_CANARY represents that the group is updating a few canary nodes, but that most nodes + // have not started updating yet. + AUTO_UPDATE_AGENT_GROUP_STATE_CANARY = 5; } // AutoUpdateAgentRolloutState represents the rollout state. This tells if Teleport started updating agents from the @@ -305,3 +314,18 @@ message AutoUpdateAgentReportSpecOmitted { int64 count = 1; string reason = 2; } + +// Canary describes a node that is acting as a canary and being updated before other nodes in its group. +message Canary { + // updater_id is reported by the agent in its control stream Hello. This allows us to uniquely identify an updater so + // the proxy can modulate its answer when the request comes from this specific updater. + string updater_id = 1; + // host_id is the node Host ID, reported by the agent in its control stream Hello. + string host_id = 2; + // hostname is the server hostname reported by the agent in its control stream Hello. + // This is purely for debugging purposes: if the agent drops, we won't be able to query the inventory to know which + // agent it was. + string hostname = 3; + // success represents if the agent successfully connected back, running the target version. + bool success = 4; +} From 1ed6ffcfe439d6a256a0d746c53522995015092b Mon Sep 17 00:00:00 2001 From: Hugo Shaka Date: Thu, 3 Jul 2025 15:36:35 -0400 Subject: [PATCH 02/14] autoupdate canary support: inventory and auth primitives (#56261) --- api/types/autoupdate/constants.go | 5 + api/types/autoupdate/rollout.go | 6 + api/types/autoupdate/rollout_test.go | 17 + ....go => autoupdate_agent_version_report.go} | 100 ++++-- ...> autoupdate_agent_version_report_test.go} | 39 ++- lib/auth/autoupdate_rollout.go | 168 ++++++++++ lib/auth/autoupdate_rollout_test.go | 312 ++++++++++++++++++ lib/inventory/controller.go | 6 + lib/inventory/store.go | 27 ++ 9 files changed, 647 insertions(+), 33 deletions(-) rename lib/auth/{agent_version_report.go => autoupdate_agent_version_report.go} (70%) rename lib/auth/{agent_version_report_test.go => autoupdate_agent_version_report_test.go} (92%) create mode 100644 lib/auth/autoupdate_rollout.go create mode 100644 lib/auth/autoupdate_rollout_test.go diff --git a/api/types/autoupdate/constants.go b/api/types/autoupdate/constants.go index deed5168fb21f..36f75a82f9f7a 100644 --- a/api/types/autoupdate/constants.go +++ b/api/types/autoupdate/constants.go @@ -43,4 +43,9 @@ const ( // maintenance window. There is no dependency between groups. Agents won't be instructed to update // if the window is over. AgentsStrategyTimeBased = "time-based" + + // MaxCanaryCount is the maximum number of canaries allowed for a single group. + // This value is arbitrarily low to avoid XXL rollouts to grow over the max backend + // item size. + MaxCanaryCount = 10 ) diff --git a/api/types/autoupdate/rollout.go b/api/types/autoupdate/rollout.go index 111c9a65e0095..ff7d9294d080d 100644 --- a/api/types/autoupdate/rollout.go +++ b/api/types/autoupdate/rollout.go @@ -93,6 +93,12 @@ func ValidateAutoUpdateAgentRollout(v *autoupdate.AutoUpdateAgentRollout) error if v.Spec.Strategy == AgentsStrategyHaltOnError && i == 0 && group.ConfigWaitHours != 0 { return trace.BadParameter("status.schedules.groups[0].config_wait_hours must be zero as it's the first group") } + if group.CanaryCount > MaxCanaryCount { + return trace.BadParameter("status.schedules.groups[%d].canary_count must be less than %d", i, MaxCanaryCount) + } + if len(group.Canaries) > MaxCanaryCount { + return trace.BadParameter("status.schedules.groups[%d].canaries must be contain less than %d elements", i, MaxCanaryCount) + } if conflictingGroup, ok := seenGroups[group.Name]; ok { return trace.BadParameter("spec.agents.schedules.regular contains groups with the same name %q at indices %d and %d", group.Name, conflictingGroup, i) } diff --git a/api/types/autoupdate/rollout_test.go b/api/types/autoupdate/rollout_test.go index d95ba9ef890fd..8a8e054f4ef1c 100644 --- a/api/types/autoupdate/rollout_test.go +++ b/api/types/autoupdate/rollout_test.go @@ -349,6 +349,23 @@ func TestValidateAutoUpdateAgentRollout(t *testing.T) { }, assertErr: require.Error, }, + { + name: "group with too high canary count", + rollout: &autoupdate.AutoUpdateAgentRollout{ + Kind: types.KindAutoUpdateAgentRollout, + Version: types.V1, + Metadata: &headerv1.Metadata{ + Name: types.MetaNameAutoUpdateAgentRollout, + }, + Spec: &haltOnErrorRolloutSpec, + Status: &autoupdate.AutoUpdateAgentRolloutStatus{ + Groups: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + {Name: "g1", ConfigDays: []string{"*"}, CanaryCount: 15}, + }, + }, + }, + assertErr: require.Error, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/lib/auth/agent_version_report.go b/lib/auth/autoupdate_agent_version_report.go similarity index 70% rename from lib/auth/agent_version_report.go rename to lib/auth/autoupdate_agent_version_report.go index 6dc3c4fc7f7af..1991ea851387d 100644 --- a/lib/auth/agent_version_report.go +++ b/lib/auth/autoupdate_agent_version_report.go @@ -48,51 +48,88 @@ type instanceReport struct { omissions map[string]int } -func (ir instanceReport) collectInstance(handle inventory.UpstreamHandle) { +type autoUpdateFilterResult int + +const ( + // The zero value should not be used, but if it is, it should not match anything. + autoUpdateFilterResultUnknown autoUpdateFilterResult = iota + autoUpdateFilterResultMatching + autoUpdateFilterResultTerminating + autoUpdateFilterResultRecentlyConnected + autoUpdaterFilterResultIsControlPlane + autoUpdateFilterResultNoUpdater + autoUpdateFilterResultUpdaterTooOld + autoUpdateFilterResultUpdaterDisabled + autoUpdateFilterResultUpdaterPinned + autoUpdateFilterResultUpdaterUnreadable + autoUpdateFilterResultUpdaterUnknown + autoUpdateFilterResultUpdaterV1 +) + +// String returns a user-facing message explaining the filter decision. +// If the filter result reason is not actionable, it returns an empty string. +func (r autoUpdateFilterResult) String() string { + switch r { + case autoUpdateFilterResultNoUpdater: + return omissionReasonNoUpdater + case autoUpdateFilterResultUpdaterTooOld: + return omissionReasonUpdaterTooOld + case autoUpdateFilterResultUpdaterDisabled: + return omissionReasonUpdaterDisabled + case autoUpdateFilterResultUpdaterPinned: + return omissionReasonUpdaterPinned + case autoUpdateFilterResultUpdaterUnreadable: + return omissionReasonUpdaterUnreadable + case autoUpdateFilterResultUpdaterUnknown: + return omissionReasonUpdaterUnknown + case autoUpdateFilterResultUpdaterV1: + return omissionReasonUpdaterV1 + default: + return "" + } +} + +// filterHandler filters handles than can or cannot be used for automatic update +// purposes. It returns true if the handle can be used, false otherwise. +// If the handle cannot be used, the function might also return a non-empty string +// explaining why. +func filterHandler(handle inventory.UpstreamHandle, now time.Time) autoUpdateFilterResult { // If the instance is being soft-reloaded or shut down, we ignore it. if goodbye := handle.Goodbye(); goodbye.GetSoftReload() || goodbye.GetDeleteResources() { - return + return autoUpdateFilterResultTerminating } // We skip servers that joined less than a minute ago as they might have been // connected to another auth instance a few seconds ago, which would lead to double-counting. - if ir.timestamp.Sub(handle.RegistrationTime()) < constants.AutoUpdateAgentReportPeriod { - return + if now.Sub(handle.RegistrationTime()) < constants.AutoUpdateAgentReportPeriod { + return autoUpdateFilterResultRecentlyConnected } // We skip control planes instances because we don't update them. if handle.HasControlPlaneService() { - return + return autoUpdaterFilterResultIsControlPlane } hello := handle.Hello() + // If the machine has no updater, we skip it switch hello.ExternalUpgrader { case "": - ir.omissions[omissionReasonNoUpdater] += 1 - return + return autoUpdateFilterResultNoUpdater case types.UpgraderKindSystemdUnit: - ir.omissions[omissionReasonUpdaterV1] += 1 - return - } - - // If the machine has no updater, we skip it - if hello.ExternalUpgrader == "" { - return + return autoUpdateFilterResultUpdaterV1 } // Reject instance not advertising updater info updaterInfo := hello.GetUpdaterInfo() if updaterInfo == nil { - ir.omissions[omissionReasonUpdaterTooOld] += 1 - return + return autoUpdateFilterResultUpdaterTooOld } // Reject instances who are not advertising the group properly. // They might be running too old versions. updateGroup := updaterInfo.UpdateGroup if updateGroup == "" { - ir.omissions[omissionReasonUpdaterTooOld] += 1 - return + return autoUpdateFilterResultUpdaterTooOld } // We skip instances whose updater status is not OK. @@ -100,19 +137,31 @@ func (ir instanceReport) collectInstance(handle inventory.UpstreamHandle) { switch status { case types.UpdaterStatus_UPDATER_STATUS_OK: case types.UpdaterStatus_UPDATER_STATUS_DISABLED: - ir.omissions[omissionReasonUpdaterDisabled] += 1 - return + return autoUpdateFilterResultUpdaterDisabled case types.UpdaterStatus_UPDATER_STATUS_PINNED: - ir.omissions[omissionReasonUpdaterPinned] += 1 - return + return autoUpdateFilterResultUpdaterPinned case types.UpdaterStatus_UPDATER_STATUS_UNREADABLE: - ir.omissions[omissionReasonUpdaterUnreadable] += 1 - return + return autoUpdateFilterResultUpdaterUnreadable default: - ir.omissions[omissionReasonUpdaterUnknown] += 1 + return autoUpdateFilterResultUnknown + } + return autoUpdateFilterResultMatching +} + +// collectInstance is meant to be a callback that is passed to inventory.AllHandles(). +func (ir instanceReport) collectInstance(handle inventory.UpstreamHandle) { + result := filterHandler(handle, ir.timestamp) + if result != autoUpdateFilterResultMatching { + if reason := result.String(); reason != "" { + ir.omissions[reason] += 1 + } return } + // No need to check for UpdaterInfo being nil, it would have been filtered + // out by filterHandler(). + updateGroup := handle.Hello().UpdaterInfo.UpdateGroup + if _, ok := ir.data[updateGroup]; !ok { ir.data[updateGroup] = instanceGroupReport{} } @@ -210,5 +259,4 @@ func (a *Server) reportAgentVersions(ctx context.Context) { a.logger.ErrorContext(ctx, "Failed to write agent version report", "error", err) } a.logger.DebugContext(ctx, "Finished exporting the agent version report") - } diff --git a/lib/auth/agent_version_report_test.go b/lib/auth/autoupdate_agent_version_report_test.go similarity index 92% rename from lib/auth/agent_version_report_test.go rename to lib/auth/autoupdate_agent_version_report_test.go index c37ad80446c46..04cb0bc68c5da 100644 --- a/lib/auth/agent_version_report_test.go +++ b/lib/auth/autoupdate_agent_version_report_test.go @@ -70,6 +70,10 @@ func (f fakeControlStream) Done() <-chan struct{} { return f.doneChan } +func (f fakeControlStream) Send(ctx context.Context, msg proto.DownstreamInventoryMessage) error { + return nil +} + func (f fakeControlStream) fakeMsg(msg proto.UpstreamInventoryMessage) { f.msgChan <- msg } @@ -237,10 +241,21 @@ func TestServer_generateAgentVersionReport(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { clock := clockwork.NewFakeClockAt(twoMinutesAgo) + bk, err := memory.New(memory.Config{}) + require.NoError(t, err) auth := &Server{ - logger: logtest.NewLogger(), - ServerID: uuid.NewString(), + cancelFunc: func() {}, + logger: logtest.NewLogger(), + ServerID: uuid.NewString(), + Services: &Services{ + // The inventory is running heartbeats on the background. + // If we don't create a presence service this will cause panics. + PresenceInternal: local.NewPresenceService(bk), + }, } + t.Cleanup(func() { + auth.Close() + }) controller := inventory.NewController(auth, nil, inventory.WithClock(clock)) for _, fixture := range tt.fixtures { clock.Advance(fixture.delay) @@ -297,13 +312,23 @@ func TestServer_reportAgentVersions(t *testing.T) { // Test setup: load fixtures. const testNodeCount = 10 auth := &Server{ - clock: clock, - ServerID: uuid.NewString(), - Services: &Services{AutoUpdateService: svc}, - logger: logtest.NewLogger(), + cancelFunc: func() {}, + clock: clock, + ServerID: uuid.NewString(), + Services: &Services{ + AutoUpdateService: svc, + // The inventory is running heartbeats on the background. + // If we don't create a presence service this will cause panics. + PresenceInternal: local.NewPresenceService(bk), + }, + logger: logtest.NewLogger(), } + t.Cleanup(func() { + auth.Close() + }) auth.Cache = auth.Services - controller := inventory.NewController(auth, nil, inventory.WithClock(clock)) + controller := inventory.NewController(auth, nil, + inventory.WithClock(clock)) auth.inventory = controller for range testNodeCount { diff --git a/lib/auth/autoupdate_rollout.go b/lib/auth/autoupdate_rollout.go new file mode 100644 index 0000000000000..8aa534b57f97b --- /dev/null +++ b/lib/auth/autoupdate_rollout.go @@ -0,0 +1,168 @@ +/* + * Teleport + * Copyright (C) 2025 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package auth + +import ( + "context" + "math/rand/v2" + + "github.com/google/uuid" + "github.com/gravitational/trace" + + "github.com/gravitational/teleport/api/client/proto" + "github.com/gravitational/teleport/api/constants" + autoupdatev1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + "github.com/gravitational/teleport/lib/inventory" +) + +// SampleAgentsFromAutoUpdateGroup iterates over every handle in the inventory to +// build a random sample of agents belonging to a given group. +// The main use-case for this function is to pick canaries that can be updated. +func (a *Server) SampleAgentsFromAutoUpdateGroup(ctx context.Context, groupName string, sampleSize int, groups []string) ([]*autoupdatev1pb.Canary, error) { + if len(groups) == 0 { + return nil, trace.BadParameter("no groups specified") + } + isCatchAll := groupName == groups[len(groups)-1] + var groupSet map[string]struct{} + + // Small optimization, we only need to build the groupSet if we are sampling the catch-all group + if isCatchAll { + groupSet = make(map[string]struct{}) + for _, group := range groups { + groupSet[group] = struct{}{} + } + } + + filter := func(handle inventory.UpstreamHandle) bool { + if result := filterHandler(handle, a.clock.Now()); result != autoUpdateFilterResultMatching { + return false + } + + // If this is not the catch-all group, we can only check if the agent group is the right one. + if !isCatchAll { + // No need to check for UpdaterInfo being nil, it would have been filtered + // out by filterHandler(). + return handle.Hello().UpdaterInfo.UpdateGroup == groupName + } + // This is the catch-call group, it matches agents from every group not in groups. + _, ok := groupSet[groupName] + // If the agent group is not in the group list, it falls into the catch-all. + return !ok + } + + sampler := newHandlerSampler(sampleSize, filter) + + a.inventory.UniqueHandles(sampler.visit) + + sampled := sampler.Sampled() + canaries := make([]*autoupdatev1pb.Canary, 0, len(sampled)) + for _, h := range sampled { + hello := h.Hello() + updaterID, err := uuid.FromBytes(hello.UpdaterInfo.UpdateUUID) + if err != nil { + return nil, trace.Wrap(err) + } + canaries = append(canaries, &autoupdatev1pb.Canary{ + UpdaterId: updaterID.String(), + HostId: hello.ServerID, + Hostname: hello.Hostname, + Success: false, + }) + } + return canaries, nil +} + +// handleSampler randomly samples handles from the inventory. +// It implements Alan Waterman's Reservoir Sampling Algorithm R +// (The Art of Computer Programming Volume 2). +// See https://en.wikipedia.org/wiki/Reservoir_sampling for more details. +type handleSampler struct { + sampleSize int + seenCount int + filter func(handle inventory.UpstreamHandle) bool + sample []inventory.UpstreamHandle +} + +func newHandlerSampler(sampleSize int, filter func(handle inventory.UpstreamHandle) bool) *handleSampler { + return &handleSampler{ + sampleSize: sampleSize, + seenCount: sampleSize, + filter: filter, + sample: make([]inventory.UpstreamHandle, 0, sampleSize), + } +} + +// pass this function to inventory.AllHandles() +func (h *handleSampler) visit(handle inventory.UpstreamHandle) { + // filter out everything we don't want + if !h.filter(handle) { + return + } + + // Fill the reservoir + if len(h.sample) < h.sampleSize { + h.sample = append(h.sample, handle) + h.seenCount++ + return + } + + // Reservoir is already filled, replace existing elements. + if j := rand.N(h.seenCount); j < h.sampleSize { + h.sample[j] = handle + } + h.seenCount++ +} + +func (h *handleSampler) Sampled() []inventory.UpstreamHandle { + return h.sample +} + +// LookupAgentInInventory looks for a specific hostID in the local inventory +// and returns all the Hello messages associated to those agents. +// Agents that joined less than a minute ago or that are terminating/restarting +// are ignored. +func (a *Server) LookupAgentInInventory(ctx context.Context, hostID string) ([]*proto.UpstreamInventoryHello, error) { + handles := a.inventory.Handles(hostID) + now := a.clock.Now() + + var qualifiedHellos []*proto.UpstreamInventoryHello + + for handle := range handles { + // If the instance is being soft-reloaded or shut down, we ignore it. + if goodbye := handle.Goodbye(); goodbye.GetSoftReload() || goodbye.GetDeleteResources() { + continue + } + + // We skip servers that joined less than a minute ago as they might have been + // connected to another auth instance a few seconds ago, which would lead to double-counting. + if now.Sub(handle.RegistrationTime()) < constants.AutoUpdateAgentReportPeriod { + continue + } + // Do don't apply other filtering logic like filterHandle() does because the instance already + // got selected with strict constraints earlier during sampling. We don't want a filtering rule change, or an instance change, to make the lookup fail and block the rollout. + qualifiedHellos = append(qualifiedHellos, handle.Hello()) + } + + if len(qualifiedHellos) == 0 { + return nil, trace.NotFound("no control streams meet requirements for host %v", hostID) + } + + return qualifiedHellos, nil + +} diff --git a/lib/auth/autoupdate_rollout_test.go b/lib/auth/autoupdate_rollout_test.go new file mode 100644 index 0000000000000..37c79e02291f1 --- /dev/null +++ b/lib/auth/autoupdate_rollout_test.go @@ -0,0 +1,312 @@ +/* + * Teleport + * Copyright (C) 2025 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package auth + +import ( + "fmt" + "iter" + "slices" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/google/uuid" + "github.com/jonboulle/clockwork" + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/testing/protocmp" + + "github.com/gravitational/teleport/api/client/proto" + autoupdatev1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + "github.com/gravitational/teleport/api/types" + "github.com/gravitational/teleport/lib/backend/memory" + "github.com/gravitational/teleport/lib/inventory" + "github.com/gravitational/teleport/lib/services/local" + "github.com/gravitational/teleport/lib/utils" +) + +func TestSampleAgentsFromGroup(t *testing.T) { + clock := clockwork.NewFakeClock() + bk, err := memory.New(memory.Config{}) + require.NoError(t, err) + + auth := &Server{ + cancelFunc: func() {}, + clock: clock, + ServerID: uuid.NewString(), + logger: utils.NewSlogLoggerForTests(), + Services: &Services{ + // The inventory is running heartbeats on the background. + // If we don't create a presence service this will cause panics. + PresenceInternal: local.NewPresenceService(bk), + }, + } + controller := inventory.NewController(auth, nil, inventory.WithClock(clock)) + auth.inventory = controller + t.Cleanup(func() { + auth.Close() + }) + + const ( + testNodeCount = 1000 + testGroupName = "my-group" + testCatchAllGroupName = "catch-all-group" + ) + + testGroups := []string{testGroupName, testCatchAllGroupName} + + for range testNodeCount { + updaterID := uuid.New() + stream := newFakeControlStream() + controller.RegisterControlStream(stream, &proto.UpstreamInventoryHello{ + Services: types.SystemRoles{types.RoleNode}.StringSlice(), + Version: "1.2.3", + ServerID: uuid.NewString(), + ExternalUpgrader: types.UpgraderKindTeleportUpdate, + UpdaterInfo: &types.UpdaterV2Info{ + UpdateUUID: updaterID[:], + UpdaterStatus: types.UpdaterStatus_UPDATER_STATUS_OK, + UpdateGroup: testGroupName, + }, + }) + t.Cleanup(stream.close) + } + + // Nodes that just registered are ignored, we advance the clock so our fixtures are not filtered out by filterHandler(). + clock.Advance(2 * time.Minute) + + // Text execution: check that we sample the correct amount of canaries + const sampleSize = 10 + canaries, err := auth.SampleAgentsFromAutoUpdateGroup(t.Context(), testGroupName, sampleSize, testGroups) + require.NoError(t, err) + require.Len(t, canaries, sampleSize) + + // Test execution: check that there were no duplicates in the samples + canarySet := make(map[string]*autoupdatev1pb.Canary) + for _, canary := range canaries { + canarySet[canary.UpdaterId] = canary + } + require.Len(t, canarySet, sampleSize, "some canary got duplicated") + + canaries2, err := auth.SampleAgentsFromAutoUpdateGroup(t.Context(), testGroupName, sampleSize, testGroups) + require.NoError(t, err) + require.Len(t, canaries2, sampleSize) + canarySet = make(map[string]*autoupdatev1pb.Canary) + for _, canary := range canaries2 { + canarySet[canary.UpdaterId] = canary + } + require.Len(t, canarySet, sampleSize, "some canary got duplicated") + + // Text execution: check that the random looks sane + var conflicts int + for i := range sampleSize { + if canaries[i].UpdaterId == canaries2[i].UpdaterId { + conflicts++ + } + } + // The probability of having 4 nodes sampled at the same position twice in + // a row is 2e-10. + require.Less(t, conflicts, 4) + + // Test execution: check that agents not belonging to any group are sampled whe requesting the catch-all group. + canariesCatchAll, err := auth.SampleAgentsFromAutoUpdateGroup(t.Context(), testGroupName, sampleSize, []string{"group-a", testCatchAllGroupName}) + require.NoError(t, err) + require.Len(t, canariesCatchAll, sampleSize) + canarySet = make(map[string]*autoupdatev1pb.Canary) + for _, canary := range canariesCatchAll { + canarySet[canary.UpdaterId] = canary + } + require.Len(t, canarySet, sampleSize, "some canary got duplicated") + +} + +func TestLookupAgentInInventory(t *testing.T) { + clock := clockwork.NewFakeClock() + bk, err := memory.New(memory.Config{}) + require.NoError(t, err) + + auth := &Server{ + cancelFunc: func() {}, + clock: clock, + ServerID: uuid.NewString(), + logger: utils.NewSlogLoggerForTests(), + Services: &Services{ + // The inventory is running heartbeats on the background. + // If we don't create a presence service this will cause panics. + PresenceInternal: local.NewPresenceService(bk), + }, + } + controller := inventory.NewController(auth, nil, inventory.WithClock(clock)) + auth.inventory = controller + t.Cleanup(func() { + auth.Close() + }) + + const testNodeCount = 5 + const testGroupName = "my-group" + + hosts := make(map[string][]*proto.UpstreamInventoryHello, testNodeCount) + + // Test setup: + // We register X nodes, node number 1 has 1 handler, node 2 has 2, ... + for i := range testNodeCount { + hostID := uuid.New().String() + updaterID := uuid.New() + hellos := make([]*proto.UpstreamInventoryHello, i+1) + for j := range i + 1 { + hello := &proto.UpstreamInventoryHello{ + Services: types.SystemRoles{types.RoleNode}.StringSlice(), + Version: fmt.Sprintf("1.2.%d", j), + ServerID: hostID, + ExternalUpgrader: types.UpgraderKindTeleportUpdate, + UpdaterInfo: &types.UpdaterV2Info{ + UpdateUUID: updaterID[:], + UpdaterStatus: types.UpdaterStatus_UPDATER_STATUS_OK, + UpdateGroup: testGroupName, + }, + } + hellos[j] = hello + + stream := newFakeControlStream() + controller.RegisterControlStream(stream, hello) + t.Cleanup(stream.close) + } + hosts[hostID] = hellos + } + + clock.Advance(2 * time.Minute) + + // Test execution: for each hostID registered, we get the handles and make sure we have the right length + // and content matches. + + opts := cmp.Options{ + cmpopts.SortSlices(func(a, b *proto.UpstreamInventoryHello) bool { return a.Version > b.Version }), + protocmp.Transform(), + } + for hostID, handles := range hosts { + result, err := auth.LookupAgentInInventory(t.Context(), hostID) + require.NoError(t, err) + require.Empty(t, cmp.Diff(handles, result, opts)) + } +} + +type fakeHandle struct { + inventory.UpstreamHandle + id int +} + +func (f *fakeHandle) Hello() *proto.UpstreamInventoryHello { + return &proto.UpstreamInventoryHello{ + Hostname: fmt.Sprintf("hostname-%d", f.id), + } +} + +func TestHandlerSampler(t *testing.T) { + filterOK := func(handle inventory.UpstreamHandle) bool { + return true + } + filterKO := func(handle inventory.UpstreamHandle) bool { + return false + } + + generateHandles := func(i int) iter.Seq[inventory.UpstreamHandle] { + return func(yield func(inventory.UpstreamHandle) bool) { + for j := range i { + yield(&fakeHandle{id: j}) + } + } + } + + const testSampleSize = 10 + // no agents to sample + // reservoir not filled yet + // reservoir filled + // filter always rejecting + tests := []struct { + name string + filter func(handle inventory.UpstreamHandle) bool + initialSeenCount int + handleFixtures iter.Seq[inventory.UpstreamHandle] + expectedSeenCount int + assertSample func(t *testing.T, reservoir []inventory.UpstreamHandle) + }{ + { + name: "filter rejects all", + filter: filterKO, + initialSeenCount: 0, + handleFixtures: generateHandles(100), + expectedSeenCount: 0, + assertSample: func(t *testing.T, reservoir []inventory.UpstreamHandle) { + require.Empty(t, reservoir) + }, + }, + { + name: "no agent to sample", + filter: filterOK, + initialSeenCount: 0, + handleFixtures: generateHandles(0), + expectedSeenCount: 0, + assertSample: func(t *testing.T, reservoir []inventory.UpstreamHandle) { + require.Empty(t, reservoir) + }, + }, + { + name: "reservoir not filled entirely", + filter: filterOK, + initialSeenCount: 0, + handleFixtures: generateHandles(5), + expectedSeenCount: 5, + assertSample: func(t *testing.T, reservoir []inventory.UpstreamHandle) { + require.Len(t, reservoir, 5) + for i, h := range reservoir { + expectedHostname := fmt.Sprintf("hostname-%d", i) + require.Equal(t, expectedHostname, h.Hello().Hostname) + } + }, + }, + { + name: "overflowing reservoir", + filter: filterOK, + initialSeenCount: 0, + handleFixtures: generateHandles(110), + expectedSeenCount: 110, + assertSample: func(t *testing.T, reservoir []inventory.UpstreamHandle) { + require.Len(t, reservoir, testSampleSize) + // the probability of no elements being sampled in the last 100 elements is 10e-200 + require.NotElementsMatch(t, slices.Collect(generateHandles(testSampleSize)), reservoir) + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + sampler := handleSampler{ + sampleSize: testSampleSize, + seenCount: test.initialSeenCount, + filter: test.filter, + sample: make([]inventory.UpstreamHandle, 0, testSampleSize), + } + for handle := range test.handleFixtures { + sampler.visit(handle) + } + + require.Equal(t, test.expectedSeenCount, sampler.seenCount) + test.assertSample(t, sampler.sample) + }) + } +} diff --git a/lib/inventory/controller.go b/lib/inventory/controller.go index 67b3e92799401..deac812ec6c69 100644 --- a/lib/inventory/controller.go +++ b/lib/inventory/controller.go @@ -20,6 +20,7 @@ package inventory import ( "context" + "iter" "log/slog" "math/rand/v2" "os" @@ -360,6 +361,11 @@ func (c *Controller) GetControlStream(serverID string) (handle UpstreamHandle, o return } +// Handles gets the handles for the given server ID if they exist. +func (c *Controller) Handles(serverID string) iter.Seq[UpstreamHandle] { + return c.store.Handles(serverID) +} + // UniqueHandles iterates across unique handles registered with this controller. // If multiple handles are registered for a given server, only // one handle is selected pseudorandomly to be observed. diff --git a/lib/inventory/store.go b/lib/inventory/store.go index 56dadb28d78be..a7d140bd117a6 100644 --- a/lib/inventory/store.go +++ b/lib/inventory/store.go @@ -20,6 +20,7 @@ package inventory import ( "hash/maphash" + "iter" "sync" "sync/atomic" ) @@ -63,6 +64,12 @@ func (s *Store) Get(serverID string) (handle UpstreamHandle, ok bool) { return s.getShard(serverID).get(serverID) } +// Handles attempts to load all known handlers for the given server ID. +// If you only need one handler, use Get instead. +func (s *Store) Handles(serverID string) iter.Seq[UpstreamHandle] { + return s.getShard(serverID).handles(serverID) +} + // Insert adds a new handle to the store. func (s *Store) Insert(handle UpstreamHandle) { s.getShard(handle.Hello().ServerID).insert(handle) @@ -144,6 +151,26 @@ func (s *shard) get(serverID string) (handle UpstreamHandle, ok bool) { return handle, true } +// handles gets all handles registered for a given hostID. +// To get a random handle, use get() instead. +func (s *shard) handles(serverID string) iter.Seq[UpstreamHandle] { + return func(yield func(UpstreamHandle) bool) { + s.rw.RLock() + defer s.rw.RUnlock() + + entry, ok := s.m[serverID] + if !ok { + return + } + + for _, handle := range entry.handles { + if !yield(handle) { + return + } + } + } +} + func (s *shard) iter(fn func(UpstreamHandle)) { s.rw.RLock() defer s.rw.RUnlock() From 3f3193ca35644daf2393b2d1ee0108f8db54e661 Mon Sep 17 00:00:00 2001 From: Hugo Shaka Date: Mon, 7 Jul 2025 16:04:27 -0400 Subject: [PATCH 03/14] autoupdate canary support: tctl (#56473) * autoupdate canary support: tctl support This commits makes `tctl autoupdate agents status` display groups in the canary state properly. * add `--force` flag to `tctl autoupdate agents start-update` --- tool/tctl/common/autoupdate_command.go | 25 ++++++- tool/tctl/common/autoupdate_command_test.go | 83 +++++++++++++++++++++ 2 files changed, 106 insertions(+), 2 deletions(-) diff --git a/tool/tctl/common/autoupdate_command.go b/tool/tctl/common/autoupdate_command.go index 70f5707ac27bc..839dd18a6da62 100644 --- a/tool/tctl/common/autoupdate_command.go +++ b/tool/tctl/common/autoupdate_command.go @@ -69,6 +69,7 @@ type AutoUpdateCommand struct { groups []string clear bool + force bool // used for testing purposes now func() time.Time @@ -101,6 +102,7 @@ func (c *AutoUpdateCommand) Initialize(app *kingpin.Application, ccf *tctlcfg.Gl c.agentsReportCmd = agentsCmd.Command("report", "Aggregates the agent autoupdate reports and displays agent count per version and per update group.") c.agentsStartUpdateCmd = agentsCmd.Command("start-update", "Starts updating one or many groups.") c.agentsStartUpdateCmd.Arg("groups", "Groups to start updating.").StringsVar(&c.groups) + c.agentsStartUpdateCmd.Flag("force", "Skips progressive deployment mechanism such as canaries or backpressure.").BoolVar(&c.force) c.agentsMarkDoneCmd = agentsCmd.Command("mark-done", "Marks one or many groups as done updating.") c.agentsMarkDoneCmd.Arg("groups", "Groups to mark as done updating.").StringsVar(&c.groups) c.agentsRollbackCmd = agentsCmd.Command("rollback", "Rolls back one or many groups.") @@ -401,9 +403,22 @@ func rolloutGroupTable(rollout *autoupdatev1pb.AutoUpdateAgentRollout, writer io if i == len(groups)-1 { groupName = groupName + " (catch-all)" } + state := userFriendlyState(group.GetState()) + + // If this is the canary state, we annotate the group state with the canary progress + if group.GetState() == autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY { + successCount := 0 + for _, canary := range group.Canaries { + if canary.Success { + successCount++ + } + } + state = fmt.Sprintf("%s (%d/%d)", state, successCount, group.CanaryCount) + } + table.AddRow([]string{ groupName, - userFriendlyState(group.GetState()), + state, formatTimeIfNotEmpty(group.GetStartTime().AsTime(), time.DateTime), group.GetLastUpdateReason(), strconv.FormatUint(groupCount, 10), @@ -440,7 +455,11 @@ func (c *AutoUpdateCommand) agentsStartUpdateCommand(ctx context.Context, client return trace.BadParameter("no groups specified") } - rollout, err := client.TriggerAutoUpdateAgentGroup(ctx, groups, autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED) + state := autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED + if c.force { + state = autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE + } + rollout, err := client.TriggerAutoUpdateAgentGroup(ctx, groups, state) if err != nil { return trace.Wrap(err) } @@ -521,6 +540,8 @@ func userFriendlyState[T autoupdatev1pb.AutoUpdateAgentGroupState | autoupdatev1 return "Done" case 4: return "Rolledback" + case 5: + return "Canary" default: // If we don't know anything about this state, we display its integer return fmt.Sprintf("Unknown state (%d)", state) diff --git a/tool/tctl/common/autoupdate_command_test.go b/tool/tctl/common/autoupdate_command_test.go index f95d9a180059e..74fc7bbee3253 100644 --- a/tool/tctl/common/autoupdate_command_test.go +++ b/tool/tctl/common/autoupdate_command_test.go @@ -24,6 +24,7 @@ import ( "testing" "time" + "github.com/google/uuid" "github.com/gravitational/trace" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -347,6 +348,88 @@ Group Name State Start Time State Reason Agent Count Up-to- dev Done 2025-01-15 12:00:00 outside_window 1023 567 stage Active 2025-01-15 14:00:00 in_window 0 0 prod (catch-all) Unstarted outside_window 789 0 +`, + }, + { + name: "rollout regular schedule halt-on-error with progress, with canary", + fixture: &autoupdatepb.AutoUpdateAgentRollout{ + Spec: &autoupdatepb.AutoUpdateAgentRolloutSpec{ + StartVersion: "1.2.3", + TargetVersion: "1.2.4", + Schedule: autoupdate.AgentsScheduleRegular, + AutoupdateMode: autoupdate.AgentsUpdateModeEnabled, + Strategy: autoupdate.AgentsStrategyHaltOnError, + }, + Status: &autoupdatepb.AutoUpdateAgentRolloutStatus{ + Groups: []*autoupdatepb.AutoUpdateAgentRolloutStatusGroup{ + { + Name: "dev", + StartTime: timestamppb.New(time.Date(2025, 1, 15, 12, 00, 0, 0, time.UTC)), + State: autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE, + LastUpdateTime: nil, + LastUpdateReason: "outside_window", + ConfigDays: []string{"Mon", "Tue", "Wed", "Thu", "Fri"}, + ConfigStartHour: 8, + PresentCount: 1023, + UpToDateCount: 567, + InitialCount: 1012, + }, + { + Name: "stage", + StartTime: timestamppb.New(time.Date(2025, 1, 15, 14, 00, 0, 0, time.UTC)), + State: autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateReason: "in_window", + ConfigDays: []string{"Mon", "Tue", "Wed", "Thu", "Fri"}, + ConfigStartHour: 14, + CanaryCount: 5, + Canaries: []*autoupdatepb.Canary{ + { + UpdaterId: uuid.NewString(), + HostId: uuid.NewString(), + Hostname: "host-1", + Success: true, + }, + { + UpdaterId: uuid.NewString(), + HostId: uuid.NewString(), + Hostname: "host-2", + Success: false, + }, + { + UpdaterId: uuid.NewString(), + HostId: uuid.NewString(), + Hostname: "host-3", + Success: true, + }, + }, + }, + { + Name: "prod", + StartTime: nil, + State: autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateReason: "outside_window", + ConfigDays: []string{"Mon", "Tue", "Wed", "Thu", "Fri"}, + ConfigStartHour: 18, + PresentCount: 789, + }, + }, + State: autoupdatepb.AutoUpdateAgentRolloutState_AUTO_UPDATE_AGENT_ROLLOUT_STATE_ACTIVE, + StartTime: timestamppb.New(time.Date(2025, 1, 15, 2, 0, 0, 0, time.UTC)), + TimeOverride: nil, + }, + }, + expectedOutput: `Agent autoupdate mode: enabled +Rollout creation date: 2025-01-15 02:00:00 +Start version: 1.2.3 +Target version: 1.2.4 +Rollout state: Active +Strategy: halt-on-error + +Group Name State Start Time State Reason Agent Count Up-to-date +---------------- ------------ ------------------- -------------- ----------- ---------- +dev Done 2025-01-15 12:00:00 outside_window 1023 567 +stage Canary (2/5) 2025-01-15 14:00:00 in_window 0 0 +prod (catch-all) Unstarted outside_window 789 0 `, }, } From 0b114cb46959a6e3b619452c023e93f8353ddd8d Mon Sep 17 00:00:00 2001 From: Hugo Shaka Date: Tue, 8 Jul 2025 15:40:53 -0400 Subject: [PATCH 04/14] autoupdate canary support: modulate proxy response (#56468) This commit makes the TEleport Proxcy service find and pind endpoints fetch the updater ID from the request parameters and lookup if the requestor is a canary. If it is, the requestor will be told to update. --- lib/web/apiserver.go | 11 +- lib/web/apiserver_ping_test.go | 227 ++++++++++++++++++++++++++---- lib/web/autoupdate_common.go | 19 +++ lib/web/autoupdate_common_test.go | 160 ++++++++++++++++++++- 4 files changed, 390 insertions(+), 27 deletions(-) diff --git a/lib/web/apiserver.go b/lib/web/apiserver.go index f3dea4adf9718..285f79e868dbf 100644 --- a/lib/web/apiserver.go +++ b/lib/web/apiserver.go @@ -1627,6 +1627,7 @@ func (h *Handler) ping(w http.ResponseWriter, r *http.Request, p httprouter.Para } group := r.URL.Query().Get(webclient.AgentUpdateGroupParameter) + updaterID := r.URL.Query().Get(webclient.AgentUpdateIDParameter) return webclient.PingResponse{ Auth: authSettings, @@ -1635,7 +1636,7 @@ func (h *Handler) ping(w http.ResponseWriter, r *http.Request, p httprouter.Para MinClientVersion: teleport.MinClientSemVer().String(), ClusterName: h.auth.clusterName, AutomaticUpgrades: pr.ServerFeatures.GetAutomaticUpgrades(), - AutoUpdate: h.automaticUpdateSettings184(r.Context(), group, "" /* updater UUID */), + AutoUpdate: h.automaticUpdateSettings184(r.Context(), group, updaterID), Edition: modules.GetModules().BuildType(), FIPS: modules.IsBoringBinary(), }, nil @@ -1674,6 +1675,14 @@ func (h *Handler) find(w http.ResponseWriter, r *http.Request, p httprouter.Para if err != nil { return nil, trace.Wrap(err) } + + // Now we modulate the autoupdate answer on a per-request basis. + // We don't want to cache one answer per updater UUID, so we take the + // cached result and just override what we must. + updaterID := r.URL.Query().Get(webclient.AgentUpdateIDParameter) + if updaterID != "" { + resp.AutoUpdate = h.automaticUpdateSettings184(r.Context(), group, updaterID) + } return resp, nil } diff --git a/lib/web/apiserver_ping_test.go b/lib/web/apiserver_ping_test.go index 4b5ea96a0e0eb..b9e55e936a9c3 100644 --- a/lib/web/apiserver_ping_test.go +++ b/lib/web/apiserver_ping_test.go @@ -25,6 +25,7 @@ import ( "net/url" "testing" + "github.com/google/uuid" "github.com/gravitational/roundtrip" "github.com/gravitational/trace" "github.com/stretchr/testify/assert" @@ -34,6 +35,7 @@ import ( "github.com/gravitational/teleport/api/client/webclient" "github.com/gravitational/teleport/api/constants" autoupdatev1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + headerv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/header/v1" "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/api/types/autoupdate" "github.com/gravitational/teleport/lib/client" @@ -284,23 +286,23 @@ func TestPing_minimalAPI(t *testing.T) { // TestPing_autoUpdateResources tests that find endpoint return correct data related to auto updates. func TestPing_autoUpdateResources(t *testing.T) { - env := newWebPack(t, 1, func(cfg *proxyConfig) { - cfg.minimalHandler = true - }) + env := newWebPack(t, 1) proxy := env.proxies[0] ctx, cancel := context.WithCancel(context.Background()) defer cancel() - req, err := http.NewRequest(http.MethodGet, proxy.newClient(t).Endpoint("webapi", "find"), nil) - require.NoError(t, err) - req.Host = proxy.handler.handler.cfg.ProxyPublicAddrs[0].Host() + testGroup := "test-group" + testUpdaterID := uuid.NewString() + // Note: we are not using webclient.Ping() here because we don't have a valid certificate for 127.0.0.1 and + // The webclient doesn't support being passed custom transport. + clt := proxy.newClient(t) tests := []struct { name string config *autoupdatev1pb.AutoUpdateConfigSpec version *autoupdatev1pb.AutoUpdateVersionSpec - rollout *autoupdatev1pb.AutoUpdateAgentRolloutSpec + rollout *autoupdatev1pb.AutoUpdateAgentRollout cleanup bool expected webclient.AutoUpdateSettings }{ @@ -332,12 +334,17 @@ func TestPing_autoUpdateResources(t *testing.T) { }, { name: "enable agent auto update, immediate schedule", - rollout: &autoupdatev1pb.AutoUpdateAgentRolloutSpec{ - AutoupdateMode: autoupdate.AgentsUpdateModeEnabled, - Strategy: autoupdate.AgentsStrategyHaltOnError, - Schedule: autoupdate.AgentsScheduleImmediate, - StartVersion: "1.2.3", - TargetVersion: "1.2.4", + rollout: &autoupdatev1pb.AutoUpdateAgentRollout{ + Metadata: &headerv1.Metadata{ + Name: types.MetaNameAutoUpdateAgentRollout, + }, + Spec: &autoupdatev1pb.AutoUpdateAgentRolloutSpec{ + AutoupdateMode: autoupdate.AgentsUpdateModeEnabled, + Strategy: autoupdate.AgentsStrategyHaltOnError, + Schedule: autoupdate.AgentsScheduleImmediate, + StartVersion: "1.2.3", + TargetVersion: "1.2.4", + }, }, expected: webclient.AutoUpdateSettings{ ToolsVersion: api.Version, @@ -350,12 +357,17 @@ func TestPing_autoUpdateResources(t *testing.T) { }, { name: "agent rollout present but AU mode is disabled", - rollout: &autoupdatev1pb.AutoUpdateAgentRolloutSpec{ - AutoupdateMode: autoupdate.AgentsUpdateModeDisabled, - Strategy: autoupdate.AgentsStrategyHaltOnError, - Schedule: autoupdate.AgentsScheduleImmediate, - StartVersion: "1.2.3", - TargetVersion: "1.2.4", + rollout: &autoupdatev1pb.AutoUpdateAgentRollout{ + Metadata: &headerv1.Metadata{ + Name: types.MetaNameAutoUpdateAgentRollout, + }, + Spec: &autoupdatev1pb.AutoUpdateAgentRolloutSpec{ + AutoupdateMode: autoupdate.AgentsUpdateModeDisabled, + Strategy: autoupdate.AgentsStrategyHaltOnError, + Schedule: autoupdate.AgentsScheduleImmediate, + StartVersion: "1.2.3", + TargetVersion: "1.2.4", + }, }, expected: webclient.AutoUpdateSettings{ ToolsVersion: api.Version, @@ -434,6 +446,171 @@ func TestPing_autoUpdateResources(t *testing.T) { AgentAutoUpdate: false, AgentVersion: api.Version, }, + cleanup: true, + }, + { + name: "group must be updated", + rollout: &autoupdatev1pb.AutoUpdateAgentRollout{ + Metadata: &headerv1.Metadata{ + Name: types.MetaNameAutoUpdateAgentRollout, + }, + Spec: &autoupdatev1pb.AutoUpdateAgentRolloutSpec{ + AutoupdateMode: autoupdate.AgentsUpdateModeEnabled, + Strategy: autoupdate.AgentsStrategyHaltOnError, + Schedule: autoupdate.AgentsScheduleRegular, + StartVersion: "1.2.3", + TargetVersion: "1.2.4", + }, + Status: &autoupdatev1pb.AutoUpdateAgentRolloutStatus{ + Groups: []*autoupdatev1pb.AutoUpdateAgentRolloutStatusGroup{ + { + Name: testGroup, + State: autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + ConfigDays: []string{"*"}, + }, + { + Name: "unstarted", + State: autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + ConfigDays: []string{"*"}, + }, + }, + }, + }, + expected: webclient.AutoUpdateSettings{ + ToolsVersion: api.Version, + ToolsAutoUpdate: false, + AgentVersion: "1.2.4", + AgentAutoUpdate: true, + AgentUpdateJitterSeconds: DefaultAgentUpdateJitterSeconds, + }, + cleanup: true, + }, + { + name: "group must not be updated", + rollout: &autoupdatev1pb.AutoUpdateAgentRollout{ + Metadata: &headerv1.Metadata{ + Name: types.MetaNameAutoUpdateAgentRollout, + }, + Spec: &autoupdatev1pb.AutoUpdateAgentRolloutSpec{ + AutoupdateMode: autoupdate.AgentsUpdateModeEnabled, + Strategy: autoupdate.AgentsStrategyHaltOnError, + Schedule: autoupdate.AgentsScheduleRegular, + StartVersion: "1.2.3", + TargetVersion: "1.2.4", + }, + Status: &autoupdatev1pb.AutoUpdateAgentRolloutStatus{ + Groups: []*autoupdatev1pb.AutoUpdateAgentRolloutStatusGroup{ + { + Name: "done", + State: autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + ConfigDays: []string{"*"}, + }, + { + Name: testGroup, + State: autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + ConfigDays: []string{"*"}, + }, + }, + }, + }, + expected: webclient.AutoUpdateSettings{ + ToolsVersion: api.Version, + ToolsAutoUpdate: false, + AgentVersion: "1.2.3", + AgentAutoUpdate: false, + AgentUpdateJitterSeconds: DefaultAgentUpdateJitterSeconds, + }, + cleanup: true, + }, + { + name: "canary must be updated", + rollout: &autoupdatev1pb.AutoUpdateAgentRollout{ + Metadata: &headerv1.Metadata{ + Name: types.MetaNameAutoUpdateAgentRollout, + }, + Spec: &autoupdatev1pb.AutoUpdateAgentRolloutSpec{ + AutoupdateMode: autoupdate.AgentsUpdateModeEnabled, + Strategy: autoupdate.AgentsStrategyHaltOnError, + Schedule: autoupdate.AgentsScheduleRegular, + StartVersion: "1.2.3", + TargetVersion: "1.2.4", + }, + Status: &autoupdatev1pb.AutoUpdateAgentRolloutStatus{ + Groups: []*autoupdatev1pb.AutoUpdateAgentRolloutStatusGroup{ + { + Name: testGroup, + State: autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + ConfigDays: []string{"*"}, + Canaries: []*autoupdatev1pb.Canary{ + { + UpdaterId: testUpdaterID, + HostId: uuid.NewString(), + Hostname: "test-host", + Success: false, + }, + }, + }, + { + Name: "unstarted", + State: autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + ConfigDays: []string{"*"}, + }, + }, + }, + }, + expected: webclient.AutoUpdateSettings{ + ToolsVersion: api.Version, + ToolsAutoUpdate: false, + AgentVersion: "1.2.4", + AgentAutoUpdate: true, + AgentUpdateJitterSeconds: DefaultAgentUpdateJitterSeconds, + }, + cleanup: true, + }, + { + name: "canary must not be updated", + rollout: &autoupdatev1pb.AutoUpdateAgentRollout{ + Metadata: &headerv1.Metadata{ + Name: types.MetaNameAutoUpdateAgentRollout, + }, + Spec: &autoupdatev1pb.AutoUpdateAgentRolloutSpec{ + AutoupdateMode: autoupdate.AgentsUpdateModeEnabled, + Strategy: autoupdate.AgentsStrategyHaltOnError, + Schedule: autoupdate.AgentsScheduleRegular, + StartVersion: "1.2.3", + TargetVersion: "1.2.4", + }, + Status: &autoupdatev1pb.AutoUpdateAgentRolloutStatus{ + Groups: []*autoupdatev1pb.AutoUpdateAgentRolloutStatusGroup{ + { + Name: testGroup, + State: autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + Canaries: []*autoupdatev1pb.Canary{ + { + UpdaterId: uuid.NewString(), + HostId: uuid.NewString(), + Hostname: "test-host", + Success: false, + }, + }, + ConfigDays: []string{"*"}, + }, + { + Name: "unstarted", + State: autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + ConfigDays: []string{"*"}, + }, + }, + }, + }, + expected: webclient.AutoUpdateSettings{ + ToolsVersion: api.Version, + ToolsAutoUpdate: false, + AgentVersion: "1.2.3", + AgentAutoUpdate: false, + AgentUpdateJitterSeconds: DefaultAgentUpdateJitterSeconds, + }, + cleanup: true, }, } for _, tc := range tests { @@ -451,9 +628,7 @@ func TestPing_autoUpdateResources(t *testing.T) { require.NoError(t, err) } if tc.rollout != nil { - rollout, err := autoupdate.NewAutoUpdateAgentRollout(tc.rollout) - require.NoError(t, err) - _, err = env.server.Auth().UpsertAutoUpdateAgentRollout(ctx, rollout) + _, err := env.server.Auth().UpsertAutoUpdateAgentRollout(ctx, tc.rollout) require.NoError(t, err) } @@ -462,12 +637,14 @@ func TestPing_autoUpdateResources(t *testing.T) { proxy.clock.Advance(2 * findEndpointCacheTTL) } - resp, err := client.NewInsecureWebClient().Do(req) + resp, err := clt.Get(ctx, clt.Endpoint("webapi", "ping"), url.Values{ + webclient.AgentUpdateIDParameter: []string{testUpdaterID}, + webclient.AgentUpdateGroupParameter: []string{testGroup}, + }) require.NoError(t, err) pr := &webclient.PingResponse{} - require.NoError(t, json.NewDecoder(resp.Body).Decode(pr)) - require.NoError(t, resp.Body.Close()) + require.NoError(t, json.NewDecoder(resp.Reader()).Decode(pr)) assert.Equal(t, tc.expected, pr.AutoUpdate) diff --git a/lib/web/autoupdate_common.go b/lib/web/autoupdate_common.go index c469aefb52034..0d55a62842f7c 100644 --- a/lib/web/autoupdate_common.go +++ b/lib/web/autoupdate_common.go @@ -125,11 +125,28 @@ func getVersionFromRollout( case autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: return version.EnsureSemver(rollout.GetSpec().GetTargetVersion()) + case autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: + if updaterIsCanary(group, updaterUUID) { + return version.EnsureSemver(rollout.GetSpec().GetTargetVersion()) + } + return version.EnsureSemver(rollout.GetSpec().GetStartVersion()) default: return nil, trace.NotImplemented("unsupported group state %q", group.GetState()) } } +func updaterIsCanary(group *autoupdatepb.AutoUpdateAgentRolloutStatusGroup, updaterUUID string) bool { + if updaterUUID == "" { + return false + } + for _, canary := range group.GetCanaries() { + if canary.UpdaterId == updaterUUID { + return true + } + } + return false +} + // getTriggerFromRollout returns the version we should serve to the agent based // on the RFD184 agent rollout, the agent group name, and its UUID. // This logic is pretty complex and described in RFD 184. @@ -166,6 +183,8 @@ func getTriggerFromRollout(rollout *autoupdatepb.AutoUpdateAgentRollout, groupNa return true, nil case autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: return rollout.GetSpec().GetStrategy() == autoupdate.AgentsStrategyHaltOnError, nil + case autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: + return updaterIsCanary(group, updaterUUID), nil default: return false, trace.NotImplemented("Unsupported group state %q", group.GetState()) } diff --git a/lib/web/autoupdate_common_test.go b/lib/web/autoupdate_common_test.go index b84070d115f1d..b65d0ba36a3ce 100644 --- a/lib/web/autoupdate_common_test.go +++ b/lib/web/autoupdate_common_test.go @@ -23,11 +23,13 @@ import ( "fmt" "net/http" "net/http/httptest" + "strconv" "strings" "testing" "time" "github.com/coreos/go-semver/semver" + "github.com/google/uuid" "github.com/gravitational/trace" "github.com/jonboulle/clockwork" "github.com/stretchr/testify/assert" @@ -365,6 +367,7 @@ func TestGetVersionFromRollout(t *testing.T) { autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: testVersionHigh, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: testVersionHigh, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: testVersionHigh, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: testVersionHigh, } activeDoneOnly := map[autoupdatepb.AutoUpdateAgentGroupState]string{ @@ -372,6 +375,15 @@ func TestGetVersionFromRollout(t *testing.T) { autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: testVersionHigh, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: testVersionHigh, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: testVersionLow, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: testVersionLow, + } + + activeCanaryDone := map[autoupdatepb.AutoUpdateAgentGroupState]string{ + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED: testVersionLow, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: testVersionHigh, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: testVersionHigh, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: testVersionLow, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: testVersionHigh, } tests := map[string]map[string]map[autoupdatepb.AutoUpdateAgentGroupState]string{ @@ -394,7 +406,6 @@ func TestGetVersionFromRollout(t *testing.T) { t.Run(fmt.Sprintf("%s/%s/%s", mode, schedule, state), func(t *testing.T) { expectedSemVersion, err := version.EnsureSemver(expectedVersion) require.NoError(t, err) - rollout := &autoupdatepb.AutoUpdateAgentRollout{ Spec: &autoupdatepb.AutoUpdateAgentRolloutSpec{ StartVersion: testVersionLow, @@ -420,6 +431,63 @@ func TestGetVersionFromRollout(t *testing.T) { } } } + + canaryTestCases := map[bool]map[autoupdatepb.AutoUpdateAgentGroupState]string{ + true: activeCanaryDone, + false: activeDoneOnly, + } + + for canaryMatching, statesCases := range canaryTestCases { + const ( + schedule = autoupdate.AgentsScheduleRegular + mode = autoupdate.AgentsUpdateModeEnabled + ) + + for state, expectedVersion := range statesCases { + t.Run(fmt.Sprintf("canary(%s)/%s", strconv.FormatBool(canaryMatching), state), func(t *testing.T) { + expectedSemVersion, err := version.EnsureSemver(expectedVersion) + require.NoError(t, err) + + rollout := &autoupdatepb.AutoUpdateAgentRollout{ + Spec: &autoupdatepb.AutoUpdateAgentRolloutSpec{ + StartVersion: testVersionLow, + TargetVersion: testVersionHigh, + Schedule: schedule, + AutoupdateMode: mode, + // Strategy does not affect which version are served + Strategy: autoupdate.AgentsStrategyTimeBased, + }, + Status: &autoupdatepb.AutoUpdateAgentRolloutStatus{ + Groups: []*autoupdatepb.AutoUpdateAgentRolloutStatusGroup{ + { + Name: groupName, + State: state, + Canaries: []*autoupdatepb.Canary{ + { + UpdaterId: uuid.NewString(), + HostId: uuid.NewString(), + Hostname: "test-host", + Success: false, + }, + }, + }, + }, + }, + } + var updaterID string + if canaryMatching { + updaterID = rollout.GetStatus().GetGroups()[0].GetCanaries()[0].GetUpdaterId() + } else { + updaterID = uuid.NewString() + } + version, err := getVersionFromRollout(rollout, groupName, updaterID) + require.NoError(t, err) + require.Equal(t, expectedSemVersion, version) + }) + + } + + } } func TestGetTriggerFromRollout(t *testing.T) { @@ -433,12 +501,14 @@ func TestGetTriggerFromRollout(t *testing.T) { autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: false, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: false, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: false, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: false, } alwaysUpdate := map[autoupdatepb.AutoUpdateAgentGroupState]bool{ autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED: true, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: true, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: true, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: true, } tests := map[string]map[string]map[string]map[autoupdatepb.AutoUpdateAgentGroupState]bool{ @@ -470,6 +540,7 @@ func TestGetTriggerFromRollout(t *testing.T) { autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: false, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: true, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: false, }, }, autoupdate.AgentsStrategyHaltOnError: { @@ -479,6 +550,7 @@ func TestGetTriggerFromRollout(t *testing.T) { autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: true, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: true, autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: false, }, }, }, @@ -513,6 +585,92 @@ func TestGetTriggerFromRollout(t *testing.T) { } } } + + canaryTestCases := map[bool]map[string]map[autoupdatepb.AutoUpdateAgentGroupState]bool{ + true: { + autoupdate.AgentsStrategyTimeBased: { + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED: false, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: false, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: true, + }, + autoupdate.AgentsStrategyHaltOnError: { + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED: false, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: true, + }, + }, + false: { + autoupdate.AgentsStrategyTimeBased: { + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED: false, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: false, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: false, + }, + autoupdate.AgentsStrategyHaltOnError: { + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED: false, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: true, + autoupdatepb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: false, + }, + }, + } + + for canaryMatching, strategyCases := range canaryTestCases { + const ( + schedule = autoupdate.AgentsScheduleRegular + mode = autoupdate.AgentsUpdateModeEnabled + ) + + for strategy, statesCases := range strategyCases { + for state, expectedTrigger := range statesCases { + t.Run(fmt.Sprintf("canary(%s)/%s/%s", strconv.FormatBool(canaryMatching), strategy, state), func(t *testing.T) { + rollout := &autoupdatepb.AutoUpdateAgentRollout{ + Spec: &autoupdatepb.AutoUpdateAgentRolloutSpec{ + StartVersion: testVersionLow, + TargetVersion: testVersionHigh, + Schedule: schedule, + AutoupdateMode: mode, + Strategy: strategy, + }, + Status: &autoupdatepb.AutoUpdateAgentRolloutStatus{ + Groups: []*autoupdatepb.AutoUpdateAgentRolloutStatusGroup{ + { + Name: groupName, + State: state, + Canaries: []*autoupdatepb.Canary{ + { + UpdaterId: uuid.NewString(), + HostId: uuid.NewString(), + Hostname: "test-host", + Success: false, + }, + }, + }, + }, + }, + } + var updaterID string + if canaryMatching { + updaterID = rollout.GetStatus().GetGroups()[0].GetCanaries()[0].GetUpdaterId() + } else { + updaterID = uuid.NewString() + } + trigger, err := getTriggerFromRollout(rollout, groupName, updaterID) + require.NoError(t, err) + require.Equal(t, expectedTrigger, trigger) + }) + + } + + } + + } } func TestGetGroup(t *testing.T) { From 53151a586e1deba5ce75e5e440da1517368235b5 Mon Sep 17 00:00:00 2001 From: Hugo Shaka Date: Thu, 17 Jul 2025 17:52:39 -0400 Subject: [PATCH 05/14] autoupdate canary support: rollout controller (#56467) * autoupdate canary support: rollout controller This commit adds canary support to the autoupdate_agent_rollout controller when the strategy is "halt-on-error". * Apply suggestions from code review --- lib/autoupdate/rollout/client.go | 8 + lib/autoupdate/rollout/client_test.go | 54 ++- lib/autoupdate/rollout/controller.go | 1 - lib/autoupdate/rollout/strategy.go | 3 +- .../rollout/strategy_haltonerror.go | 169 ++++++- .../rollout/strategy_haltonerror_test.go | 414 +++++++++++++++++- lib/autoupdate/rollout/transitions.go | 56 ++- 7 files changed, 661 insertions(+), 44 deletions(-) diff --git a/lib/autoupdate/rollout/client.go b/lib/autoupdate/rollout/client.go index d0fbbfaeae427..b79c08e3c0db0 100644 --- a/lib/autoupdate/rollout/client.go +++ b/lib/autoupdate/rollout/client.go @@ -24,6 +24,7 @@ import ( "github.com/gravitational/trace" + "github.com/gravitational/teleport/api/client/proto" "github.com/gravitational/teleport/api/constants" autoupdatepb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" "github.com/gravitational/teleport/api/types" @@ -55,6 +56,13 @@ type Client interface { // ListAutoUpdateAgentReports lists the autoupdate_agent_report resources // so the controller can measure the rollout progress. ListAutoUpdateAgentReports(ctx context.Context, pageSize int, nextKey string) ([]*autoupdatepb.AutoUpdateAgentReport, string, error) + + // SampleAgentsFromAutoUpdateGroup samples agents belonging to a specific autoupdate group. + // This is used to pick canaries. + SampleAgentsFromAutoUpdateGroup(ctx context.Context, groupName string, sampleSize int, groups []string) ([]*autoupdatepb.Canary, error) + + // LookupAgentInInventory looks up a specific HostID in the auth local inventory and returns its Hello message. + LookupAgentInInventory(ctx context.Context, hostID string) ([]*proto.UpstreamInventoryHello, error) } func getAllReports(ctx context.Context, clt Client) ([]*autoupdatepb.AutoUpdateAgentReport, error) { diff --git a/lib/autoupdate/rollout/client_test.go b/lib/autoupdate/rollout/client_test.go index 49a84d85555c1..76b4366c4607a 100644 --- a/lib/autoupdate/rollout/client_test.go +++ b/lib/autoupdate/rollout/client_test.go @@ -26,22 +26,28 @@ import ( "github.com/stretchr/testify/require" "google.golang.org/protobuf/proto" + apiproto "github.com/gravitational/teleport/api/client/proto" "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" "github.com/gravitational/teleport/api/types" apiutils "github.com/gravitational/teleport/api/utils" ) +// make sure our test client implements the Client interface. +var _ Client = &testifyMockClient{} + type mockClientStubs struct { - configAnswers []callAnswer[*autoupdate.AutoUpdateConfig] - versionAnswers []callAnswer[*autoupdate.AutoUpdateVersion] - rolloutAnswers []callAnswer[*autoupdate.AutoUpdateAgentRollout] - createRolloutAnswers []callAnswer[*autoupdate.AutoUpdateAgentRollout] - createRolloutExpects []require.ValueAssertionFunc - updateRolloutAnswers []callAnswer[*autoupdate.AutoUpdateAgentRollout] - updateRolloutExpects []require.ValueAssertionFunc - deleteRolloutAnswers []error - cmcAnswers []callAnswer[*types.ClusterMaintenanceConfigV1] - reportsAnswers []callAnswer[[]*autoupdate.AutoUpdateAgentReport] + configAnswers []callAnswer[*autoupdate.AutoUpdateConfig] + versionAnswers []callAnswer[*autoupdate.AutoUpdateVersion] + rolloutAnswers []callAnswer[*autoupdate.AutoUpdateAgentRollout] + createRolloutAnswers []callAnswer[*autoupdate.AutoUpdateAgentRollout] + createRolloutExpects []require.ValueAssertionFunc + updateRolloutAnswers []callAnswer[*autoupdate.AutoUpdateAgentRollout] + updateRolloutExpects []require.ValueAssertionFunc + deleteRolloutAnswers []error + cmcAnswers []callAnswer[*types.ClusterMaintenanceConfigV1] + reportsAnswers []callAnswer[[]*autoupdate.AutoUpdateAgentReport] + agentSamples []callAnswer[[]*autoupdate.Canary] + inventoryAgentLookups map[string][]callAnswer[[]*apiproto.UpstreamInventoryHello] } type callAnswer[T any] struct { @@ -83,6 +89,14 @@ func newMockClient(t *testing.T, stubs mockClientStubs) *testifyMockClient { for _, answer := range stubs.reportsAnswers { clt.On("ListAutoUpdateAgentReports", mock.Anything, mock.Anything, mock.Anything).Return(answer.result, answer.err).Once() } + for _, answer := range stubs.agentSamples { + clt.On("SampleAgentsFromAutoUpdateGroup", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(answer.result, answer.err).Once() + } + for host := range stubs.inventoryAgentLookups { + for _, answer := range stubs.inventoryAgentLookups[host] { + clt.On("LookupAgentInInventory", mock.Anything, host).Return(answer.result, answer.err).Once() + } + } return clt } @@ -93,6 +107,20 @@ type testifyMockClient struct { stubs mockClientStubs } +func (n *testifyMockClient) SampleAgentsFromAutoUpdateGroup(ctx context.Context, groupName string, sampleSize int, groups []string) ([]*autoupdate.Canary, error) { + args := n.Called(ctx, groupName, sampleSize, groups) + var canaries []*autoupdate.Canary + for _, canary := range args.Get(0).([]*autoupdate.Canary) { + canaries = append(canaries, proto.CloneOf(canary)) + } + return canaries, args.Error(1) +} + +func (n *testifyMockClient) LookupAgentInInventory(ctx context.Context, hostID string) ([]*apiproto.UpstreamInventoryHello, error) { + args := n.Called(ctx, hostID) + return args.Get(0).([]*apiproto.UpstreamInventoryHello), args.Error(1) +} + func (n *testifyMockClient) GetAutoUpdateConfig(ctx context.Context) (*autoupdate.AutoUpdateConfig, error) { args := n.Called(ctx) result := proto.Clone(args.Get(0).(*autoupdate.AutoUpdateConfig)) @@ -153,4 +181,10 @@ func (n *testifyMockClient) checkIfCallsWereDone(t *testing.T) { n.AssertNumberOfCalls(t, "DeleteAutoUpdateAgentRollout", len(n.stubs.deleteRolloutAnswers)) n.AssertNumberOfCalls(t, "GetClusterMaintenanceConfig", len(n.stubs.cmcAnswers)) n.AssertNumberOfCalls(t, "ListAutoUpdateAgentReports", len(n.stubs.reportsAnswers)) + n.AssertNumberOfCalls(t, "SampleAgentsFromAutoUpdateGroup", len(n.stubs.agentSamples)) + var agentLookupCount int + for _, answers := range n.stubs.inventoryAgentLookups { + agentLookupCount += len(answers) + } + n.AssertNumberOfCalls(t, "LookupAgentInInventory", agentLookupCount) } diff --git a/lib/autoupdate/rollout/controller.go b/lib/autoupdate/rollout/controller.go index 4a460f391b44b..120854cce82c0 100644 --- a/lib/autoupdate/rollout/controller.go +++ b/lib/autoupdate/rollout/controller.go @@ -41,7 +41,6 @@ const ( // We currently wake up every minute, in the future we might decide to also watch for events // (from autoupdate_config and autoupdate_version changefeed) to react faster. type Controller struct { - // TODO(hugoShaka) add prometheus metrics describing the reconciliation status reconciler reconciler clock clockwork.Clock log *slog.Logger diff --git a/lib/autoupdate/rollout/strategy.go b/lib/autoupdate/rollout/strategy.go index d5b8236ce8f90..3b3b524e54275 100644 --- a/lib/autoupdate/rollout/strategy.go +++ b/lib/autoupdate/rollout/strategy.go @@ -98,7 +98,8 @@ func setGroupState(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, newState group.State = newState changed = true // If we just started the group, also update the start time - if newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE { + if newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE || + newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY { group.StartTime = timestamppb.New(now) } } diff --git a/lib/autoupdate/rollout/strategy_haltonerror.go b/lib/autoupdate/rollout/strategy_haltonerror.go index b05b79827c234..51bbceb87729f 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror.go +++ b/lib/autoupdate/rollout/strategy_haltonerror.go @@ -24,10 +24,13 @@ import ( "slices" "time" + "github.com/coreos/go-semver/semver" "github.com/gravitational/trace" + "github.com/gravitational/teleport/api/client/proto" "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" update "github.com/gravitational/teleport/api/types/autoupdate" + "github.com/gravitational/teleport/lib/automaticupgrades/version" ) const ( @@ -36,6 +39,8 @@ const ( updateReasonPreviousGroupsNotDone = "previous_groups_not_done" updateReasonUpdateComplete = "update_complete" updateReasonUpdateInProgress = "update_in_progress" + updateReasonCanariesAlive = "canaries_are_alive" + updateReasonWaitingForCanaries = "waiting_for_canaries" haltOnErrorWindowDuration = time.Hour ) @@ -128,8 +133,35 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, spec *autoupd default: // All previous groups are DONE and time-related criteria are met. // We can start. - setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now) - group.InitialCount = uint64(agentCount) + + // We pass the list of groups to the sampler because it must compute the catch-call group + groups := make([]string, len(status.Groups)) + for j, g := range status.Groups { + groups[j] = g.GetName() + } + h.startGroup(ctx, group, now, agentCount, status) + } + previousGroupsAreDone = false + case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: + // Sample the canaries if they were not sampled yet. + if err := h.sampleCanaries(ctx, group, status); err != nil { + return trace.Wrap(err, "failed to sample canaries") + } + // Check if the canaries are back online and running the right version + targetVersion, err := version.EnsureSemver(spec.GetTargetVersion()) + if err != nil { + return trace.Wrap(err, "failed to parse target version, rollout is malformed") + } + successfulCanaries := h.updateCanariesStatus(ctx, group, *targetVersion) + + // If all canaries are OK, we can transition to the active state + if successfulCanaries == int(group.CanaryCount) { + h.log.DebugContext(ctx, "All canaries came back alive, transitioning to the active state", "group", group, "got", successfulCanaries, "want", int(group.CanaryCount)) + setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanariesAlive, now) + } else { + h.log.DebugContext(ctx, "Not all canaries came back yet, staying into canary state", "group", group, "got", successfulCanaries, "want", int(group.CanaryCount)) + setGroupState(group, group.State, updateReasonWaitingForCanaries, now) + } previousGroupsAreDone = false case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: @@ -157,6 +189,139 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, spec *autoupd return nil } +const ( + canaryCount = 5 + canaryThreshold = 20 +) + +func shouldUseCanaries(currentCount int) bool { + // in the future we might change this logic to be a multiple of the required canary count + // and make the canary count dynamic + return currentCount >= canaryThreshold +} + +func (h *haltOnErrorStrategy) startGroup(ctx context.Context, group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time, agentCount int, status *autoupdate.AutoUpdateAgentRolloutStatus) { + group.InitialCount = uint64(agentCount) + + if !shouldUseCanaries(agentCount) { + h.log.DebugContext(ctx, "Skipping canary rollout, transitioning directly to the active state", "group", group.Name) + setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now) + return + } + + setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, updateReasonCanStart, now) + // This is a small optimization, as we just transitioned into the canary state we can sample canaries. + // This will allow us to start updating without having to wait for the next reconciliation cycle. + if err := h.sampleCanaries(ctx, group, status); err != nil { + h.log.WarnContext(ctx, "Failed to sample canaries", "group", group.Name) + } +} + +func (h *haltOnErrorStrategy) sampleCanaries(ctx context.Context, group *autoupdate.AutoUpdateAgentRolloutStatusGroup, status *autoupdate.AutoUpdateAgentRolloutStatus) error { + if group.CanaryCount == 0 { + group.CanaryCount = canaryCount + } + // Check if we need to pick more canaries + if len(group.Canaries) < int(group.CanaryCount) { + previousLength := len(group.Canaries) + h.log.DebugContext(ctx, "Group is missing canaries, sampling some more", "group", group, "got", previousLength, "want", int(group.CanaryCount)) + + // We pass the list of groups to the sampler because it must compute the catch-all group. + groups := make([]string, len(status.Groups)) + for j, g := range status.Groups { + groups[j] = g.GetName() + } + // We sample as many canaries as possible instead of just the missing ones + // Because we might sample an already sampled canary. + additionalCanaries, err := h.clt.SampleAgentsFromAutoUpdateGroup(ctx, group.Name, int(group.CanaryCount), groups) + if err != nil { + return trace.Wrap(err) + } + injectCanaries(group, additionalCanaries) + h.log.DebugContext(ctx, "Additional canaries sampled", "group", group, "before", previousLength, "after", len(group.Canaries)) + } else { + h.log.DebugContext(ctx, "Canaries already sampled", "group", group.Name, "got", len(group.Canaries)) + } + return nil +} + +func injectCanaries(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, additionalCanaries []*autoupdate.Canary) { + for _, canary := range additionalCanaries { + // We first check if the canary has already been sampled + alreadySampled := false + for _, existingCanary := range group.Canaries { + if existingCanary.UpdaterId == canary.UpdaterId { + alreadySampled = true + break + } + } + + // If it was not, great, we have a new canary. + if !alreadySampled { + group.Canaries = append(group.Canaries, canary) + } + + // Stop adding canaries once we have the right amount + if len(group.Canaries) == int(group.CanaryCount) { + return + } + } +} + +func (h *haltOnErrorStrategy) updateCanariesStatus(ctx context.Context, group *autoupdate.AutoUpdateAgentRolloutStatusGroup, targetVersion semver.Version) int { + h.log.DebugContext(ctx, "Checking canaries", "group", group.Name) + var successfulCanaries int + for _, canary := range group.Canaries { + // If the canary already came back healthy, nothing to do + if canary.Success { + successfulCanaries++ + continue + } + + canaryLogInfo := slog.Group("canary", "host_id", canary.HostId, "updater_id", canary.UpdaterId, "hostname", canary.Hostname) + log := h.log.With(canaryLogInfo).With("group", group.Name) + + // Check if the canary is connected to our auth + hellos, err := h.clt.LookupAgentInInventory(ctx, canary.HostId) + if err != nil { + if trace.IsNotFound(err) { + // Canary is not registered to our Auth Service. + // Note: One old canary instance might still be connected to the auth, + // be we are ignoring terminating instances. + log.DebugContext(ctx, "Node not connected") + } else { + h.log.WarnContext(ctx, "Failed to lookup agent") + } + continue + } + + if canaryIsRunningTargetVersion(ctx, hellos, targetVersion, log) { + canary.Success = true + successfulCanaries++ + } + } + return successfulCanaries +} + +// canaryIsRunningTargetVersion returns true if at least one of the Hellos indicates +// the canary is running the target version. +func canaryIsRunningTargetVersion(ctx context.Context, hellos []*proto.UpstreamInventoryHello, targetVersion semver.Version, log *slog.Logger) bool { + for _, hello := range hellos { + canaryVersion, err := version.EnsureSemver(hello.Version) + if err != nil { + log.WarnContext(ctx, "Failed to parse canary version", "err", err, "current_version", hello.Version) + continue + } + if !targetVersion.Equal(*canaryVersion) { + log.DebugContext(ctx, "Canary is not running the target version", "current_version", canaryVersion, "expected_version", targetVersion) + continue + } + log.DebugContext(ctx, "Canary is running the target version, marking it healthy", "current_version", canaryVersion, "expected_version", targetVersion) + return true + } + return false +} + func canStartHaltOnError(group, previousGroup *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) { // check wait hours if group.ConfigWaitHours != 0 { diff --git a/lib/autoupdate/rollout/strategy_haltonerror_test.go b/lib/autoupdate/rollout/strategy_haltonerror_test.go index a5bd97d47869b..15a48c4d68f4c 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror_test.go +++ b/lib/autoupdate/rollout/strategy_haltonerror_test.go @@ -20,16 +20,22 @@ package rollout import ( "context" + "fmt" "testing" "time" + "github.com/google/go-cmp/cmp" + "github.com/google/uuid" "github.com/gravitational/trace" "github.com/jonboulle/clockwork" "github.com/stretchr/testify/require" + "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/timestamppb" + "github.com/gravitational/teleport/api/client/proto" "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" headerv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/header/v1" + "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/lib/utils/log/logtest" ) @@ -167,7 +173,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { }, group2Name: { Versions: map[string]*autoupdate.AutoUpdateAgentReportSpecGroupVersion{ - startVersion: {Count: 5}, + startVersion: {Count: 3}, targetVersion: {Count: 5}, }, }, @@ -198,12 +204,121 @@ func Test_progressGroupsHaltOnError(t *testing.T) { }, } + // canaryTestReports contain more agents, so it triggers the canary logic + canaryTestReports := []*autoupdate.AutoUpdateAgentReport{ + { + Metadata: &headerv1.Metadata{Name: "auth1"}, + Spec: &autoupdate.AutoUpdateAgentReportSpec{ + Timestamp: timestamppb.New(fewSecondsAgo), + Groups: map[string]*autoupdate.AutoUpdateAgentReportSpecGroup{ + group1Name: { + Versions: map[string]*autoupdate.AutoUpdateAgentReportSpecGroupVersion{ + startVersion: {Count: 20}, + targetVersion: {Count: 5}, + otherVersion: {Count: 1}, + }, + }, + group2Name: { + Versions: map[string]*autoupdate.AutoUpdateAgentReportSpecGroupVersion{ + startVersion: {Count: 3}, + targetVersion: {Count: 5}, + }, + }, + }, + }, + }, + } + + var ( + testCanaries []*autoupdate.Canary + healthyTestCanaries []*autoupdate.Canary + ) + for i := range 10 { + updaterId := uuid.NewString() + hostID := uuid.NewString() + hostName := fmt.Sprintf("canary-%d", i) + testCanaries = append(testCanaries, &autoupdate.Canary{ + UpdaterId: updaterId, + HostId: hostID, + Hostname: hostName, + Success: false, + }) + healthyTestCanaries = append(healthyTestCanaries, &autoupdate.Canary{ + UpdaterId: updaterId, + HostId: hostID, + Hostname: hostName, + Success: true, + }) + } + + testCanariesLookupNotFound := make(map[string][]callAnswer[[]*proto.UpstreamInventoryHello]) + testCanariesLookupStartVersion := make(map[string][]callAnswer[[]*proto.UpstreamInventoryHello]) + testCanariesLookupTargetVersion := make(map[string][]callAnswer[[]*proto.UpstreamInventoryHello]) + testCanariesLookupTargetVersionDualHandles := make(map[string][]callAnswer[[]*proto.UpstreamInventoryHello]) + + for _, canary := range testCanaries { + testCanariesLookupNotFound[canary.HostId] = []callAnswer[[]*proto.UpstreamInventoryHello]{ + {err: trace.NotFound("handle not found")}, + } + testCanariesLookupStartVersion[canary.HostId] = []callAnswer[[]*proto.UpstreamInventoryHello]{ + { + result: []*proto.UpstreamInventoryHello{ + { + Version: startVersion, + ServerID: canary.HostId, + Hostname: canary.Hostname, + ExternalUpgrader: types.UpgraderKindTeleportUpdate, + ExternalUpgraderVersion: startVersion, + }, + }, + err: nil, + }, + } + testCanariesLookupTargetVersion[canary.HostId] = []callAnswer[[]*proto.UpstreamInventoryHello]{ + { + result: []*proto.UpstreamInventoryHello{ + { + Version: targetVersion, + ServerID: canary.HostId, + Hostname: canary.Hostname, + ExternalUpgrader: types.UpgraderKindTeleportUpdate, + ExternalUpgraderVersion: targetVersion, + }, + }, + err: nil, + }, + } + testCanariesLookupTargetVersionDualHandles[canary.HostId] = []callAnswer[[]*proto.UpstreamInventoryHello]{ + { + result: []*proto.UpstreamInventoryHello{ + { + Version: startVersion, + ServerID: canary.HostId, + Hostname: canary.Hostname, + ExternalUpgrader: types.UpgraderKindTeleportUpdate, + ExternalUpgraderVersion: startVersion, + }, + { + Version: targetVersion, + ServerID: canary.HostId, + Hostname: canary.Hostname, + ExternalUpgrader: types.UpgraderKindTeleportUpdate, + ExternalUpgraderVersion: targetVersion, + }, + }, + err: nil, + }, + } + } + tests := []struct { name string initialState []*autoupdate.AutoUpdateAgentRolloutStatusGroup reports []*autoupdate.AutoUpdateAgentReport rolloutStartTime *timestamppb.Timestamp expectedState []*autoupdate.AutoUpdateAgentRolloutStatusGroup + canarySamples []callAnswer[[]*autoupdate.Canary] + agentLookups map[string][]callAnswer[[]*proto.UpstreamInventoryHello] }{ { name: "single group unstarted -> unstarted", @@ -566,7 +681,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { ConfigDays: cannotStartToday, ConfigStartHour: matchingStartHour, // Group1 is the catch-all group, so it should count group2 agents - PresentCount: 20, + PresentCount: 18, UpToDateCount: 10, }, }, @@ -598,7 +713,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { ConfigDays: canStartToday, ConfigStartHour: matchingStartHour, // Group1 is the catch-all group, so it should count group2 agents - PresentCount: 20, + PresentCount: 18, UpToDateCount: 10, // InitialCount must not be changed during active -> active transitions InitialCount: 25, @@ -606,7 +721,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { }, }, { - name: "single group unstarted -> active", + name: "single group unstarted -> active with reports", initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ { Name: group1Name, @@ -630,8 +745,8 @@ func Test_progressGroupsHaltOnError(t *testing.T) { ConfigDays: canStartToday, ConfigStartHour: matchingStartHour, // InitialCount must be set during unstarted -> active transition - InitialCount: 20, - PresentCount: 20, + InitialCount: 18, + PresentCount: 18, UpToDateCount: 10, }, }, @@ -695,8 +810,8 @@ func Test_progressGroupsHaltOnError(t *testing.T) { ConfigDays: canStartToday, ConfigStartHour: matchingStartHour, ConfigWaitHours: 24, - InitialCount: 10, - PresentCount: 10, + InitialCount: 8, + PresentCount: 8, UpToDateCount: 5, }, { @@ -710,6 +825,271 @@ func Test_progressGroupsHaltOnError(t *testing.T) { }, }, }, + { + name: "single group unstarted -> canary, no canaries sampled", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonCreated, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + PresentCount: 12, + UpToDateCount: 3, + }, + }, + reports: canaryTestReports, + canarySamples: []callAnswer[[]*autoupdate.Canary]{{}}, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + StartTime: timestamppb.New(clock.Now()), + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + }, + }, + }, + { + name: "single group canary -> canary, sampling agents, agents not found", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + // Checking that if CanaryCount is not set/null (e.g. we came from a manual transition) + // We still set it instead of jumping to the active state. + CanaryCount: 0, + }, + }, + reports: canaryTestReports, + canarySamples: mockResponseForCanaries(testCanaries[:5]), + agentLookups: testCanariesLookupNotFound, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonWaitingForCanaries, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + Canaries: testCanaries[:5], + }, + }, + }, + { + name: "single group canary -> canary, sampling agents, agents running old version", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + }, + }, + reports: canaryTestReports, + canarySamples: mockResponseForCanaries(testCanaries[:5]), + agentLookups: testCanariesLookupStartVersion, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonWaitingForCanaries, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + Canaries: testCanaries[:5], + }, + }, + }, + { + name: "single group canary -> active, sampling agents, agents running target version", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + }, + }, + reports: canaryTestReports, + canarySamples: mockResponseForCanaries(testCanaries[:5]), + agentLookups: testCanariesLookupTargetVersion, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + LastUpdateTime: timestamppb.New(clock.Now()), + StartTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanariesAlive, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + Canaries: healthyTestCanaries[:5], + }, + }, + }, + { + name: "single group canary -> active, already sampled agents, agents running target version", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + Canaries: testCanaries[:5], + }, + }, + reports: canaryTestReports, + // no canarySamples set, we don't expect a sampling call + agentLookups: testCanariesLookupTargetVersion, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + LastUpdateTime: timestamppb.New(clock.Now()), + StartTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanariesAlive, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + Canaries: healthyTestCanaries[:5], + }, + }, + }, + { + name: "single group canary -> canary, incomplete sampled agents, agents running start version", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + // Only 2 canaries + Canaries: testCanaries[8:10], + }, + }, + reports: canaryTestReports, + canarySamples: mockResponseForCanaries(testCanaries[:5]), + agentLookups: testCanariesLookupStartVersion, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonWaitingForCanaries, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + // We expect the 2 initial agents to stay here, and 3 additional agents + Canaries: append(testCanaries[8:10], testCanaries[0:3]...), + }, + }, + }, + { + name: "single group canary -> active, already sampled agents, agents running target version, several handles", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanStart, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + Canaries: testCanaries[:5], + }, + }, + reports: canaryTestReports, + // no canarySamples set, we don't expect a sampling call + agentLookups: testCanariesLookupTargetVersionDualHandles, + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + LastUpdateTime: timestamppb.New(clock.Now()), + StartTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonCanariesAlive, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + // InitialCount must be set during unstarted -> active transition + InitialCount: 34, + PresentCount: 34, + UpToDateCount: 10, + CanaryCount: 5, + Canaries: healthyTestCanaries[:5], + }, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -723,7 +1103,10 @@ func Test_progressGroupsHaltOnError(t *testing.T) { TargetVersion: targetVersion, } - stubs := mockClientStubs{} + stubs := mockClientStubs{ + agentSamples: tt.canarySamples, + inventoryAgentLookups: tt.agentLookups, + } if tt.reports == nil { stubs.reportsAnswers = []callAnswer[[]*autoupdate.AutoUpdateAgentReport]{ { @@ -739,16 +1122,17 @@ func Test_progressGroupsHaltOnError(t *testing.T) { }, } } + clt := newMockClient(t, stubs) strategy, err := newHaltOnErrorStrategy(log, clt) require.NoError(t, err) err = strategy.progressRollout(ctx, spec, status, clock.Now()) require.NoError(t, err) - // We use require.Equal instead of Elements match because group order matters. + // Group order matters. // It's not super important for time-based, but is crucial for halt-on-error. // So it's better to be more conservative and validate order never changes for // both strategies. - require.Equal(t, tt.expectedState, tt.initialState) + require.Empty(t, cmp.Diff(tt.expectedState, tt.initialState, protocmp.Transform())) }) } } @@ -823,3 +1207,11 @@ func TestCountCatchAll(t *testing.T) { }) } } + +func mockResponseForCanaries(canaries []*autoupdate.Canary) []callAnswer[[]*autoupdate.Canary] { + return []callAnswer[[]*autoupdate.Canary]{ + { + result: canaries, + }, + } +} diff --git a/lib/autoupdate/rollout/transitions.go b/lib/autoupdate/rollout/transitions.go index 9bbdbde19cb99..b9b67d183d6bd 100644 --- a/lib/autoupdate/rollout/transitions.go +++ b/lib/autoupdate/rollout/transitions.go @@ -23,6 +23,7 @@ import ( "github.com/gravitational/trace" + "github.com/gravitational/teleport/api/constants" autoupdatev1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" "github.com/gravitational/teleport/api/types/autoupdate" ) @@ -63,20 +64,10 @@ func TriggerGroups(rollout *autoupdatev1pb.AutoUpdateAgentRollout, reports []*au return trace.Wrap(err) } - switch desiredState { - case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED: - // When unspecified, we default to active - desiredState = autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE - case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: - default: - return trace.BadParameter("unsupported desired state: %s, supported states are 'unspecified' and 'active'", desiredState) - } - // filter out expired reports validReports := make([]*autoupdatev1pb.AutoUpdateAgentReport, len(reports)) for _, report := range reports { - // TODO replace time.minute by the auth periodic operation constant - if now.Sub(report.GetSpec().GetTimestamp().AsTime()) <= time.Minute { + if now.Sub(report.GetSpec().GetTimestamp().AsTime()) <= constants.AutoUpdateAgentReportPeriod { validReports = append(validReports, report) } } @@ -87,6 +78,15 @@ func TriggerGroups(rollout *autoupdatev1pb.AutoUpdateAgentRollout, reports []*au if len(groups) == 0 { return trace.BadParameter("rollout has no groups") } + var initialCount, upToDateCount int + + switch desiredState { + case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED, + autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: + default: + return trace.BadParameter("unsupported desired state: %s, supported states are 'unspecified', 'canary' and 'active'", desiredState) + } // Part where we do the real work, doing a state transition for every requested group. for groupName := range groupsToTrigger { @@ -95,10 +95,18 @@ func TriggerGroups(rollout *autoupdatev1pb.AutoUpdateAgentRollout, reports []*au return trace.Wrap(err) } + if groupName == groups[len(groups)-1].GetName() { + initialCount, upToDateCount = countCatchAll(rollout.GetStatus(), countByGroup, upToDateByGroup) + } else { + initialCount = countByGroup[groupName] + upToDateCount = upToDateByGroup[groupName] + } + // We check if the group state transition is legal. switch group.GetState() { case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED, autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: return trace.AlreadyExists("group %q is already active", groupName) @@ -108,14 +116,22 @@ func TriggerGroups(rollout *autoupdatev1pb.AutoUpdateAgentRollout, reports []*au return trace.BadParameter("group %q in unexpected state %s", groupName, group.GetState()) } - var initialCount, upToDateCount int - setGroupState(group, desiredState, updateReasonManualTrigger, now) - if groupName == groups[len(groups)-1].GetName() { - initialCount, upToDateCount = countCatchAll(rollout.GetStatus(), countByGroup, upToDateByGroup) - } else { - initialCount = countByGroup[groupName] - upToDateCount = upToDateByGroup[groupName] + switch desiredState { + case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED: + if shouldUseCanaries(initialCount) { + // We switch to the canary state but we don't sample canaries now. + // Canary sampling will happen during the next reconciliation. + desiredState = autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY + } else { + desiredState = autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE + } + case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: + default: + return trace.BadParameter("unsupported desired state: %s, supported states are 'unspecified' and 'active'", desiredState) } + + setGroupState(group, desiredState, updateReasonManualTrigger, now) group.UpToDateCount = uint64(upToDateCount) group.InitialCount = uint64(initialCount) group.PresentCount = uint64(initialCount) @@ -158,7 +174,8 @@ func ForceGroupsDone(rollout *autoupdatev1pb.AutoUpdateAgentRollout, groupsToFor case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED, autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK, - autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: + autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY: case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: return trace.AlreadyExists("group %q is already done", groupName) default: @@ -179,6 +196,7 @@ func GetStartedGroups(rollout *autoupdatev1pb.AutoUpdateAgentRollout) GroupSet { for _, group := range groups { switch group.GetState() { case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, + autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_DONE: startedGroups[group.GetName()] = struct{}{} } From 43d375d89da8c2c1392ed22570f79c58e996b803 Mon Sep 17 00:00:00 2001 From: Hugo Shaka Date: Fri, 18 Jul 2025 12:28:43 -0400 Subject: [PATCH 06/14] Tune the canary logic (#56926) - Users can now specify how many canaries they want - Instead of looking at the current group size, we rely on user input - max canary 10 -> 5 (I have not done the max message size yet) - fix a bug causing the start date to be reset when doing canary -> active --- .../teleport/autoupdate/v1/autoupdate.pb.go | 18 ++++++-- .../teleport/autoupdate/v1/autoupdate.proto | 4 ++ api/types/autoupdate/config.go | 6 +++ api/types/autoupdate/config_test.go | 22 ++++++++++ api/types/autoupdate/constants.go | 2 +- ...urces-teleport-dev-autoupdateconfigsv1.mdx | 1 + .../data-sources/autoupdate_config.mdx | 1 + .../resources/autoupdate_config.mdx | 1 + ...rces.teleport.dev_autoupdateconfigsv1.yaml | 9 ++++ ...rces.teleport.dev_autoupdateconfigsv1.yaml | 9 ++++ .../autoupdate/v1/autoupdate_terraform.go | 44 +++++++++++++++++++ lib/autoupdate/rollout/reconciler.go | 1 + lib/autoupdate/rollout/strategy.go | 6 ++- .../rollout/strategy_haltonerror.go | 14 ++---- .../rollout/strategy_haltonerror_test.go | 8 ++-- lib/autoupdate/rollout/transitions.go | 2 +- 16 files changed, 127 insertions(+), 21 deletions(-) diff --git a/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go b/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go index acd17cbe9f817..a2aef34867de0 100644 --- a/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go +++ b/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go @@ -481,7 +481,11 @@ type AgentAutoUpdateGroup struct { StartHour int32 `protobuf:"varint,3,opt,name=start_hour,json=startHour,proto3" json:"start_hour,omitempty"` // wait_hours after last group succeeds before this group can run. This can only be used when the strategy is "halt-on-failure". // This field must be positive. - WaitHours int32 `protobuf:"varint,5,opt,name=wait_hours,json=waitHours,proto3" json:"wait_hours,omitempty"` + WaitHours int32 `protobuf:"varint,5,opt,name=wait_hours,json=waitHours,proto3" json:"wait_hours,omitempty"` + // canary_count is the number of canary agents that will be updated before the whole group is updated. + // when set to 0, the group does not enter the canary phase. This number is capped to 5. + // This number must always be lower than the total number of agents in the group, else the rollout will be stuck. + CanaryCount int32 `protobuf:"varint,6,opt,name=canary_count,json=canaryCount,proto3" json:"canary_count,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -544,6 +548,13 @@ func (x *AgentAutoUpdateGroup) GetWaitHours() int32 { return 0 } +func (x *AgentAutoUpdateGroup) GetCanaryCount() int32 { + if x != nil { + return x.CanaryCount + } + return 0 +} + // AutoUpdateVersion is a resource singleton with version required for // tools autoupdate. type AutoUpdateVersion struct { @@ -1627,14 +1638,15 @@ const file_teleport_autoupdate_v1_autoupdate_proto_rawDesc = "" + "\x1bmaintenance_window_duration\x18\x03 \x01(\v2\x19.google.protobuf.DurationR\x19maintenanceWindowDuration\x12N\n" + "\tschedules\x18\x06 \x01(\v20.teleport.autoupdate.v1.AgentAutoUpdateSchedulesR\tschedulesJ\x04\b\x05\x10\x06R\x0fagent_schedules\"b\n" + "\x18AgentAutoUpdateSchedules\x12F\n" + - "\aregular\x18\x01 \x03(\v2,.teleport.autoupdate.v1.AgentAutoUpdateGroupR\aregular\"\x8d\x01\n" + + "\aregular\x18\x01 \x03(\v2,.teleport.autoupdate.v1.AgentAutoUpdateGroupR\aregular\"\xb0\x01\n" + "\x14AgentAutoUpdateGroup\x12\x12\n" + "\x04name\x18\x01 \x01(\tR\x04name\x12\x12\n" + "\x04days\x18\x02 \x03(\tR\x04days\x12\x1d\n" + "\n" + "start_hour\x18\x03 \x01(\x05R\tstartHour\x12\x1d\n" + "\n" + - "wait_hours\x18\x05 \x01(\x05R\twaitHoursJ\x04\b\x04\x10\x05R\twait_days\"\xd9\x01\n" + + "wait_hours\x18\x05 \x01(\x05R\twaitHours\x12!\n" + + "\fcanary_count\x18\x06 \x01(\x05R\vcanaryCountJ\x04\b\x04\x10\x05R\twait_days\"\xd9\x01\n" + "\x11AutoUpdateVersion\x12\x12\n" + "\x04kind\x18\x01 \x01(\tR\x04kind\x12\x19\n" + "\bsub_kind\x18\x02 \x01(\tR\asubKind\x12\x18\n" + diff --git a/api/proto/teleport/autoupdate/v1/autoupdate.proto b/api/proto/teleport/autoupdate/v1/autoupdate.proto index ff42592c6fb8c..005d29996c8d7 100644 --- a/api/proto/teleport/autoupdate/v1/autoupdate.proto +++ b/api/proto/teleport/autoupdate/v1/autoupdate.proto @@ -83,6 +83,10 @@ message AgentAutoUpdateGroup { // wait_hours after last group succeeds before this group can run. This can only be used when the strategy is "halt-on-failure". // This field must be positive. int32 wait_hours = 5; + // canary_count is the number of canary agents that will be updated before the whole group is updated. + // when set to 0, the group does not enter the canary phase. This number is capped to 5. + // This number must always be lower than the total number of agents in the group, else the rollout will be stuck. + int32 canary_count = 6; } // AutoUpdateVersion is a resource singleton with version required for diff --git a/api/types/autoupdate/config.go b/api/types/autoupdate/config.go index ad79765895c0d..37a949606e470 100644 --- a/api/types/autoupdate/config.go +++ b/api/types/autoupdate/config.go @@ -104,6 +104,12 @@ func checkAgentSchedules(c *autoupdate.AutoUpdateConfig) error { if group.StartHour > 23 || group.StartHour < 0 { return trace.BadParameter("spec.agents.schedules.regular[%d].start_hour must be between 0 and 23", i) } + if group.CanaryCount < 0 || group.CanaryCount > MaxCanaryCount { + return trace.BadParameter("spec.agents.schedule.regular[%d].canary_count must be between 0 and %d", i, MaxCanaryCount) + } + if c.Spec.Agents.Strategy == AgentsStrategyTimeBased && group.CanaryCount != 0 { + return trace.BadParameter("spec.agents.schedules.regular[%d].canary_count is not zero but the strategy %q doesn't support canaries", i, AgentsStrategyTimeBased) + } if c.Spec.Agents.Strategy == AgentsStrategyTimeBased && group.WaitHours != 0 { return trace.BadParameter("spec.agents.schedules.regular[%d].wait_hours must be zero when strategy is %s", i, AgentsStrategyTimeBased) } diff --git a/api/types/autoupdate/config_test.go b/api/types/autoupdate/config_test.go index 0981dd7e681c1..6a022f11500f9 100644 --- a/api/types/autoupdate/config_test.go +++ b/api/types/autoupdate/config_test.go @@ -465,6 +465,28 @@ func TestValidateAutoUpdateConfig(t *testing.T) { }, assertErr: require.Error, }, + { + name: "group with too many canaries", + config: &autoupdate.AutoUpdateConfig{ + Kind: types.KindAutoUpdateConfig, + Version: types.V1, + Metadata: &headerv1.Metadata{ + Name: types.MetaNameAutoUpdateConfig, + }, + Spec: &autoupdate.AutoUpdateConfigSpec{ + Agents: &autoupdate.AutoUpdateConfigSpecAgents{ + Mode: AgentsUpdateModeEnabled, + Strategy: AgentsStrategyHaltOnError, + Schedules: &autoupdate.AgentAutoUpdateSchedules{ + Regular: []*autoupdate.AgentAutoUpdateGroup{ + {Name: "g1", Days: []string{"*"}, WaitHours: 0, CanaryCount: 123}, + }, + }, + }, + }, + }, + assertErr: require.Error, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/api/types/autoupdate/constants.go b/api/types/autoupdate/constants.go index 36f75a82f9f7a..0c38c7b190f42 100644 --- a/api/types/autoupdate/constants.go +++ b/api/types/autoupdate/constants.go @@ -47,5 +47,5 @@ const ( // MaxCanaryCount is the maximum number of canaries allowed for a single group. // This value is arbitrarily low to avoid XXL rollouts to grow over the max backend // item size. - MaxCanaryCount = 10 + MaxCanaryCount = 5 ) diff --git a/docs/pages/reference/operator-resources/resources-teleport-dev-autoupdateconfigsv1.mdx b/docs/pages/reference/operator-resources/resources-teleport-dev-autoupdateconfigsv1.mdx index dc61e5925073c..ddf0db9e8d221 100644 --- a/docs/pages/reference/operator-resources/resources-teleport-dev-autoupdateconfigsv1.mdx +++ b/docs/pages/reference/operator-resources/resources-teleport-dev-autoupdateconfigsv1.mdx @@ -51,6 +51,7 @@ resource, which you can apply after installing the Teleport Kubernetes operator. |Field|Type|Description| |---|---|---| +|canary_count|integer|canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.| |days|[]string|days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*"| |name|string|name of the group| |start_hour|integer|start_hour to initiate update| diff --git a/docs/pages/reference/terraform-provider/data-sources/autoupdate_config.mdx b/docs/pages/reference/terraform-provider/data-sources/autoupdate_config.mdx index 39a01d7553d7e..94336eb4cff48 100644 --- a/docs/pages/reference/terraform-provider/data-sources/autoupdate_config.mdx +++ b/docs/pages/reference/terraform-provider/data-sources/autoupdate_config.mdx @@ -53,6 +53,7 @@ Optional: Optional: +- `canary_count` (Number) canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck. - `days` (List of String) days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*" - `name` (String) name of the group - `start_hour` (Number) start_hour to initiate update diff --git a/docs/pages/reference/terraform-provider/resources/autoupdate_config.mdx b/docs/pages/reference/terraform-provider/resources/autoupdate_config.mdx index 57e67d7dba440..414c896f9c5fe 100644 --- a/docs/pages/reference/terraform-provider/resources/autoupdate_config.mdx +++ b/docs/pages/reference/terraform-provider/resources/autoupdate_config.mdx @@ -91,6 +91,7 @@ Optional: Optional: +- `canary_count` (Number) canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck. - `days` (List of String) days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*" - `name` (String) name of the group - `start_hour` (Number) start_hour to initiate update diff --git a/examples/chart/teleport-cluster/charts/teleport-operator/operator-crds/resources.teleport.dev_autoupdateconfigsv1.yaml b/examples/chart/teleport-cluster/charts/teleport-operator/operator-crds/resources.teleport.dev_autoupdateconfigsv1.yaml index f56da0ee7c415..8aaeff1d116ca 100644 --- a/examples/chart/teleport-cluster/charts/teleport-operator/operator-crds/resources.teleport.dev_autoupdateconfigsv1.yaml +++ b/examples/chart/teleport-cluster/charts/teleport-operator/operator-crds/resources.teleport.dev_autoupdateconfigsv1.yaml @@ -60,6 +60,15 @@ spec: description: regular schedules for non-critical versions. items: properties: + canary_count: + description: canary_count is the number of canary agents + that will be updated before the whole group is updated. + when set to 0, the group does not enter the canary + phase. This number is capped to 5. This number must + always be lower than the total number of agents in + the group, else the rollout will be stuck. + format: int32 + type: integer days: description: days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", diff --git a/integrations/operator/config/crd/bases/resources.teleport.dev_autoupdateconfigsv1.yaml b/integrations/operator/config/crd/bases/resources.teleport.dev_autoupdateconfigsv1.yaml index f56da0ee7c415..8aaeff1d116ca 100644 --- a/integrations/operator/config/crd/bases/resources.teleport.dev_autoupdateconfigsv1.yaml +++ b/integrations/operator/config/crd/bases/resources.teleport.dev_autoupdateconfigsv1.yaml @@ -60,6 +60,15 @@ spec: description: regular schedules for non-critical versions. items: properties: + canary_count: + description: canary_count is the number of canary agents + that will be updated before the whole group is updated. + when set to 0, the group does not enter the canary + phase. This number is capped to 5. This number must + always be lower than the total number of agents in + the group, else the rollout will be stuck. + format: int32 + type: integer days: description: days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", diff --git a/integrations/terraform/tfschema/autoupdate/v1/autoupdate_terraform.go b/integrations/terraform/tfschema/autoupdate/v1/autoupdate_terraform.go index fe0cbc478dd30..8ed84a264a197 100644 --- a/integrations/terraform/tfschema/autoupdate/v1/autoupdate_terraform.go +++ b/integrations/terraform/tfschema/autoupdate/v1/autoupdate_terraform.go @@ -117,6 +117,11 @@ func GenSchemaAutoUpdateConfig(ctx context.Context) (github_com_hashicorp_terraf "schedules": { Attributes: github_com_hashicorp_terraform_plugin_framework_tfsdk.SingleNestedAttributes(map[string]github_com_hashicorp_terraform_plugin_framework_tfsdk.Attribute{"regular": { Attributes: github_com_hashicorp_terraform_plugin_framework_tfsdk.ListNestedAttributes(map[string]github_com_hashicorp_terraform_plugin_framework_tfsdk.Attribute{ + "canary_count": { + Description: "canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.", + Optional: true, + Type: github_com_hashicorp_terraform_plugin_framework_types.Int64Type, + }, "days": { Description: "days when the update can run. Supported values are \"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\", \"Sun\" and \"*\"", Optional: true, @@ -683,6 +688,23 @@ func CopyAutoUpdateConfigFromTerraform(_ context.Context, tf github_com_hashicor } } } + { + a, ok := tf.Attrs["canary_count"] + if !ok { + diags.Append(attrReadMissingDiag{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count"}) + } else { + v, ok := a.(github_com_hashicorp_terraform_plugin_framework_types.Int64) + if !ok { + diags.Append(attrReadConversionFailureDiag{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count", "github.com/hashicorp/terraform-plugin-framework/types.Int64"}) + } else { + var t int32 + if !v.Null && !v.Unknown { + t = int32(v.Value) + } + obj.CanaryCount = t + } + } + } } obj.Regular[k] = t } @@ -1308,6 +1330,28 @@ func CopyAutoUpdateConfigToTerraform(ctx context.Context, obj *github_com_gravit tf.Attrs["wait_hours"] = v } } + { + t, ok := tf.AttrTypes["canary_count"] + if !ok { + diags.Append(attrWriteMissingDiag{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count"}) + } else { + v, ok := tf.Attrs["canary_count"].(github_com_hashicorp_terraform_plugin_framework_types.Int64) + if !ok { + i, err := t.ValueFromTerraform(ctx, github_com_hashicorp_terraform_plugin_go_tftypes.NewValue(t.TerraformType(ctx), nil)) + if err != nil { + diags.Append(attrWriteGeneralError{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count", err}) + } + v, ok = i.(github_com_hashicorp_terraform_plugin_framework_types.Int64) + if !ok { + diags.Append(attrWriteConversionFailureDiag{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count", "github.com/hashicorp/terraform-plugin-framework/types.Int64"}) + } + v.Null = int64(obj.CanaryCount) == 0 + } + v.Value = int64(obj.CanaryCount) + v.Unknown = false + tf.Attrs["canary_count"] = v + } + } } v.Unknown = false c.Elems[k] = v diff --git a/lib/autoupdate/rollout/reconciler.go b/lib/autoupdate/rollout/reconciler.go index cddd023f4e9b2..0185893035acb 100644 --- a/lib/autoupdate/rollout/reconciler.go +++ b/lib/autoupdate/rollout/reconciler.go @@ -388,6 +388,7 @@ func (r *reconciler) makeGroupsStatus(ctx context.Context, schedules *autoupdate ConfigDays: group.Days, ConfigStartHour: group.StartHour, ConfigWaitHours: group.WaitHours, + CanaryCount: uint64(group.CanaryCount), } } return groups, nil diff --git a/lib/autoupdate/rollout/strategy.go b/lib/autoupdate/rollout/strategy.go index 3b3b524e54275..1bcfe90d4a5a6 100644 --- a/lib/autoupdate/rollout/strategy.go +++ b/lib/autoupdate/rollout/strategy.go @@ -97,8 +97,10 @@ func setGroupState(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, newState if previousState != newState { group.State = newState changed = true - // If we just started the group, also update the start time - if newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE || + // If we just started the group, also update the start time. + // If we are doing a canary -> active transition, we don't override the start date. + if (newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE && + previousState != autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY) || newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY { group.StartTime = timestamppb.New(now) } diff --git a/lib/autoupdate/rollout/strategy_haltonerror.go b/lib/autoupdate/rollout/strategy_haltonerror.go index 51bbceb87729f..5741bdfe46136 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror.go +++ b/lib/autoupdate/rollout/strategy_haltonerror.go @@ -189,21 +189,16 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, spec *autoupd return nil } -const ( - canaryCount = 5 - canaryThreshold = 20 -) - -func shouldUseCanaries(currentCount int) bool { +func shouldUseCanaries(group *autoupdate.AutoUpdateAgentRolloutStatusGroup) bool { // in the future we might change this logic to be a multiple of the required canary count // and make the canary count dynamic - return currentCount >= canaryThreshold + return group.CanaryCount > 0 } func (h *haltOnErrorStrategy) startGroup(ctx context.Context, group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time, agentCount int, status *autoupdate.AutoUpdateAgentRolloutStatus) { group.InitialCount = uint64(agentCount) - if !shouldUseCanaries(agentCount) { + if !shouldUseCanaries(group) { h.log.DebugContext(ctx, "Skipping canary rollout, transitioning directly to the active state", "group", group.Name) setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now) return @@ -218,9 +213,6 @@ func (h *haltOnErrorStrategy) startGroup(ctx context.Context, group *autoupdate. } func (h *haltOnErrorStrategy) sampleCanaries(ctx context.Context, group *autoupdate.AutoUpdateAgentRolloutStatusGroup, status *autoupdate.AutoUpdateAgentRolloutStatus) error { - if group.CanaryCount == 0 { - group.CanaryCount = canaryCount - } // Check if we need to pick more canaries if len(group.Canaries) < int(group.CanaryCount) { previousLength := len(group.Canaries) diff --git a/lib/autoupdate/rollout/strategy_haltonerror_test.go b/lib/autoupdate/rollout/strategy_haltonerror_test.go index 15a48c4d68f4c..8b7e9ecf284e4 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror_test.go +++ b/lib/autoupdate/rollout/strategy_haltonerror_test.go @@ -837,6 +837,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { ConfigStartHour: matchingStartHour, PresentCount: 12, UpToDateCount: 3, + CanaryCount: 5, }, }, reports: canaryTestReports, @@ -872,9 +873,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { InitialCount: 34, PresentCount: 34, UpToDateCount: 10, - // Checking that if CanaryCount is not set/null (e.g. we came from a manual transition) - // We still set it instead of jumping to the active state. - CanaryCount: 0, + CanaryCount: 5, }, }, reports: canaryTestReports, @@ -940,6 +939,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { { Name: group1Name, State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + StartTime: timestamppb.New(clock.Now()), LastUpdateTime: timestamppb.New(clock.Now()), LastUpdateReason: updateReasonCanStart, ConfigDays: canStartToday, @@ -978,6 +978,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { { Name: group1Name, State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + StartTime: timestamppb.New(clock.Now()), LastUpdateTime: timestamppb.New(clock.Now()), LastUpdateReason: updateReasonCanStart, ConfigDays: canStartToday, @@ -1058,6 +1059,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { Name: group1Name, State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, LastUpdateTime: timestamppb.New(clock.Now()), + StartTime: timestamppb.New(clock.Now()), LastUpdateReason: updateReasonCanStart, ConfigDays: canStartToday, ConfigStartHour: matchingStartHour, diff --git a/lib/autoupdate/rollout/transitions.go b/lib/autoupdate/rollout/transitions.go index b9b67d183d6bd..22a15ac37aabc 100644 --- a/lib/autoupdate/rollout/transitions.go +++ b/lib/autoupdate/rollout/transitions.go @@ -118,7 +118,7 @@ func TriggerGroups(rollout *autoupdatev1pb.AutoUpdateAgentRollout, reports []*au switch desiredState { case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED: - if shouldUseCanaries(initialCount) { + if shouldUseCanaries(group) { // We switch to the canary state but we don't sample canaries now. // Canary sampling will happen during the next reconciliation. desiredState = autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY From 5a41359aeb061ea4b7d92893ad5a20de7c008da6 Mon Sep 17 00:00:00 2001 From: hugoShaka Date: Tue, 22 Jul 2025 11:54:09 -0400 Subject: [PATCH 07/14] Fix autoupdate canary sampling for the catch-all group --- lib/auth/autoupdate_rollout.go | 14 ++++++++++---- lib/auth/autoupdate_rollout_test.go | 11 ++++++++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/lib/auth/autoupdate_rollout.go b/lib/auth/autoupdate_rollout.go index 8aa534b57f97b..c3371c80f9a1f 100644 --- a/lib/auth/autoupdate_rollout.go +++ b/lib/auth/autoupdate_rollout.go @@ -54,14 +54,20 @@ func (a *Server) SampleAgentsFromAutoUpdateGroup(ctx context.Context, groupName return false } - // If this is not the catch-all group, we can only check if the agent group is the right one. - if !isCatchAll { + // If the agent group matches, we keep it. + if handle.Hello().UpdaterInfo.UpdateGroup == groupName { // No need to check for UpdaterInfo being nil, it would have been filtered // out by filterHandler(). - return handle.Hello().UpdaterInfo.UpdateGroup == groupName + return true + } + + // If this is not the catch-all group, we filter the agent out + if !isCatchAll { + return false } + // This is the catch-call group, it matches agents from every group not in groups. - _, ok := groupSet[groupName] + _, ok := groupSet[handle.Hello().UpdaterInfo.UpdateGroup] // If the agent group is not in the group list, it falls into the catch-all. return !ok } diff --git a/lib/auth/autoupdate_rollout_test.go b/lib/auth/autoupdate_rollout_test.go index 37c79e02291f1..21abcbd2ba3c3 100644 --- a/lib/auth/autoupdate_rollout_test.go +++ b/lib/auth/autoupdate_rollout_test.go @@ -125,7 +125,7 @@ func TestSampleAgentsFromGroup(t *testing.T) { require.Less(t, conflicts, 4) // Test execution: check that agents not belonging to any group are sampled whe requesting the catch-all group. - canariesCatchAll, err := auth.SampleAgentsFromAutoUpdateGroup(t.Context(), testGroupName, sampleSize, []string{"group-a", testCatchAllGroupName}) + canariesCatchAll, err := auth.SampleAgentsFromAutoUpdateGroup(t.Context(), testCatchAllGroupName, sampleSize, []string{"group-a", testCatchAllGroupName}) require.NoError(t, err) require.Len(t, canariesCatchAll, sampleSize) canarySet = make(map[string]*autoupdatev1pb.Canary) @@ -134,6 +134,15 @@ func TestSampleAgentsFromGroup(t *testing.T) { } require.Len(t, canarySet, sampleSize, "some canary got duplicated") + // Test execution: check that agents belonging to the catch-all group are sampled. + canariesCatchAll, err = auth.SampleAgentsFromAutoUpdateGroup(t.Context(), testGroupName, sampleSize, []string{"group-a", testGroupName}) + require.NoError(t, err) + require.Len(t, canariesCatchAll, sampleSize) + canarySet = make(map[string]*autoupdatev1pb.Canary) + for _, canary := range canariesCatchAll { + canarySet[canary.UpdaterId] = canary + } + require.Len(t, canarySet, sampleSize, "some canary got duplicated") } func TestLookupAgentInInventory(t *testing.T) { From 26335760b4baf114281f1b4bd35b27ce3fabb874 Mon Sep 17 00:00:00 2001 From: Stephen Levine Date: Tue, 22 Jul 2025 22:33:27 -0400 Subject: [PATCH 08/14] Reliably detect update.yaml after soft reloads --- lib/autoupdate/agent/integrations.go | 103 ++++++++-------------- lib/autoupdate/agent/integrations_test.go | 98 +++++++++----------- 2 files changed, 80 insertions(+), 121 deletions(-) diff --git a/lib/autoupdate/agent/integrations.go b/lib/autoupdate/agent/integrations.go index 035a671cd3d20..81b489afea660 100644 --- a/lib/autoupdate/agent/integrations.go +++ b/lib/autoupdate/agent/integrations.go @@ -25,7 +25,6 @@ import ( "log/slog" "os" "path/filepath" - "strings" "github.com/google/uuid" "github.com/gravitational/trace" @@ -33,10 +32,7 @@ import ( "github.com/gravitational/teleport/api/types" ) -const ( - installDirEnvVar = "TELEPORT_UPDATE_INSTALL_DIR" - updateConfigFileEnvVar = "TELEPORT_UPDATE_CONFIG_FILE" -) +const updateConfigFileEnvVar = "TELEPORT_UPDATE_CONFIG_FILE" // IsManagedByUpdater returns true if the local Teleport binary is managed by teleport-update. // Note that true may be returned even if auto-updates is disabled or the version is pinned. @@ -54,64 +50,20 @@ func IsManagedByUpdater() (bool, error) { if err != nil { return false, trace.Wrap(err, "cannot find Teleport binary") } - installDir := os.Getenv(installDirEnvVar) - if installDir == "" { - installDir = defaultInstallDir - } - // Check if current binary is under the updater-managed path. - managed, err := hasParentDir(teleportPath, installDir) - if err != nil { - return false, trace.Wrap(err) - } - if !managed { + // Consider all installations that have an update.yaml file to be + // managed by teleport-update (the "binary" updater). + p := findParentMatching(teleportPath, versionsDirName, 4) + if p == "" { return false, nil } - // Return false if the binary is under the updater-managed path, but in the system prefix reserved for the package. - system, err := hasParentDir(teleportPath, packageSystemDir) - return !system, trace.Wrap(err) -} - -// IsManagedAndDefault returns true if the local Teleport binary is both managed by teleport-update -// and the default installation (with teleport.service as the unit file name). -// The binary is considered managed and default if it lives within /opt/teleport/default. -func IsManagedAndDefault() (bool, error) { - systemd, err := hasSystemD() - if err != nil { - return false, trace.Wrap(err) - } - if !systemd { + _, err = os.Stat(filepath.Join(p, updateConfigName)) + if errors.Is(err, os.ErrNotExist) { return false, nil } - teleportPath, err := os.Executable() if err != nil { - return false, trace.Wrap(err, "cannot find Teleport binary") - } - installDir := os.Getenv(installDirEnvVar) - if installDir == "" { - installDir = defaultInstallDir - } - isDefault, err := hasParentDir(teleportPath, filepath.Join(installDir, defaultNamespace)) - return isDefault, trace.Wrap(err) -} - -// hasParentDir returns true if dir is any parent directory of parent. -// hasParentDir does not resolve symlinks, and requires that files be represented the same way in dir and parent. -func hasParentDir(dir, parent string) (bool, error) { - // Note that os.Stat + os.SameFile would be more reliable, - // but does not work well for arbitrarily nested subdirectories. - absDir, err := filepath.Abs(dir) - if err != nil { - return false, trace.Wrap(err, "cannot get absolute path for directory %s", dir) - } - absParent, err := filepath.Abs(parent) - if err != nil { - return false, trace.Wrap(err, "cannot get absolute path for parent directory %s", dir) - } - sep := string(filepath.Separator) - if !strings.HasSuffix(absParent, sep) { - absParent += sep + return false, trace.Wrap(err) } - return strings.HasPrefix(absDir, absParent), nil + return true, nil } // ErrUnstableExecutable is returned by StableExecutable when no stable path can be found. @@ -170,17 +122,39 @@ func stablePathForBinary(origPath, defaultPath string) (string, error) { // findParentMatching returns the directory above name, if name is at rpos. // Otherwise, it returns empty string. -func findParentMatching(dir, name string, rpos int) string { +func findParentMatching(path, parent string, rpos int) string { + if parent == "" { + return "" + } var base string for range rpos { - dir, base = filepath.Split(filepath.Clean(dir)) + path, base = filepath.Split(filepath.Clean(path)) } - if base == name { - return dir + if base == parent { + return filepath.Clean(path) } return "" } +// findConfigFile returns the path to the Teleport installation's update.yaml file. +func findConfigFile() (string, error) { + // Note that if the install dir or install suffix changes before the installation + // is hard restarted, this path will be incorrect. + configPath := os.Getenv(updateConfigFileEnvVar) + if configPath != "" { + return configPath, nil + } + teleportPath, err := os.Executable() + if err != nil { + return "", trace.Wrap(err, "cannot find Teleport binary") + } + p := findParentMatching(teleportPath, versionsDirName, 4) + if p == "" { + return "", trace.Errorf("installation not managed by updater") + } + return filepath.Join(p, updateConfigName), nil +} + // ReadHelloUpdaterInfo reads the updater config and generates a proto.UpdaterV2Info // that can be reported in the inventory hello message. // This function performs io operations, its usage must be cached @@ -188,11 +162,10 @@ func findParentMatching(dir, name string, rpos int) string { func ReadHelloUpdaterInfo(ctx context.Context, log *slog.Logger, hostUUID string) (*types.UpdaterV2Info, error) { info := &types.UpdaterV2Info{} - configPath := os.Getenv(updateConfigFileEnvVar) - if configPath == "" { - return nil, trace.Errorf("config file not specified") + configPath, err := findConfigFile() + if err != nil { + return nil, trace.Wrap(err, "config file not specified") } - cfg, err := readConfig(configPath) if err != nil { return nil, trace.Wrap(err, "reading config file %s", configPath) diff --git a/lib/autoupdate/agent/integrations_test.go b/lib/autoupdate/agent/integrations_test.go index 17ffd4d860cc7..c9fd7d5184673 100644 --- a/lib/autoupdate/agent/integrations_test.go +++ b/lib/autoupdate/agent/integrations_test.go @@ -27,86 +27,72 @@ import ( "github.com/stretchr/testify/require" ) -func TestHasParentDir(t *testing.T) { +func TestFindParentMatching(t *testing.T) { tests := []struct { - name string - path string - parent string - wantResult bool + name string + path string + parent string + rpos int + result string }{ { - name: "Has valid parent directory", - path: "/opt/teleport/dir/test", - parent: "/opt/teleport", - wantResult: true, + name: "Has valid parent directory", + path: "/opt/teleport/default/versions/v1.2.3/bin/teleport", + parent: "versions", + rpos: 4, + result: "/opt/teleport/default", }, { - name: "Has valid parent directory with slash", - path: "/opt/teleport/dir/test", - parent: "/opt/teleport/", - wantResult: true, + name: "Parent too close", + path: "/opt/teleport/default/versions/bin/teleport", + parent: "versions", + rpos: 4, }, { - name: "Parent directory is root", - path: "/opt/teleport/dir", - parent: "/", - wantResult: true, + name: "Parent too far", + path: "/opt/teleport/default/versions/v1.2.3/extra/bin/teleport", + parent: "versions", + rpos: 4, }, { - name: "Parent is the same as the path", - path: "/opt/teleport/dir", - parent: "/opt/teleport/dir", - wantResult: false, + name: "Parent missing", + path: "/opt/teleport/default/version/v1.2.3/bin/teleport", + parent: "versions", + rpos: 4, }, { - name: "Parent the same as the path but without slash", - path: "/opt/teleport/dir/", - parent: "/opt/teleport/dir", - wantResult: false, + name: "Parent at root", + path: "/versions/v1.2.3/bin/teleport", + parent: "versions", + rpos: 4, + result: "/", }, { - name: "Parent the same as the path but with slash", - path: "/opt/teleport/dir", - parent: "/opt/teleport/dir/", - wantResult: false, + name: "Position past root", + path: "/v1.2.3/bin/teleport", + parent: "versions", + rpos: 4, }, { - name: "Parent is substring of the path", - path: "/opt/teleport/dir-place", - parent: "/opt/teleport/dir", - wantResult: false, + name: "Empty", }, { - name: "Parent is in path", - path: "/opt/teleport", - parent: "/opt/teleport/dir", - wantResult: false, + name: "Empty path", + parent: "versions", + rpos: 4, }, { - name: "Empty parent", - path: "/opt/teleport/dir", - parent: "", - wantResult: false, - }, - { - name: "Empty path", - path: "", - parent: "/opt/teleport", - wantResult: false, - }, - { - name: "Both empty", - path: "", - parent: "", - wantResult: false, + name: "Empty parent", + path: "/v1.2.3/bin/teleport", + parent: "", + rpos: 4, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - result, err := hasParentDir(tt.path, tt.parent) - require.NoError(t, err) - require.Equal(t, tt.wantResult, result) + result := findParentMatching(tt.path, tt.parent, tt.rpos) + require.Equal(t, tt.result, result) }) } } From 47a0218998a9d613033db1fb19475dfd0f3900b4 Mon Sep 17 00:00:00 2001 From: Stephen Levine Date: Thu, 24 Jul 2025 18:08:00 -0400 Subject: [PATCH 09/14] Fix detection on initial install --- lib/autoupdate/agent/config.go | 2 +- lib/autoupdate/agent/config_test.go | 2 +- lib/autoupdate/agent/updater.go | 29 +++++++++++++++++++++++------ 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/lib/autoupdate/agent/config.go b/lib/autoupdate/agent/config.go index 61f5a91734c72..2b53260bb6012 100644 --- a/lib/autoupdate/agent/config.go +++ b/lib/autoupdate/agent/config.go @@ -207,7 +207,7 @@ func writeConfig(filename string, cfg *UpdateConfig) error { })) } -func validateConfigSpec(spec *UpdateSpec, override OverrideConfig) error { +func updateConfigSpec(spec *UpdateSpec, override OverrideConfig) error { if override.Proxy != "" { spec.Proxy = override.Proxy } diff --git a/lib/autoupdate/agent/config_test.go b/lib/autoupdate/agent/config_test.go index d301a16a0b1fa..0207a779c0e13 100644 --- a/lib/autoupdate/agent/config_test.go +++ b/lib/autoupdate/agent/config_test.go @@ -223,7 +223,7 @@ func TestValidateConfigSpec(t *testing.T) { }, } { t.Run(tt.name, func(t *testing.T) { - err := validateConfigSpec(&tt.config, OverrideConfig{UpdateSpec: tt.override}) + err := updateConfigSpec(&tt.config, OverrideConfig{UpdateSpec: tt.override}) if tt.errMatch != "" { require.ErrorContains(t, err, tt.errMatch) return diff --git a/lib/autoupdate/agent/updater.go b/lib/autoupdate/agent/updater.go index 435a148a41b5d..0ac82b854f374 100644 --- a/lib/autoupdate/agent/updater.go +++ b/lib/autoupdate/agent/updater.go @@ -358,13 +358,14 @@ func toPtr[T any](t T) *T { // If the initial installation succeeds, the override configuration is persisted. // Otherwise, the configuration is not changed. // This function is idempotent. -func (u *Updater) Install(ctx context.Context, override OverrideConfig) error { +func (u *Updater) Install(ctx context.Context, override OverrideConfig) (err error) { // Read configuration from update.yaml and override any new values passed as flags. cfg, err := readConfig(u.UpdateConfigFile) if err != nil { return trace.Wrap(err, "failed to read %s", updateConfigName) } - if err := validateConfigSpec(&cfg.Spec, override); err != nil { + // Set the desired state. + if err := updateConfigSpec(&cfg.Spec, override); err != nil { return trace.Wrap(err) } @@ -407,6 +408,22 @@ func (u *Updater) Install(ctx context.Context, override OverrideConfig) error { u.Log.InfoContext(ctx, "Initiating installation.", targetKey, target, activeKey, active) } + // If update.yaml does not exist, write it before attempting to install. + // This ensures that update.yaml always exists when the agent is started and sends HELLO. + // The written file does not contain any versions, only the desired configuration. + if _, statErr := os.Stat(u.UpdateConfigFile); errors.Is(statErr, fs.ErrNotExist) { + if err := writeConfig(u.UpdateConfigFile, cfg); err != nil { + return trace.Wrap(err, "failed to write %s", updateConfigName) + } + defer func() { + if err != nil { + if err := os.Remove(u.UpdateConfigFile); err != nil { + u.Log.WarnContext(ctx, "Failed to remove stale configuration.", "path", u.UpdateConfigFile) + } + } + }() + } + if err := u.update(ctx, cfg, target, override.AllowOverwrite, resp.AGPL); err != nil { if errors.Is(err, ErrFilePresent) && !override.AllowOverwrite { u.Log.ErrorContext(ctx, "A non-packaged or outdated installation of Teleport was detected on this system.") @@ -458,7 +475,7 @@ func (u *Updater) Remove(ctx context.Context, force bool) error { if err != nil { return trace.Wrap(err, "failed to read %s", updateConfigName) } - if err := validateConfigSpec(&cfg.Spec, OverrideConfig{}); err != nil { + if err := updateConfigSpec(&cfg.Spec, OverrideConfig{}); err != nil { return trace.Wrap(err) } active := cfg.Status.Active @@ -697,7 +714,7 @@ func (u *Updater) Status(ctx context.Context) (Status, error) { if err != nil { return out, trace.Wrap(err, "failed to read %s", updateConfigName) } - if err := validateConfigSpec(&cfg.Spec, OverrideConfig{}); err != nil { + if err := updateConfigSpec(&cfg.Spec, OverrideConfig{}); err != nil { return out, trace.Wrap(err) } if cfg.Spec.Proxy == "" { @@ -764,7 +781,7 @@ func (u *Updater) Update(ctx context.Context, now bool) error { if err != nil { return trace.Wrap(err, "failed to read %s", updateConfigName) } - if err := validateConfigSpec(&cfg.Spec, OverrideConfig{}); err != nil { + if err := updateConfigSpec(&cfg.Spec, OverrideConfig{}); err != nil { return trace.Wrap(err) } @@ -1160,7 +1177,7 @@ func (u *Updater) LinkPackage(ctx context.Context) error { if err != nil { return trace.Wrap(err, "failed to read %s", updateConfigName) } - if err := validateConfigSpec(&cfg.Spec, OverrideConfig{}); err != nil { + if err := updateConfigSpec(&cfg.Spec, OverrideConfig{}); err != nil { return trace.Wrap(err) } active := cfg.Status.Active From e8c6b98aa89e9a0aafa124797c59c360a3e7ef9b Mon Sep 17 00:00:00 2001 From: Stephen Levine Date: Thu, 24 Jul 2025 18:10:09 -0400 Subject: [PATCH 10/14] fix log --- lib/autoupdate/agent/updater.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/autoupdate/agent/updater.go b/lib/autoupdate/agent/updater.go index 0ac82b854f374..dbb066aff1138 100644 --- a/lib/autoupdate/agent/updater.go +++ b/lib/autoupdate/agent/updater.go @@ -418,7 +418,7 @@ func (u *Updater) Install(ctx context.Context, override OverrideConfig) (err err defer func() { if err != nil { if err := os.Remove(u.UpdateConfigFile); err != nil { - u.Log.WarnContext(ctx, "Failed to remove stale configuration.", "path", u.UpdateConfigFile) + u.Log.ErrorContext(ctx, "Failed to remove invalid partial configuration.", "path", u.UpdateConfigFile, errorKey, err) } } }() From 1e988388e8c122345eb77d95231ad7dd4a4b317e Mon Sep 17 00:00:00 2001 From: hugoShaka Date: Thu, 24 Jul 2025 18:57:07 -0400 Subject: [PATCH 11/14] Fix tests after rebase --- lib/auth/autoupdate_rollout_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/auth/autoupdate_rollout_test.go b/lib/auth/autoupdate_rollout_test.go index 21abcbd2ba3c3..2a0618af08371 100644 --- a/lib/auth/autoupdate_rollout_test.go +++ b/lib/auth/autoupdate_rollout_test.go @@ -38,7 +38,7 @@ import ( "github.com/gravitational/teleport/lib/backend/memory" "github.com/gravitational/teleport/lib/inventory" "github.com/gravitational/teleport/lib/services/local" - "github.com/gravitational/teleport/lib/utils" + "github.com/gravitational/teleport/lib/utils/log/logtest" ) func TestSampleAgentsFromGroup(t *testing.T) { @@ -50,7 +50,7 @@ func TestSampleAgentsFromGroup(t *testing.T) { cancelFunc: func() {}, clock: clock, ServerID: uuid.NewString(), - logger: utils.NewSlogLoggerForTests(), + logger: logtest.NewLogger(), Services: &Services{ // The inventory is running heartbeats on the background. // If we don't create a presence service this will cause panics. @@ -154,7 +154,7 @@ func TestLookupAgentInInventory(t *testing.T) { cancelFunc: func() {}, clock: clock, ServerID: uuid.NewString(), - logger: utils.NewSlogLoggerForTests(), + logger: logtest.NewLogger(), Services: &Services{ // The inventory is running heartbeats on the background. // If we don't create a presence service this will cause panics. From 7c95f698daf0ac5b9bbc35c4a1019ac14f99b70e Mon Sep 17 00:00:00 2001 From: Stephen Levine Date: Thu, 24 Jul 2025 19:51:35 -0400 Subject: [PATCH 12/14] Always persist new configuration --- lib/autoupdate/agent/config.go | 10 ++++++++++ lib/autoupdate/agent/updater.go | 26 +++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/lib/autoupdate/agent/config.go b/lib/autoupdate/agent/config.go index 2b53260bb6012..bd61756ab84ba 100644 --- a/lib/autoupdate/agent/config.go +++ b/lib/autoupdate/agent/config.go @@ -59,6 +59,16 @@ type UpdateConfig struct { Status UpdateStatus `yaml:"status"` } +// Copy an UpdateConfig. Pointers are not copied. +func (cfg *UpdateConfig) Copy() *UpdateConfig { + if cfg == nil { + return nil + } + // All pointer fields use immutable values. + // This may need additional logic if changes. + return toPtr(*cfg) +} + // UpdateSpec describes the spec field in update.yaml. type UpdateSpec struct { // Proxy address diff --git a/lib/autoupdate/agent/updater.go b/lib/autoupdate/agent/updater.go index dbb066aff1138..6b63482b3210c 100644 --- a/lib/autoupdate/agent/updater.go +++ b/lib/autoupdate/agent/updater.go @@ -364,6 +364,7 @@ func (u *Updater) Install(ctx context.Context, override OverrideConfig) (err err if err != nil { return trace.Wrap(err, "failed to read %s", updateConfigName) } + origCfg := cfg.Copy() // Set the desired state. if err := updateConfigSpec(&cfg.Spec, override); err != nil { return trace.Wrap(err) @@ -408,21 +409,20 @@ func (u *Updater) Install(ctx context.Context, override OverrideConfig) (err err u.Log.InfoContext(ctx, "Initiating installation.", targetKey, target, activeKey, active) } - // If update.yaml does not exist, write it before attempting to install. + // Write update.yaml before attempting to install. // This ensures that update.yaml always exists when the agent is started and sends HELLO. - // The written file does not contain any versions, only the desired configuration. - if _, statErr := os.Stat(u.UpdateConfigFile); errors.Is(statErr, fs.ErrNotExist) { - if err := writeConfig(u.UpdateConfigFile, cfg); err != nil { - return trace.Wrap(err, "failed to write %s", updateConfigName) - } - defer func() { - if err != nil { - if err := os.Remove(u.UpdateConfigFile); err != nil { - u.Log.ErrorContext(ctx, "Failed to remove invalid partial configuration.", "path", u.UpdateConfigFile, errorKey, err) - } - } - }() + // The written file does not contain updated versions, only the desired configuration. + // It is reverted if the installation fails to start, so that configuration is not persisted. + if err := writeConfig(u.UpdateConfigFile, cfg); err != nil { + return trace.Wrap(err, "failed to write %s", updateConfigName) } + defer func() { + if err != nil { + if err := writeConfig(u.UpdateConfigFile, origCfg); err != nil { + u.Log.ErrorContext(ctx, "Failed to revert invalid partial configuration.", "path", u.UpdateConfigFile, errorKey, err) + } + } + }() if err := u.update(ctx, cfg, target, override.AllowOverwrite, resp.AGPL); err != nil { if errors.Is(err, ErrFilePresent) && !override.AllowOverwrite { From 4b7654a6fced3b428b8edfc6f5ae6dee7a9f1069 Mon Sep 17 00:00:00 2001 From: Stephen Levine Date: Thu, 24 Jul 2025 20:16:16 -0400 Subject: [PATCH 13/14] cleanup --- lib/autoupdate/agent/config.go | 7 ++++--- lib/autoupdate/agent/integrations.go | 4 ++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/autoupdate/agent/config.go b/lib/autoupdate/agent/config.go index bd61756ab84ba..32c7b90a52dfa 100644 --- a/lib/autoupdate/agent/config.go +++ b/lib/autoupdate/agent/config.go @@ -48,6 +48,7 @@ const ( ) // UpdateConfig describes the update.yaml file schema. +// Pointer values should be replaced and not mutated, see Copy. type UpdateConfig struct { // Version of the configuration file Version string `yaml:"version"` @@ -62,14 +63,13 @@ type UpdateConfig struct { // Copy an UpdateConfig. Pointers are not copied. func (cfg *UpdateConfig) Copy() *UpdateConfig { if cfg == nil { - return nil + return &UpdateConfig{} } - // All pointer fields use immutable values. - // This may need additional logic if changes. return toPtr(*cfg) } // UpdateSpec describes the spec field in update.yaml. +// Pointer values should be replaced and not mutated, see Copy. type UpdateSpec struct { // Proxy address Proxy string `yaml:"proxy"` @@ -86,6 +86,7 @@ type UpdateSpec struct { } // UpdateStatus describes the status field in update.yaml. +// Pointer values should be replaced and not mutated, see Copy. type UpdateStatus struct { // IDFile is the path to a temporary file containing the updater ID. IDFile string `yaml:"id_file,omitempty"` diff --git a/lib/autoupdate/agent/integrations.go b/lib/autoupdate/agent/integrations.go index 81b489afea660..8f24f598961d2 100644 --- a/lib/autoupdate/agent/integrations.go +++ b/lib/autoupdate/agent/integrations.go @@ -171,6 +171,10 @@ func ReadHelloUpdaterInfo(ctx context.Context, log *slog.Logger, hostUUID string return nil, trace.Wrap(err, "reading config file %s", configPath) } + // Note that only IDFile may be read from Status on the initial HELLO. + // Any fields set after the agent starts (e.g., active version) will be + // outdated until the agent is confirmed healthy. + info.UpdateGroup = cfg.Spec.Group if info.UpdateGroup == "" { info.UpdateGroup = defaultSetting From bbc95fff909dbf881dfea25532faa92a77aee4c1 Mon Sep 17 00:00:00 2001 From: Stephen Levine Date: Thu, 24 Jul 2025 21:13:09 -0400 Subject: [PATCH 14/14] fix tests --- .../testdata/TestUpdater_Install/setup_fails.golden | 10 ++++++++++ lib/autoupdate/agent/updater_test.go | 6 ------ 2 files changed, 10 insertions(+), 6 deletions(-) create mode 100644 lib/autoupdate/agent/testdata/TestUpdater_Install/setup_fails.golden diff --git a/lib/autoupdate/agent/testdata/TestUpdater_Install/setup_fails.golden b/lib/autoupdate/agent/testdata/TestUpdater_Install/setup_fails.golden new file mode 100644 index 0000000000000..6e104086250e3 --- /dev/null +++ b/lib/autoupdate/agent/testdata/TestUpdater_Install/setup_fails.golden @@ -0,0 +1,10 @@ +version: v1 +kind: update_config +spec: + proxy: "" + path: "" + enabled: false + pinned: false +status: + active: + version: "" diff --git a/lib/autoupdate/agent/updater_test.go b/lib/autoupdate/agent/updater_test.go index 05e78f5464ed7..f54a230663cef 100644 --- a/lib/autoupdate/agent/updater_test.go +++ b/lib/autoupdate/agent/updater_test.go @@ -1823,12 +1823,6 @@ func TestUpdater_Install(t *testing.T) { require.Equal(t, tt.setupCalls, setupCalls) require.Equal(t, tt.restarted, restarted) - if tt.cfg == nil && err != nil { - _, err := os.Stat(cfgPath) - require.Error(t, err) - return - } - data, err := os.ReadFile(cfgPath) require.NoError(t, err) data = blankTestAddr(data)