diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 8cc8b8dcc6aea..af1b9c93095cd 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -4,6 +4,7 @@ on: merge_group: env: GOFLAGS: '-buildvcs=false' + GOEXPERIMENT: 'synctest' jobs: changes: diff --git a/Makefile b/Makefile index 8856d0daf090f..99984844a912a 100644 --- a/Makefile +++ b/Makefile @@ -941,7 +941,7 @@ test-go-prepare: ensure-webassets bpf-bytecode $(TEST_LOG_DIR) ensure-gotestsum test-go-unit: FLAGS ?= -race -shuffle on test-go-unit: SUBJECT ?= $(shell go list ./... | grep -vE 'teleport/(e2e|integration|tool/tsh|integrations/operator|integrations/access|integrations/lib)') test-go-unit: - $(CGOFLAG) go test -cover -json -tags "$(PAM_TAG) $(FIPS_TAG) $(BPF_TAG) $(LIBFIDO2_TEST_TAG) $(TOUCHID_TAG) $(PIV_TEST_TAG) $(VNETDAEMON_TAG)" $(PACKAGES) $(SUBJECT) $(FLAGS) $(ADDFLAGS) \ + $(CGOFLAG) GOEXPERIMENT=synctest go test -cover -json -tags "$(PAM_TAG) $(FIPS_TAG) $(BPF_TAG) $(LIBFIDO2_TEST_TAG) $(TOUCHID_TAG) $(PIV_TEST_TAG) $(VNETDAEMON_TAG)" $(PACKAGES) $(SUBJECT) $(FLAGS) $(ADDFLAGS) \ | tee $(TEST_LOG_DIR)/unit.json \ | gotestsum --raw-command -- cat diff --git a/api/gen/proto/go/teleport/machineid/v1/bot_instance.pb.go b/api/gen/proto/go/teleport/machineid/v1/bot_instance.pb.go index fe5b9ca1689f7..1cc8a9b9b58be 100644 --- a/api/gen/proto/go/teleport/machineid/v1/bot_instance.pb.go +++ b/api/gen/proto/go/teleport/machineid/v1/bot_instance.pb.go @@ -103,6 +103,64 @@ func (BotKind) EnumDescriptor() ([]byte, []int) { return file_teleport_machineid_v1_bot_instance_proto_rawDescGZIP(), []int{0} } +// BotInstanceHealthStatus describes the healthiness of a `tbot` service. +type BotInstanceHealthStatus int32 + +const ( + // The enum zero-value, it means no status was included. + BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNSPECIFIED BotInstanceHealthStatus = 0 + // Means the service is still "starting up" and hasn't reported its status. + BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_INITIALIZING BotInstanceHealthStatus = 1 + // Means the service is healthy and ready to serve traffic, or it has + // recently succeeded in generating an output. + BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_HEALTHY BotInstanceHealthStatus = 2 + // Means the service is failing to serve traffic or generate output. + BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY BotInstanceHealthStatus = 3 +) + +// Enum value maps for BotInstanceHealthStatus. +var ( + BotInstanceHealthStatus_name = map[int32]string{ + 0: "BOT_INSTANCE_HEALTH_STATUS_UNSPECIFIED", + 1: "BOT_INSTANCE_HEALTH_STATUS_INITIALIZING", + 2: "BOT_INSTANCE_HEALTH_STATUS_HEALTHY", + 3: "BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY", + } + BotInstanceHealthStatus_value = map[string]int32{ + "BOT_INSTANCE_HEALTH_STATUS_UNSPECIFIED": 0, + "BOT_INSTANCE_HEALTH_STATUS_INITIALIZING": 1, + "BOT_INSTANCE_HEALTH_STATUS_HEALTHY": 2, + "BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY": 3, + } +) + +func (x BotInstanceHealthStatus) Enum() *BotInstanceHealthStatus { + p := new(BotInstanceHealthStatus) + *p = x + return p +} + +func (x BotInstanceHealthStatus) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (BotInstanceHealthStatus) Descriptor() protoreflect.EnumDescriptor { + return file_teleport_machineid_v1_bot_instance_proto_enumTypes[1].Descriptor() +} + +func (BotInstanceHealthStatus) Type() protoreflect.EnumType { + return &file_teleport_machineid_v1_bot_instance_proto_enumTypes[1] +} + +func (x BotInstanceHealthStatus) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use BotInstanceHealthStatus.Descriptor instead. +func (BotInstanceHealthStatus) EnumDescriptor() ([]byte, []int) { + return file_teleport_machineid_v1_bot_instance_proto_rawDescGZIP(), []int{1} +} + // A BotInstance type BotInstance struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -548,8 +606,10 @@ type BotInstanceStatus struct { InitialHeartbeat *BotInstanceStatusHeartbeat `protobuf:"bytes,3,opt,name=initial_heartbeat,json=initialHeartbeat,proto3" json:"initial_heartbeat,omitempty"` // The N most recent heartbeats for this bot instance. LatestHeartbeats []*BotInstanceStatusHeartbeat `protobuf:"bytes,4,rep,name=latest_heartbeats,json=latestHeartbeats,proto3" json:"latest_heartbeats,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + // The health of the services/output `tbot` is running. + ServiceHealth []*BotInstanceServiceHealth `protobuf:"bytes,5,rep,name=service_health,json=serviceHealth,proto3" json:"service_health,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *BotInstanceStatus) Reset() { @@ -610,6 +670,142 @@ func (x *BotInstanceStatus) GetLatestHeartbeats() []*BotInstanceStatusHeartbeat return nil } +func (x *BotInstanceStatus) GetServiceHealth() []*BotInstanceServiceHealth { + if x != nil { + return x.ServiceHealth + } + return nil +} + +// BotInstanceServiceHealth is a snapshot of a `tbot` service's health. +type BotInstanceServiceHealth struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Service identifies the service. + Service *BotInstanceServiceIdentifier `protobuf:"bytes,1,opt,name=service,proto3" json:"service,omitempty"` + // Status describes the service's healthiness. + Status BotInstanceHealthStatus `protobuf:"varint,2,opt,name=status,proto3,enum=teleport.machineid.v1.BotInstanceHealthStatus" json:"status,omitempty"` + // Reason is a human-readable explanation for the service's status. It might + // include an error message. + Reason *string `protobuf:"bytes,3,opt,name=reason,proto3,oneof" json:"reason,omitempty"` + // UpdatedAt is the time at which the service's health last changed. + UpdatedAt *timestamppb.Timestamp `protobuf:"bytes,4,opt,name=updated_at,json=updatedAt,proto3" json:"updated_at,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *BotInstanceServiceHealth) Reset() { + *x = BotInstanceServiceHealth{} + mi := &file_teleport_machineid_v1_bot_instance_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *BotInstanceServiceHealth) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*BotInstanceServiceHealth) ProtoMessage() {} + +func (x *BotInstanceServiceHealth) ProtoReflect() protoreflect.Message { + mi := &file_teleport_machineid_v1_bot_instance_proto_msgTypes[5] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use BotInstanceServiceHealth.ProtoReflect.Descriptor instead. +func (*BotInstanceServiceHealth) Descriptor() ([]byte, []int) { + return file_teleport_machineid_v1_bot_instance_proto_rawDescGZIP(), []int{5} +} + +func (x *BotInstanceServiceHealth) GetService() *BotInstanceServiceIdentifier { + if x != nil { + return x.Service + } + return nil +} + +func (x *BotInstanceServiceHealth) GetStatus() BotInstanceHealthStatus { + if x != nil { + return x.Status + } + return BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNSPECIFIED +} + +func (x *BotInstanceServiceHealth) GetReason() string { + if x != nil && x.Reason != nil { + return *x.Reason + } + return "" +} + +func (x *BotInstanceServiceHealth) GetUpdatedAt() *timestamppb.Timestamp { + if x != nil { + return x.UpdatedAt + } + return nil +} + +// BotInstanceServiceIdentifier uniquely identifies a `tbot` service. +type BotInstanceServiceIdentifier struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Type of service (e.g. database-tunnel, ssh-multiplexer). + Type string `protobuf:"bytes,1,opt,name=type,proto3" json:"type,omitempty"` + // Name of the service, either given by the user or auto-generated. + Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *BotInstanceServiceIdentifier) Reset() { + *x = BotInstanceServiceIdentifier{} + mi := &file_teleport_machineid_v1_bot_instance_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *BotInstanceServiceIdentifier) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*BotInstanceServiceIdentifier) ProtoMessage() {} + +func (x *BotInstanceServiceIdentifier) ProtoReflect() protoreflect.Message { + mi := &file_teleport_machineid_v1_bot_instance_proto_msgTypes[6] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use BotInstanceServiceIdentifier.ProtoReflect.Descriptor instead. +func (*BotInstanceServiceIdentifier) Descriptor() ([]byte, []int) { + return file_teleport_machineid_v1_bot_instance_proto_rawDescGZIP(), []int{6} +} + +func (x *BotInstanceServiceIdentifier) GetType() string { + if x != nil { + return x.Type + } + return "" +} + +func (x *BotInstanceServiceIdentifier) GetName() string { + if x != nil { + return x.Name + } + return "" +} + var File_teleport_machineid_v1_bot_instance_proto protoreflect.FileDescriptor const file_teleport_machineid_v1_bot_instance_proto_rawDesc = "" + @@ -658,18 +854,34 @@ const file_teleport_machineid_v1_bot_instance_proto_rawDesc = "" + "\n" + "public_key\x18\x06 \x01(\fR\tpublicKey\x12F\n" + "\n" + - "join_attrs\x18\b \x01(\v2'.teleport.workloadidentity.v1.JoinAttrsR\tjoinAttrsJ\x04\b\a\x10\bR\vfingerprint\"\xb1\x03\n" + + "join_attrs\x18\b \x01(\v2'.teleport.workloadidentity.v1.JoinAttrsR\tjoinAttrsJ\x04\b\a\x10\bR\vfingerprint\"\x89\x04\n" + "\x11BotInstanceStatus\x12m\n" + "\x16initial_authentication\x18\x01 \x01(\v26.teleport.machineid.v1.BotInstanceStatusAuthenticationR\x15initialAuthentication\x12m\n" + "\x16latest_authentications\x18\x02 \x03(\v26.teleport.machineid.v1.BotInstanceStatusAuthenticationR\x15latestAuthentications\x12^\n" + "\x11initial_heartbeat\x18\x03 \x01(\v21.teleport.machineid.v1.BotInstanceStatusHeartbeatR\x10initialHeartbeat\x12^\n" + - "\x11latest_heartbeats\x18\x04 \x03(\v21.teleport.machineid.v1.BotInstanceStatusHeartbeatR\x10latestHeartbeats*\x8c\x01\n" + + "\x11latest_heartbeats\x18\x04 \x03(\v21.teleport.machineid.v1.BotInstanceStatusHeartbeatR\x10latestHeartbeats\x12V\n" + + "\x0eservice_health\x18\x05 \x03(\v2/.teleport.machineid.v1.BotInstanceServiceHealthR\rserviceHealth\"\x94\x02\n" + + "\x18BotInstanceServiceHealth\x12M\n" + + "\aservice\x18\x01 \x01(\v23.teleport.machineid.v1.BotInstanceServiceIdentifierR\aservice\x12F\n" + + "\x06status\x18\x02 \x01(\x0e2..teleport.machineid.v1.BotInstanceHealthStatusR\x06status\x12\x1b\n" + + "\x06reason\x18\x03 \x01(\tH\x00R\x06reason\x88\x01\x01\x129\n" + + "\n" + + "updated_at\x18\x04 \x01(\v2\x1a.google.protobuf.TimestampR\tupdatedAtB\t\n" + + "\a_reason\"F\n" + + "\x1cBotInstanceServiceIdentifier\x12\x12\n" + + "\x04type\x18\x01 \x01(\tR\x04type\x12\x12\n" + + "\x04name\x18\x02 \x01(\tR\x04name*\x8c\x01\n" + "\aBotKind\x12\x18\n" + "\x14BOT_KIND_UNSPECIFIED\x10\x00\x12\x11\n" + "\rBOT_KIND_TBOT\x10\x01\x12\x1f\n" + "\x1bBOT_KIND_TERRAFORM_PROVIDER\x10\x02\x12 \n" + "\x1cBOT_KIND_KUBERNETES_OPERATOR\x10\x03\x12\x11\n" + - "\rBOT_KIND_TCTL\x10\x04BVZTgithub.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1;machineidv1b\x06proto3" + "\rBOT_KIND_TCTL\x10\x04*\xc4\x01\n" + + "\x17BotInstanceHealthStatus\x12*\n" + + "&BOT_INSTANCE_HEALTH_STATUS_UNSPECIFIED\x10\x00\x12+\n" + + "'BOT_INSTANCE_HEALTH_STATUS_INITIALIZING\x10\x01\x12&\n" + + "\"BOT_INSTANCE_HEALTH_STATUS_HEALTHY\x10\x02\x12(\n" + + "$BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY\x10\x03BVZTgithub.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1;machineidv1b\x06proto3" var ( file_teleport_machineid_v1_bot_instance_proto_rawDescOnce sync.Once @@ -683,42 +895,49 @@ func file_teleport_machineid_v1_bot_instance_proto_rawDescGZIP() []byte { return file_teleport_machineid_v1_bot_instance_proto_rawDescData } -var file_teleport_machineid_v1_bot_instance_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_teleport_machineid_v1_bot_instance_proto_msgTypes = make([]protoimpl.MessageInfo, 5) +var file_teleport_machineid_v1_bot_instance_proto_enumTypes = make([]protoimpl.EnumInfo, 2) +var file_teleport_machineid_v1_bot_instance_proto_msgTypes = make([]protoimpl.MessageInfo, 7) var file_teleport_machineid_v1_bot_instance_proto_goTypes = []any{ (BotKind)(0), // 0: teleport.machineid.v1.BotKind - (*BotInstance)(nil), // 1: teleport.machineid.v1.BotInstance - (*BotInstanceSpec)(nil), // 2: teleport.machineid.v1.BotInstanceSpec - (*BotInstanceStatusHeartbeat)(nil), // 3: teleport.machineid.v1.BotInstanceStatusHeartbeat - (*BotInstanceStatusAuthentication)(nil), // 4: teleport.machineid.v1.BotInstanceStatusAuthentication - (*BotInstanceStatus)(nil), // 5: teleport.machineid.v1.BotInstanceStatus - (*v1.Metadata)(nil), // 6: teleport.header.v1.Metadata - (*timestamppb.Timestamp)(nil), // 7: google.protobuf.Timestamp - (*durationpb.Duration)(nil), // 8: google.protobuf.Duration - (*types.UpdaterV2Info)(nil), // 9: types.UpdaterV2Info - (*structpb.Struct)(nil), // 10: google.protobuf.Struct - (*v11.JoinAttrs)(nil), // 11: teleport.workloadidentity.v1.JoinAttrs + (BotInstanceHealthStatus)(0), // 1: teleport.machineid.v1.BotInstanceHealthStatus + (*BotInstance)(nil), // 2: teleport.machineid.v1.BotInstance + (*BotInstanceSpec)(nil), // 3: teleport.machineid.v1.BotInstanceSpec + (*BotInstanceStatusHeartbeat)(nil), // 4: teleport.machineid.v1.BotInstanceStatusHeartbeat + (*BotInstanceStatusAuthentication)(nil), // 5: teleport.machineid.v1.BotInstanceStatusAuthentication + (*BotInstanceStatus)(nil), // 6: teleport.machineid.v1.BotInstanceStatus + (*BotInstanceServiceHealth)(nil), // 7: teleport.machineid.v1.BotInstanceServiceHealth + (*BotInstanceServiceIdentifier)(nil), // 8: teleport.machineid.v1.BotInstanceServiceIdentifier + (*v1.Metadata)(nil), // 9: teleport.header.v1.Metadata + (*timestamppb.Timestamp)(nil), // 10: google.protobuf.Timestamp + (*durationpb.Duration)(nil), // 11: google.protobuf.Duration + (*types.UpdaterV2Info)(nil), // 12: types.UpdaterV2Info + (*structpb.Struct)(nil), // 13: google.protobuf.Struct + (*v11.JoinAttrs)(nil), // 14: teleport.workloadidentity.v1.JoinAttrs } var file_teleport_machineid_v1_bot_instance_proto_depIdxs = []int32{ - 6, // 0: teleport.machineid.v1.BotInstance.metadata:type_name -> teleport.header.v1.Metadata - 2, // 1: teleport.machineid.v1.BotInstance.spec:type_name -> teleport.machineid.v1.BotInstanceSpec - 5, // 2: teleport.machineid.v1.BotInstance.status:type_name -> teleport.machineid.v1.BotInstanceStatus - 7, // 3: teleport.machineid.v1.BotInstanceStatusHeartbeat.recorded_at:type_name -> google.protobuf.Timestamp - 8, // 4: teleport.machineid.v1.BotInstanceStatusHeartbeat.uptime:type_name -> google.protobuf.Duration - 9, // 5: teleport.machineid.v1.BotInstanceStatusHeartbeat.updater_info:type_name -> types.UpdaterV2Info + 9, // 0: teleport.machineid.v1.BotInstance.metadata:type_name -> teleport.header.v1.Metadata + 3, // 1: teleport.machineid.v1.BotInstance.spec:type_name -> teleport.machineid.v1.BotInstanceSpec + 6, // 2: teleport.machineid.v1.BotInstance.status:type_name -> teleport.machineid.v1.BotInstanceStatus + 10, // 3: teleport.machineid.v1.BotInstanceStatusHeartbeat.recorded_at:type_name -> google.protobuf.Timestamp + 11, // 4: teleport.machineid.v1.BotInstanceStatusHeartbeat.uptime:type_name -> google.protobuf.Duration + 12, // 5: teleport.machineid.v1.BotInstanceStatusHeartbeat.updater_info:type_name -> types.UpdaterV2Info 0, // 6: teleport.machineid.v1.BotInstanceStatusHeartbeat.kind:type_name -> teleport.machineid.v1.BotKind - 7, // 7: teleport.machineid.v1.BotInstanceStatusAuthentication.authenticated_at:type_name -> google.protobuf.Timestamp - 10, // 8: teleport.machineid.v1.BotInstanceStatusAuthentication.metadata:type_name -> google.protobuf.Struct - 11, // 9: teleport.machineid.v1.BotInstanceStatusAuthentication.join_attrs:type_name -> teleport.workloadidentity.v1.JoinAttrs - 4, // 10: teleport.machineid.v1.BotInstanceStatus.initial_authentication:type_name -> teleport.machineid.v1.BotInstanceStatusAuthentication - 4, // 11: teleport.machineid.v1.BotInstanceStatus.latest_authentications:type_name -> teleport.machineid.v1.BotInstanceStatusAuthentication - 3, // 12: teleport.machineid.v1.BotInstanceStatus.initial_heartbeat:type_name -> teleport.machineid.v1.BotInstanceStatusHeartbeat - 3, // 13: teleport.machineid.v1.BotInstanceStatus.latest_heartbeats:type_name -> teleport.machineid.v1.BotInstanceStatusHeartbeat - 14, // [14:14] is the sub-list for method output_type - 14, // [14:14] is the sub-list for method input_type - 14, // [14:14] is the sub-list for extension type_name - 14, // [14:14] is the sub-list for extension extendee - 0, // [0:14] is the sub-list for field type_name + 10, // 7: teleport.machineid.v1.BotInstanceStatusAuthentication.authenticated_at:type_name -> google.protobuf.Timestamp + 13, // 8: teleport.machineid.v1.BotInstanceStatusAuthentication.metadata:type_name -> google.protobuf.Struct + 14, // 9: teleport.machineid.v1.BotInstanceStatusAuthentication.join_attrs:type_name -> teleport.workloadidentity.v1.JoinAttrs + 5, // 10: teleport.machineid.v1.BotInstanceStatus.initial_authentication:type_name -> teleport.machineid.v1.BotInstanceStatusAuthentication + 5, // 11: teleport.machineid.v1.BotInstanceStatus.latest_authentications:type_name -> teleport.machineid.v1.BotInstanceStatusAuthentication + 4, // 12: teleport.machineid.v1.BotInstanceStatus.initial_heartbeat:type_name -> teleport.machineid.v1.BotInstanceStatusHeartbeat + 4, // 13: teleport.machineid.v1.BotInstanceStatus.latest_heartbeats:type_name -> teleport.machineid.v1.BotInstanceStatusHeartbeat + 7, // 14: teleport.machineid.v1.BotInstanceStatus.service_health:type_name -> teleport.machineid.v1.BotInstanceServiceHealth + 8, // 15: teleport.machineid.v1.BotInstanceServiceHealth.service:type_name -> teleport.machineid.v1.BotInstanceServiceIdentifier + 1, // 16: teleport.machineid.v1.BotInstanceServiceHealth.status:type_name -> teleport.machineid.v1.BotInstanceHealthStatus + 10, // 17: teleport.machineid.v1.BotInstanceServiceHealth.updated_at:type_name -> google.protobuf.Timestamp + 18, // [18:18] is the sub-list for method output_type + 18, // [18:18] is the sub-list for method input_type + 18, // [18:18] is the sub-list for extension type_name + 18, // [18:18] is the sub-list for extension extendee + 0, // [0:18] is the sub-list for field type_name } func init() { file_teleport_machineid_v1_bot_instance_proto_init() } @@ -726,13 +945,14 @@ func file_teleport_machineid_v1_bot_instance_proto_init() { if File_teleport_machineid_v1_bot_instance_proto != nil { return } + file_teleport_machineid_v1_bot_instance_proto_msgTypes[5].OneofWrappers = []any{} type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_teleport_machineid_v1_bot_instance_proto_rawDesc), len(file_teleport_machineid_v1_bot_instance_proto_rawDesc)), - NumEnums: 1, - NumMessages: 5, + NumEnums: 2, + NumMessages: 7, NumExtensions: 0, NumServices: 0, }, diff --git a/api/gen/proto/go/teleport/machineid/v1/bot_instance_service.pb.go b/api/gen/proto/go/teleport/machineid/v1/bot_instance_service.pb.go index 8fa41fe942e4e..45bad52dfd303 100644 --- a/api/gen/proto/go/teleport/machineid/v1/bot_instance_service.pb.go +++ b/api/gen/proto/go/teleport/machineid/v1/bot_instance_service.pb.go @@ -295,7 +295,9 @@ func (x *DeleteBotInstanceRequest) GetInstanceId() string { type SubmitHeartbeatRequest struct { state protoimpl.MessageState `protogen:"open.v1"` // The heartbeat data to submit. - Heartbeat *BotInstanceStatusHeartbeat `protobuf:"bytes,1,opt,name=heartbeat,proto3" json:"heartbeat,omitempty"` + Heartbeat *BotInstanceStatusHeartbeat `protobuf:"bytes,1,opt,name=heartbeat,proto3" json:"heartbeat,omitempty"` + // The health of the services/output `tbot` is running. + ServiceHealth []*BotInstanceServiceHealth `protobuf:"bytes,2,rep,name=service_health,json=serviceHealth,proto3" json:"service_health,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -337,6 +339,13 @@ func (x *SubmitHeartbeatRequest) GetHeartbeat() *BotInstanceStatusHeartbeat { return nil } +func (x *SubmitHeartbeatRequest) GetServiceHealth() []*BotInstanceServiceHealth { + if x != nil { + return x.ServiceHealth + } + return nil +} + // The response for SubmitHeartbeat. type SubmitHeartbeatResponse struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -396,9 +405,10 @@ const file_teleport_machineid_v1_bot_instance_service_proto_rawDesc = "" + "\x18DeleteBotInstanceRequest\x12\x19\n" + "\bbot_name\x18\x01 \x01(\tR\abotName\x12\x1f\n" + "\vinstance_id\x18\x02 \x01(\tR\n" + - "instanceId\"i\n" + + "instanceId\"\xc1\x01\n" + "\x16SubmitHeartbeatRequest\x12O\n" + - "\theartbeat\x18\x01 \x01(\v21.teleport.machineid.v1.BotInstanceStatusHeartbeatR\theartbeat\"\x19\n" + + "\theartbeat\x18\x01 \x01(\v21.teleport.machineid.v1.BotInstanceStatusHeartbeatR\theartbeat\x12V\n" + + "\x0eservice_health\x18\x02 \x03(\v2/.teleport.machineid.v1.BotInstanceServiceHealthR\rserviceHealth\"\x19\n" + "\x17SubmitHeartbeatResponse2\xbd\x03\n" + "\x12BotInstanceService\x12b\n" + "\x0eGetBotInstance\x12,.teleport.machineid.v1.GetBotInstanceRequest\x1a\".teleport.machineid.v1.BotInstance\x12s\n" + @@ -429,25 +439,27 @@ var file_teleport_machineid_v1_bot_instance_service_proto_goTypes = []any{ (*types.SortBy)(nil), // 6: types.SortBy (*BotInstance)(nil), // 7: teleport.machineid.v1.BotInstance (*BotInstanceStatusHeartbeat)(nil), // 8: teleport.machineid.v1.BotInstanceStatusHeartbeat - (*emptypb.Empty)(nil), // 9: google.protobuf.Empty + (*BotInstanceServiceHealth)(nil), // 9: teleport.machineid.v1.BotInstanceServiceHealth + (*emptypb.Empty)(nil), // 10: google.protobuf.Empty } var file_teleport_machineid_v1_bot_instance_service_proto_depIdxs = []int32{ - 6, // 0: teleport.machineid.v1.ListBotInstancesRequest.sort:type_name -> types.SortBy - 7, // 1: teleport.machineid.v1.ListBotInstancesResponse.bot_instances:type_name -> teleport.machineid.v1.BotInstance - 8, // 2: teleport.machineid.v1.SubmitHeartbeatRequest.heartbeat:type_name -> teleport.machineid.v1.BotInstanceStatusHeartbeat - 0, // 3: teleport.machineid.v1.BotInstanceService.GetBotInstance:input_type -> teleport.machineid.v1.GetBotInstanceRequest - 1, // 4: teleport.machineid.v1.BotInstanceService.ListBotInstances:input_type -> teleport.machineid.v1.ListBotInstancesRequest - 3, // 5: teleport.machineid.v1.BotInstanceService.DeleteBotInstance:input_type -> teleport.machineid.v1.DeleteBotInstanceRequest - 4, // 6: teleport.machineid.v1.BotInstanceService.SubmitHeartbeat:input_type -> teleport.machineid.v1.SubmitHeartbeatRequest - 7, // 7: teleport.machineid.v1.BotInstanceService.GetBotInstance:output_type -> teleport.machineid.v1.BotInstance - 2, // 8: teleport.machineid.v1.BotInstanceService.ListBotInstances:output_type -> teleport.machineid.v1.ListBotInstancesResponse - 9, // 9: teleport.machineid.v1.BotInstanceService.DeleteBotInstance:output_type -> google.protobuf.Empty - 5, // 10: teleport.machineid.v1.BotInstanceService.SubmitHeartbeat:output_type -> teleport.machineid.v1.SubmitHeartbeatResponse - 7, // [7:11] is the sub-list for method output_type - 3, // [3:7] is the sub-list for method input_type - 3, // [3:3] is the sub-list for extension type_name - 3, // [3:3] is the sub-list for extension extendee - 0, // [0:3] is the sub-list for field type_name + 6, // 0: teleport.machineid.v1.ListBotInstancesRequest.sort:type_name -> types.SortBy + 7, // 1: teleport.machineid.v1.ListBotInstancesResponse.bot_instances:type_name -> teleport.machineid.v1.BotInstance + 8, // 2: teleport.machineid.v1.SubmitHeartbeatRequest.heartbeat:type_name -> teleport.machineid.v1.BotInstanceStatusHeartbeat + 9, // 3: teleport.machineid.v1.SubmitHeartbeatRequest.service_health:type_name -> teleport.machineid.v1.BotInstanceServiceHealth + 0, // 4: teleport.machineid.v1.BotInstanceService.GetBotInstance:input_type -> teleport.machineid.v1.GetBotInstanceRequest + 1, // 5: teleport.machineid.v1.BotInstanceService.ListBotInstances:input_type -> teleport.machineid.v1.ListBotInstancesRequest + 3, // 6: teleport.machineid.v1.BotInstanceService.DeleteBotInstance:input_type -> teleport.machineid.v1.DeleteBotInstanceRequest + 4, // 7: teleport.machineid.v1.BotInstanceService.SubmitHeartbeat:input_type -> teleport.machineid.v1.SubmitHeartbeatRequest + 7, // 8: teleport.machineid.v1.BotInstanceService.GetBotInstance:output_type -> teleport.machineid.v1.BotInstance + 2, // 9: teleport.machineid.v1.BotInstanceService.ListBotInstances:output_type -> teleport.machineid.v1.ListBotInstancesResponse + 10, // 10: teleport.machineid.v1.BotInstanceService.DeleteBotInstance:output_type -> google.protobuf.Empty + 5, // 11: teleport.machineid.v1.BotInstanceService.SubmitHeartbeat:output_type -> teleport.machineid.v1.SubmitHeartbeatResponse + 8, // [8:12] is the sub-list for method output_type + 4, // [4:8] is the sub-list for method input_type + 4, // [4:4] is the sub-list for extension type_name + 4, // [4:4] is the sub-list for extension extendee + 0, // [0:4] is the sub-list for field type_name } func init() { file_teleport_machineid_v1_bot_instance_service_proto_init() } diff --git a/api/proto/teleport/machineid/v1/bot_instance.proto b/api/proto/teleport/machineid/v1/bot_instance.proto index 1010412b1a6a8..d31b9b4947262 100644 --- a/api/proto/teleport/machineid/v1/bot_instance.proto +++ b/api/proto/teleport/machineid/v1/bot_instance.proto @@ -167,4 +167,47 @@ message BotInstanceStatus { BotInstanceStatusHeartbeat initial_heartbeat = 3; // The N most recent heartbeats for this bot instance. repeated BotInstanceStatusHeartbeat latest_heartbeats = 4; + // The health of the services/output `tbot` is running. + repeated BotInstanceServiceHealth service_health = 5; +} + +// BotInstanceServiceHealth is a snapshot of a `tbot` service's health. +message BotInstanceServiceHealth { + // Service identifies the service. + BotInstanceServiceIdentifier service = 1; + + // Status describes the service's healthiness. + BotInstanceHealthStatus status = 2; + + // Reason is a human-readable explanation for the service's status. It might + // include an error message. + optional string reason = 3; + + // UpdatedAt is the time at which the service's health last changed. + google.protobuf.Timestamp updated_at = 4; +} + +// BotInstanceServiceIdentifier uniquely identifies a `tbot` service. +message BotInstanceServiceIdentifier { + // Type of service (e.g. database-tunnel, ssh-multiplexer). + string type = 1; + + // Name of the service, either given by the user or auto-generated. + string name = 2; +} + +// BotInstanceHealthStatus describes the healthiness of a `tbot` service. +enum BotInstanceHealthStatus { + // The enum zero-value, it means no status was included. + BOT_INSTANCE_HEALTH_STATUS_UNSPECIFIED = 0; + + // Means the service is still "starting up" and hasn't reported its status. + BOT_INSTANCE_HEALTH_STATUS_INITIALIZING = 1; + + // Means the service is healthy and ready to serve traffic, or it has + // recently succeeded in generating an output. + BOT_INSTANCE_HEALTH_STATUS_HEALTHY = 2; + + // Means the service is failing to serve traffic or generate output. + BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY = 3; } diff --git a/api/proto/teleport/machineid/v1/bot_instance_service.proto b/api/proto/teleport/machineid/v1/bot_instance_service.proto index f0802486fccd4..9e68a1697d6b7 100644 --- a/api/proto/teleport/machineid/v1/bot_instance_service.proto +++ b/api/proto/teleport/machineid/v1/bot_instance_service.proto @@ -71,6 +71,9 @@ message DeleteBotInstanceRequest { message SubmitHeartbeatRequest { // The heartbeat data to submit. BotInstanceStatusHeartbeat heartbeat = 1; + + // The health of the services/output `tbot` is running. + repeated BotInstanceServiceHealth service_health = 2; } // The response for SubmitHeartbeat. diff --git a/docs/pages/reference/machine-id/diagnostics-service.mdx b/docs/pages/reference/machine-id/diagnostics-service.mdx index a7a688dea7937..1b828e19f373e 100644 --- a/docs/pages/reference/machine-id/diagnostics-service.mdx +++ b/docs/pages/reference/machine-id/diagnostics-service.mdx @@ -93,8 +93,8 @@ Content-Type: application/json } ``` -By default, `tbot` generates service names based on their configuration such as -the output destination. You can override this by providing your own name in the +By default, `tbot` generates service names based on their type (e.g. +`application-output-1`). You can override this by providing your own name in the `tbot` configuration file. ```yaml diff --git a/lib/auth/machineid/machineidv1/bot_instance_service.go b/lib/auth/machineid/machineidv1/bot_instance_service.go index 24d59081885e2..6d0f60e98f8b4 100644 --- a/lib/auth/machineid/machineidv1/bot_instance_service.go +++ b/lib/auth/machineid/machineidv1/bot_instance_service.go @@ -21,6 +21,8 @@ package machineidv1 import ( "context" "log/slog" + "os" + "strconv" "time" "github.com/gravitational/trace" @@ -46,6 +48,12 @@ const ( // ensure the instance remains accessible until shortly after the last // issued certificate expires. ExpiryMargin = time.Minute * 5 + + // serviceNameLimit is the maximum length in bytes of a bot service name. + serviceNameLimit = 64 + + // statusReasonLimit is the maximum length in bytes of a service status reason. + statusReasonLimit = 256 ) // BotInstancesCache is the subset of the cached resources that the Service queries. @@ -178,6 +186,17 @@ func (b *BotInstanceService) SubmitHeartbeat(ctx context.Context, req *pb.Submit return nil, trace.BadParameter("heartbeat: must be non-nil") } + for _, svcHealth := range req.GetServiceHealth() { + name := svcHealth.GetService().GetName() + if len(name) > serviceNameLimit { + return nil, trace.BadParameter("service name %q is longer than %d bytes", name, serviceNameLimit) + } + reason := svcHealth.GetReason() + if len(reason) > statusReasonLimit { + return nil, trace.BadParameter("service %q has a status reason longer than %d bytes", name, statusReasonLimit) + } + } + // Enforce that the connecting client is a bot and has a bot instance ID. botName := authCtx.Identity.GetIdentity().BotName botInstanceID := authCtx.Identity.GetIdentity().BotInstanceID @@ -212,6 +231,11 @@ func (b *BotInstanceService) SubmitHeartbeat(ctx context.Context, req *pb.Submit // Append the new heartbeat to the end. instance.Status.LatestHeartbeats = append(instance.Status.LatestHeartbeats, req.Heartbeat) + if storeHeartbeatExtras() { + // Overwrite the service health. + instance.Status.ServiceHealth = req.ServiceHealth + } + return instance, nil }) if err != nil { @@ -220,3 +244,15 @@ func (b *BotInstanceService) SubmitHeartbeat(ctx context.Context, req *pb.Submit return &pb.SubmitHeartbeatResponse{}, nil } + +// storeHeartbeatExtras returns whether we should store "extra" data submitted +// with tbot heartbeats, such as the service health. Defaults to true unless the +// TELEPORT_DISABLE_TBOT_HEARTBEAT_EXTRAS environment variable is set to true on +// the auth server. +func storeHeartbeatExtras() bool { + disabled, err := strconv.ParseBool(os.Getenv("TELEPORT_DISABLE_TBOT_HEARTBEAT_EXTRAS")) + if err != nil { + return true + } + return !disabled +} diff --git a/lib/auth/machineid/machineidv1/bot_instance_service_test.go b/lib/auth/machineid/machineidv1/bot_instance_service_test.go index 5de41a878a7e4..9fa7a2dc6f86f 100644 --- a/lib/auth/machineid/machineidv1/bot_instance_service_test.go +++ b/lib/auth/machineid/machineidv1/bot_instance_service_test.go @@ -23,6 +23,7 @@ import ( "fmt" "slices" "strconv" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -257,6 +258,7 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) { createBotInstance bool assertErr assert.ErrorAssertionFunc wantHeartbeat bool + wantServiceHealth []*machineidv1.BotInstanceServiceHealth }{ { name: "success", @@ -265,10 +267,30 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) { Heartbeat: &machineidv1.BotInstanceStatusHeartbeat{ Hostname: "llama", }, + ServiceHealth: []*machineidv1.BotInstanceServiceHealth{ + { + Service: &machineidv1.BotInstanceServiceIdentifier{ + Type: "application-tunnel", + Name: "my-application-tunnel", + }, + Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + Reason: ptr("application is broken"), + }, + }, }, identity: goodIdentity, assertErr: assert.NoError, wantHeartbeat: true, + wantServiceHealth: []*machineidv1.BotInstanceServiceHealth{ + { + Service: &machineidv1.BotInstanceServiceIdentifier{ + Type: "application-tunnel", + Name: "my-application-tunnel", + }, + Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + Reason: ptr("application is broken"), + }, + }, }, { name: "missing bot name", @@ -327,6 +349,54 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) { }, wantHeartbeat: false, }, + { + name: "service name too long", + createBotInstance: true, + req: &machineidv1.SubmitHeartbeatRequest{ + Heartbeat: &machineidv1.BotInstanceStatusHeartbeat{ + Hostname: "llama", + }, + ServiceHealth: []*machineidv1.BotInstanceServiceHealth{ + { + Service: &machineidv1.BotInstanceServiceIdentifier{ + Type: "application-tunnel", + Name: strings.Repeat("a", 100), + }, + Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + Reason: ptr("application is broken"), + }, + }, + }, + identity: goodIdentity, + assertErr: func(t assert.TestingT, err error, i ...any) bool { + return assert.True(t, trace.IsBadParameter(err)) && assert.Contains(t, err.Error(), "is longer than 64 bytes") + }, + wantHeartbeat: false, + }, + { + name: "status reason too long", + createBotInstance: true, + req: &machineidv1.SubmitHeartbeatRequest{ + Heartbeat: &machineidv1.BotInstanceStatusHeartbeat{ + Hostname: "llama", + }, + ServiceHealth: []*machineidv1.BotInstanceServiceHealth{ + { + Service: &machineidv1.BotInstanceServiceIdentifier{ + Type: "application-tunnel", + Name: "my-application-tunnel", + }, + Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + Reason: ptr(strings.Repeat("a", 300)), + }, + }, + }, + identity: goodIdentity, + assertErr: func(t assert.TestingT, err error, i ...any) bool { + return assert.True(t, trace.IsBadParameter(err)) && assert.Contains(t, err.Error(), "status reason longer than 256 bytes") + }, + wantHeartbeat: false, + }, } for _, tt := range tests { @@ -377,6 +447,13 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) { assert.Nil(t, bi.Status.InitialHeartbeat) assert.Empty(t, bi.Status.LatestHeartbeats) } + assert.Empty(t, + cmp.Diff( + bi.Status.ServiceHealth, + tt.wantServiceHealth, + protocmp.Transform(), + ), + ) } }) } @@ -591,3 +668,5 @@ func newBotInstanceService( return service } + +func ptr[T any](v T) *T { return &v } diff --git a/lib/tbot/bot/bot.go b/lib/tbot/bot/bot.go index b48b7f1731ff0..989019d679469 100644 --- a/lib/tbot/bot/bot.go +++ b/lib/tbot/bot/bot.go @@ -72,26 +72,38 @@ func (b *Bot) Run(ctx context.Context) (err error) { ctx, cancel := context.WithCancel(ctx) defer cancel() - services, closer, err := b.buildServices(ctx) + registry := readyz.NewRegistry() + + services, closer, err := b.buildServices(ctx, registry) defer closer() if err != nil { return trace.Wrap(err) } + for _, handle := range services { + // If the service builder called ServiceDependencies.GetStatusReporter, + // we take that as a promise that the service's Run method will report + // statuses. Otherwise we will not include the service in heartbeats or + // the `/readyz` endpoint. + if handle.statusReporter.used { + handle.statusReporter.reporter = registry.AddService(handle.serviceType, handle.name) + } + } + b.cfg.Logger.InfoContext(ctx, "Initialization complete. Starting services") group, groupCtx := errgroup.WithContext(ctx) - for _, svc := range services { - svc := svc - log := b.cfg.Logger.With("service", svc.String()) + for _, handle := range services { + handle := handle + log := b.cfg.Logger.With("service", handle.name) group.Go(func() error { log.InfoContext(groupCtx, "Starting service") - err := svc.Run(groupCtx) + err := handle.service.Run(groupCtx) if err != nil { log.ErrorContext(ctx, "Service exited with error", "error", err) - return trace.Wrap(err, "service(%s)", svc.String()) + return trace.Wrap(err, "service(%s)", handle.name) } log.InfoContext(groupCtx, "Service exited") @@ -113,32 +125,48 @@ func (b *Bot) OneShot(ctx context.Context) (err error) { ctx, cancel := context.WithCancel(ctx) defer cancel() - services, closer, err := b.buildServices(ctx) + registry := readyz.NewRegistry() + + services, closer, err := b.buildServices(ctx, registry) defer closer() if err != nil { return trace.Wrap(err) } - group, groupCtx := errgroup.WithContext(ctx) - for _, service := range services { - log := b.cfg.Logger.With("service", service.String()) - - svc, ok := service.(OneShotService) - if !ok { - log.InfoContext(ctx, "Service does not support oneshot mode, will not run") + // Filter out the services that don't support oneshot mode. + oneShotServices := make([]*serviceHandle, 0, len(services)) + for _, handle := range services { + handle := handle + if _, ok := handle.service.(OneShotService); !ok { + b.cfg.Logger.InfoContext(ctx, + "Service does not support oneshot mode, will not run", + "service", handle.name, + ) continue } + // Add oneshot services to the registry. + handle.statusReporter.reporter = registry.AddService(handle.serviceType, handle.name) + oneShotServices = append(oneShotServices, handle) + } + + group, groupCtx := errgroup.WithContext(ctx) + for _, handle := range oneShotServices { + handle := handle + log := b.cfg.Logger.With("service", handle.name) + group.Go(func() error { log.DebugContext(groupCtx, "Running service in oneshot mode") - err := svc.OneShot(groupCtx) + err := handle.service.(OneShotService).OneShot(groupCtx) if err != nil { log.ErrorContext(ctx, "Service exited with error", "error", err) - return trace.Wrap(err, "service(%s)", svc.String()) + handle.statusReporter.ReportReason(readyz.Unhealthy, err.Error()) + return trace.Wrap(err, "service(%s)", handle.name) } log.InfoContext(groupCtx, "Service finished") + handle.statusReporter.Report(readyz.Healthy) return nil }) } @@ -152,19 +180,19 @@ func (b *Bot) checkStarted() error { return trace.BadParameter("bot has already been started") } -func (b *Bot) buildServices(ctx context.Context) ([]Service, func(), error) { +func (b *Bot) buildServices(ctx context.Context, registry *readyz.Registry) ([]*serviceHandle, func(), error) { startedAt := time.Now().UTC() - services := make([]Service, 0, len(b.cfg.Services)) - var closers []func() + handles := make([]*serviceHandle, 0, len(b.cfg.Services)) closeFn := func() { - for _, fn := range closers { - fn() + for _, h := range handles { + if h.closeFn != nil { + h.closeFn() + } } } // Build internal services and dependencies. - statusRegistry := readyz.NewRegistry() reloadBroadcaster := b.buildReloadBroadcaster(ctx) resolver, err := b.buildResolver(ctx) @@ -177,37 +205,34 @@ func (b *Bot) buildServices(ctx context.Context) ([]Service, func(), error) { return nil, closeFn, trace.Wrap(err, "building client builder") } - identityService, closeIdentityService, err := b.buildIdentityService( + identityService, identityServiceHandle, err := b.buildIdentityService( ctx, reloadBroadcaster, clientBuilder, - statusRegistry, ) if err != nil { return nil, closeFn, trace.Wrap(err, "building identity service") } - services = append(services, identityService) - closers = append(closers, closeIdentityService) + handles = append(handles, identityServiceHandle) heartbeatService, err := b.buildHeartbeatService( identityService, startedAt, - statusRegistry, + registry, ) if err != nil { return nil, closeFn, trace.Wrap(err, "building heartbeat service") } - services = append(services, heartbeatService) + handles = append(handles, heartbeatService) caRotationService, err := b.buildCARotationService( reloadBroadcaster, identityService, - statusRegistry, ) if err != nil { return nil, closeFn, trace.Wrap(err, "building CA rotation service") } - services = append(services, caRotationService) + handles = append(handles, caRotationService) proxyPinger, err := b.buildProxyPinger(identityService) if err != nil { @@ -220,28 +245,42 @@ func (b *Bot) buildServices(ctx context.Context) ([]Service, func(), error) { } // Build user services. - for idx, buildService := range b.cfg.Services { + for idx, builder := range b.cfg.Services { reloadCh, unsubscribe := reloadBroadcaster.Subscribe() - closers = append(closers, unsubscribe) - service, err := buildService(ServiceDependencies{ + handle := &serviceHandle{ + closeFn: unsubscribe, + statusReporter: &statusReporter{}, + } + handle.serviceType, handle.name = builder.GetTypeAndName() + + var err error + handle.service, err = builder.Build(ServiceDependencies{ Client: identityService.GetClient(), Resolver: resolver, - Logger: b.cfg.Logger, ClientBuilder: clientBuilder, IdentityGenerator: identityGenerator, ProxyPinger: proxyPinger, BotIdentity: identityService.GetIdentity, BotIdentityReadyCh: identityService.Ready(), ReloadCh: reloadCh, - StatusRegistry: statusRegistry, + StatusRegistry: registry, + GetStatusReporter: func() readyz.Reporter { + handle.statusReporter.used = true + return handle.statusReporter + }, + Logger: b.cfg.Logger.With( + teleport.ComponentKey, + teleport.Component(teleport.ComponentTBot, "svc", handle.name), + ), }) if err != nil { return nil, closeFn, trace.Wrap(err, "building service [%d]", idx) } - services = append(services, service) + + handles = append(handles, handle) } - return services, closeFn, nil + return handles, closeFn, nil } func (b *Bot) buildReloadBroadcaster(ctx context.Context) *internal.ChannelBroadcaster { @@ -302,10 +341,14 @@ func (b *Bot) buildIdentityService( ctx context.Context, reloadBroadcaster *internal.ChannelBroadcaster, clientBuilder *client.Builder, - statusRegistry *readyz.Registry, -) (*identity.Service, func(), error) { - reloadCh, unsubscribe := reloadBroadcaster.Subscribe() +) (*identity.Service, *serviceHandle, error) { + handle := &serviceHandle{ + serviceType: "internal/identity", + name: "identity", + statusReporter: &statusReporter{used: true}, + } + reloadCh, unsubscribe := reloadBroadcaster.Subscribe() identityService, err := identity.NewService(identity.Config{ Connection: b.cfg.Connection, Onboarding: b.cfg.Onboarding, @@ -315,18 +358,19 @@ func (b *Bot) buildIdentityService( FIPS: b.cfg.FIPS, Logger: b.cfg.Logger.With( teleport.ComponentKey, - teleport.Component(teleport.ComponentTBot, "identity"), + teleport.Component(teleport.ComponentTBot, handle.name), ), ClientBuilder: clientBuilder, ReloadCh: reloadCh, - StatusReporter: statusRegistry.AddService("identity"), + StatusReporter: handle.statusReporter, }) if err != nil { unsubscribe() return nil, nil, trace.Wrap(err, "building identity service") } - close := func() { + handle.service = identityService + handle.closeFn = func() { if err := identityService.Close(); err != nil { b.cfg.Logger.ErrorContext( ctx, @@ -338,20 +382,25 @@ func (b *Bot) buildIdentityService( } if err := identityService.Initialize(ctx); err != nil { - close() + handle.closeFn() return nil, nil, trace.Wrap(err, "initializing identity service") } - - return identityService, close, nil + return identityService, handle, nil } func (b *Bot) buildHeartbeatService( identityService *identity.Service, startedAt time.Time, statusRegistry *readyz.Registry, -) (*heartbeat.Service, error) { - return heartbeat.NewService(heartbeat.Config{ - BotKind: machineidv1.BotKind(b.cfg.Kind), +) (*serviceHandle, error) { + handle := &serviceHandle{ + serviceType: "internal/heartbeat", + name: "heartbeat", + statusReporter: &statusReporter{used: true}, + } + + var err error + handle.service, err = heartbeat.NewService(heartbeat.Config{ Interval: 30 * time.Minute, RetryLimit: 5, Client: machineidv1.NewBotInstanceServiceClient(identityService.GetClient().GetConnection()), @@ -359,10 +408,16 @@ func (b *Bot) buildHeartbeatService( StartedAt: startedAt, JoinMethod: b.cfg.Onboarding.JoinMethod, Logger: b.cfg.Logger.With( - teleport.ComponentKey, teleport.Component(teleport.ComponentTBot, "heartbeat"), + teleport.ComponentKey, + teleport.Component(teleport.ComponentTBot, handle.name), ), - StatusReporter: statusRegistry.AddService("heartbeat"), + StatusReporter: handle.statusReporter, + StatusRegistry: statusRegistry, }) + if err != nil { + return nil, trace.Wrap(err) + } + return handle, nil } func (b *Bot) buildProxyPinger(identityService *identity.Service) (connection.ProxyPinger, error) { @@ -379,17 +434,65 @@ func (b *Bot) buildProxyPinger(identityService *identity.Service) (connection.Pr func (b *Bot) buildCARotationService( reloadBroadcaster *internal.ChannelBroadcaster, identityService *identity.Service, - statusRegistry *readyz.Registry, -) (*carotation.Service, error) { - return carotation.NewService(carotation.Config{ +) (*serviceHandle, error) { + handle := &serviceHandle{ + serviceType: "internal/ca-rotation", + name: "ca-rotation", + statusReporter: &statusReporter{used: true}, + } + + var err error + handle.service, err = carotation.NewService(carotation.Config{ BroadcastFn: reloadBroadcaster.Broadcast, Client: identityService.GetClient(), GetBotIdentityFn: identityService.GetIdentity, BotIdentityReadyCh: identityService.Ready(), Logger: b.cfg.Logger.With( teleport.ComponentKey, - teleport.Component(teleport.ComponentTBot, "ca-rotation"), + teleport.Component(teleport.ComponentTBot, handle.name), ), - StatusReporter: statusRegistry.AddService("ca-rotation"), + StatusReporter: handle.statusReporter, }) + if err != nil { + return nil, trace.Wrap(err) + } + return handle, nil +} + +// serviceHandle contains a built service, its type/name, close function, and a +// pointer to its status reporter - so we can automatically report statuses for +// oneshot services. +type serviceHandle struct { + serviceType, name string + service Service + statusReporter *statusReporter + closeFn func() +} + +// statusReporter wraps readyz.Reporter to break a circular dependency where: +// +// - We need a status reporter to build a service +// - We must register the service in order to get the status reporter +// - We may not want to register the service in one-shot mode if it does not +// implement OneShotService +// - We can only know whether the service implements OneShotService after +// we've built it +// +// This wrapper allows us to defer the actual registration until we know whether +// the service implements OneShotService. +type statusReporter struct { + used bool + reporter readyz.Reporter +} + +func (r *statusReporter) Report(status readyz.Status) { + if r.reporter != nil { + r.reporter.Report(status) + } +} + +func (r *statusReporter) ReportReason(status readyz.Status, reason string) { + if r.reporter != nil { + r.reporter.ReportReason(status, reason) + } } diff --git a/lib/tbot/bot/service.go b/lib/tbot/bot/service.go index a5ba3c9deb121..4b216117aff7d 100644 --- a/lib/tbot/bot/service.go +++ b/lib/tbot/bot/service.go @@ -20,17 +20,18 @@ package bot import ( "context" + "fmt" "log/slog" "golang.org/x/sync/errgroup" - "github.com/gravitational/teleport" apiclient "github.com/gravitational/teleport/api/client" "github.com/gravitational/teleport/lib/reversetunnelclient" "github.com/gravitational/teleport/lib/tbot/bot/connection" "github.com/gravitational/teleport/lib/tbot/client" "github.com/gravitational/teleport/lib/tbot/identity" "github.com/gravitational/teleport/lib/tbot/readyz" + "github.com/gravitational/teleport/lib/utils" ) // Service is a long-running sub-component of tbot. @@ -65,8 +66,8 @@ type ServiceDependencies struct { // Resolver that can be used to look up proxy addresses. Resolver reversetunnelclient.Resolver - // Logger to which errors and messages can be written. It's the service's - // responsibility to tag the logger with a component. + // Logger to which errors and messages can be written, with its component + // set to the name of the service. Logger *slog.Logger // ProxyPinger can be used to ping the proxy or auth server to discover @@ -92,21 +93,62 @@ type ServiceDependencies struct { // it's time to reload their certificates (e.g. following a CA rotation). ReloadCh <-chan struct{} - // StatusRegistry is the registry the service can register itself with to - // report service health. - StatusRegistry *readyz.Registry + // GetStatusReporter returns the reporter to which the service should report + // its health. + // + // If a ServiceBuilder calls GetStatusReporter the service's Run method *MUST* + // call Report or ReportReason (or if using internal.RunOnInterval pass it the + // reporter) otherwise it will delay the initial heartbeat and the `/readyz` + // endpoint will return 503. + // + // You do not have to do this in your service's OneShot method as the bot + // will automatically report oneshot service status based on its return value. + GetStatusReporter func() readyz.Reporter + + // StatusRegistry can be used to read the health of the bot's services. + StatusRegistry readyz.ReadOnlyRegistry } -// LoggerForService returns a logger with the service's name as its component. -func (deps ServiceDependencies) LoggerForService(svc Service) *slog.Logger { - return deps.Logger.With( - teleport.ComponentKey, - teleport.Component(teleport.ComponentTBot, "svc", svc.String()), - ) +// ServiceBuilder will be used by the bot to create a service. +type ServiceBuilder interface { + // GetTypeAndName returns the service type and name. + GetTypeAndName() (string, string) + + // Build the service using the given dependencies. + Build(ServiceDependencies) (Service, error) +} + +// NewServiceBuilder creates a ServiceBuilder with the given service type, name +// and build function. +func NewServiceBuilder( + serviceType, name string, + buildFn func(ServiceDependencies) (Service, error), +) ServiceBuilder { + if name == "" { + // The tbot binary will set default service names, so name could only + // realistically be empty if the bot were embedded somewhere else (e.g. + // the Terraform provider) in which case a randomly generated name is + // better than nothing. + // + // We do not handle the error from CryptoRandHex because the underlying + // call to rand.Read will never fail. + suffix, _ := utils.CryptoRandomHex(4) + name = fmt.Sprintf("%s-%s", serviceType, suffix) + } + return &serviceBuilder{ + serviceType: serviceType, + name: name, + buildFn: buildFn, + } } -// ServiceBuilder will be called by the bot to create a service. -type ServiceBuilder func(ServiceDependencies) (Service, error) +type serviceBuilder struct { + serviceType, name string + buildFn func(ServiceDependencies) (Service, error) +} + +func (b *serviceBuilder) GetTypeAndName() (string, string) { return b.serviceType, b.name } +func (b *serviceBuilder) Build(deps ServiceDependencies) (Service, error) { return b.buildFn(deps) } // ServicePair combines two related Services. type ServicePair struct{ primary, secondary Service } @@ -166,13 +208,6 @@ func (s *OneShotServicePair) OneShot(ctx context.Context) error { return group.Wait() } -// LiteralService create a ServiceBuilder that returns the service as-is. -func LiteralService(service Service) ServiceBuilder { - return func(ServiceDependencies) (Service, error) { - return service, nil - } -} - // NewNopService returns a service with the given name that does nothing at all. func NewNopService(name string) NopService { return NopService{name: name} diff --git a/lib/tbot/config/config.go b/lib/tbot/config/config.go index 89873a87fb810..5da45f85f9f06 100644 --- a/lib/tbot/config/config.go +++ b/lib/tbot/config/config.go @@ -27,7 +27,6 @@ import ( "io" "net/url" "os" - "regexp" "slices" "strings" "time" @@ -60,38 +59,6 @@ const ( DefaultRenewInterval = 20 * time.Minute ) -// ReservedServiceNames are the service names reserved for internal use. -var ReservedServiceNames = []string{ - "ca-rotation", - "crl-cache", - "heartbeat", - "identity", - "spiffe-trust-bundle-cache", -} - -var reservedServiceNamesMap = func() map[string]struct{} { - m := make(map[string]struct{}, len(ReservedServiceNames)) - for _, k := range ReservedServiceNames { - m[k] = struct{}{} - } - return m -}() - -var serviceNameRegex = regexp.MustCompile(`\A[a-z\d_\-+]+\z`) - -func validateServiceName(name string) error { - if name == "" { - return nil - } - if _, ok := reservedServiceNamesMap[name]; ok { - return trace.BadParameter("service name %q is reserved for internal use", name) - } - if !serviceNameRegex.MatchString(name) { - return trace.BadParameter("invalid service name: %q, may only contain lowercase letters, numbers, hyphens, underscores, or plus symbols", name) - } - return nil -} - var log = logutils.NewPackageLogger(teleport.ComponentKey, teleport.ComponentTBot) // BotConfig is the bot's root config object. @@ -137,6 +104,10 @@ type BotConfig struct { // renewal. ReloadCh <-chan struct{} `yaml:"-"` + // Testing is set in unit tests to attach a faux service which exposes the + // bot's underlying identity and client so we can make assertions on it. + Testing bool `yaml:"-"` + // Insecure configures the bot to trust the certificates from the Auth Server or Proxy on first connect without verification. // Do not use in production. Insecure bool `yaml:"insecure,omitempty"` @@ -219,9 +190,11 @@ func (conf *BotConfig) CheckAndSetDefaults() error { return trace.Wrap(err) } - // We've migrated Outputs to Services, so copy all Outputs to Services. + // We've migrated Outputs to Services, so move all Outputs to Services. conf.Services = append(conf.Services, conf.Outputs...) - uniqueNames := make(map[string]struct{}, len(conf.Services)) + conf.Outputs = nil + + namer := newServiceNamer() for i, service := range conf.Services { if err := service.CheckAndSetDefaults(); err != nil { return trace.Wrap(err, "validating service[%d]", i) @@ -229,14 +202,13 @@ func (conf *BotConfig) CheckAndSetDefaults() error { if err := service.GetCredentialLifetime().Validate(conf.Oneshot); err != nil { return trace.Wrap(err, "validating service[%d]", i) } - if name := service.GetName(); name != "" { - if err := validateServiceName(name); err != nil { - return trace.Wrap(err, "validating service[%d]", i) - } - if _, seen := uniqueNames[name]; seen { - return trace.BadParameter("validating service[%d]: duplicate name: %q", i, name) - } - uniqueNames[name] = struct{}{} + + name, err := namer.pickName(service.Type(), service.GetName()) + switch { + case err != nil: + return trace.Wrap(err, "validating service[%d]", i) + case name != service.GetName(): + service.SetName(name) } } @@ -332,9 +304,14 @@ type ServiceConfig interface { // support these options should return the zero value. GetCredentialLifetime() bot.CredentialLifetime - // GetName returns the user-given name of the service, used for validation - // purposes. + // GetName returns the service's given name. Initially the name chosen by + // the user, but after BotConfig.CheckAndSetDefaults is called it may be + // our automatically generated name. GetName() string + + // SetName is called by BotConfig.CheckAndSetDefaults to assign the service + // a new name if the user didn't specify one. + SetName(string) } // ServiceConfigs assists polymorphic unmarshaling of a slice of ServiceConfigs. diff --git a/lib/tbot/config/config_test.go b/lib/tbot/config/config_test.go index efec900a31cc5..54cc82899177b 100644 --- a/lib/tbot/config/config_test.go +++ b/lib/tbot/config/config_test.go @@ -553,62 +553,4 @@ func TestBotConfig_Base64(t *testing.T) { } } -func TestBotConfig_NameValidation(t *testing.T) { - t.Parallel() - - testCases := map[string]struct { - cfg *BotConfig - err string - }{ - "duplicate names": { - cfg: &BotConfig{ - Version: V2, - Services: ServiceConfigs{ - &identity.OutputConfig{ - Name: "foo", - Destination: &destination.Memory{}, - }, - &identity.OutputConfig{ - Name: "foo", - Destination: &destination.Memory{}, - }, - }, - }, - err: `duplicate name: "foo`, - }, - "reserved name": { - cfg: &BotConfig{ - Version: V2, - Services: ServiceConfigs{ - &identity.OutputConfig{ - Name: "identity", - Destination: &destination.Memory{}, - }, - }, - }, - err: `service name "identity" is reserved for internal use`, - }, - "invalid name": { - cfg: &BotConfig{ - Version: V2, - Services: ServiceConfigs{ - &identity.OutputConfig{ - Name: "hello, world!", - Destination: &destination.Memory{}, - }, - }, - }, - err: `may only contain lowercase letters`, - }, - } - for desc, tc := range testCases { - t.Run(desc, func(t *testing.T) { - t.Parallel() - require.ErrorContains(t, tc.cfg.CheckAndSetDefaults(), tc.err) - }) - } -} - -func ptr[T any](v T) *T { - return &v -} +func ptr[T any](v T) *T { return &v } diff --git a/lib/tbot/config/naming.go b/lib/tbot/config/naming.go new file mode 100644 index 0000000000000..845b35728a36d --- /dev/null +++ b/lib/tbot/config/naming.go @@ -0,0 +1,80 @@ +/* + * Teleport + * Copyright (C) 2025 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package config + +import ( + "fmt" + "regexp" + + "github.com/gravitational/trace" +) + +// reservedServiceNames are the service names reserved for internal use. +var reservedServiceNames = map[string]struct{}{ + "ca-rotation": {}, + "crl-cache": {}, + "heartbeat": {}, + "identity": {}, + "spiffe-trust-bundle-cache": {}, +} + +var invalidServiceNameRegex = regexp.MustCompile(`[^a-z\d_\-+]`) + +type serviceNamer struct { + usedNames map[string]struct{} + countByServiceType map[string]int +} + +func newServiceNamer() *serviceNamer { + return &serviceNamer{ + usedNames: make(map[string]struct{}), + countByServiceType: make(map[string]int), + } +} + +// pickName checks the user-chosen name is valid (e.g. not reserved for internal +// use or containing illegal characters) if one is given. If no name is given, +// it will generate a name based on the service type with a counter suffix. +func (n *serviceNamer) pickName(serviceType, name string) (string, error) { + n.countByServiceType[serviceType]++ + + if name == "" { + name = fmt.Sprintf( + "%s-%d", + invalidServiceNameRegex.ReplaceAllString(serviceType, "-"), + n.countByServiceType[serviceType], + ) + if _, ok := n.usedNames[name]; ok { + return "", trace.BadParameter("service name %q conflicts with an automatically generated service name", name) + } + } else { + if _, ok := n.usedNames[name]; ok { + return "", trace.BadParameter("service name %q used more than once", name) + } + if _, ok := reservedServiceNames[name]; ok { + return "", trace.BadParameter("service name %q is reserved for internal use", name) + } + if invalidServiceNameRegex.MatchString(name) { + return "", trace.BadParameter("invalid service name: %q, may only contain lowercase letters, numbers, hyphens, underscores, or plus symbols", name) + } + } + + n.usedNames[name] = struct{}{} + return name, nil +} diff --git a/lib/tbot/config/naming_test.go b/lib/tbot/config/naming_test.go new file mode 100644 index 0000000000000..b2108a936b52f --- /dev/null +++ b/lib/tbot/config/naming_test.go @@ -0,0 +1,83 @@ +/* + * Teleport + * Copyright (C) 2025 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package config + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestServiceNamer(t *testing.T) { + t.Parallel() + + t.Run("success", func(t *testing.T) { + namer := newServiceNamer() + + // Chosen name is used. + name, err := namer.pickName("type/1", "foo") + require.NoError(t, err) + assert.Equal(t, "foo", name) + + // Next name in the "sequence" for that service type is used. + t12, err := namer.pickName("type/1", "") + require.NoError(t, err) + assert.Equal(t, "type-1-2", t12) + + // First name in sequence for new service type is used. + t21, err := namer.pickName("type/2", "") + require.NoError(t, err) + assert.Equal(t, "type-2-1", t21) + }) + + t.Run("reserved name", func(t *testing.T) { + namer := newServiceNamer() + + _, err := namer.pickName("foo", "heartbeat") + require.ErrorContains(t, err, `service name "heartbeat" is reserved for internal use`) + }) + + t.Run("invalid name", func(t *testing.T) { + namer := newServiceNamer() + + _, err := namer.pickName("foo", "hello, world") + require.ErrorContains(t, err, `invalid service name: "hello, world", may only contain lowercase letters, numbers, hyphens, underscores, or plus symbols`) + }) + + t.Run("named used twice", func(t *testing.T) { + namer := newServiceNamer() + + _, err := namer.pickName("foo", "bar") + require.NoError(t, err) + + _, err = namer.pickName("foo", "bar") + require.ErrorContains(t, err, `service name "bar" used more than once`) + }) + + t.Run("chosen name conflicts with generated name", func(t *testing.T) { + namer := newServiceNamer() + + _, err := namer.pickName("foo", "foo-2") + require.NoError(t, err) + + _, err = namer.pickName("foo", "") + require.ErrorContains(t, err, `service name "foo-2" conflicts with an automatically generated service name`) + }) +} diff --git a/lib/tbot/internal/diagnostics/service.go b/lib/tbot/internal/diagnostics/service.go index 95fcb7621e5de..39a46a5e1c137 100644 --- a/lib/tbot/internal/diagnostics/service.go +++ b/lib/tbot/internal/diagnostics/service.go @@ -36,9 +36,10 @@ import ( // ServiceBuilder returns a builder for the diagnostics service. func ServiceBuilder(cfg Config) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { return NewService(cfg, deps.StatusRegistry) } + return bot.NewServiceBuilder("internal/diagnostics", "diagnostics", buildFn) } // Config contains configuration for the diagnostics service. @@ -59,7 +60,7 @@ func (cfg *Config) CheckAndSetDefaults() error { } // NewService creates a new diagnostics service. -func NewService(cfg Config, registry *readyz.Registry) (*Service, error) { +func NewService(cfg Config, registry readyz.ReadOnlyRegistry) (*Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -80,7 +81,7 @@ type Service struct { log *slog.Logger diagAddr string pprofEnabled bool - statusRegistry *readyz.Registry + statusRegistry readyz.ReadOnlyRegistry } func (s *Service) String() string { diff --git a/lib/tbot/internal/heartbeat/service.go b/lib/tbot/internal/heartbeat/service.go index a868ca1e804fe..936065c0f41a7 100644 --- a/lib/tbot/internal/heartbeat/service.go +++ b/lib/tbot/internal/heartbeat/service.go @@ -23,10 +23,10 @@ import ( "log/slog" "os" "runtime" + "sort" "time" "github.com/gravitational/trace" - "github.com/jonboulle/clockwork" "google.golang.org/grpc" "google.golang.org/protobuf/types/known/durationpb" "google.golang.org/protobuf/types/known/timestamppb" @@ -38,6 +38,16 @@ import ( "github.com/gravitational/teleport/lib/tbot/readyz" ) +const ( + // Maximum amount of time we'll wait for services to report their initial + // status before sending the first heartbeat. + serviceHealthMaxWait = 30 * time.Second + + // Maximum amount of time the one-shot heartbeat can take once the bot has + // started shutting down. + shutdownHeartbeatTimeout = 5 * time.Second +) + // Client for the heartbeat service. type Client interface { SubmitHeartbeat( @@ -76,8 +86,9 @@ type Config struct { // StatusReporter is used to report the service's health. StatusReporter readyz.Reporter - // Clock that will be used to determine the current time. - Clock clockwork.Clock + // StatusRegistry is used to fetch the current service statuses when + // submitting a heartbeat. + StatusRegistry *readyz.Registry } // CheckAndSetDefaults checks the service configuration and sets any default values. @@ -91,12 +102,11 @@ func (cfg *Config) CheckAndSetDefaults() error { return trace.BadParameter("Client is required") case cfg.JoinMethod == "": return trace.BadParameter("JoinMethod is required") - } - if cfg.Clock == nil { - cfg.Clock = clockwork.NewRealClock() + case cfg.StatusRegistry == nil: + return trace.BadParameter("StatusRegistry is required") } if cfg.StartedAt.IsZero() { - cfg.StartedAt = cfg.Clock.Now() + cfg.StartedAt = time.Now() } return nil } @@ -114,6 +124,13 @@ type Service struct{ cfg Config } // Run the service in long-running mode, submitting heartbeats periodically. func (s *Service) Run(ctx context.Context) error { + // Wait for service health before sending our first heartbeat. Otherwise, we + // might report all services as "initializing" for the first ~30 minutes our + // bot is running. + if shuttingDown := s.waitForServiceHealth(ctx); shuttingDown { + return nil + } + isStartup := true err := internal.RunOnInterval(ctx, internal.RunOnIntervalConfig{ Service: s.String(), @@ -146,6 +163,21 @@ func (s *Service) Run(ctx context.Context) error { // OneShot submits one heartbeat and then exits. func (s *Service) OneShot(ctx context.Context) error { + // Wait for services to report their health before sending the heartbeat. + shuttingDown := s.waitForServiceHealth(ctx) + + if shuttingDown { + // If the outer context has been canceled (likely because another + // service has return an error) we'll create a new one detached from + // the cancellation to try to send the heartbeat. + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout( + context.WithoutCancel(ctx), + shutdownHeartbeatTimeout, + ) + defer cancel() + } + err := s.heartbeat(ctx, true, true) // Ignore not implemented as this is likely confusing. // TODO(noah): Remove NotImplemented check at V18 assuming V17 first major @@ -159,6 +191,24 @@ func (s *Service) OneShot(ctx context.Context) error { // String implements fmt.Stringer. func (s *Service) String() string { return "heartbeat" } +func (s *Service) waitForServiceHealth(ctx context.Context) (shuttingDown bool) { + // We must report our own status to avoid blocking ourselves! + s.cfg.StatusReporter.Report(readyz.Healthy) + + select { + case <-s.cfg.StatusRegistry.AllServicesReported(): + // All services have reported their status, we're ready! + return false + case <-time.After(serviceHealthMaxWait): + // It's taking too long, give up and start sending heartbeats. + return false + case <-ctx.Done(): + // The outer context has been canceled (e.g. another service has exited + // or the process has received SIGINT). + return true + } +} + func (s *Service) heartbeat(ctx context.Context, isOneShot, isStartup bool) error { s.cfg.Logger.DebugContext(ctx, "Sending heartbeat") hostName, err := os.Hostname() @@ -166,7 +216,7 @@ func (s *Service) heartbeat(ctx context.Context, isOneShot, isStartup bool) erro s.cfg.Logger.WarnContext(ctx, "Failed to determine hostname for heartbeat", "error", err) } - now := s.cfg.Clock.Now() + now := time.Now() hb := &machineidv1pb.BotInstanceStatusHeartbeat{ RecordedAt: timestamppb.New(now), Hostname: hostName, @@ -180,8 +230,20 @@ func (s *Service) heartbeat(ctx context.Context, isOneShot, isStartup bool) erro Kind: s.cfg.BotKind, } + status := s.cfg.StatusRegistry.OverallStatus() + serviceHealth := make([]*machineidv1pb.BotInstanceServiceHealth, 0, len(status.Services)) + for name, serviceStatus := range status.Services { + serviceHealth = append(serviceHealth, statusToServiceHealth(name, serviceStatus)) + } + + // Sort the services by name to make tests deterministic. + sort.Slice(serviceHealth, func(a, b int) bool { + return serviceHealth[a].Service.Name < serviceHealth[b].Service.Name + }) + _, err = s.cfg.Client.SubmitHeartbeat(ctx, &machineidv1pb.SubmitHeartbeatRequest{ - Heartbeat: hb, + Heartbeat: hb, + ServiceHealth: serviceHealth, }) if err != nil { return trace.Wrap(err, "submitting heartbeat") @@ -190,3 +252,49 @@ func (s *Service) heartbeat(ctx context.Context, isOneShot, isStartup bool) erro s.cfg.Logger.InfoContext(ctx, "Sent heartbeat", "data", hb.String()) return nil } + +func statusToServiceHealth(name string, status *readyz.ServiceStatus) *machineidv1pb.BotInstanceServiceHealth { + health := &machineidv1pb.BotInstanceServiceHealth{ + Service: &machineidv1pb.BotInstanceServiceIdentifier{ + Name: trimString(name, 64), + Type: status.ServiceType, + }, + } + + switch status.Status { + case readyz.Initializing: + health.Status = machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_INITIALIZING + case readyz.Healthy: + health.Status = machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_HEALTHY + case readyz.Unhealthy: + health.Status = machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY + } + + if status.Reason != "" { + reason := trimString(status.Reason, 256) + health.Reason = &reason + } + + if status.UpdatedAt != nil { + health.UpdatedAt = timestamppb.New(*status.UpdatedAt) + } + + return health +} + +func trimString(s string, maxBytes int) string { + if len(s) <= maxBytes { + return s + } + + // Trim the string to maxBytes, honoring rune boundaries for non-ASCII text. + byteCount := 0 + for i, r := range s { + runeSize := len(string(r)) + if byteCount+runeSize > maxBytes { + return s[:i] + } + byteCount += runeSize + } + return s +} diff --git a/lib/tbot/internal/heartbeat/service_test.go b/lib/tbot/internal/heartbeat/service_test.go index 8dac3c6f241ee..272116ec554fc 100644 --- a/lib/tbot/internal/heartbeat/service_test.go +++ b/lib/tbot/internal/heartbeat/service_test.go @@ -22,11 +22,12 @@ import ( "context" "os" "runtime" + "strings" "testing" + "testing/synctest" "time" "github.com/google/go-cmp/cmp" - "github.com/jonboulle/clockwork" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "google.golang.org/grpc" @@ -37,6 +38,7 @@ import ( "github.com/gravitational/teleport" machineidv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1" "github.com/gravitational/teleport/api/types" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/utils" ) @@ -54,56 +56,128 @@ func (f *fakeHeartbeatSubmitter) SubmitHeartbeat( func TestHeartbeatService(t *testing.T) { t.Parallel() - log := utils.NewSlogLoggerForTests() - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - - fhs := &fakeHeartbeatSubmitter{ - ch: make(chan *machineidv1pb.SubmitHeartbeatRequest, 2), - } - - now := time.Date(2024, time.April, 1, 12, 0, 0, 0, time.UTC) - svc, err := NewService(Config{ - Interval: time.Second, - RetryLimit: 3, - Client: fhs, - Clock: clockwork.NewFakeClockAt(now), - StartedAt: time.Date(2024, time.April, 1, 11, 0, 0, 0, time.UTC), - Logger: log, - JoinMethod: types.JoinMethodGitHub, - BotKind: machineidv1pb.BotKind_BOT_KIND_TBOT, + synctest.Run(func() { + log := utils.NewSlogLoggerForTests() + ctx, cancel := context.WithCancel(t.Context()) + t.Cleanup(cancel) + + fhs := &fakeHeartbeatSubmitter{ + ch: make(chan *machineidv1pb.SubmitHeartbeatRequest, 2), + } + + reg := readyz.NewRegistry() + + svcA := reg.AddService("a", "a") + svcB := reg.AddService("b", strings.Repeat("b", 200)) + + svc, err := NewService(Config{ + Interval: time.Second, + RetryLimit: 3, + Client: fhs, + StartedAt: time.Now().Add(-1 * time.Hour), + Logger: log, + JoinMethod: types.JoinMethodGitHub, + StatusReporter: reg.AddService("internal/heartbeat", "heartbeat"), + StatusRegistry: reg, + BotKind: machineidv1pb.BotKind_BOT_KIND_TBOT, + }) + require.NoError(t, err) + + hostName, err := os.Hostname() + require.NoError(t, err) + + errCh := make(chan error, 1) + go func() { + errCh <- svc.Run(ctx) + }() + + synctest.Wait() + select { + case <-fhs.ch: + t.Fatal("should not have received a heartbeat until all services have reported their status") + default: + } + + svcA.ReportReason(readyz.Unhealthy, "no more bananas") + svcB.ReportReason(readyz.Unhealthy, strings.Repeat("b", 300)) + + want := &machineidv1pb.SubmitHeartbeatRequest{ + Heartbeat: &machineidv1pb.BotInstanceStatusHeartbeat{ + Hostname: hostName, + IsStartup: true, + OneShot: false, + Uptime: durationpb.New(time.Hour), + Version: teleport.Version, + Architecture: runtime.GOARCH, + Os: runtime.GOOS, + JoinMethod: string(types.JoinMethodGitHub), + Kind: machineidv1pb.BotKind_BOT_KIND_TBOT, + }, + ServiceHealth: []*machineidv1pb.BotInstanceServiceHealth{ + { + Service: &machineidv1pb.BotInstanceServiceIdentifier{ + Name: "a", + Type: "a", + }, + Reason: ptr("no more bananas"), + Status: machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + UpdatedAt: timestamppb.New(time.Now()), + }, + // Check limits were applied on user-controlled or dynamic fields. + { + Service: &machineidv1pb.BotInstanceServiceIdentifier{ + Name: strings.Repeat("b", 64), + Type: "b", + }, + Reason: ptr(strings.Repeat("b", 256)), + Status: machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + UpdatedAt: timestamppb.New(time.Now()), + }, + { + Service: &machineidv1pb.BotInstanceServiceIdentifier{ + Name: "heartbeat", + Type: "internal/heartbeat", + }, + Status: machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_HEALTHY, + UpdatedAt: timestamppb.New(time.Now()), + }, + }, + } + + compare := func(t *testing.T, want, got *machineidv1pb.SubmitHeartbeatRequest) { + t.Helper() + + assert.Empty(t, + cmp.Diff(want, got, + protocmp.Transform(), + protocmp.IgnoreFields(&machineidv1pb.BotInstanceStatusHeartbeat{}, "recorded_at"), + ), + ) + } + + synctest.Wait() + select { + case got := <-fhs.ch: + compare(t, want, got) + default: + t.Fatal("no heartbeat received") + } + + time.Sleep(1 * time.Second) + synctest.Wait() + + select { + case got := <-fhs.ch: + want.Heartbeat.IsStartup = false + want.Heartbeat.Uptime = durationpb.New(time.Hour + time.Second) + compare(t, want, got) + default: + t.Fatal("no heartbeat received") + } + + cancel() + assert.NoError(t, <-errCh) }) - require.NoError(t, err) - - hostName, err := os.Hostname() - require.NoError(t, err) - - errCh := make(chan error, 1) - go func() { - errCh <- svc.Run(ctx) - }() - - got := <-fhs.ch - want := &machineidv1pb.SubmitHeartbeatRequest{ - Heartbeat: &machineidv1pb.BotInstanceStatusHeartbeat{ - RecordedAt: timestamppb.New(now), - Hostname: hostName, - IsStartup: true, - OneShot: false, - Uptime: durationpb.New(time.Hour), - Version: teleport.Version, - Architecture: runtime.GOARCH, - Os: runtime.GOOS, - JoinMethod: string(types.JoinMethodGitHub), - Kind: machineidv1pb.BotKind_BOT_KIND_TBOT, - }, - } - assert.Empty(t, cmp.Diff(want, got, protocmp.Transform())) - - got = <-fhs.ch - want.Heartbeat.IsStartup = false - assert.Empty(t, cmp.Diff(want, got, protocmp.Transform())) - - cancel() - assert.NoError(t, <-errCh) } + +func ptr[T any](v T) *T { return &v } diff --git a/lib/tbot/internal/identity/service.go b/lib/tbot/internal/identity/service.go index f5330e0e21f3d..2282ca53c55b9 100644 --- a/lib/tbot/internal/identity/service.go +++ b/lib/tbot/internal/identity/service.go @@ -372,7 +372,6 @@ func (s *Service) Initialize(ctx context.Context) error { s.mu.Unlock() s.unblockWaiters() - s.cfg.StatusReporter.Report(readyz.Healthy) s.log.InfoContext(ctx, "Identity initialized successfully") return nil @@ -401,8 +400,10 @@ func (s *Service) Run(ctx context.Context) error { // etc) storageDestination := s.cfg.Destination - // Keep retrying renewal if it failed on startup. - if !s.IsReady() { + if s.IsReady() { + s.cfg.StatusReporter.Report(readyz.Healthy) + } else { + // Keep retrying renewal if it failed on startup. retry, err := retryutils.NewRetryV2(retryutils.RetryV2Config{ Driver: retryutils.NewExponentialDriver(1 * time.Second), Max: 1 * time.Minute, diff --git a/lib/tbot/readyz/http.go b/lib/tbot/readyz/http.go index f2a9472beec59..139c88b0c3490 100644 --- a/lib/tbot/readyz/http.go +++ b/lib/tbot/readyz/http.go @@ -26,9 +26,16 @@ import ( "net/http" ) +// ReadOnlyRegistry is a version of the Registry which can only be read from, +// not reported to. +type ReadOnlyRegistry interface { + ServiceStatus(name string) (*ServiceStatus, bool) + OverallStatus() *OverallStatus +} + // HTTPHandler returns an HTTP handler that implements tbot's // /readyz(/{service}) endpoints. -func HTTPHandler(reg *Registry) http.Handler { +func HTTPHandler(reg ReadOnlyRegistry) http.Handler { mux := http.NewServeMux() mux.Handle("/readyz/{service}", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/lib/tbot/readyz/readyz.go b/lib/tbot/readyz/readyz.go index 4462c7c1fe4bc..71881c1dfd3f8 100644 --- a/lib/tbot/readyz/readyz.go +++ b/lib/tbot/readyz/readyz.go @@ -18,36 +18,69 @@ package readyz -import "sync" +import ( + "sync" + "time" + + "github.com/jonboulle/clockwork" +) // NewRegistry returns a Registry to track the health of tbot's services. -func NewRegistry() *Registry { - return &Registry{ +func NewRegistry(opts ...NewRegistryOpt) *Registry { + r := &Registry{ + clock: clockwork.NewRealClock(), services: make(map[string]*ServiceStatus), + notifyCh: make(chan struct{}), + } + for _, opt := range opts { + opt(r) } + return r +} + +// NewRegistryOpt can be passed to NewRegistry to provide optional configuration. +type NewRegistryOpt func(r *Registry) + +// WithClock sets the registry's clock. +func WithClock(clock clockwork.Clock) NewRegistryOpt { + return func(r *Registry) { r.clock = clock } } // Registry tracks the status/health of tbot's services. type Registry struct { + clock clockwork.Clock + mu sync.Mutex services map[string]*ServiceStatus + reported int + notifyCh chan struct{} } // AddService adds a service to the registry so that its health will be reported // from our readyz endpoints. It returns a Reporter the service can use to report // status changes. -func (r *Registry) AddService(name string) Reporter { +// +// Note: you should add all of your services before any service reports its status +// otherwise AllServicesReported will unblock too early. +func (r *Registry) AddService(serviceType, name string) Reporter { r.mu.Lock() defer r.mu.Unlock() + // TODO(boxofrad): If you add the same service multiple times, you could end + // up unblocking AllServicesReported prematurely. The impact is low, it just + // means we'd send a heartbeat sooner than is desirable, but we should panic + // or return an error from this method instead. status, ok := r.services[name] if !ok { - status = &ServiceStatus{} + status = &ServiceStatus{ServiceType: serviceType} r.services[name] = status } + return &reporter{ mu: &r.mu, + clock: r.clock, status: status, + notify: sync.OnceFunc(r.maybeNotifyLocked), } } @@ -87,6 +120,31 @@ func (r *Registry) OverallStatus() *OverallStatus { } } +// AllServicesReported returns a channel you can receive from to be notified +// when all registered services have reported their initial status. It provides +// a way for us to hold the initial heartbeat until after the initial flurry of +// activity. +func (r *Registry) AllServicesReported() <-chan struct{} { return r.notifyCh } + +// maybeNotifyLocked unblocks the AllServicesReported channel if all services +// have reported their initial status. It's called by each of the Reporters the +// first time you report a status. +// +// Caller must be holding r.mu. +func (r *Registry) maybeNotifyLocked() { + r.reported++ + + if r.reported != len(r.services) { + return + } + + select { + case <-r.notifyCh: + default: + close(r.notifyCh) + } +} + // ServiceStatus is a snapshot of the service's status. type ServiceStatus struct { // Status of the service. @@ -94,6 +152,12 @@ type ServiceStatus struct { // Reason string describing why the service has its current status. Reason string `json:"reason,omitempty"` + + // UpdatedAt is the time at which the service's status last changed. + UpdatedAt *time.Time `json:"updated_at"` + + // ServiceType is exposed in bot heartbeats, but not the `/readyz` endpoint. + ServiceType string `json:"-"` } // Clone the status to avoid data races. diff --git a/lib/tbot/readyz/readyz_test.go b/lib/tbot/readyz/readyz_test.go index 40641ab9d4a01..4a43a0fec9126 100644 --- a/lib/tbot/readyz/readyz_test.go +++ b/lib/tbot/readyz/readyz_test.go @@ -23,7 +23,9 @@ import ( "net/http" "net/http/httptest" "testing" + "time" + "github.com/jonboulle/clockwork" "github.com/stretchr/testify/require" "github.com/gravitational/teleport/lib/tbot/readyz" @@ -32,10 +34,13 @@ import ( func TestReadyz(t *testing.T) { t.Parallel() - reg := readyz.NewRegistry() + now := time.Now().UTC().Truncate(time.Second) + clock := clockwork.NewFakeClockAt(now) + + reg := readyz.NewRegistry(readyz.WithClock(clock)) - a := reg.AddService("a") - b := reg.AddService("b") + a := reg.AddService("svc", "a") + b := reg.AddService("svc", "b") srv := httptest.NewServer(readyz.HTTPHandler(reg)) srv.URL = srv.URL + "/readyz" @@ -79,8 +84,9 @@ func TestReadyz(t *testing.T) { require.Equal(t, readyz.ServiceStatus{ - Status: readyz.Unhealthy, - Reason: "database is down", + Status: readyz.Unhealthy, + Reason: "database is down", + UpdatedAt: &now, }, response, ) @@ -104,8 +110,8 @@ func TestReadyz(t *testing.T) { readyz.OverallStatus{ Status: readyz.Unhealthy, Services: map[string]*readyz.ServiceStatus{ - "a": {Status: readyz.Healthy}, - "b": {Status: readyz.Unhealthy, Reason: "database is down"}, + "a": {Status: readyz.Healthy, UpdatedAt: &now}, + "b": {Status: readyz.Unhealthy, Reason: "database is down", UpdatedAt: &now}, }, }, response, @@ -130,8 +136,8 @@ func TestReadyz(t *testing.T) { readyz.OverallStatus{ Status: readyz.Healthy, Services: map[string]*readyz.ServiceStatus{ - "a": {Status: readyz.Healthy}, - "b": {Status: readyz.Healthy}, + "a": {Status: readyz.Healthy, UpdatedAt: &now}, + "b": {Status: readyz.Healthy, UpdatedAt: &now}, }, }, response, @@ -146,3 +152,32 @@ func TestReadyz(t *testing.T) { require.Equal(t, http.StatusNotFound, rsp.StatusCode) }) } + +func TestAllServicesReported(t *testing.T) { + reg := readyz.NewRegistry() + + a := reg.AddService("svc", "a") + b := reg.AddService("svc", "b") + + select { + case <-reg.AllServicesReported(): + t.Fatal("AllServicesReported should be blocked") + default: + } + + a.Report(readyz.Healthy) + + select { + case <-reg.AllServicesReported(): + t.Fatal("AllServicesReported should be blocked") + default: + } + + b.Report(readyz.Unhealthy) + + select { + case <-reg.AllServicesReported(): + default: + t.Fatal("AllServicesReported should not be blocked") + } +} diff --git a/lib/tbot/readyz/reporter.go b/lib/tbot/readyz/reporter.go index 3620bb96ef3fb..7a8d2ff8e7224 100644 --- a/lib/tbot/readyz/reporter.go +++ b/lib/tbot/readyz/reporter.go @@ -18,7 +18,11 @@ package readyz -import "sync" +import ( + "sync" + + "github.com/jonboulle/clockwork" +) // Reporter can be used by a service to report its status. type Reporter interface { @@ -32,6 +36,8 @@ type Reporter interface { type reporter struct { mu *sync.Mutex status *ServiceStatus + clock clockwork.Clock + notify func() } func (r *reporter) Report(status Status) { @@ -44,6 +50,11 @@ func (r *reporter) ReportReason(status Status, reason string) { r.status.Status = status r.status.Reason = reason + + updatedAt := r.clock.Now() + r.status.UpdatedAt = &updatedAt + + r.notify() } // NoopReporter returns a no-op Reporter that can be used when no real reporter diff --git a/lib/tbot/services/application/output_config.go b/lib/tbot/services/application/output_config.go index 773dde76b3729..a7e900b6a4f40 100644 --- a/lib/tbot/services/application/output_config.go +++ b/lib/tbot/services/application/output_config.go @@ -76,6 +76,11 @@ func (o *OutputConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *OutputConfig) SetName(name string) { + o.Name = name +} + func (o *OutputConfig) GetDestination() destination.Destination { return o.Destination } diff --git a/lib/tbot/services/application/output_service.go b/lib/tbot/services/application/output_service.go index 4e619dca0eef0..77fc6e383c5e8 100644 --- a/lib/tbot/services/application/output_service.go +++ b/lib/tbot/services/application/output_service.go @@ -38,7 +38,7 @@ import ( ) func OutputServiceBuilder(cfg *OutputConfig, defaultCredentialLifetime bot.CredentialLifetime) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -51,11 +51,12 @@ func OutputServiceBuilder(cfg *OutputConfig, defaultCredentialLifetime bot.Crede reloadCh: deps.ReloadCh, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(OutputServiceType, cfg.Name, buildFn) } // OutputService generates the artifacts necessary to connect to a diff --git a/lib/tbot/services/application/proxy_config.go b/lib/tbot/services/application/proxy_config.go index 58382711d5af1..17c4942fb25ca 100644 --- a/lib/tbot/services/application/proxy_config.go +++ b/lib/tbot/services/application/proxy_config.go @@ -56,6 +56,11 @@ func (c *ProxyServiceConfig) GetName() string { return c.Name } +// SetName sets the service's name to an automatically generated one. +func (o *ProxyServiceConfig) SetName(name string) { + o.Name = name +} + // Type returns the type of the service. func (c *ProxyServiceConfig) Type() string { return ProxyServiceType diff --git a/lib/tbot/services/application/proxy_service.go b/lib/tbot/services/application/proxy_service.go index 53c638b0a9168..5a7ae14b41444 100644 --- a/lib/tbot/services/application/proxy_service.go +++ b/lib/tbot/services/application/proxy_service.go @@ -52,7 +52,7 @@ func ProxyServiceBuilder( defaultCredentialLifetime bot.CredentialLifetime, alpnUpgradeCache *internal.ALPNUpgradeCache, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -67,12 +67,12 @@ func ProxyServiceBuilder( identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, alpnUpgradeCache: alpnUpgradeCache, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(ProxyServiceType, cfg.Name, buildFn) } // ProxyService presents a http_proxy compatible proxy on a listener which will diff --git a/lib/tbot/services/application/tunnel_config.go b/lib/tbot/services/application/tunnel_config.go index 4acf6df78fb6f..1882f3ed8969f 100644 --- a/lib/tbot/services/application/tunnel_config.go +++ b/lib/tbot/services/application/tunnel_config.go @@ -61,6 +61,11 @@ func (o *TunnelConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *TunnelConfig) SetName(name string) { + o.Name = name +} + func (s *TunnelConfig) Type() string { return TunnelServiceType } diff --git a/lib/tbot/services/application/tunnel_service.go b/lib/tbot/services/application/tunnel_service.go index 073ab782ff795..8a7ad75c40276 100644 --- a/lib/tbot/services/application/tunnel_service.go +++ b/lib/tbot/services/application/tunnel_service.go @@ -45,7 +45,7 @@ func TunnelServiceBuilder( connCfg connection.Config, defaultCredentialLifetime bot.CredentialLifetime, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -59,11 +59,12 @@ func TunnelServiceBuilder( cfg: cfg, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(TunnelServiceType, cfg.Name, buildFn) } // TunnelService is a service that listens on a socket and forwards diff --git a/lib/tbot/services/awsra/config.go b/lib/tbot/services/awsra/config.go index c69517f58e1b4..179dc516cc2a5 100644 --- a/lib/tbot/services/awsra/config.go +++ b/lib/tbot/services/awsra/config.go @@ -102,6 +102,11 @@ func (o *Config) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *Config) SetName(name string) { + o.Name = name +} + // Init initializes the destination. func (o *Config) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) diff --git a/lib/tbot/services/awsra/service.go b/lib/tbot/services/awsra/service.go index 3fa7dbd5c77d3..fa3df8bf8490d 100644 --- a/lib/tbot/services/awsra/service.go +++ b/lib/tbot/services/awsra/service.go @@ -49,7 +49,7 @@ import ( var tracer = otel.Tracer("github.com/gravitational/teleport/lib/tbot/services/awsra") func ServiceBuilder(cfg *Config) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -60,11 +60,12 @@ func ServiceBuilder(cfg *Config) bot.ServiceBuilder { reloadCh: deps.ReloadCh, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(ServiceType, cfg.Name, buildFn) } // Service is a service that retrieves X.509 certificates and exchanges them for diff --git a/lib/tbot/services/clientcredentials/config.go b/lib/tbot/services/clientcredentials/config.go index 6b039b0187666..aebe229097428 100644 --- a/lib/tbot/services/clientcredentials/config.go +++ b/lib/tbot/services/clientcredentials/config.go @@ -61,6 +61,11 @@ func (o *UnstableConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *UnstableConfig) SetName(name string) { + o.Name = name +} + // Ready returns a channel which closes when the Output is ready to be used // as a client credential. Using this as a credential before Ready closes is // unsupported. diff --git a/lib/tbot/services/clientcredentials/service.go b/lib/tbot/services/clientcredentials/service.go index 79bab6cbcae5f..5e7f66b1c1282 100644 --- a/lib/tbot/services/clientcredentials/service.go +++ b/lib/tbot/services/clientcredentials/service.go @@ -38,7 +38,7 @@ import ( // Note: when using the client credentials service to provide credentials to // another service (e.g. the SPIFFE Workload API service) use NewSidecar instead. func ServiceBuilder(cfg *UnstableConfig, credentialLifetime bot.CredentialLifetime) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -49,11 +49,12 @@ func ServiceBuilder(cfg *UnstableConfig, credentialLifetime bot.CredentialLifeti cfg: cfg, reloadCh: deps.ReloadCh, identityGenerator: deps.IdentityGenerator, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(ServiceType, cfg.Name, buildFn) } // NewSidecar creates a client credential service intended to provide credentials @@ -68,8 +69,8 @@ func NewSidecar(deps bot.ServiceDependencies, credentialLifetime bot.CredentialL reloadCh: deps.ReloadCh, identityGenerator: deps.IdentityGenerator, statusReporter: readyz.NoopReporter(), + log: deps.Logger, } - svc.log = deps.LoggerForService(svc) return svc, cfg } diff --git a/lib/tbot/services/database/output_config.go b/lib/tbot/services/database/output_config.go index d48aba8d2970b..6d506107df829 100644 --- a/lib/tbot/services/database/output_config.go +++ b/lib/tbot/services/database/output_config.go @@ -104,6 +104,11 @@ func (o *OutputConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *OutputConfig) SetName(name string) { + o.Name = name +} + func (o *OutputConfig) Init(ctx context.Context) error { subDirs := []string{} if o.Format == CockroachDatabaseFormat { diff --git a/lib/tbot/services/database/output_service.go b/lib/tbot/services/database/output_service.go index 1a1c045a9bf0e..fa95c65a303fd 100644 --- a/lib/tbot/services/database/output_service.go +++ b/lib/tbot/services/database/output_service.go @@ -38,7 +38,7 @@ import ( ) func OutputServiceBuilder(cfg *OutputConfig, defaultCredentialLifetime bot.CredentialLifetime) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -50,11 +50,12 @@ func OutputServiceBuilder(cfg *OutputConfig, defaultCredentialLifetime bot.Crede reloadCh: deps.ReloadCh, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(OutputServiceType, cfg.Name, buildFn) } // OutputService generates the artifacts necessary to connect to a diff --git a/lib/tbot/services/database/tunnel_config.go b/lib/tbot/services/database/tunnel_config.go index db1bf39380dc2..6f89611a7fc41 100644 --- a/lib/tbot/services/database/tunnel_config.go +++ b/lib/tbot/services/database/tunnel_config.go @@ -65,6 +65,11 @@ func (o *TunnelConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *TunnelConfig) SetName(name string) { + o.Name = name +} + func (s *TunnelConfig) Type() string { return TunnelServiceType } diff --git a/lib/tbot/services/database/tunnel_service.go b/lib/tbot/services/database/tunnel_service.go index c152718fda570..db44db51b82ab 100644 --- a/lib/tbot/services/database/tunnel_service.go +++ b/lib/tbot/services/database/tunnel_service.go @@ -46,7 +46,7 @@ func TunnelServiceBuilder( connCfg connection.Config, defaultCredentialLifetime bot.CredentialLifetime, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -60,11 +60,12 @@ func TunnelServiceBuilder( botIdentityReadyCh: deps.BotIdentityReadyCh, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(TunnelServiceType, cfg.Name, buildFn) } // TunnelService is a service that listens on a local port and forwards diff --git a/lib/tbot/services/example/config.go b/lib/tbot/services/example/config.go index c04c1a2d1881e..174234e4f33ee 100644 --- a/lib/tbot/services/example/config.go +++ b/lib/tbot/services/example/config.go @@ -41,6 +41,11 @@ func (s *Config) GetName() string { return s.Name } +// SetName sets the service's name to an automatically generated one. +func (o *Config) SetName(name string) { + o.Name = name +} + func (s *Config) Type() string { return ServiceType } diff --git a/lib/tbot/services/example/service.go b/lib/tbot/services/example/service.go index 5117f1faec1df..4c4a00f3fcf9b 100644 --- a/lib/tbot/services/example/service.go +++ b/lib/tbot/services/example/service.go @@ -31,12 +31,13 @@ import ( // ServiceBuilder returns an example service builder. func ServiceBuilder(cfg *Config) bot.ServiceBuilder { - return func(bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } return &Service{cfg: cfg}, nil } + return bot.NewServiceBuilder(ServiceType, cfg.Name, buildFn) } // Service is a temporary example service for testing purposes. It is not diff --git a/lib/tbot/services/identity/output.go b/lib/tbot/services/identity/output.go index 676a2dd26150e..e728caf8ee3cb 100644 --- a/lib/tbot/services/identity/output.go +++ b/lib/tbot/services/identity/output.go @@ -50,7 +50,7 @@ func OutputServiceBuilder( defaultCredentialLifetime bot.CredentialLifetime, insecure, fips bool, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -67,11 +67,12 @@ func OutputServiceBuilder( proxyPinger: deps.ProxyPinger, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(OutputServiceType, cfg.Name, buildFn) } // OutputService produces credentials which can be used to connect to diff --git a/lib/tbot/services/identity/output_config.go b/lib/tbot/services/identity/output_config.go index d6656d318c4b3..6adb4b48e6b49 100644 --- a/lib/tbot/services/identity/output_config.go +++ b/lib/tbot/services/identity/output_config.go @@ -101,6 +101,11 @@ func (o *OutputConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *OutputConfig) SetName(name string) { + o.Name = name +} + func (o *OutputConfig) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) } diff --git a/lib/tbot/services/k8s/argocd_output.go b/lib/tbot/services/k8s/argocd_output.go index 3f9fe81a03fe9..ef7089838af19 100644 --- a/lib/tbot/services/k8s/argocd_output.go +++ b/lib/tbot/services/k8s/argocd_output.go @@ -52,7 +52,7 @@ import ( // ArgoCDServiceBuilder builds a new ArgoCDOutput. func ArgoCDServiceBuilder(cfg *ArgoCDOutputConfig, opts ...ArgoCDServiceOption) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -66,6 +66,8 @@ func ArgoCDServiceBuilder(cfg *ArgoCDOutputConfig, opts ...ArgoCDServiceOption) clientBuilder: deps.ClientBuilder, reloadCh: deps.ReloadCh, botIdentityReadyCh: deps.BotIdentityReadyCh, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } for _, opt := range opts { @@ -81,15 +83,13 @@ func ArgoCDServiceBuilder(cfg *ArgoCDOutputConfig, opts ...ArgoCDServiceOption) } } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) - if svc.alpnUpgradeCache == nil { svc.alpnUpgradeCache = internal.NewALPNUpgradeCache(svc.log) } return svc, nil } + return bot.NewServiceBuilder(ArgoCDOutputServiceType, cfg.Name, buildFn) } // ArgoCDServiceOption is an option that can be provided to customize the service. diff --git a/lib/tbot/services/k8s/argocd_output_config.go b/lib/tbot/services/k8s/argocd_output_config.go index e4eeb513b095f..125ac55fa5f17 100644 --- a/lib/tbot/services/k8s/argocd_output_config.go +++ b/lib/tbot/services/k8s/argocd_output_config.go @@ -96,6 +96,11 @@ func (o *ArgoCDOutputConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *ArgoCDOutputConfig) SetName(name string) { + o.Name = name +} + // CheckAndSetDefaults validates the service configuration and sets any default // values. func (o *ArgoCDOutputConfig) CheckAndSetDefaults() error { diff --git a/lib/tbot/services/k8s/output_v1.go b/lib/tbot/services/k8s/output_v1.go index e847cbf4402e7..086680dc1f337 100644 --- a/lib/tbot/services/k8s/output_v1.go +++ b/lib/tbot/services/k8s/output_v1.go @@ -52,7 +52,7 @@ import ( const defaultKubeconfigPath = "kubeconfig.yaml" func OutputV1ServiceBuilder(cfg *OutputV1Config, opts ...OutputV1Option) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -66,14 +66,15 @@ func OutputV1ServiceBuilder(cfg *OutputV1Config, opts ...OutputV1Option) bot.Ser executablePath: autoupdate.StableExecutable, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } for _, opt := range opts { opt.applyToV1Output(svc) } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(OutputV1ServiceType, cfg.Name, buildFn) } // OutputV1Option is an option that can be provided to customize the service. diff --git a/lib/tbot/services/k8s/output_v1_config.go b/lib/tbot/services/k8s/output_v1_config.go index 3cfd1313f74ba..993ba8029e608 100644 --- a/lib/tbot/services/k8s/output_v1_config.go +++ b/lib/tbot/services/k8s/output_v1_config.go @@ -65,6 +65,11 @@ func (o *OutputV1Config) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *OutputV1Config) SetName(name string) { + o.Name = name +} + func (o *OutputV1Config) CheckAndSetDefaults() error { if o.Destination == nil { return trace.BadParameter("no destination configured for output") diff --git a/lib/tbot/services/k8s/output_v2.go b/lib/tbot/services/k8s/output_v2.go index a06a8dd8cf018..e44c6b4373749 100644 --- a/lib/tbot/services/k8s/output_v2.go +++ b/lib/tbot/services/k8s/output_v2.go @@ -52,7 +52,7 @@ import ( ) func OutputV2ServiceBuilder(cfg *OutputV2Config, opts ...OutputV2Option) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -66,14 +66,15 @@ func OutputV2ServiceBuilder(cfg *OutputV2Config, opts ...OutputV2Option) bot.Ser executablePath: autoupdate.StableExecutable, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } for _, opt := range opts { opt.applyToV2Output(svc) } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(OutputV2ServiceType, cfg.Name, buildFn) } // OutputV1Option is an option that can be provided to customize the service. diff --git a/lib/tbot/services/k8s/output_v2_config.go b/lib/tbot/services/k8s/output_v2_config.go index 10d86cc1b0970..0002957724f9b 100644 --- a/lib/tbot/services/k8s/output_v2_config.go +++ b/lib/tbot/services/k8s/output_v2_config.go @@ -75,6 +75,11 @@ func (o *OutputV2Config) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *OutputV2Config) SetName(name string) { + o.Name = name +} + func (o *OutputV2Config) CheckAndSetDefaults() error { if o.Destination == nil { return trace.BadParameter("no destination configured for output") diff --git a/lib/tbot/services/legacyspiffe/svid_output.go b/lib/tbot/services/legacyspiffe/svid_output.go index 12683ae09bce0..29414f3d28f94 100644 --- a/lib/tbot/services/legacyspiffe/svid_output.go +++ b/lib/tbot/services/legacyspiffe/svid_output.go @@ -49,7 +49,7 @@ func SVIDOutputServiceBuilder( trustBundleCache TrustBundleGetter, defaultCredentialLifetime bot.CredentialLifetime, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -61,11 +61,12 @@ func SVIDOutputServiceBuilder( identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, trustBundleCache: trustBundleCache, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(SVIDOutputServiceType, cfg.Name, buildFn) } // SVIDOutputService is a service that generates and writes X509 SPIFFE diff --git a/lib/tbot/services/legacyspiffe/svid_output_config.go b/lib/tbot/services/legacyspiffe/svid_output_config.go index 667a835aa21c0..663968bb987ab 100644 --- a/lib/tbot/services/legacyspiffe/svid_output_config.go +++ b/lib/tbot/services/legacyspiffe/svid_output_config.go @@ -126,6 +126,11 @@ func (o *SVIDOutputConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *SVIDOutputConfig) SetName(name string) { + o.Name = name +} + // Init initializes the destination. func (o *SVIDOutputConfig) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) diff --git a/lib/tbot/services/legacyspiffe/workload_api.go b/lib/tbot/services/legacyspiffe/workload_api.go index 6f6c1a4a9301a..159fbefe43839 100644 --- a/lib/tbot/services/legacyspiffe/workload_api.go +++ b/lib/tbot/services/legacyspiffe/workload_api.go @@ -68,7 +68,7 @@ func WorkloadAPIServiceBuilder( trustBundleCache TrustBundleGetter, defaultCredentialLifetime bot.CredentialLifetime, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -79,11 +79,12 @@ func WorkloadAPIServiceBuilder( cfg: cfg, trustBundleCache: trustBundleCache, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return bot.NewServicePair(svc, sidecar), nil } + return bot.NewServiceBuilder(WorkloadAPIServiceType, cfg.Name, buildFn) } type TrustBundleGetter interface { diff --git a/lib/tbot/services/legacyspiffe/workload_api_config.go b/lib/tbot/services/legacyspiffe/workload_api_config.go index f76f10321fbef..87f7b1b2f2450 100644 --- a/lib/tbot/services/legacyspiffe/workload_api_config.go +++ b/lib/tbot/services/legacyspiffe/workload_api_config.go @@ -140,6 +140,11 @@ func (o *WorkloadAPIConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *WorkloadAPIConfig) SetName(name string) { + o.Name = name +} + func (s *WorkloadAPIConfig) Type() string { return WorkloadAPIServiceType } diff --git a/lib/tbot/services/legacyspiffe/workload_api_test.go b/lib/tbot/services/legacyspiffe/workload_api_test.go index dc86a757aa51c..68fe9e77a91a7 100644 --- a/lib/tbot/services/legacyspiffe/workload_api_test.go +++ b/lib/tbot/services/legacyspiffe/workload_api_test.go @@ -51,6 +51,7 @@ import ( "github.com/gravitational/teleport/lib/tbot/bot/destination" "github.com/gravitational/teleport/lib/tbot/workloadidentity" "github.com/gravitational/teleport/lib/tbot/workloadidentity/attrs" + "github.com/gravitational/teleport/lib/tbot/workloadidentity/workloadattest" "github.com/gravitational/teleport/lib/utils" libtestutils "github.com/gravitational/teleport/lib/utils/testutils" "github.com/gravitational/teleport/tool/teleport/testenv" @@ -507,7 +508,7 @@ func TestBotSPIFFEWorkloadAPI(t *testing.T) { Onboarding: *onboarding, InternalStorage: destination.NewMemory(), Services: []bot.ServiceBuilder{ - trustBundleCache.BuildService, + trustBundleCache.Builder(), WorkloadAPIServiceBuilder( &WorkloadAPIConfig{ Listen: socketPath, @@ -545,6 +546,11 @@ func TestBotSPIFFEWorkloadAPI(t *testing.T) { }, }, }, + Attestors: workloadattest.Config{ + Unix: workloadattest.UnixAttestorConfig{ + BinaryHashMaxSizeBytes: workloadattest.TestBinaryHashMaxBytes, + }, + }, }, trustBundleCache, bot.DefaultCredentialLifetime, @@ -722,7 +728,7 @@ func Test_E2E_SPIFFE_SDS(t *testing.T) { Onboarding: *onboarding, InternalStorage: destination.NewMemory(), Services: []bot.ServiceBuilder{ - trustBundleCache.BuildService, + trustBundleCache.Builder(), WorkloadAPIServiceBuilder( &WorkloadAPIConfig{ Listen: socketPath, @@ -760,6 +766,11 @@ func Test_E2E_SPIFFE_SDS(t *testing.T) { }, }, }, + Attestors: workloadattest.Config{ + Unix: workloadattest.UnixAttestorConfig{ + BinaryHashMaxSizeBytes: workloadattest.TestBinaryHashMaxBytes, + }, + }, }, trustBundleCache, bot.DefaultCredentialLifetime, diff --git a/lib/tbot/services/ssh/host_output.go b/lib/tbot/services/ssh/host_output.go index 81390571c89ef..429531e5d1ec7 100644 --- a/lib/tbot/services/ssh/host_output.go +++ b/lib/tbot/services/ssh/host_output.go @@ -44,7 +44,7 @@ import ( ) func HostOutputServiceBuilder(cfg *HostOutputConfig, defaultCredentialLifetime bot.CredentialLifetime) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -56,11 +56,12 @@ func HostOutputServiceBuilder(cfg *HostOutputConfig, defaultCredentialLifetime b reloadCh: deps.ReloadCh, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(HostOutputServiceType, cfg.Name, buildFn) } type HostOutputService struct { diff --git a/lib/tbot/services/ssh/host_output_config.go b/lib/tbot/services/ssh/host_output_config.go index 4467a92741229..92f180b30e5db 100644 --- a/lib/tbot/services/ssh/host_output_config.go +++ b/lib/tbot/services/ssh/host_output_config.go @@ -68,6 +68,11 @@ func (o *HostOutputConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *HostOutputConfig) SetName(name string) { + o.Name = name +} + func (o *HostOutputConfig) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) } diff --git a/lib/tbot/services/ssh/multiplexer.go b/lib/tbot/services/ssh/multiplexer.go index 9db7e1ac12f66..d5a736e9166bb 100644 --- a/lib/tbot/services/ssh/multiplexer.go +++ b/lib/tbot/services/ssh/multiplexer.go @@ -99,7 +99,7 @@ func MultiplexerServiceBuilder( defaultCredentialLifetime bot.CredentialLifetime, clientMetrics *grpcprom.ClientMetrics, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -115,11 +115,12 @@ func MultiplexerServiceBuilder( reloadCh: deps.ReloadCh, identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(MultiplexerServiceType, cfg.Name, buildFn) } // MultiplexerService is a long-lived local SSH proxy. It listens on a local Unix diff --git a/lib/tbot/services/ssh/multiplexer_config.go b/lib/tbot/services/ssh/multiplexer_config.go index 36da4466da037..1aa479f918153 100644 --- a/lib/tbot/services/ssh/multiplexer_config.go +++ b/lib/tbot/services/ssh/multiplexer_config.go @@ -64,6 +64,11 @@ func (o *MultiplexerConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *MultiplexerConfig) SetName(name string) { + o.Name = name +} + func (s *MultiplexerConfig) SessionResumptionEnabled() bool { if s.EnableResumption == nil { return true diff --git a/lib/tbot/services/workloadidentity/jwt_output.go b/lib/tbot/services/workloadidentity/jwt_output.go index 37ff41f125569..1cf4fc75c2ff4 100644 --- a/lib/tbot/services/workloadidentity/jwt_output.go +++ b/lib/tbot/services/workloadidentity/jwt_output.go @@ -42,7 +42,7 @@ func JWTOutputServiceBuilder( trustBundleCache TrustBundleGetter, defaultCredentialLifetime bot.CredentialLifetime, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -54,11 +54,12 @@ func JWTOutputServiceBuilder( identityGenerator: deps.IdentityGenerator, clientBuilder: deps.ClientBuilder, trustBundleCache: trustBundleCache, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(JWTOutputServiceType, cfg.Name, buildFn) } // JWTOutputService is a service that retrieves JWT workload identity diff --git a/lib/tbot/services/workloadidentity/jwt_output_config.go b/lib/tbot/services/workloadidentity/jwt_output_config.go index 624ebd204cc20..0af5b54c4674e 100644 --- a/lib/tbot/services/workloadidentity/jwt_output_config.go +++ b/lib/tbot/services/workloadidentity/jwt_output_config.go @@ -53,6 +53,11 @@ func (o JWTOutputConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *JWTOutputConfig) SetName(name string) { + o.Name = name +} + // Init initializes the destination. func (o *JWTOutputConfig) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) diff --git a/lib/tbot/services/workloadidentity/workload_api.go b/lib/tbot/services/workloadidentity/workload_api.go index 7e75fd3598c95..23a26f00d936c 100644 --- a/lib/tbot/services/workloadidentity/workload_api.go +++ b/lib/tbot/services/workloadidentity/workload_api.go @@ -64,7 +64,7 @@ func WorkloadAPIServiceBuilder( crlCache CRLGetter, defaultCredentialLifetime bot.CredentialLifetime, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -76,11 +76,12 @@ func WorkloadAPIServiceBuilder( trustBundleCache: trustBundleCache, crlCache: crlCache, clientBuilder: deps.ClientBuilder, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return bot.NewServicePair(svc, sidecar), nil } + return bot.NewServiceBuilder(WorkloadAPIServiceType, cfg.Name, buildFn) } // WorkloadAPIService implements a gRPC server that fulfills the SPIFFE diff --git a/lib/tbot/services/workloadidentity/workload_api_config.go b/lib/tbot/services/workloadidentity/workload_api_config.go index c438d255d70e9..d3e69fb89fc63 100644 --- a/lib/tbot/services/workloadidentity/workload_api_config.go +++ b/lib/tbot/services/workloadidentity/workload_api_config.go @@ -64,6 +64,11 @@ func (o *WorkloadAPIConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *WorkloadAPIConfig) SetName(name string) { + o.Name = name +} + // Type returns the type of the service. func (o *WorkloadAPIConfig) Type() string { return WorkloadAPIServiceType diff --git a/lib/tbot/services/workloadidentity/workload_api_test.go b/lib/tbot/services/workloadidentity/workload_api_test.go index 64b50d60e1341..866cf4d71fd03 100644 --- a/lib/tbot/services/workloadidentity/workload_api_test.go +++ b/lib/tbot/services/workloadidentity/workload_api_test.go @@ -43,6 +43,7 @@ import ( "github.com/gravitational/teleport/lib/tbot/bot" "github.com/gravitational/teleport/lib/tbot/bot/connection" "github.com/gravitational/teleport/lib/tbot/workloadidentity" + "github.com/gravitational/teleport/lib/tbot/workloadidentity/workloadattest" "github.com/gravitational/teleport/lib/utils" "github.com/gravitational/teleport/tool/teleport/testenv" ) @@ -130,14 +131,19 @@ func TestBotWorkloadIdentityAPI(t *testing.T) { Logger: log, Onboarding: *onboarding, Services: []bot.ServiceBuilder{ - trustBundleCache.BuildService, - crlCache.BuildService, + trustBundleCache.Builder(), + crlCache.Builder(), WorkloadAPIServiceBuilder( &WorkloadAPIConfig{ Selector: bot.WorkloadIdentitySelector{ Name: workloadIdentity.GetMetadata().GetName(), }, Listen: listenAddr.String(), + Attestors: workloadattest.Config{ + Unix: workloadattest.UnixAttestorConfig{ + BinaryHashMaxSizeBytes: workloadattest.TestBinaryHashMaxBytes, + }, + }, }, trustBundleCache, crlCache, diff --git a/lib/tbot/services/workloadidentity/x509_output.go b/lib/tbot/services/workloadidentity/x509_output.go index d3b753e4458b9..bdc012d7ece06 100644 --- a/lib/tbot/services/workloadidentity/x509_output.go +++ b/lib/tbot/services/workloadidentity/x509_output.go @@ -55,7 +55,7 @@ func X509OutputServiceBuilder( crlCache CRLGetter, defaultCredentialLifetime bot.CredentialLifetime, ) bot.ServiceBuilder { - return func(deps bot.ServiceDependencies) (bot.Service, error) { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { if err := cfg.CheckAndSetDefaults(); err != nil { return nil, trace.Wrap(err) } @@ -68,11 +68,12 @@ func X509OutputServiceBuilder( clientBuilder: deps.ClientBuilder, trustBundleCache: trustBundleCache, crlCache: crlCache, + log: deps.Logger, + statusReporter: deps.GetStatusReporter(), } - svc.log = deps.LoggerForService(svc) - svc.statusReporter = deps.StatusRegistry.AddService(svc.String()) return svc, nil } + return bot.NewServiceBuilder(X509OutputServiceType, cfg.Name, buildFn) } // X509OutputService is a service that retrieves X.509 certificates diff --git a/lib/tbot/services/workloadidentity/x509_output_config.go b/lib/tbot/services/workloadidentity/x509_output_config.go index ea32713bf5c83..cc2f7b186b0b2 100644 --- a/lib/tbot/services/workloadidentity/x509_output_config.go +++ b/lib/tbot/services/workloadidentity/x509_output_config.go @@ -54,6 +54,11 @@ func (o X509OutputConfig) GetName() string { return o.Name } +// SetName sets the service's name to an automatically generated one. +func (o *X509OutputConfig) SetName(name string) { + o.Name = name +} + // Init initializes the destination. func (o *X509OutputConfig) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) diff --git a/lib/tbot/tbot.go b/lib/tbot/tbot.go index 9e8444f65cb58..e6567d0e150a3 100644 --- a/lib/tbot/tbot.go +++ b/lib/tbot/tbot.go @@ -173,15 +173,20 @@ func (b *Bot) Run(ctx context.Context) (err error) { // This faux service allows us to get the bot's internal identity and client // for tests, without exposing them on the core bot.Bot struct. - services = append(services, func(deps bot.ServiceDependencies) (bot.Service, error) { - b.mu.Lock() - defer b.mu.Unlock() + if b.cfg.Testing { + services = append(services, + bot.NewServiceBuilder("internal/client-fetcher", "client-fetcher", + func(deps bot.ServiceDependencies) (bot.Service, error) { + b.mu.Lock() + defer b.mu.Unlock() - b.identity = deps.BotIdentity - b.client = deps.Client + b.identity = deps.BotIdentity + b.client = deps.Client - return bot.NewNopService("client-fetcher"), nil - }) + return bot.NewNopService("client-fetcher"), nil + }), + ) + } // We only want to create this service if it's needed by a dependent // service. @@ -194,7 +199,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { return trustBundleCache } trustBundleCache = workloadidentity.NewTrustBundleCacheFacade() - services = append(services, trustBundleCache.BuildService) + services = append(services, trustBundleCache.Builder()) return trustBundleCache } @@ -207,7 +212,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { return crlCache } crlCache = workloadidentity.NewCRLCacheFacade() - services = append(services, crlCache.BuildService) + services = append(services, crlCache.Builder()) return crlCache } diff --git a/lib/tbot/tbot_test.go b/lib/tbot/tbot_test.go index 6d7d53a1be100..fd336dad545f3 100644 --- a/lib/tbot/tbot_test.go +++ b/lib/tbot/tbot_test.go @@ -184,6 +184,7 @@ func defaultBotConfig( // certs. Insecure: opts.insecure, Services: serviceConfigs, + Testing: true, } require.NoError(t, cfg.CheckAndSetDefaults()) @@ -1313,6 +1314,7 @@ func TestBotJoiningURI(t *testing.T) { }, Oneshot: true, Insecure: true, + Testing: true, } require.NoError(t, cfg.CheckAndSetDefaults()) diff --git a/lib/tbot/workloadidentity/crl_cache.go b/lib/tbot/workloadidentity/crl_cache.go index 6c7dbd6fa214b..c8bcd22d50915 100644 --- a/lib/tbot/workloadidentity/crl_cache.go +++ b/lib/tbot/workloadidentity/crl_cache.go @@ -28,7 +28,6 @@ import ( "github.com/gravitational/trace" - "github.com/gravitational/teleport" workloadidentityv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/workloadidentity/v1" "github.com/gravitational/teleport/lib/tbot/bot" "github.com/gravitational/teleport/lib/tbot/readyz" @@ -83,29 +82,29 @@ type CRLCacheFacade struct { crlCache *CRLCache } -// BuildService implements bot.ServiceBuilder to build the CRLCache once when the -// bot starts up. -func (f *CRLCacheFacade) BuildService(deps bot.ServiceDependencies) (bot.Service, error) { - f.mu.Lock() - defer f.mu.Unlock() - - if f.crlCache == nil { - var err error - f.crlCache, err = NewCRLCache(CRLCacheConfig{ - RevocationsClient: deps.Client.WorkloadIdentityRevocationServiceClient(), - Logger: deps.Logger.With( - teleport.ComponentKey, - teleport.Component(teleport.ComponentTBot, "crl-cache"), - ), - StatusReporter: deps.StatusRegistry.AddService("crl-cache"), - }) - if err != nil { - return nil, trace.Wrap(err) +// Builder returns a bot.ServiceBuilder to build the CRLCache once when the bot +// starts up. +func (f *CRLCacheFacade) Builder() bot.ServiceBuilder { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { + f.mu.Lock() + defer f.mu.Unlock() + + if f.crlCache == nil { + var err error + f.crlCache, err = NewCRLCache(CRLCacheConfig{ + RevocationsClient: deps.Client.WorkloadIdentityRevocationServiceClient(), + Logger: deps.Logger, + StatusReporter: deps.GetStatusReporter(), + }) + if err != nil { + return nil, trace.Wrap(err) + } + close(f.ready) } - close(f.ready) - } - return f.crlCache, nil + return f.crlCache, nil + } + return bot.NewServiceBuilder("internal/crl-cache", "crl-cache", buildFn) } func (m *CRLCacheFacade) GetCRLSet(ctx context.Context) (*CRLSet, error) { diff --git a/lib/tbot/workloadidentity/trust_bundle_cache.go b/lib/tbot/workloadidentity/trust_bundle_cache.go index 9d4b2cda1e279..608cf3cc52ba3 100644 --- a/lib/tbot/workloadidentity/trust_bundle_cache.go +++ b/lib/tbot/workloadidentity/trust_bundle_cache.go @@ -35,7 +35,6 @@ import ( "github.com/spiffe/go-spiffe/v2/spiffeid" "go.opentelemetry.io/otel" - "github.com/gravitational/teleport" machineidv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1" trustv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/trust/v1" "github.com/gravitational/teleport/api/types" @@ -220,33 +219,37 @@ func NewTrustBundleCacheFacade() *TrustBundleCacheFacade { return &TrustBundleCacheFacade{ready: make(chan struct{})} } -// BuildService implements bot.ServiceBuilder to build the TrustBundleCache once -// when the bot starts up. -func (f *TrustBundleCacheFacade) BuildService(deps bot.ServiceDependencies) (bot.Service, error) { - f.mu.Lock() - defer f.mu.Unlock() - - if f.bundleCache == nil { - var err error - f.bundleCache, err = NewTrustBundleCache(TrustBundleCacheConfig{ - FederationClient: deps.Client.SPIFFEFederationServiceClient(), - TrustClient: deps.Client.TrustClient(), - EventsClient: deps.Client, - ClusterName: deps.BotIdentity().ClusterName, - BotIdentityReadyCh: deps.BotIdentityReadyCh, - Logger: deps.Logger.With( - teleport.ComponentKey, - teleport.Component(teleport.ComponentTBot, "spiffe-trust-bundle-cache"), - ), - StatusReporter: deps.StatusRegistry.AddService("spiffe-trust-bundle-cache"), - }) - if err != nil { - return nil, trace.Wrap(err) +// Builder returns a bot.ServiceBuilder to build the TrustBundleCache when the +// bot starts up. +func (f *TrustBundleCacheFacade) Builder() bot.ServiceBuilder { + buildFn := func(deps bot.ServiceDependencies) (bot.Service, error) { + f.mu.Lock() + defer f.mu.Unlock() + + if f.bundleCache == nil { + var err error + f.bundleCache, err = NewTrustBundleCache(TrustBundleCacheConfig{ + FederationClient: deps.Client.SPIFFEFederationServiceClient(), + TrustClient: deps.Client.TrustClient(), + EventsClient: deps.Client, + ClusterName: deps.BotIdentity().ClusterName, + BotIdentityReadyCh: deps.BotIdentityReadyCh, + Logger: deps.Logger, + StatusReporter: deps.GetStatusReporter(), + }) + if err != nil { + return nil, trace.Wrap(err) + } + close(f.ready) } - close(f.ready) + return f.bundleCache, nil } - return f.bundleCache, nil + return bot.NewServiceBuilder( + "internal/spiffe-trust-bundle-cache", + "spiffe-trust-bundle-cache", + buildFn, + ) } func (f *TrustBundleCacheFacade) GetBundleSet(ctx context.Context) (*BundleSet, error) { diff --git a/lib/tbot/workloadidentity/workloadattest/unix.go b/lib/tbot/workloadidentity/workloadattest/unix.go index a7b0a2aaa8b04..6e12de5d16c5c 100644 --- a/lib/tbot/workloadidentity/workloadattest/unix.go +++ b/lib/tbot/workloadidentity/workloadattest/unix.go @@ -25,6 +25,7 @@ import ( "errors" "io" "log/slog" + "time" "github.com/gravitational/trace" "github.com/shirou/gopsutil/v4/process" @@ -35,6 +36,18 @@ import ( // DefaultBinaryHashMaxBytes is default value for BinaryHashMaxSizeBytes. const DefaultBinaryHashMaxBytes = 1 << 30 // 1GiB +// TestBinaryHashMaxBytes is a more suitable value for BinaryHashMaxSizeBytes +// in tests. Reading `/proc//exe` relies on inode stability, and in GitHub +// Actions we've observed read stalls likely caused by overlayfs or debugfs. +const TestBinaryHashMaxBytes = 1 << 12 // 4KiB + +// BinaryHashReadTimeout is the maximum amount of time we will wait to read +// a process's executable to calculate its checksum. In normal circumstances +// we should never reach this timeout, but reading `/proc//exe` depends +// on inode stability, so it's possible to encounter read stalls if there is +// a network or overlay filesystem involved. +const BinaryHashReadTimeout = 15 * time.Second + // UnixAttestorConfig holds the configuration for the Unix workload attestor. type UnixAttestorConfig struct { // BinaryHashMaxSize is the maximum number of bytes that will be read from @@ -149,22 +162,51 @@ func (a *UnixAttestor) Attest(ctx context.Context, pid int) (*workloadidentityv1 att.BinaryPath = &path } - exe, err := a.os.OpenExe(ctx, p) + if hash := a.hashBinary(ctx, p); hash != "" { + att.BinaryHash = &hash + } + + return att, nil +} + +func (a *UnixAttestor) hashBinary(ctx context.Context, proc *process.Process) string { + exe, err := a.os.OpenExe(ctx, proc) if err != nil { a.log.ErrorContext(ctx, "Failed to open workload executable for hashing", "error", err) - return att, nil + return "" } defer func() { _ = exe.Close() }() - hash := sha256.New() - if _, err := copyAtMost(hash, exe, a.cfg.BinaryHashMaxSizeBytes); err != nil { - a.log.ErrorContext(ctx, "Failed to hash workload executable", "error", err) - return att, nil + // Read the workload executable to calculate a checksum. We do this in a + // goroutine in case of read stalls (e.g. due to the executable being on + // a network or overlay filesystem). If this happens, the goroutine will + // terminate when we close the file descriptor (see defer statement above). + type sumResult struct { + sum string + err error + } + resCh := make(chan sumResult, 1) + go func() { + hash := sha256.New() + if _, err := copyAtMost(hash, exe, a.cfg.BinaryHashMaxSizeBytes); err == nil { + resCh <- sumResult{sum: hex.EncodeToString(hash.Sum(nil))} + } else { + resCh <- sumResult{err: err} + } + }() + + select { + case res := <-resCh: + if res.err != nil { + a.log.ErrorContext(ctx, "Failed to hash workload executable", "error", err) + } + return res.sum + case <-time.After(BinaryHashReadTimeout): + a.log.ErrorContext(ctx, "Timeout reading workload executable. If this happens frequently, it could be due to the workload executable being on a network or overlay filesystem, you may also consider lowering `attestors.unix.binary_hash_max_size_bytes`.") + case <-ctx.Done(): } - sum := hex.EncodeToString(hash.Sum(nil)) - att.BinaryHash = &sum - return att, nil + return "" } // copyAtMost copies at most n bytes from src to dst. If src contains more than