diff --git a/api/gen/proto/go/teleport/machineid/v1/bot_instance_service.pb.go b/api/gen/proto/go/teleport/machineid/v1/bot_instance_service.pb.go index 9d57db891b1be..22e3319bad62d 100644 --- a/api/gen/proto/go/teleport/machineid/v1/bot_instance_service.pb.go +++ b/api/gen/proto/go/teleport/machineid/v1/bot_instance_service.pb.go @@ -384,7 +384,9 @@ func (x *DeleteBotInstanceRequest) GetInstanceId() string { type SubmitHeartbeatRequest struct { state protoimpl.MessageState `protogen:"open.v1"` // The heartbeat data to submit. - Heartbeat *BotInstanceStatusHeartbeat `protobuf:"bytes,1,opt,name=heartbeat,proto3" json:"heartbeat,omitempty"` + Heartbeat *BotInstanceStatusHeartbeat `protobuf:"bytes,1,opt,name=heartbeat,proto3" json:"heartbeat,omitempty"` + // The health of the services/output `tbot` is running. + ServiceHealth []*BotInstanceServiceHealth `protobuf:"bytes,2,rep,name=service_health,json=serviceHealth,proto3" json:"service_health,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -426,6 +428,13 @@ func (x *SubmitHeartbeatRequest) GetHeartbeat() *BotInstanceStatusHeartbeat { return nil } +func (x *SubmitHeartbeatRequest) GetServiceHealth() []*BotInstanceServiceHealth { + if x != nil { + return x.ServiceHealth + } + return nil +} + // The response for SubmitHeartbeat. type SubmitHeartbeatResponse struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -564,9 +573,10 @@ const file_teleport_machineid_v1_bot_instance_service_proto_rawDesc = "" + "\x18DeleteBotInstanceRequest\x12\x19\n" + "\bbot_name\x18\x01 \x01(\tR\abotName\x12\x1f\n" + "\vinstance_id\x18\x02 \x01(\tR\n" + - "instanceId\"i\n" + + "instanceId\"\xc1\x01\n" + "\x16SubmitHeartbeatRequest\x12O\n" + - "\theartbeat\x18\x01 \x01(\v21.teleport.machineid.v1.BotInstanceStatusHeartbeatR\theartbeat\"\x19\n" + + "\theartbeat\x18\x01 \x01(\v21.teleport.machineid.v1.BotInstanceStatusHeartbeatR\theartbeat\x12V\n" + + "\x0eservice_health\x18\x02 \x03(\v2/.teleport.machineid.v1.BotInstanceServiceHealthR\rserviceHealth\"\x19\n" + "\x17SubmitHeartbeatResponse2\xbb\x04\n" + "\x12BotInstanceService\x12b\n" + "\x0eGetBotInstance\x12,.teleport.machineid.v1.GetBotInstanceRequest\x1a\".teleport.machineid.v1.BotInstance\x12x\n" + @@ -600,28 +610,30 @@ var file_teleport_machineid_v1_bot_instance_service_proto_goTypes = []any{ (*types.SortBy)(nil), // 8: types.SortBy (*BotInstance)(nil), // 9: teleport.machineid.v1.BotInstance (*BotInstanceStatusHeartbeat)(nil), // 10: teleport.machineid.v1.BotInstanceStatusHeartbeat - (*emptypb.Empty)(nil), // 11: google.protobuf.Empty + (*BotInstanceServiceHealth)(nil), // 11: teleport.machineid.v1.BotInstanceServiceHealth + (*emptypb.Empty)(nil), // 12: google.protobuf.Empty } var file_teleport_machineid_v1_bot_instance_service_proto_depIdxs = []int32{ 8, // 0: teleport.machineid.v1.ListBotInstancesRequest.sort:type_name -> types.SortBy 7, // 1: teleport.machineid.v1.ListBotInstancesV2Request.filter:type_name -> teleport.machineid.v1.ListBotInstancesV2Request.Filters 9, // 2: teleport.machineid.v1.ListBotInstancesResponse.bot_instances:type_name -> teleport.machineid.v1.BotInstance 10, // 3: teleport.machineid.v1.SubmitHeartbeatRequest.heartbeat:type_name -> teleport.machineid.v1.BotInstanceStatusHeartbeat - 0, // 4: teleport.machineid.v1.BotInstanceService.GetBotInstance:input_type -> teleport.machineid.v1.GetBotInstanceRequest - 1, // 5: teleport.machineid.v1.BotInstanceService.ListBotInstances:input_type -> teleport.machineid.v1.ListBotInstancesRequest - 2, // 6: teleport.machineid.v1.BotInstanceService.ListBotInstancesV2:input_type -> teleport.machineid.v1.ListBotInstancesV2Request - 4, // 7: teleport.machineid.v1.BotInstanceService.DeleteBotInstance:input_type -> teleport.machineid.v1.DeleteBotInstanceRequest - 5, // 8: teleport.machineid.v1.BotInstanceService.SubmitHeartbeat:input_type -> teleport.machineid.v1.SubmitHeartbeatRequest - 9, // 9: teleport.machineid.v1.BotInstanceService.GetBotInstance:output_type -> teleport.machineid.v1.BotInstance - 3, // 10: teleport.machineid.v1.BotInstanceService.ListBotInstances:output_type -> teleport.machineid.v1.ListBotInstancesResponse - 3, // 11: teleport.machineid.v1.BotInstanceService.ListBotInstancesV2:output_type -> teleport.machineid.v1.ListBotInstancesResponse - 11, // 12: teleport.machineid.v1.BotInstanceService.DeleteBotInstance:output_type -> google.protobuf.Empty - 6, // 13: teleport.machineid.v1.BotInstanceService.SubmitHeartbeat:output_type -> teleport.machineid.v1.SubmitHeartbeatResponse - 9, // [9:14] is the sub-list for method output_type - 4, // [4:9] is the sub-list for method input_type - 4, // [4:4] is the sub-list for extension type_name - 4, // [4:4] is the sub-list for extension extendee - 0, // [0:4] is the sub-list for field type_name + 11, // 4: teleport.machineid.v1.SubmitHeartbeatRequest.service_health:type_name -> teleport.machineid.v1.BotInstanceServiceHealth + 0, // 5: teleport.machineid.v1.BotInstanceService.GetBotInstance:input_type -> teleport.machineid.v1.GetBotInstanceRequest + 1, // 6: teleport.machineid.v1.BotInstanceService.ListBotInstances:input_type -> teleport.machineid.v1.ListBotInstancesRequest + 2, // 7: teleport.machineid.v1.BotInstanceService.ListBotInstancesV2:input_type -> teleport.machineid.v1.ListBotInstancesV2Request + 4, // 8: teleport.machineid.v1.BotInstanceService.DeleteBotInstance:input_type -> teleport.machineid.v1.DeleteBotInstanceRequest + 5, // 9: teleport.machineid.v1.BotInstanceService.SubmitHeartbeat:input_type -> teleport.machineid.v1.SubmitHeartbeatRequest + 9, // 10: teleport.machineid.v1.BotInstanceService.GetBotInstance:output_type -> teleport.machineid.v1.BotInstance + 3, // 11: teleport.machineid.v1.BotInstanceService.ListBotInstances:output_type -> teleport.machineid.v1.ListBotInstancesResponse + 3, // 12: teleport.machineid.v1.BotInstanceService.ListBotInstancesV2:output_type -> teleport.machineid.v1.ListBotInstancesResponse + 12, // 13: teleport.machineid.v1.BotInstanceService.DeleteBotInstance:output_type -> google.protobuf.Empty + 6, // 14: teleport.machineid.v1.BotInstanceService.SubmitHeartbeat:output_type -> teleport.machineid.v1.SubmitHeartbeatResponse + 10, // [10:15] is the sub-list for method output_type + 5, // [5:10] is the sub-list for method input_type + 5, // [5:5] is the sub-list for extension type_name + 5, // [5:5] is the sub-list for extension extendee + 0, // [0:5] is the sub-list for field type_name } func init() { file_teleport_machineid_v1_bot_instance_service_proto_init() } diff --git a/api/proto/teleport/machineid/v1/bot_instance_service.proto b/api/proto/teleport/machineid/v1/bot_instance_service.proto index 7f6b18552a1b5..bc3469f895ab6 100644 --- a/api/proto/teleport/machineid/v1/bot_instance_service.proto +++ b/api/proto/teleport/machineid/v1/bot_instance_service.proto @@ -104,6 +104,9 @@ message DeleteBotInstanceRequest { message SubmitHeartbeatRequest { // The heartbeat data to submit. BotInstanceStatusHeartbeat heartbeat = 1; + + // The health of the services/output `tbot` is running. + repeated BotInstanceServiceHealth service_health = 2; } // The response for SubmitHeartbeat. diff --git a/lib/auth/machineid/machineidv1/bot_instance_service.go b/lib/auth/machineid/machineidv1/bot_instance_service.go index 7d9b27115c9dc..3f0270545a4ff 100644 --- a/lib/auth/machineid/machineidv1/bot_instance_service.go +++ b/lib/auth/machineid/machineidv1/bot_instance_service.go @@ -21,6 +21,8 @@ package machineidv1 import ( "context" "log/slog" + "os" + "strconv" "time" "github.com/gravitational/trace" @@ -46,6 +48,12 @@ const ( // ensure the instance remains accessible until shortly after the last // issued certificate expires. ExpiryMargin = time.Minute * 5 + + // serviceNameLimit is the maximum length in bytes of a bot service name. + serviceNameLimit = 64 + + // statusReasonLimit is the maximum length in bytes of a service status reason. + statusReasonLimit = 256 ) // BotInstancesCache is the subset of the cached resources that the Service queries. @@ -204,6 +212,17 @@ func (b *BotInstanceService) SubmitHeartbeat(ctx context.Context, req *pb.Submit return nil, trace.BadParameter("heartbeat: must be non-nil") } + for _, svcHealth := range req.GetServiceHealth() { + name := svcHealth.GetService().GetName() + if len(name) > serviceNameLimit { + return nil, trace.BadParameter("service name %q is longer than %d bytes", name, serviceNameLimit) + } + reason := svcHealth.GetReason() + if len(reason) > statusReasonLimit { + return nil, trace.BadParameter("service %q has a status reason longer than %d bytes", name, statusReasonLimit) + } + } + // Enforce that the connecting client is a bot and has a bot instance ID. botName := authCtx.Identity.GetIdentity().BotName botInstanceID := authCtx.Identity.GetIdentity().BotInstanceID @@ -238,6 +257,11 @@ func (b *BotInstanceService) SubmitHeartbeat(ctx context.Context, req *pb.Submit // Append the new heartbeat to the end. instance.Status.LatestHeartbeats = append(instance.Status.LatestHeartbeats, req.Heartbeat) + if storeHeartbeatExtras() { + // Overwrite the service health. + instance.Status.ServiceHealth = req.ServiceHealth + } + return instance, nil }) if err != nil { @@ -246,3 +270,15 @@ func (b *BotInstanceService) SubmitHeartbeat(ctx context.Context, req *pb.Submit return &pb.SubmitHeartbeatResponse{}, nil } + +// storeHeartbeatExtras returns whether we should store "extra" data submitted +// with tbot heartbeats, such as the service health. Defaults to true unless the +// TELEPORT_DISABLE_TBOT_HEARTBEAT_EXTRAS environment variable is set to true on +// the auth server. +func storeHeartbeatExtras() bool { + disabled, err := strconv.ParseBool(os.Getenv("TELEPORT_DISABLE_TBOT_HEARTBEAT_EXTRAS")) + if err != nil { + return true + } + return !disabled +} diff --git a/lib/auth/machineid/machineidv1/bot_instance_service_test.go b/lib/auth/machineid/machineidv1/bot_instance_service_test.go index 3c2815b0102da..bf6384a883d26 100644 --- a/lib/auth/machineid/machineidv1/bot_instance_service_test.go +++ b/lib/auth/machineid/machineidv1/bot_instance_service_test.go @@ -23,6 +23,7 @@ import ( "fmt" "slices" "strconv" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -265,6 +266,7 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) { createBotInstance bool assertErr assert.ErrorAssertionFunc wantHeartbeat bool + wantServiceHealth []*machineidv1.BotInstanceServiceHealth }{ { name: "success", @@ -273,10 +275,30 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) { Heartbeat: &machineidv1.BotInstanceStatusHeartbeat{ Hostname: "llama", }, + ServiceHealth: []*machineidv1.BotInstanceServiceHealth{ + { + Service: &machineidv1.BotInstanceServiceIdentifier{ + Type: "application-tunnel", + Name: "my-application-tunnel", + }, + Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + Reason: ptr("application is broken"), + }, + }, }, identity: goodIdentity, assertErr: assert.NoError, wantHeartbeat: true, + wantServiceHealth: []*machineidv1.BotInstanceServiceHealth{ + { + Service: &machineidv1.BotInstanceServiceIdentifier{ + Type: "application-tunnel", + Name: "my-application-tunnel", + }, + Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + Reason: ptr("application is broken"), + }, + }, }, { name: "missing bot name", @@ -335,6 +357,54 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) { }, wantHeartbeat: false, }, + { + name: "service name too long", + createBotInstance: true, + req: &machineidv1.SubmitHeartbeatRequest{ + Heartbeat: &machineidv1.BotInstanceStatusHeartbeat{ + Hostname: "llama", + }, + ServiceHealth: []*machineidv1.BotInstanceServiceHealth{ + { + Service: &machineidv1.BotInstanceServiceIdentifier{ + Type: "application-tunnel", + Name: strings.Repeat("a", 100), + }, + Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + Reason: ptr("application is broken"), + }, + }, + }, + identity: goodIdentity, + assertErr: func(t assert.TestingT, err error, i ...any) bool { + return assert.True(t, trace.IsBadParameter(err)) && assert.Contains(t, err.Error(), "is longer than 64 bytes") + }, + wantHeartbeat: false, + }, + { + name: "status reason too long", + createBotInstance: true, + req: &machineidv1.SubmitHeartbeatRequest{ + Heartbeat: &machineidv1.BotInstanceStatusHeartbeat{ + Hostname: "llama", + }, + ServiceHealth: []*machineidv1.BotInstanceServiceHealth{ + { + Service: &machineidv1.BotInstanceServiceIdentifier{ + Type: "application-tunnel", + Name: "my-application-tunnel", + }, + Status: machineidv1.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + Reason: ptr(strings.Repeat("a", 300)), + }, + }, + }, + identity: goodIdentity, + assertErr: func(t assert.TestingT, err error, i ...any) bool { + return assert.True(t, trace.IsBadParameter(err)) && assert.Contains(t, err.Error(), "status reason longer than 256 bytes") + }, + wantHeartbeat: false, + }, } for _, tt := range tests { @@ -385,6 +455,13 @@ func TestBotInstanceServiceSubmitHeartbeat(t *testing.T) { assert.Nil(t, bi.Status.InitialHeartbeat) assert.Empty(t, bi.Status.LatestHeartbeats) } + assert.Empty(t, + cmp.Diff( + bi.Status.ServiceHealth, + tt.wantServiceHealth, + protocmp.Transform(), + ), + ) } }) } @@ -597,3 +674,5 @@ func newBotInstanceService( return service } + +func ptr[T any](v T) *T { return &v } diff --git a/lib/tbot/bot/bot.go b/lib/tbot/bot/bot.go index 3eaa7e9aea5ad..989019d679469 100644 --- a/lib/tbot/bot/bot.go +++ b/lib/tbot/bot/bot.go @@ -86,7 +86,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { // statuses. Otherwise we will not include the service in heartbeats or // the `/readyz` endpoint. if handle.statusReporter.used { - handle.statusReporter.reporter = registry.AddService(handle.name) + handle.statusReporter.reporter = registry.AddService(handle.serviceType, handle.name) } } @@ -146,7 +146,7 @@ func (b *Bot) OneShot(ctx context.Context) (err error) { } // Add oneshot services to the registry. - handle.statusReporter.reporter = registry.AddService(handle.name) + handle.statusReporter.reporter = registry.AddService(handle.serviceType, handle.name) oneShotServices = append(oneShotServices, handle) } diff --git a/lib/tbot/internal/heartbeat/service.go b/lib/tbot/internal/heartbeat/service.go index a6da45bc3cb9e..936065c0f41a7 100644 --- a/lib/tbot/internal/heartbeat/service.go +++ b/lib/tbot/internal/heartbeat/service.go @@ -23,6 +23,7 @@ import ( "log/slog" "os" "runtime" + "sort" "time" "github.com/gravitational/trace" @@ -229,8 +230,20 @@ func (s *Service) heartbeat(ctx context.Context, isOneShot, isStartup bool) erro Kind: s.cfg.BotKind, } + status := s.cfg.StatusRegistry.OverallStatus() + serviceHealth := make([]*machineidv1pb.BotInstanceServiceHealth, 0, len(status.Services)) + for name, serviceStatus := range status.Services { + serviceHealth = append(serviceHealth, statusToServiceHealth(name, serviceStatus)) + } + + // Sort the services by name to make tests deterministic. + sort.Slice(serviceHealth, func(a, b int) bool { + return serviceHealth[a].Service.Name < serviceHealth[b].Service.Name + }) + _, err = s.cfg.Client.SubmitHeartbeat(ctx, &machineidv1pb.SubmitHeartbeatRequest{ - Heartbeat: hb, + Heartbeat: hb, + ServiceHealth: serviceHealth, }) if err != nil { return trace.Wrap(err, "submitting heartbeat") @@ -239,3 +252,49 @@ func (s *Service) heartbeat(ctx context.Context, isOneShot, isStartup bool) erro s.cfg.Logger.InfoContext(ctx, "Sent heartbeat", "data", hb.String()) return nil } + +func statusToServiceHealth(name string, status *readyz.ServiceStatus) *machineidv1pb.BotInstanceServiceHealth { + health := &machineidv1pb.BotInstanceServiceHealth{ + Service: &machineidv1pb.BotInstanceServiceIdentifier{ + Name: trimString(name, 64), + Type: status.ServiceType, + }, + } + + switch status.Status { + case readyz.Initializing: + health.Status = machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_INITIALIZING + case readyz.Healthy: + health.Status = machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_HEALTHY + case readyz.Unhealthy: + health.Status = machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY + } + + if status.Reason != "" { + reason := trimString(status.Reason, 256) + health.Reason = &reason + } + + if status.UpdatedAt != nil { + health.UpdatedAt = timestamppb.New(*status.UpdatedAt) + } + + return health +} + +func trimString(s string, maxBytes int) string { + if len(s) <= maxBytes { + return s + } + + // Trim the string to maxBytes, honoring rune boundaries for non-ASCII text. + byteCount := 0 + for i, r := range s { + runeSize := len(string(r)) + if byteCount+runeSize > maxBytes { + return s[:i] + } + byteCount += runeSize + } + return s +} diff --git a/lib/tbot/internal/heartbeat/service_test.go b/lib/tbot/internal/heartbeat/service_test.go index 0eb152de46121..505449ecd8e23 100644 --- a/lib/tbot/internal/heartbeat/service_test.go +++ b/lib/tbot/internal/heartbeat/service_test.go @@ -22,6 +22,7 @@ import ( "context" "os" "runtime" + "strings" "testing" "testing/synctest" "time" @@ -32,6 +33,7 @@ import ( "google.golang.org/grpc" "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/durationpb" + "google.golang.org/protobuf/types/known/timestamppb" "github.com/gravitational/teleport" machineidv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1" @@ -69,7 +71,9 @@ func TestHeartbeatService(t *testing.T) { } reg := readyz.NewRegistry() - svcA := reg.AddService("a") + + svcA := reg.AddService("a", "a") + svcB := reg.AddService("b", strings.Repeat("b", 200)) svc, err := NewService(Config{ Interval: time.Second, @@ -78,7 +82,7 @@ func TestHeartbeatService(t *testing.T) { StartedAt: time.Now().Add(-1 * time.Hour), Logger: log, JoinMethod: types.JoinMethodGitHub, - StatusReporter: reg.AddService("heartbeat"), + StatusReporter: reg.AddService("internal/heartbeat", "heartbeat"), StatusRegistry: reg, BotKind: machineidv1pb.BotKind_BOT_KIND_TBOT, }) @@ -100,6 +104,7 @@ func TestHeartbeatService(t *testing.T) { } svcA.ReportReason(readyz.Unhealthy, "no more bananas") + svcB.ReportReason(readyz.Unhealthy, strings.Repeat("b", 300)) want := &machineidv1pb.SubmitHeartbeatRequest{ Heartbeat: &machineidv1pb.BotInstanceStatusHeartbeat{ @@ -113,6 +118,35 @@ func TestHeartbeatService(t *testing.T) { JoinMethod: string(types.JoinMethodGitHub), Kind: machineidv1pb.BotKind_BOT_KIND_TBOT, }, + ServiceHealth: []*machineidv1pb.BotInstanceServiceHealth{ + { + Service: &machineidv1pb.BotInstanceServiceIdentifier{ + Name: "a", + Type: "a", + }, + Reason: ptr("no more bananas"), + Status: machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + UpdatedAt: timestamppb.New(time.Now()), + }, + // Check limits were applied on user-controlled or dynamic fields. + { + Service: &machineidv1pb.BotInstanceServiceIdentifier{ + Name: strings.Repeat("b", 64), + Type: "b", + }, + Reason: ptr(strings.Repeat("b", 256)), + Status: machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_UNHEALTHY, + UpdatedAt: timestamppb.New(time.Now()), + }, + { + Service: &machineidv1pb.BotInstanceServiceIdentifier{ + Name: "heartbeat", + Type: "internal/heartbeat", + }, + Status: machineidv1pb.BotInstanceHealthStatus_BOT_INSTANCE_HEALTH_STATUS_HEALTHY, + UpdatedAt: timestamppb.New(time.Now()), + }, + }, } compare := func(t *testing.T, want, got *machineidv1pb.SubmitHeartbeatRequest) { @@ -150,3 +184,5 @@ func TestHeartbeatService(t *testing.T) { assert.NoError(t, <-errCh) }) } + +func ptr[T any](v T) *T { return &v } diff --git a/lib/tbot/readyz/readyz.go b/lib/tbot/readyz/readyz.go index f275b9ce06a8b..71881c1dfd3f8 100644 --- a/lib/tbot/readyz/readyz.go +++ b/lib/tbot/readyz/readyz.go @@ -18,18 +18,38 @@ package readyz -import "sync" +import ( + "sync" + "time" + + "github.com/jonboulle/clockwork" +) // NewRegistry returns a Registry to track the health of tbot's services. -func NewRegistry() *Registry { - return &Registry{ +func NewRegistry(opts ...NewRegistryOpt) *Registry { + r := &Registry{ + clock: clockwork.NewRealClock(), services: make(map[string]*ServiceStatus), notifyCh: make(chan struct{}), } + for _, opt := range opts { + opt(r) + } + return r +} + +// NewRegistryOpt can be passed to NewRegistry to provide optional configuration. +type NewRegistryOpt func(r *Registry) + +// WithClock sets the registry's clock. +func WithClock(clock clockwork.Clock) NewRegistryOpt { + return func(r *Registry) { r.clock = clock } } // Registry tracks the status/health of tbot's services. type Registry struct { + clock clockwork.Clock + mu sync.Mutex services map[string]*ServiceStatus reported int @@ -42,18 +62,23 @@ type Registry struct { // // Note: you should add all of your services before any service reports its status // otherwise AllServicesReported will unblock too early. -func (r *Registry) AddService(name string) Reporter { +func (r *Registry) AddService(serviceType, name string) Reporter { r.mu.Lock() defer r.mu.Unlock() + // TODO(boxofrad): If you add the same service multiple times, you could end + // up unblocking AllServicesReported prematurely. The impact is low, it just + // means we'd send a heartbeat sooner than is desirable, but we should panic + // or return an error from this method instead. status, ok := r.services[name] if !ok { - status = &ServiceStatus{} + status = &ServiceStatus{ServiceType: serviceType} r.services[name] = status } return &reporter{ mu: &r.mu, + clock: r.clock, status: status, notify: sync.OnceFunc(r.maybeNotifyLocked), } @@ -127,6 +152,12 @@ type ServiceStatus struct { // Reason string describing why the service has its current status. Reason string `json:"reason,omitempty"` + + // UpdatedAt is the time at which the service's status last changed. + UpdatedAt *time.Time `json:"updated_at"` + + // ServiceType is exposed in bot heartbeats, but not the `/readyz` endpoint. + ServiceType string `json:"-"` } // Clone the status to avoid data races. diff --git a/lib/tbot/readyz/readyz_test.go b/lib/tbot/readyz/readyz_test.go index c37e73c0bc610..4a43a0fec9126 100644 --- a/lib/tbot/readyz/readyz_test.go +++ b/lib/tbot/readyz/readyz_test.go @@ -23,7 +23,9 @@ import ( "net/http" "net/http/httptest" "testing" + "time" + "github.com/jonboulle/clockwork" "github.com/stretchr/testify/require" "github.com/gravitational/teleport/lib/tbot/readyz" @@ -32,10 +34,13 @@ import ( func TestReadyz(t *testing.T) { t.Parallel() - reg := readyz.NewRegistry() + now := time.Now().UTC().Truncate(time.Second) + clock := clockwork.NewFakeClockAt(now) + + reg := readyz.NewRegistry(readyz.WithClock(clock)) - a := reg.AddService("a") - b := reg.AddService("b") + a := reg.AddService("svc", "a") + b := reg.AddService("svc", "b") srv := httptest.NewServer(readyz.HTTPHandler(reg)) srv.URL = srv.URL + "/readyz" @@ -79,8 +84,9 @@ func TestReadyz(t *testing.T) { require.Equal(t, readyz.ServiceStatus{ - Status: readyz.Unhealthy, - Reason: "database is down", + Status: readyz.Unhealthy, + Reason: "database is down", + UpdatedAt: &now, }, response, ) @@ -104,8 +110,8 @@ func TestReadyz(t *testing.T) { readyz.OverallStatus{ Status: readyz.Unhealthy, Services: map[string]*readyz.ServiceStatus{ - "a": {Status: readyz.Healthy}, - "b": {Status: readyz.Unhealthy, Reason: "database is down"}, + "a": {Status: readyz.Healthy, UpdatedAt: &now}, + "b": {Status: readyz.Unhealthy, Reason: "database is down", UpdatedAt: &now}, }, }, response, @@ -130,8 +136,8 @@ func TestReadyz(t *testing.T) { readyz.OverallStatus{ Status: readyz.Healthy, Services: map[string]*readyz.ServiceStatus{ - "a": {Status: readyz.Healthy}, - "b": {Status: readyz.Healthy}, + "a": {Status: readyz.Healthy, UpdatedAt: &now}, + "b": {Status: readyz.Healthy, UpdatedAt: &now}, }, }, response, @@ -150,8 +156,8 @@ func TestReadyz(t *testing.T) { func TestAllServicesReported(t *testing.T) { reg := readyz.NewRegistry() - a := reg.AddService("a") - b := reg.AddService("b") + a := reg.AddService("svc", "a") + b := reg.AddService("svc", "b") select { case <-reg.AllServicesReported(): diff --git a/lib/tbot/readyz/reporter.go b/lib/tbot/readyz/reporter.go index 876f29186e020..7a8d2ff8e7224 100644 --- a/lib/tbot/readyz/reporter.go +++ b/lib/tbot/readyz/reporter.go @@ -18,7 +18,11 @@ package readyz -import "sync" +import ( + "sync" + + "github.com/jonboulle/clockwork" +) // Reporter can be used by a service to report its status. type Reporter interface { @@ -32,6 +36,7 @@ type Reporter interface { type reporter struct { mu *sync.Mutex status *ServiceStatus + clock clockwork.Clock notify func() } @@ -45,6 +50,10 @@ func (r *reporter) ReportReason(status Status, reason string) { r.status.Status = status r.status.Reason = reason + + updatedAt := r.clock.Now() + r.status.UpdatedAt = &updatedAt + r.notify() }