diff --git a/docs/pages/includes/machine-id/common-output-config.yaml b/docs/pages/includes/machine-id/common-output-config.yaml index 37a2f0ec37b8e..df32295a9afcf 100644 --- a/docs/pages/includes/machine-id/common-output-config.yaml +++ b/docs/pages/includes/machine-id/common-output-config.yaml @@ -25,3 +25,8 @@ roles: # on the next invocation, but don't want long-lived workload certificates on-disk. credential_ttl: 30m renewal_interval: 15m + +# name optionally overrides the name of the service used in logs and the `/readyz` +# endpoint. It must only contain letters, numbers, hyphens, underscores, and plus +# symbols. +name: my-service-name diff --git a/docs/pages/reference/machine-id/configuration.mdx b/docs/pages/reference/machine-id/configuration.mdx index c6b18a8fafe7e..d5fc2310eac50 100644 --- a/docs/pages/reference/machine-id/configuration.mdx +++ b/docs/pages/reference/machine-id/configuration.mdx @@ -366,6 +366,11 @@ renewal_interval: 15m # plugin is used to automatically refresh the credentials within a single # invocation of `kubectl`. Defaults to `false`. disable_exec_plugin: false + +# name optionally overrides the name of the service used in logs and the `/readyz` +# endpoint. It must only contain letters, numbers, hyphens, underscores, and plus +# symbols. +name: my-service-name ``` Each Kubernetes cluster matching a selector will result in a new context in the @@ -456,6 +461,10 @@ audiences: - foo.example.com (!docs/pages/includes/machine-id/workload-identity-selector-config.yaml!) (!docs/pages/includes/machine-id/common-output-config.yaml!) +# name optionally overrides the name of the service used in logs and the `/readyz` +# endpoint. It must only contain letters, numbers, hyphens, underscores, and plus +# symbols. +name: my-service-name ``` ### `workload-identity-aws-roles-anywhere` @@ -523,6 +532,10 @@ artifact_name: my-credentials-file # defaults to `false`. overwrite_credential_file: false (!docs/pages/includes/machine-id/workload-identity-selector-config.yaml!) +# name optionally overrides the name of the service used in logs and the `/readyz` +# endpoint. It must only contain letters, numbers, hyphens, underscores, and plus +# symbols. +name: my-service-name ``` ### `spiffe-svid` @@ -780,6 +793,10 @@ svids: # # If unspecified, the GID is not checked. gid: 50 +# name optionally overrides the name of the service used in logs and the `/readyz` +# endpoint. It must only contain letters, numbers, hyphens, underscores, and plus +# symbols. +name: my-service-name ``` #### Envoy SDS @@ -947,6 +964,10 @@ database: postgres # username is the name of the user on the specified database server to open a # tunnel for. username: postgres +# name optionally overrides the name of the service used in logs and the `/readyz` +# endpoint. It must only contain letters, numbers, hyphens, underscores, and plus +# symbols. +name: my-service-name ``` The `database-tunnel` service will not start if `tbot` has been configured @@ -978,6 +999,10 @@ listen: tcp://127.0.0.1:8084 # app_name is the name of the application, as configured in Teleport, that # the service should open a tunnel to. app_name: my-application +# name optionally overrides the name of the service used in logs and the `/readyz` +# endpoint. It must only contain letters, numbers, hyphens, underscores, and plus +# symbols. +name: my-service-name ``` The `application-tunnel` service will not start if `tbot` has been configured @@ -1036,6 +1061,10 @@ proxy_command: # # If unspecified, proxy templates will not be used. proxy_templates_path: /etc/my-proxy-templates.yaml +# name optionally overrides the name of the service used in logs and the `/readyz` +# endpoint. It must only contain letters, numbers, hyphens, underscores, and plus +# symbols. +name: my-service-name ``` Once configured, `tbot` will create the following artifacts in the specified diff --git a/docs/pages/reference/machine-id/diagnostics-service.mdx b/docs/pages/reference/machine-id/diagnostics-service.mdx index 23859771f51d8..45e3440b2483f 100644 --- a/docs/pages/reference/machine-id/diagnostics-service.mdx +++ b/docs/pages/reference/machine-id/diagnostics-service.mdx @@ -40,13 +40,65 @@ to determine if the `tbot` process is running and has not crashed or hung. If deploying to Kubernetes, we recommend this endpoint is used for your Liveness Probe. -### `/readyz` +### `/readyz` and `/readyz/{service}` -The `/readyz` endpoint currently returns the same information as `/livez`. +The `/readyz` endpoint returns the overall health of `tbot`, including all of +its internal and user-defined services. If all services are healthy, it will +respond with a 200 status code. If any service is unhealthy, it will respond +with a 503 status code. -In the future, this endpoint will be expanded to indicate whether the internal -components of `tbot` have been able to generate certificates and are ready -to serve requests. +```code +$ curl -v http://127.0.0.1:3001/readyz + +HTTP/1.1 503 Service Unavailable +Content-Type: application/json + +{ + "status": "unhealthy", + "services": { + "ca-rotation": { + "status": "healthy" + }, + "heartbeat": { + "status": "healthy" + }, + "identity": { + "status": "healthy" + }, + "aws-roles-anywhere": { + "status": "unhealthy", + "reason": "access denied to perform action \"read\" on \"workload_identity\"" + } + } +} +``` + +If deploying to Kubernetes, we recommend this endpoint is used for your +Readiness Probe. + +You can also use the `/readyz/{service}` endpoint to query the health of a +specific service. + +```code +$ curl -v http://127.0.0.1:3001/readyz/aws-roles-anywhere + +HTTP/1.1 200 OK +Content-Type: application/json + +{ + "status": "healthy" +} +``` + +By default, `tbot` generates service names based on their configuration such as +the output destination. You can override this by providing your own name in the +`tbot` configuration file. + +```yaml +services: + - type: identity + name: my-service-123 +``` ### `/metrics` diff --git a/lib/tbot/config/config.go b/lib/tbot/config/config.go index a9025abb61c4e..eeabf11e32ac9 100644 --- a/lib/tbot/config/config.go +++ b/lib/tbot/config/config.go @@ -26,6 +26,7 @@ import ( "fmt" "io" "net/url" + "regexp" "slices" "strings" "time" @@ -65,6 +66,38 @@ var SupportedJoinMethods = []string{ string(types.JoinMethodTerraformCloud), } +// ReservedServiceNames are the service names reserved for internal use. +var ReservedServiceNames = []string{ + "ca-rotation", + "crl-cache", + "heartbeat", + "identity", + "spiffe-trust-bundle-cache", +} + +var reservedServiceNamesMap = func() map[string]struct{} { + m := make(map[string]struct{}, len(ReservedServiceNames)) + for _, k := range ReservedServiceNames { + m[k] = struct{}{} + } + return m +}() + +var serviceNameRegex = regexp.MustCompile(`\A[a-z\d_\-+]+\z`) + +func validateServiceName(name string) error { + if name == "" { + return nil + } + if _, ok := reservedServiceNamesMap[name]; ok { + return trace.BadParameter("service name %q is reserved for internal use", name) + } + if !serviceNameRegex.MatchString(name) { + return trace.BadParameter("invalid service name: %q, may only contain lowercase letters, numbers, hyphens, underscores, or plus symbols", name) + } + return nil +} + var log = logutils.NewPackageLogger(teleport.ComponentKey, teleport.ComponentTBot) // AzureOnboardingConfig holds configuration relevant to the "azure" join method. @@ -288,6 +321,7 @@ func (conf *BotConfig) CheckAndSetDefaults() error { // We've migrated Outputs to Services, so copy all Outputs to Services. conf.Services = append(conf.Services, conf.Outputs...) + uniqueNames := make(map[string]struct{}, len(conf.Services)) for i, service := range conf.Services { if err := service.CheckAndSetDefaults(); err != nil { return trace.Wrap(err, "validating service[%d]", i) @@ -295,6 +329,15 @@ func (conf *BotConfig) CheckAndSetDefaults() error { if err := service.GetCredentialLifetime().Validate(conf.Oneshot); err != nil { return trace.Wrap(err, "validating service[%d]", i) } + if name := service.GetName(); name != "" { + if err := validateServiceName(name); err != nil { + return trace.Wrap(err, "validating service[%d]", i) + } + if _, seen := uniqueNames[name]; seen { + return trace.BadParameter("validating service[%d]: duplicate name: %q", i, name) + } + uniqueNames[name] = struct{}{} + } } destinationPaths := map[string]int{} @@ -386,6 +429,10 @@ type ServiceConfig interface { // RenewalInterval. It's used for validation purposes; services that do not // support these options should return the zero value. GetCredentialLifetime() CredentialLifetime + + // GetName returns the user-given name of the service, used for validation + // purposes. + GetName() string } // ServiceConfigs assists polymorphic unmarshaling of a slice of ServiceConfigs. diff --git a/lib/tbot/config/config_test.go b/lib/tbot/config/config_test.go index 5d4eff9af406a..21ab98e1f1db7 100644 --- a/lib/tbot/config/config_test.go +++ b/lib/tbot/config/config_test.go @@ -578,3 +578,59 @@ func TestBotConfig_Base64(t *testing.T) { }) } } + +func TestBotConfig_NameValidation(t *testing.T) { + t.Parallel() + + testCases := map[string]struct { + cfg *BotConfig + err string + }{ + "duplicate names": { + cfg: &BotConfig{ + Version: V2, + Services: ServiceConfigs{ + &IdentityOutput{ + Name: "foo", + Destination: &DestinationMemory{}, + }, + &IdentityOutput{ + Name: "foo", + Destination: &DestinationMemory{}, + }, + }, + }, + err: `duplicate name: "foo`, + }, + "reserved name": { + cfg: &BotConfig{ + Version: V2, + Services: ServiceConfigs{ + &IdentityOutput{ + Name: "identity", + Destination: &DestinationMemory{}, + }, + }, + }, + err: `service name "identity" is reserved for internal use`, + }, + "invalid name": { + cfg: &BotConfig{ + Version: V2, + Services: ServiceConfigs{ + &IdentityOutput{ + Name: "hello, world!", + Destination: &DestinationMemory{}, + }, + }, + }, + err: `may only contain lowercase letters`, + }, + } + for desc, tc := range testCases { + t.Run(desc, func(t *testing.T) { + t.Parallel() + require.ErrorContains(t, tc.cfg.CheckAndSetDefaults(), tc.err) + }) + } +} diff --git a/lib/tbot/config/service_application.go b/lib/tbot/config/service_application.go index cd4148d702910..3912f10994cf3 100644 --- a/lib/tbot/config/service_application.go +++ b/lib/tbot/config/service_application.go @@ -35,6 +35,8 @@ var ( const ApplicationOutputType = "application" type ApplicationOutput struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Destination is where the credentials should be written to. Destination bot.Destination `yaml:"destination"` // Roles is the list of roles to request for the generated credentials. @@ -68,6 +70,11 @@ func (o *ApplicationOutput) CheckAndSetDefaults() error { return nil } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *ApplicationOutput) GetName() string { + return o.Name +} + func (o *ApplicationOutput) GetDestination() bot.Destination { return o.Destination } diff --git a/lib/tbot/config/service_application_tunnel.go b/lib/tbot/config/service_application_tunnel.go index ede5b1a4ac44f..d3da1300053ac 100644 --- a/lib/tbot/config/service_application_tunnel.go +++ b/lib/tbot/config/service_application_tunnel.go @@ -35,6 +35,8 @@ const ApplicationTunnelServiceType = "application-tunnel" // ApplicationTunnelService opens an authenticated tunnel for Application // Access. type ApplicationTunnelService struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Listen is the address on which database tunnel should listen. Example: // - "tcp://127.0.0.1:3306" // - "tcp://0.0.0.0:3306 @@ -59,7 +61,12 @@ func (s *ApplicationTunnelService) Type() string { return ApplicationTunnelServiceType } -func (s *ApplicationTunnelService) MarshalYAML() (interface{}, error) { +// GetName returns the user-given name of the service, used for validation purposes. +func (o *ApplicationTunnelService) GetName() string { + return o.Name +} + +func (s *ApplicationTunnelService) MarshalYAML() (any, error) { type raw ApplicationTunnelService return withTypeHeader((*raw)(s), ApplicationTunnelServiceType) } diff --git a/lib/tbot/config/service_client_credential.go b/lib/tbot/config/service_client_credential.go index f685aa7100c99..12ffb9835b4a3 100644 --- a/lib/tbot/config/service_client_credential.go +++ b/lib/tbot/config/service_client_credential.go @@ -47,11 +47,19 @@ var ( // be modified. This output is currently part of an experiment and could be // removed in a future release. type UnstableClientCredentialOutput struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` + mu sync.Mutex facade *identity.Facade ready chan struct{} } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *UnstableClientCredentialOutput) GetName() string { + return o.Name +} + // Ready returns a channel which closes when the Output is ready to be used // as a client credential. Using this as a credential before Ready closes is // unsupported. diff --git a/lib/tbot/config/service_database.go b/lib/tbot/config/service_database.go index 205e5df49f3c7..e7e725cc11ed7 100644 --- a/lib/tbot/config/service_database.go +++ b/lib/tbot/config/service_database.go @@ -76,6 +76,8 @@ var ( // DatabaseOutput produces credentials which can be used to connect to a // database through teleport. type DatabaseOutput struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Destination is where the credentials should be written to. Destination bot.Destination `yaml:"destination"` // Roles is the list of roles to request for the generated credentials. @@ -101,6 +103,11 @@ type DatabaseOutput struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *DatabaseOutput) GetName() string { + return o.Name +} + func (o *DatabaseOutput) Init(ctx context.Context) error { subDirs := []string{} if o.Format == CockroachDatabaseFormat { diff --git a/lib/tbot/config/service_database_tunnel.go b/lib/tbot/config/service_database_tunnel.go index 0e68563980a8e..9b03e5945f124 100644 --- a/lib/tbot/config/service_database_tunnel.go +++ b/lib/tbot/config/service_database_tunnel.go @@ -30,6 +30,8 @@ const DatabaseTunnelServiceType = "database-tunnel" // DatabaseTunnelService opens an authenticated tunnel for Database Access. type DatabaseTunnelService struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Listen is the address on which database tunnel should listen. Example: // - "tcp://127.0.0.1:3306" // - "tcp://0.0.0.0:3306 @@ -55,6 +57,11 @@ type DatabaseTunnelService struct { Listener net.Listener `yaml:"-"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *DatabaseTunnelService) GetName() string { + return o.Name +} + func (s *DatabaseTunnelService) Type() string { return DatabaseTunnelServiceType } diff --git a/lib/tbot/config/service_example.go b/lib/tbot/config/service_example.go index 7a09dd38b9cbd..7eabf9eb3bcd2 100644 --- a/lib/tbot/config/service_example.go +++ b/lib/tbot/config/service_example.go @@ -29,6 +29,9 @@ const ExampleServiceType = "example" // not intended to be used and exists to demonstrate how a user configurable // service integrates with the tbot service manager. type ExampleService struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` + Message string `yaml:"message"` } @@ -36,7 +39,12 @@ func (s *ExampleService) Type() string { return ExampleServiceType } -func (s *ExampleService) MarshalYAML() (interface{}, error) { +// GetName returns the user-given name of the service, used for validation purposes. +func (s *ExampleService) GetName() string { + return s.Name +} + +func (s *ExampleService) MarshalYAML() (any, error) { type raw ExampleService return withTypeHeader((*raw)(s), ExampleServiceType) } diff --git a/lib/tbot/config/service_identity.go b/lib/tbot/config/service_identity.go index 8d51fd43564db..0e0827e8fdcf5 100644 --- a/lib/tbot/config/service_identity.go +++ b/lib/tbot/config/service_identity.go @@ -75,6 +75,8 @@ var ( // It cannot be used to connect to Applications, Databases or Kubernetes // Clusters. type IdentityOutput struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Destination is where the credentials should be written to. Destination bot.Destination `yaml:"destination"` // Roles is the list of roles to request for the generated credentials. @@ -107,6 +109,11 @@ type IdentityOutput struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *IdentityOutput) GetName() string { + return o.Name +} + func (o *IdentityOutput) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) } diff --git a/lib/tbot/config/service_kubernetes.go b/lib/tbot/config/service_kubernetes.go index 09a7572bba49a..c15d5889f1cfc 100644 --- a/lib/tbot/config/service_kubernetes.go +++ b/lib/tbot/config/service_kubernetes.go @@ -37,6 +37,8 @@ const KubernetesOutputType = "kubernetes" // KubernetesOutput produces credentials which can be used to connect to a // Kubernetes Cluster through teleport. type KubernetesOutput struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Destination is where the credentials should be written to. Destination bot.Destination `yaml:"destination"` // Roles is the list of roles to request for the generated credentials. @@ -60,6 +62,11 @@ type KubernetesOutput struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *KubernetesOutput) GetName() string { + return o.Name +} + func (o *KubernetesOutput) CheckAndSetDefaults() error { if err := validateOutputDestination(o.Destination); err != nil { return trace.Wrap(err) diff --git a/lib/tbot/config/service_kubernetes_v2.go b/lib/tbot/config/service_kubernetes_v2.go index eb0b651dd6ce2..5fd27b8a10479 100644 --- a/lib/tbot/config/service_kubernetes_v2.go +++ b/lib/tbot/config/service_kubernetes_v2.go @@ -37,6 +37,8 @@ const KubernetesV2OutputType = "kubernetes/v2" // KubernetesOutput produces credentials which can be used to connect to a // Kubernetes Cluster through teleport. type KubernetesV2Output struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Destination is where the credentials should be written to. Destination bot.Destination `yaml:"destination"` @@ -56,6 +58,11 @@ type KubernetesV2Output struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *KubernetesV2Output) GetName() string { + return o.Name +} + func (o *KubernetesV2Output) CheckAndSetDefaults() error { if err := validateOutputDestination(o.Destination); err != nil { return trace.Wrap(err) diff --git a/lib/tbot/config/service_spiffe_svid.go b/lib/tbot/config/service_spiffe_svid.go index 5e208a35074e5..89cfec1d242c3 100644 --- a/lib/tbot/config/service_spiffe_svid.go +++ b/lib/tbot/config/service_spiffe_svid.go @@ -117,6 +117,8 @@ func (o JWTSVID) CheckAndSetDefaults() error { // SPIFFESVIDOutput is the configuration for the SPIFFE SVID output. // Emulates the output of https://github.com/spiffe/spiffe-helper type SPIFFESVIDOutput struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Destination is where the credentials should be written to. Destination bot.Destination `yaml:"destination"` SVID SVIDRequest `yaml:"svid"` @@ -130,6 +132,11 @@ type SPIFFESVIDOutput struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *SPIFFESVIDOutput) GetName() string { + return o.Name +} + // Init initializes the destination. func (o *SPIFFESVIDOutput) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) diff --git a/lib/tbot/config/service_spiffe_workload_api.go b/lib/tbot/config/service_spiffe_workload_api.go index c5e0917bf0666..a1c2e78398641 100644 --- a/lib/tbot/config/service_spiffe_workload_api.go +++ b/lib/tbot/config/service_spiffe_workload_api.go @@ -114,6 +114,8 @@ func (o SVIDRequestRule) LogValue() slog.Value { // SPIFFEWorkloadAPIService is the configuration for the SPIFFE Workload API // service. type SPIFFEWorkloadAPIService struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Listen is the address on which the SPIFFE Workload API server should // listen. This should either be prefixed with "unix://" or "tcp://". Listen string `yaml:"listen"` @@ -132,6 +134,11 @@ type SPIFFEWorkloadAPIService struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *SPIFFEWorkloadAPIService) GetName() string { + return o.Name +} + func (s *SPIFFEWorkloadAPIService) Type() string { return SPIFFEWorkloadAPIServiceType } diff --git a/lib/tbot/config/service_ssh_host.go b/lib/tbot/config/service_ssh_host.go index 7729cfc0a8c06..16a36b4a0448b 100644 --- a/lib/tbot/config/service_ssh_host.go +++ b/lib/tbot/config/service_ssh_host.go @@ -49,6 +49,8 @@ var ( // SSHHostOutput generates a host certificate signed by the Teleport CA. This // can be used to allow OpenSSH server to be trusted by Teleport SSH clients. type SSHHostOutput struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Destination is where the credentials should be written to. Destination bot.Destination `yaml:"destination"` // Roles is the list of roles to request for the generated credentials. @@ -63,6 +65,11 @@ type SSHHostOutput struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *SSHHostOutput) GetName() string { + return o.Name +} + func (o *SSHHostOutput) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) } diff --git a/lib/tbot/config/service_ssh_multiplexer.go b/lib/tbot/config/service_ssh_multiplexer.go index 3d538647518cc..6d98324f2b74a 100644 --- a/lib/tbot/config/service_ssh_multiplexer.go +++ b/lib/tbot/config/service_ssh_multiplexer.go @@ -30,6 +30,8 @@ const SSHMultiplexerServiceType = "ssh-multiplexer" // SSHMultiplexerService is the configuration for the `ssh-proxy` service type SSHMultiplexerService struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Destination is where the config and tunnel should be written to. It // should be a DestinationDirectory. Destination bot.Destination `yaml:"destination"` @@ -54,6 +56,11 @@ type SSHMultiplexerService struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *SSHMultiplexerService) GetName() string { + return o.Name +} + func (s *SSHMultiplexerService) SessionResumptionEnabled() bool { if s.EnableResumption == nil { return true diff --git a/lib/tbot/config/service_workload_identity_api.go b/lib/tbot/config/service_workload_identity_api.go index dd0890422e227..1d6b1e6b795e4 100644 --- a/lib/tbot/config/service_workload_identity_api.go +++ b/lib/tbot/config/service_workload_identity_api.go @@ -32,6 +32,8 @@ var ( // WorkloadIdentityAPIService is the configuration for the // WorkloadIdentityAPIService type WorkloadIdentityAPIService struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Listen is the address on which the SPIFFE Workload API server should // listen. This should either be prefixed with "unix://" or "tcp://". Listen string `yaml:"listen"` @@ -60,6 +62,11 @@ func (o *WorkloadIdentityAPIService) CheckAndSetDefaults() error { return nil } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *WorkloadIdentityAPIService) GetName() string { + return o.Name +} + // Type returns the type of the service. func (o *WorkloadIdentityAPIService) Type() string { return WorkloadIdentityAPIServiceType diff --git a/lib/tbot/config/service_workload_identity_aws_ra.go b/lib/tbot/config/service_workload_identity_aws_ra.go index aefc1ee00b6b7..6fbc269f28fea 100644 --- a/lib/tbot/config/service_workload_identity_aws_ra.go +++ b/lib/tbot/config/service_workload_identity_aws_ra.go @@ -43,6 +43,8 @@ var ( // WorkloadIdentityAWSRAService is the configuration for the // WorkloadIdentityAWSRAService type WorkloadIdentityAWSRAService struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Selector is the selector for the WorkloadIdentity resource that will be // used to issue WICs. Selector WorkloadIdentitySelector `yaml:"selector"` @@ -96,6 +98,11 @@ type WorkloadIdentityAWSRAService struct { EndpointOverride string `yaml:"-"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o *WorkloadIdentityAWSRAService) GetName() string { + return o.Name +} + // Init initializes the destination. func (o *WorkloadIdentityAWSRAService) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) diff --git a/lib/tbot/config/service_workload_identity_jwt.go b/lib/tbot/config/service_workload_identity_jwt.go index a46063380d363..2c95a36488a90 100644 --- a/lib/tbot/config/service_workload_identity_jwt.go +++ b/lib/tbot/config/service_workload_identity_jwt.go @@ -34,6 +34,8 @@ var ( // WorkloadIdentityJWTService is the configuration for the WorkloadIdentityJWTService type WorkloadIdentityJWTService struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Selector is the selector for the WorkloadIdentity resource that will be // used to issue WICs. Selector WorkloadIdentitySelector `yaml:"selector"` @@ -47,6 +49,11 @@ type WorkloadIdentityJWTService struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o WorkloadIdentityJWTService) GetName() string { + return o.Name +} + // Init initializes the destination. func (o *WorkloadIdentityJWTService) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) diff --git a/lib/tbot/config/service_workload_identity_x509.go b/lib/tbot/config/service_workload_identity_x509.go index 80fd9d9a2885a..0a2469cd09089 100644 --- a/lib/tbot/config/service_workload_identity_x509.go +++ b/lib/tbot/config/service_workload_identity_x509.go @@ -63,6 +63,8 @@ func (s *WorkloadIdentitySelector) CheckAndSetDefaults() error { // WorkloadIdentityX509Service is the configuration for the WorkloadIdentityX509Service // Emulates the output of https://github.com/spiffe/spiffe-helper type WorkloadIdentityX509Service struct { + // Name of the service for logs and the /readyz endpoint. + Name string `yaml:"name,omitempty"` // Selector is the selector for the WorkloadIdentity resource that will be // used to issue WICs. Selector WorkloadIdentitySelector `yaml:"selector"` @@ -77,6 +79,11 @@ type WorkloadIdentityX509Service struct { CredentialLifetime CredentialLifetime `yaml:",inline"` } +// GetName returns the user-given name of the service, used for validation purposes. +func (o WorkloadIdentityX509Service) GetName() string { + return o.Name +} + // Init initializes the destination. func (o *WorkloadIdentityX509Service) Init(ctx context.Context) error { return trace.Wrap(o.Destination.Init(ctx, []string{})) diff --git a/lib/tbot/loop.go b/lib/tbot/loop.go index af9cef8940a0e..c941f27482cfd 100644 --- a/lib/tbot/loop.go +++ b/lib/tbot/loop.go @@ -29,6 +29,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/gravitational/teleport/api/utils/retryutils" + "github.com/gravitational/teleport/lib/tbot/readyz" ) var ( @@ -80,13 +81,10 @@ type runOnIntervalConfig struct { // service doesn't support gracefully degrading when there is no API client // available. identityReadyCh <-chan struct{} + statusReporter readyz.Reporter } -// runOnInterval runs a function on a given interval, with retries and jitter. -// -// TODO(noah): Emit Prometheus metrics for: -// - Time of next attempt -func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { +func (cfg *runOnIntervalConfig) checkAndSetDefaults() error { switch { case cfg.interval <= 0: return trace.BadParameter("interval must be greater than 0") @@ -100,6 +98,25 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { return trace.BadParameter("name is required") } + if cfg.clock == nil { + cfg.clock = clockwork.NewRealClock() + } + if cfg.statusReporter == nil { + cfg.statusReporter = readyz.NoopReporter() + } + + return nil +} + +// runOnInterval runs a function on a given interval, with retries and jitter. +// +// TODO(noah): Emit Prometheus metrics for: +// - Time of next attempt +func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { + if err := cfg.checkAndSetDefaults(); err != nil { + return err + } + log := cfg.log.With("task", cfg.name) if cfg.identityReadyCh != nil { @@ -115,10 +132,6 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { } } - if cfg.clock == nil { - cfg.clock = clockwork.NewRealClock() - } - ticker := cfg.clock.NewTicker(cfg.interval) defer ticker.Stop() jitter := retryutils.DefaultJitter @@ -147,6 +160,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { ) err = cfg.f(ctx) if err == nil { + cfg.statusReporter.Report(readyz.Healthy) loopIterationsSuccessCounter.WithLabelValues(cfg.service, cfg.name).Observe(float64(attempt - 1)) break } @@ -177,6 +191,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { loopIterationTime.WithLabelValues(cfg.service, cfg.name).Observe(time.Since(startTime).Seconds()) if err != nil { + cfg.statusReporter.ReportReason(readyz.Unhealthy, err.Error()) loopIterationsFailureCounter.WithLabelValues(cfg.service, cfg.name).Inc() if cfg.exitOnRetryExhausted { diff --git a/lib/tbot/readyz/http.go b/lib/tbot/readyz/http.go new file mode 100644 index 0000000000000..f2a9472beec59 --- /dev/null +++ b/lib/tbot/readyz/http.go @@ -0,0 +1,79 @@ +/* + * Teleport + * Copyright (C) 2025 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package readyz + +import ( + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" +) + +// HTTPHandler returns an HTTP handler that implements tbot's +// /readyz(/{service}) endpoints. +func HTTPHandler(reg *Registry) http.Handler { + mux := http.NewServeMux() + + mux.Handle("/readyz/{service}", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + status, ok := reg.ServiceStatus(r.PathValue("service")) + if !ok { + w.WriteHeader(http.StatusNotFound) + if err := writeJSON(w, struct { + Error string `json:"error"` + }{ + fmt.Sprintf("Service named %q not found.", r.PathValue("service")), + }); err != nil { + slog.ErrorContext(r.Context(), "Failed to write response", "error", err) + } + return + } + + w.WriteHeader(status.Status.HTTPStatusCode()) + if err := writeJSON(w, status); err != nil { + slog.ErrorContext(r.Context(), "Failed to write response", "error", err) + } + })) + + mux.Handle("/readyz", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + status := reg.OverallStatus() + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status.Status.HTTPStatusCode()) + + if err := writeJSON(w, status); err != nil { + slog.ErrorContext(r.Context(), "Failed to write response", "error", err) + } + })) + + return mux +} + +func writeJSON(w io.Writer, v any) error { + output, err := json.MarshalIndent(v, "", " ") + if err != nil { + return err + } + if _, err := w.Write(output); err != nil { + return err + } + return nil +} diff --git a/lib/tbot/readyz/readyz.go b/lib/tbot/readyz/readyz.go new file mode 100644 index 0000000000000..4462c7c1fe4bc --- /dev/null +++ b/lib/tbot/readyz/readyz.go @@ -0,0 +1,113 @@ +/* + * Teleport + * Copyright (C) 2025 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package readyz + +import "sync" + +// NewRegistry returns a Registry to track the health of tbot's services. +func NewRegistry() *Registry { + return &Registry{ + services: make(map[string]*ServiceStatus), + } +} + +// Registry tracks the status/health of tbot's services. +type Registry struct { + mu sync.Mutex + services map[string]*ServiceStatus +} + +// AddService adds a service to the registry so that its health will be reported +// from our readyz endpoints. It returns a Reporter the service can use to report +// status changes. +func (r *Registry) AddService(name string) Reporter { + r.mu.Lock() + defer r.mu.Unlock() + + status, ok := r.services[name] + if !ok { + status = &ServiceStatus{} + r.services[name] = status + } + return &reporter{ + mu: &r.mu, + status: status, + } +} + +// ServiceStatus reads the named service's status. The bool value will be false +// if the service has not been registered. +func (r *Registry) ServiceStatus(name string) (*ServiceStatus, bool) { + r.mu.Lock() + defer r.mu.Unlock() + + if status, ok := r.services[name]; ok { + return status.Clone(), true + } + + return nil, false +} + +// OverallStatus returns tbot's overall status when taking service statuses into +// account. +func (r *Registry) OverallStatus() *OverallStatus { + r.mu.Lock() + defer r.mu.Unlock() + + status := Healthy + services := make(map[string]*ServiceStatus, len(r.services)) + + for name, svc := range r.services { + services[name] = svc.Clone() + + if svc.Status != Healthy { + status = Unhealthy + } + } + + return &OverallStatus{ + Status: status, + Services: services, + } +} + +// ServiceStatus is a snapshot of the service's status. +type ServiceStatus struct { + // Status of the service. + Status Status `json:"status"` + + // Reason string describing why the service has its current status. + Reason string `json:"reason,omitempty"` +} + +// Clone the status to avoid data races. +func (s *ServiceStatus) Clone() *ServiceStatus { + clone := *s + return &clone +} + +// OverallStatus is tbot's overall aggregate status. +type OverallStatus struct { + // Status of tbot overall. If any service isn't Healthy, the overall status + // will be Unhealthy. + Status Status `json:"status"` + + // Services contains the service-specific statuses. + Services map[string]*ServiceStatus `json:"services"` +} diff --git a/lib/tbot/readyz/readyz_test.go b/lib/tbot/readyz/readyz_test.go new file mode 100644 index 0000000000000..40641ab9d4a01 --- /dev/null +++ b/lib/tbot/readyz/readyz_test.go @@ -0,0 +1,148 @@ +/* + * Teleport + * Copyright (C) 2025 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package readyz_test + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/gravitational/teleport/lib/tbot/readyz" +) + +func TestReadyz(t *testing.T) { + t.Parallel() + + reg := readyz.NewRegistry() + + a := reg.AddService("a") + b := reg.AddService("b") + + srv := httptest.NewServer(readyz.HTTPHandler(reg)) + srv.URL = srv.URL + "/readyz" + t.Cleanup(srv.Close) + + t.Run("initial state - overall", func(t *testing.T) { + rsp, err := http.Get(srv.URL) + require.NoError(t, err) + defer rsp.Body.Close() + + require.Equal(t, http.StatusServiceUnavailable, rsp.StatusCode) + + var response readyz.OverallStatus + err = json.NewDecoder(rsp.Body).Decode(&response) + require.NoError(t, err) + + require.Equal(t, + readyz.OverallStatus{ + Status: readyz.Unhealthy, + Services: map[string]*readyz.ServiceStatus{ + "a": {Status: readyz.Initializing}, + "b": {Status: readyz.Initializing}, + }, + }, + response, + ) + }) + + t.Run("individual service", func(t *testing.T) { + a.ReportReason(readyz.Unhealthy, "database is down") + + rsp, err := http.Get(srv.URL + "/a") + require.NoError(t, err) + defer rsp.Body.Close() + + require.Equal(t, http.StatusServiceUnavailable, rsp.StatusCode) + + var response readyz.ServiceStatus + err = json.NewDecoder(rsp.Body).Decode(&response) + require.NoError(t, err) + + require.Equal(t, + readyz.ServiceStatus{ + Status: readyz.Unhealthy, + Reason: "database is down", + }, + response, + ) + }) + + t.Run("mixed state", func(t *testing.T) { + a.Report(readyz.Healthy) + b.ReportReason(readyz.Unhealthy, "database is down") + + rsp, err := http.Get(srv.URL) + require.NoError(t, err) + defer rsp.Body.Close() + + require.Equal(t, http.StatusServiceUnavailable, rsp.StatusCode) + + var response readyz.OverallStatus + err = json.NewDecoder(rsp.Body).Decode(&response) + require.NoError(t, err) + + require.Equal(t, + readyz.OverallStatus{ + Status: readyz.Unhealthy, + Services: map[string]*readyz.ServiceStatus{ + "a": {Status: readyz.Healthy}, + "b": {Status: readyz.Unhealthy, Reason: "database is down"}, + }, + }, + response, + ) + }) + + t.Run("all healthy", func(t *testing.T) { + a.Report(readyz.Healthy) + b.Report(readyz.Healthy) + + rsp, err := http.Get(srv.URL) + require.NoError(t, err) + defer rsp.Body.Close() + + require.Equal(t, http.StatusOK, rsp.StatusCode) + + var response readyz.OverallStatus + err = json.NewDecoder(rsp.Body).Decode(&response) + require.NoError(t, err) + + require.Equal(t, + readyz.OverallStatus{ + Status: readyz.Healthy, + Services: map[string]*readyz.ServiceStatus{ + "a": {Status: readyz.Healthy}, + "b": {Status: readyz.Healthy}, + }, + }, + response, + ) + }) + + t.Run("unknown service", func(t *testing.T) { + rsp, err := http.Get(srv.URL + "/foo") + require.NoError(t, err) + defer rsp.Body.Close() + + require.Equal(t, http.StatusNotFound, rsp.StatusCode) + }) +} diff --git a/lib/tbot/readyz/reporter.go b/lib/tbot/readyz/reporter.go new file mode 100644 index 0000000000000..3620bb96ef3fb --- /dev/null +++ b/lib/tbot/readyz/reporter.go @@ -0,0 +1,58 @@ +/* + * Teleport + * Copyright (C) 2025 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package readyz + +import "sync" + +// Reporter can be used by a service to report its status. +type Reporter interface { + // Report the service's status. + Report(status Status) + + // ReportReason reports the service's status including reason/description text. + ReportReason(status Status, reason string) +} + +type reporter struct { + mu *sync.Mutex + status *ServiceStatus +} + +func (r *reporter) Report(status Status) { + r.ReportReason(status, "") +} + +func (r *reporter) ReportReason(status Status, reason string) { + r.mu.Lock() + defer r.mu.Unlock() + + r.status.Status = status + r.status.Reason = reason +} + +// NoopReporter returns a no-op Reporter that can be used when no real reporter +// is available (e.g. in tests). +func NoopReporter() Reporter { + return noopReporter{} +} + +type noopReporter struct{} + +func (noopReporter) Report(Status) {} +func (noopReporter) ReportReason(Status, string) {} diff --git a/lib/tbot/readyz/status.go b/lib/tbot/readyz/status.go new file mode 100644 index 0000000000000..3d226c67023b6 --- /dev/null +++ b/lib/tbot/readyz/status.go @@ -0,0 +1,85 @@ +/* + * Teleport + * Copyright (C) 2025 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package readyz + +import ( + "encoding/json" + "net/http" +) + +// Status describes the healthiness of a service or tbot overall. +type Status uint + +const ( + // Initializing means no status has been reported for the service. + Initializing Status = iota + + // Healthy means the service is healthy and ready to serve traffic or it has + // recently succeeded generating an output. + Healthy + + // Unhealthy means the service is failing to serve traffic or generate output. + Unhealthy +) + +// String implements fmt.Stringer. +func (s Status) String() string { + switch s { + case Initializing: + return "initializing" + case Healthy: + return "healthy" + case Unhealthy: + return "unhealthy" + default: + return "" + } +} + +// MarshalJSON implements json.Marshaler. +func (s Status) MarshalJSON() ([]byte, error) { + return json.Marshal(s.String()) +} + +// MarshalJSON implements json.Unmarshaler. +func (s *Status) UnmarshalJSON(j []byte) error { + var str string + if err := json.Unmarshal(j, &str); err != nil { + return err + } + switch str { + case "healthy": + *s = Healthy + case "unhealthy": + *s = Unhealthy + default: + *s = Initializing + } + return nil +} + +// HTTPStatusCode returns the HTTP response code that represents this status. +func (s Status) HTTPStatusCode() int { + switch s { + case Healthy: + return http.StatusOK + default: + return http.StatusServiceUnavailable + } +} diff --git a/lib/tbot/service_application_output.go b/lib/tbot/service_application_output.go index fe9c2a8b569d4..5ac03188955c2 100644 --- a/lib/tbot/service_application_output.go +++ b/lib/tbot/service_application_output.go @@ -33,6 +33,7 @@ import ( "github.com/gravitational/teleport/lib/reversetunnelclient" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" ) // ApplicationOutputService generates the artifacts necessary to connect to a @@ -46,10 +47,14 @@ type ApplicationOutputService struct { log *slog.Logger reloadBroadcaster *channelBroadcaster resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter } func (s *ApplicationOutputService) String() string { - return fmt.Sprintf("application-output (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("application-output (%s)", s.cfg.Destination.String()), + ) } func (s *ApplicationOutputService) OneShot(ctx context.Context) error { @@ -69,6 +74,7 @@ func (s *ApplicationOutputService) Run(ctx context.Context) error { log: s.log, reloadCh: reloadCh, identityReadyCh: s.botIdentityReadyCh, + statusReporter: s.statusReporter, }) return trace.Wrap(err) } diff --git a/lib/tbot/service_application_tunnel.go b/lib/tbot/service_application_tunnel.go index 854afcb5c5d0e..7b1115a0ecf37 100644 --- a/lib/tbot/service_application_tunnel.go +++ b/lib/tbot/service_application_tunnel.go @@ -35,6 +35,7 @@ import ( "github.com/gravitational/teleport/lib/srv/alpnproxy/common" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/utils" ) @@ -51,6 +52,7 @@ type ApplicationTunnelService struct { botClient *apiclient.Client getBotIdentity getBotIdentityFn botIdentityReadyCh <-chan struct{} + statusReporter readyz.Reporter } func (s *ApplicationTunnelService) Run(ctx context.Context) error { @@ -98,10 +100,16 @@ func (s *ApplicationTunnelService) Run(ctx context.Context) error { }() s.log.InfoContext(ctx, "Listening for connections.", "address", l.Addr().String()) + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } + s.statusReporter.Report(readyz.Healthy) + select { case <-ctx.Done(): return nil case err := <-errCh: + s.statusReporter.ReportReason(readyz.Unhealthy, err.Error()) return trace.Wrap(err, "local proxy failed") } } @@ -262,5 +270,8 @@ func (s *ApplicationTunnelService) issueCert( // String returns a human-readable string that can uniquely identify the // service. func (s *ApplicationTunnelService) String() string { - return fmt.Sprintf("%s:%s:%s", config.ApplicationTunnelServiceType, s.cfg.Listen, s.cfg.AppName) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("%s:%s:%s", config.ApplicationTunnelServiceType, s.cfg.Listen, s.cfg.AppName), + ) } diff --git a/lib/tbot/service_bot_identity.go b/lib/tbot/service_bot_identity.go index 96745b6f24224..5828cc9058fbd 100644 --- a/lib/tbot/service_bot_identity.go +++ b/lib/tbot/service_bot_identity.go @@ -43,6 +43,7 @@ import ( "github.com/gravitational/teleport/lib/tbot/bot" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/utils" ) @@ -58,6 +59,7 @@ type identityService struct { reloadBroadcaster *channelBroadcaster cfg *config.BotConfig resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter mu sync.Mutex client *apiclient.Client @@ -291,10 +293,13 @@ func (s *identityService) Initialize(ctx context.Context) error { s.mu.Lock() s.client = c s.facade = facade + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } s.mu.Unlock() s.unblockWaiters() - + s.statusReporter.Report(readyz.Healthy) s.log.InfoContext(ctx, "Identity initialized successfully") return nil } @@ -371,6 +376,7 @@ func (s *identityService) Run(ctx context.Context) error { log: s.log, reloadCh: reloadCh, waitBeforeFirstRun: true, + statusReporter: s.statusReporter, }) return trace.Wrap(err) } diff --git a/lib/tbot/service_ca_rotation.go b/lib/tbot/service_ca_rotation.go index 462de4d37e59e..4684387806fca 100644 --- a/lib/tbot/service_ca_rotation.go +++ b/lib/tbot/service_ca_rotation.go @@ -34,6 +34,7 @@ import ( apiclient "github.com/gravitational/teleport/api/client" "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/api/utils/retryutils" + "github.com/gravitational/teleport/lib/tbot/readyz" ) // debouncer accepts a duration, and a function. When `attempt` is called on @@ -132,6 +133,7 @@ type caRotationService struct { botClient *apiclient.Client getBotIdentity getBotIdentityFn botIdentityReadyCh <-chan struct{} + statusReporter readyz.Reporter } func (s *caRotationService) String() string { @@ -144,6 +146,10 @@ func (s *caRotationService) String() string { // Run also manages debouncing the renewals across multiple watch // attempts. func (s *caRotationService) Run(ctx context.Context) error { + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } + rd := debouncer{ f: s.reloadBroadcaster.broadcast, debouncePeriod: time.Second * 10, @@ -183,6 +189,7 @@ func (s *caRotationService) Run(ctx context.Context) error { if isCancelledErr { s.log.DebugContext(ctx, "CA watcher detected client closing. Waiting to rewatch", "wait", backoffPeriod) } else if err != nil { + s.statusReporter.Report(readyz.Unhealthy) s.log.ErrorContext(ctx, "Error occurred whilst watching CA rotations. Waiting to retry", "wait", backoffPeriod, "error", err) } @@ -224,6 +231,7 @@ func (s *caRotationService) watchCARotations(ctx context.Context, queueReload fu // OpInit is a special case omitted by the Watcher when the // connection succeeds. if event.Type == types.OpInit { + s.statusReporter.Report(readyz.Healthy) s.log.InfoContext(ctx, "Started watching for CA rotations") continue } diff --git a/lib/tbot/service_client_credential.go b/lib/tbot/service_client_credential.go index 01dd60e1448ed..7617a1d607e29 100644 --- a/lib/tbot/service_client_credential.go +++ b/lib/tbot/service_client_credential.go @@ -19,6 +19,7 @@ package tbot import ( + "cmp" "context" "log/slog" @@ -26,6 +27,7 @@ import ( apiclient "github.com/gravitational/teleport/api/client" "github.com/gravitational/teleport/lib/tbot/config" + "github.com/gravitational/teleport/lib/tbot/readyz" ) // ClientCredentialOutputService produces credentials which can be used to @@ -41,10 +43,14 @@ type ClientCredentialOutputService struct { getBotIdentity getBotIdentityFn log *slog.Logger reloadBroadcaster *channelBroadcaster + statusReporter readyz.Reporter } func (s *ClientCredentialOutputService) String() string { - return "client-credential-output" + return cmp.Or( + s.cfg.Name, + "client-credential-output", + ) } func (s *ClientCredentialOutputService) OneShot(ctx context.Context) error { @@ -64,6 +70,7 @@ func (s *ClientCredentialOutputService) Run(ctx context.Context) error { log: s.log, reloadCh: reloadCh, identityReadyCh: s.botIdentityReadyCh, + statusReporter: s.statusReporter, }) return trace.Wrap(err) } diff --git a/lib/tbot/service_database_output.go b/lib/tbot/service_database_output.go index c9b95a869b6f5..870b63e41c4ee 100644 --- a/lib/tbot/service_database_output.go +++ b/lib/tbot/service_database_output.go @@ -34,6 +34,7 @@ import ( "github.com/gravitational/teleport/lib/tbot/bot" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" ) // DatabaseOutputService generates the artifacts necessary to connect to a @@ -47,10 +48,14 @@ type DatabaseOutputService struct { log *slog.Logger reloadBroadcaster *channelBroadcaster resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter } func (s *DatabaseOutputService) String() string { - return fmt.Sprintf("database-output (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("database-output (%s)", s.cfg.Destination.String()), + ) } func (s *DatabaseOutputService) OneShot(ctx context.Context) error { @@ -70,6 +75,7 @@ func (s *DatabaseOutputService) Run(ctx context.Context) error { log: s.log, reloadCh: reloadCh, identityReadyCh: s.botIdentityReadyCh, + statusReporter: s.statusReporter, }) return trace.Wrap(err) } diff --git a/lib/tbot/service_database_tunnel.go b/lib/tbot/service_database_tunnel.go index 9581b43974272..0f7fe878202fe 100644 --- a/lib/tbot/service_database_tunnel.go +++ b/lib/tbot/service_database_tunnel.go @@ -34,6 +34,7 @@ import ( "github.com/gravitational/teleport/lib/srv/alpnproxy/common" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tlsca" "github.com/gravitational/teleport/lib/utils" ) @@ -71,6 +72,7 @@ type DatabaseTunnelService struct { botClient *apiclient.Client getBotIdentity getBotIdentityFn botIdentityReadyCh <-chan struct{} + statusReporter readyz.Reporter } // buildLocalProxyConfig initializes the service, fetching any initial information and setting @@ -234,10 +236,16 @@ func (s *DatabaseTunnelService) Run(ctx context.Context) error { }() s.log.InfoContext(ctx, "Listening for connections.", "address", l.Addr().String()) + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } + s.statusReporter.Report(readyz.Healthy) + select { case <-ctx.Done(): return nil case err := <-errCh: + s.statusReporter.ReportReason(readyz.Unhealthy, err.Error()) return trace.Wrap(err, "local proxy failed") } } @@ -309,5 +317,8 @@ func (s *DatabaseTunnelService) issueCert( // String returns a human-readable string that can uniquely identify the // service. func (s *DatabaseTunnelService) String() string { - return fmt.Sprintf("%s:%s:%s", config.DatabaseTunnelServiceType, s.cfg.Listen, s.cfg.Service) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("%s:%s:%s", config.DatabaseTunnelServiceType, s.cfg.Listen, s.cfg.Service), + ) } diff --git a/lib/tbot/service_diagnostics.go b/lib/tbot/service_diagnostics.go index 2a78e684bbd0a..d941ca5d1eeff 100644 --- a/lib/tbot/service_diagnostics.go +++ b/lib/tbot/service_diagnostics.go @@ -29,14 +29,16 @@ import ( apidefaults "github.com/gravitational/teleport/api/defaults" "github.com/gravitational/teleport/lib/defaults" + "github.com/gravitational/teleport/lib/tbot/readyz" ) // diagnosticsService is a [bot.Service] that exposes diagnostics endpoints. // It's only started when a --diag-addr is provided. type diagnosticsService struct { - log *slog.Logger - diagAddr string - pprofEnabled bool + log *slog.Logger + diagAddr string + pprofEnabled bool + statusRegistry *readyz.Registry } func (s *diagnosticsService) String() string { @@ -69,13 +71,8 @@ func (s *diagnosticsService) Run(ctx context.Context) error { w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte("OK")) })) - mux.Handle("/readyz", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - // TODO(noah): Eventually this should diverge from /livez and report - // the readiness status from each sub-service, with an error status if - // any of them are not ready. - w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte("OK")) - })) + mux.Handle("/readyz", readyz.HTTPHandler(s.statusRegistry)) + mux.Handle("/readyz/", readyz.HTTPHandler(s.statusRegistry)) srv := http.Server{ Addr: s.diagAddr, Handler: mux, diff --git a/lib/tbot/service_example.go b/lib/tbot/service_example.go index c550c52d9f24f..290e612aedcc0 100644 --- a/lib/tbot/service_example.go +++ b/lib/tbot/service_example.go @@ -19,6 +19,7 @@ package tbot import ( + "cmp" "context" "fmt" "time" @@ -46,5 +47,8 @@ func (s *ExampleService) Run(ctx context.Context) error { } func (s *ExampleService) String() string { - return fmt.Sprintf("%s:%s", config.ExampleServiceType, s.Message) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("%s:%s", config.ExampleServiceType, s.Message), + ) } diff --git a/lib/tbot/service_heartbeat.go b/lib/tbot/service_heartbeat.go index 335f593ef1fe5..7047711b8b337 100644 --- a/lib/tbot/service_heartbeat.go +++ b/lib/tbot/service_heartbeat.go @@ -33,6 +33,7 @@ import ( "github.com/gravitational/teleport" machineidv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1" "github.com/gravitational/teleport/lib/tbot/config" + "github.com/gravitational/teleport/lib/tbot/readyz" ) type heartbeatSubmitter interface { @@ -50,6 +51,7 @@ type heartbeatService struct { botIdentityReadyCh <-chan struct{} interval time.Duration retryLimit int + statusReporter readyz.Reporter } func (s *heartbeatService) heartbeat(ctx context.Context, isStartup bool) error { @@ -119,6 +121,7 @@ func (s *heartbeatService) Run(ctx context.Context) error { return nil }, identityReadyCh: s.botIdentityReadyCh, + statusReporter: s.statusReporter, }) return trace.Wrap(err) } diff --git a/lib/tbot/service_identity_output.go b/lib/tbot/service_identity_output.go index 08228470b63a4..3129ae0e36cba 100644 --- a/lib/tbot/service_identity_output.go +++ b/lib/tbot/service_identity_output.go @@ -38,6 +38,7 @@ import ( "github.com/gravitational/teleport/lib/tbot/bot" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tbot/ssh" "github.com/gravitational/teleport/lib/utils" ) @@ -57,6 +58,7 @@ type IdentityOutputService struct { proxyPingCache *proxyPingCache reloadBroadcaster *channelBroadcaster resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter // executablePath is called to get the path to the tbot executable. // Usually this is os.Executable executablePath func() (string, error) @@ -64,7 +66,10 @@ type IdentityOutputService struct { } func (s *IdentityOutputService) String() string { - return fmt.Sprintf("identity-output (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("identity-output (%s)", s.cfg.Destination.String()), + ) } func (s *IdentityOutputService) OneShot(ctx context.Context) error { @@ -84,6 +89,7 @@ func (s *IdentityOutputService) Run(ctx context.Context) error { log: s.log, reloadCh: reloadCh, identityReadyCh: s.botIdentityReadyCh, + statusReporter: s.statusReporter, }) return trace.Wrap(err) } diff --git a/lib/tbot/service_kubernetes_output.go b/lib/tbot/service_kubernetes_output.go index 4342ceda26a80..49332c85cc3ff 100644 --- a/lib/tbot/service_kubernetes_output.go +++ b/lib/tbot/service_kubernetes_output.go @@ -42,6 +42,7 @@ import ( "github.com/gravitational/teleport/lib/reversetunnelclient" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" logutils "github.com/gravitational/teleport/lib/utils/log" ) @@ -62,13 +63,17 @@ type KubernetesOutputService struct { proxyPingCache *proxyPingCache reloadBroadcaster *channelBroadcaster resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter // executablePath is called to get the path to the tbot executable. // Usually this is os.Executable executablePath func() (string, error) } func (s *KubernetesOutputService) String() string { - return fmt.Sprintf("kubernetes-output (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("kubernetes-output (%s)", s.cfg.Destination.String()), + ) } func (s *KubernetesOutputService) OneShot(ctx context.Context) error { @@ -88,6 +93,7 @@ func (s *KubernetesOutputService) Run(ctx context.Context) error { log: s.log, reloadCh: reloadCh, identityReadyCh: s.botIdentityReadyCh, + statusReporter: s.statusReporter, }) return trace.Wrap(err) } diff --git a/lib/tbot/service_kubernetes_v2_output.go b/lib/tbot/service_kubernetes_v2_output.go index a045331aea601..9e0bd2ccf3da9 100644 --- a/lib/tbot/service_kubernetes_v2_output.go +++ b/lib/tbot/service_kubernetes_v2_output.go @@ -43,6 +43,7 @@ import ( "github.com/gravitational/teleport/lib/reversetunnelclient" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" logutils "github.com/gravitational/teleport/lib/utils/log" ) @@ -61,13 +62,17 @@ type KubernetesV2OutputService struct { proxyPingCache *proxyPingCache reloadBroadcaster *channelBroadcaster resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter // executablePath is called to get the path to the tbot executable. // Usually this is os.Executable executablePath func() (string, error) } func (s *KubernetesV2OutputService) String() string { - return fmt.Sprintf("kubernetes-v2-output (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("kubernetes-v2-output (%s)", s.cfg.Destination.String()), + ) } func (s *KubernetesV2OutputService) OneShot(ctx context.Context) error { @@ -87,6 +92,7 @@ func (s *KubernetesV2OutputService) Run(ctx context.Context) error { log: s.log, reloadCh: reloadCh, identityReadyCh: s.botIdentityReadyCh, + statusReporter: s.statusReporter, })) } diff --git a/lib/tbot/service_spiffe_svid_output.go b/lib/tbot/service_spiffe_svid_output.go index e13bf3db16b42..040607e2a0808 100644 --- a/lib/tbot/service_spiffe_svid_output.go +++ b/lib/tbot/service_spiffe_svid_output.go @@ -39,6 +39,7 @@ import ( "github.com/gravitational/teleport/lib/reversetunnelclient" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tbot/workloadidentity" ) @@ -59,13 +60,17 @@ type SPIFFESVIDOutputService struct { getBotIdentity getBotIdentityFn log *slog.Logger resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter // trustBundleCache is the cache of trust bundles. It only needs to be // provided when running in daemon mode. trustBundleCache *workloadidentity.TrustBundleCache } func (s *SPIFFESVIDOutputService) String() string { - return fmt.Sprintf("spiffe-svid-output (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("spiffe-svid-output (%s)", s.cfg.Destination.String()), + ) } func (s *SPIFFESVIDOutputService) OneShot(ctx context.Context) error { @@ -94,6 +99,10 @@ func (s *SPIFFESVIDOutputService) Run(ctx context.Context) error { return trace.Wrap(err, "getting trust bundle set") } + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } + jitter := retryutils.DefaultJitter var res *machineidv1pb.SignX509SVIDsResponse var privateKey crypto.Signer @@ -104,10 +113,8 @@ func (s *SPIFFESVIDOutputService) Run(ctx context.Context) error { for { var retryAfter <-chan time.Time if failures > 0 { - backoffTime := time.Second * time.Duration(math.Pow(2, float64(failures-1))) - if backoffTime > time.Minute { - backoffTime = time.Minute - } + s.statusReporter.Report(readyz.Unhealthy) + backoffTime := min(time.Second*time.Duration(math.Pow(2, float64(failures-1))), time.Minute) backoffTime = jitter(backoffTime) s.log.WarnContext( ctx, @@ -156,6 +163,7 @@ func (s *SPIFFESVIDOutputService) Run(ctx context.Context) error { failures++ continue } + s.statusReporter.Report(readyz.Healthy) failures = 0 } } diff --git a/lib/tbot/service_spiffe_workload_api.go b/lib/tbot/service_spiffe_workload_api.go index 18adfafa0d49d..a71e2727a987c 100644 --- a/lib/tbot/service_spiffe_workload_api.go +++ b/lib/tbot/service_spiffe_workload_api.go @@ -57,6 +57,7 @@ import ( "github.com/gravitational/teleport/lib/observability/metrics" "github.com/gravitational/teleport/lib/reversetunnelclient" "github.com/gravitational/teleport/lib/tbot/config" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tbot/workloadidentity" "github.com/gravitational/teleport/lib/tbot/workloadidentity/attrs" "github.com/gravitational/teleport/lib/tbot/workloadidentity/workloadattest" @@ -81,6 +82,7 @@ type SPIFFEWorkloadAPIService struct { log *slog.Logger resolver reversetunnelclient.Resolver trustBundleCache *workloadidentity.TrustBundleCache + statusReporter readyz.Reporter // client holds the impersonated client for the service client *apiclient.Client @@ -133,6 +135,10 @@ func (s *SPIFFEWorkloadAPIService) setup(ctx context.Context) (err error) { return trace.Wrap(err, "setting up workload attestation") } + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } + return nil } @@ -283,7 +289,12 @@ func (s *SPIFFEWorkloadAPIService) Run(ctx context.Context) error { return nil }) - return trace.Wrap(eg.Wait()) + s.statusReporter.Report(readyz.Healthy) + if err := eg.Wait(); err != nil { + s.statusReporter.ReportReason(readyz.Unhealthy, err.Error()) + return trace.Wrap(eg.Wait()) + } + return nil } // serialString returns a human-readable colon-separated string of the serial @@ -893,5 +904,8 @@ func (s *SPIFFEWorkloadAPIService) ValidateJWTSVID( // String returns a human-readable string that can uniquely identify the // service. func (s *SPIFFEWorkloadAPIService) String() string { - return fmt.Sprintf("%s:%s", config.SPIFFEWorkloadAPIServiceType, s.cfg.Listen) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("%s:%s", config.SPIFFEWorkloadAPIServiceType, s.cfg.Listen), + ) } diff --git a/lib/tbot/service_ssh_host_output.go b/lib/tbot/service_ssh_host_output.go index 3b219e5f3b769..f5df49f139365 100644 --- a/lib/tbot/service_ssh_host_output.go +++ b/lib/tbot/service_ssh_host_output.go @@ -39,6 +39,7 @@ import ( "github.com/gravitational/teleport/lib/sshutils" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" ) type SSHHostOutputService struct { @@ -50,10 +51,14 @@ type SSHHostOutputService struct { log *slog.Logger reloadBroadcaster *channelBroadcaster resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter } func (s *SSHHostOutputService) String() string { - return fmt.Sprintf("ssh-host (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("ssh-host (%s)", s.cfg.Destination.String()), + ) } func (s *SSHHostOutputService) OneShot(ctx context.Context) error { @@ -73,6 +78,7 @@ func (s *SSHHostOutputService) Run(ctx context.Context) error { log: s.log, reloadCh: reloadCh, identityReadyCh: s.botIdentityReadyCh, + statusReporter: s.statusReporter, }) return trace.Wrap(err) } diff --git a/lib/tbot/service_ssh_multiplexer.go b/lib/tbot/service_ssh_multiplexer.go index f3157e64ebe5f..1584356304f88 100644 --- a/lib/tbot/service_ssh_multiplexer.go +++ b/lib/tbot/service_ssh_multiplexer.go @@ -57,6 +57,7 @@ import ( "github.com/gravitational/teleport/lib/tbot/bot" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tbot/ssh" "github.com/gravitational/teleport/lib/utils" "github.com/gravitational/teleport/lib/utils/uds" @@ -105,6 +106,7 @@ type SSHMultiplexerService struct { proxyPingCache *proxyPingCache reloadBroadcaster *channelBroadcaster resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter // Fields below here are initialized by the service itself on startup. identity *identity.Facade @@ -563,7 +565,16 @@ func (s *SSHMultiplexerService) Run(ctx context.Context) (err error) { return s.identityRenewalLoop(egCtx, proxyHost, authClient) }) - return eg.Wait() + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } + s.statusReporter.Report(readyz.Healthy) + + if err := eg.Wait(); err != nil { + s.statusReporter.ReportReason(readyz.Unhealthy, err.Error()) + return err + } + return nil } func (s *SSHMultiplexerService) handleConn( @@ -796,7 +807,10 @@ func (s *SSHMultiplexerService) handleConn( } func (s *SSHMultiplexerService) String() string { - return config.SSHMultiplexerServiceType + return cmp.Or( + s.cfg.Name, + config.SSHMultiplexerServiceType, + ) } type hostDialer interface { diff --git a/lib/tbot/service_workload_identity_api.go b/lib/tbot/service_workload_identity_api.go index a617cbea28b0e..f71e6cc940e5d 100644 --- a/lib/tbot/service_workload_identity_api.go +++ b/lib/tbot/service_workload_identity_api.go @@ -48,6 +48,7 @@ import ( "github.com/gravitational/teleport/lib/observability/metrics" "github.com/gravitational/teleport/lib/reversetunnelclient" "github.com/gravitational/teleport/lib/tbot/config" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tbot/workloadidentity" "github.com/gravitational/teleport/lib/tbot/workloadidentity/attrs" "github.com/gravitational/teleport/lib/tbot/workloadidentity/workloadattest" @@ -73,6 +74,7 @@ type WorkloadIdentityAPIService struct { resolver reversetunnelclient.Resolver trustBundleCache *workloadidentity.TrustBundleCache crlCache *workloadidentity.CRLCache + statusReporter readyz.Reporter // client holds the impersonated client for the service client *apiclient.Client @@ -125,6 +127,10 @@ func (s *WorkloadIdentityAPIService) setup(ctx context.Context) (err error) { return trace.Wrap(err, "setting up workload attestation") } + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } + return nil } @@ -220,7 +226,12 @@ func (s *WorkloadIdentityAPIService) Run(ctx context.Context) error { return nil }) - return trace.Wrap(eg.Wait()) + s.statusReporter.Report(readyz.Healthy) + if err := eg.Wait(); err != nil { + s.statusReporter.ReportReason(readyz.Unhealthy, err.Error()) + return trace.Wrap(err) + } + return nil } func (s *WorkloadIdentityAPIService) authenticateClient( @@ -698,5 +709,8 @@ func (s *WorkloadIdentityAPIService) ValidateJWTSVID( // String returns a human-readable string that can uniquely identify the // service. func (s *WorkloadIdentityAPIService) String() string { - return fmt.Sprintf("%s:%s", config.WorkloadIdentityAPIServiceType, s.cfg.Listen) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("%s:%s", config.WorkloadIdentityAPIServiceType, s.cfg.Listen), + ) } diff --git a/lib/tbot/service_workload_identity_aws_ra.go b/lib/tbot/service_workload_identity_aws_ra.go index c91286540a9ec..e1daf83cd09f6 100644 --- a/lib/tbot/service_workload_identity_aws_ra.go +++ b/lib/tbot/service_workload_identity_aws_ra.go @@ -38,6 +38,7 @@ import ( "github.com/gravitational/teleport/lib/tbot/bot" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tbot/workloadidentity" ) @@ -52,11 +53,15 @@ type WorkloadIdentityAWSRAService struct { log *slog.Logger resolver reversetunnelclient.Resolver reloadBroadcaster *channelBroadcaster + statusReporter readyz.Reporter } // String returns a human-readable description of the service. func (s *WorkloadIdentityAWSRAService) String() string { - return fmt.Sprintf("workload-identity-aws-roles-anywhere (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("workload-identity-aws-roles-anywhere (%s)", s.cfg.Destination.String()), + ) } // OneShot runs the service once, generating the output and writing it to the @@ -80,6 +85,7 @@ func (s *WorkloadIdentityAWSRAService) Run(ctx context.Context) error { log: s.log, reloadCh: reloadCh, identityReadyCh: s.botIdentityReadyCh, + statusReporter: s.statusReporter, }) return trace.Wrap(err) } diff --git a/lib/tbot/service_workload_identity_jwt.go b/lib/tbot/service_workload_identity_jwt.go index ad5c3ec3d0789..f2e0e5b585cf2 100644 --- a/lib/tbot/service_workload_identity_jwt.go +++ b/lib/tbot/service_workload_identity_jwt.go @@ -32,6 +32,7 @@ import ( "github.com/gravitational/teleport/lib/reversetunnelclient" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tbot/workloadidentity" ) @@ -44,6 +45,7 @@ type WorkloadIdentityJWTService struct { getBotIdentity getBotIdentityFn log *slog.Logger resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter // trustBundleCache is the cache of trust bundles. It only needs to be // provided when running in daemon mode. trustBundleCache *workloadidentity.TrustBundleCache @@ -51,7 +53,10 @@ type WorkloadIdentityJWTService struct { // String returns a human-readable description of the service. func (s *WorkloadIdentityJWTService) String() string { - return fmt.Sprintf("workload-identity-jwt (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("workload-identity-jwt (%s)", s.cfg.Destination.String()), + ) } // OneShot runs the service once, generating the output and writing it to the @@ -72,6 +77,10 @@ func (s *WorkloadIdentityJWTService) Run(ctx context.Context) error { return trace.Wrap(err, "getting trust bundle set") } + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } + jitter := retryutils.DefaultJitter var cred *workloadidentityv1pb.Credential var failures int @@ -80,10 +89,8 @@ func (s *WorkloadIdentityJWTService) Run(ctx context.Context) error { for { var retryAfter <-chan time.Time if failures > 0 { - backoffTime := time.Second * time.Duration(math.Pow(2, float64(failures-1))) - if backoffTime > time.Minute { - backoffTime = time.Minute - } + s.statusReporter.Report(readyz.Unhealthy) + backoffTime := min(time.Second*time.Duration(math.Pow(2, float64(failures-1))), time.Minute) backoffTime = jitter(backoffTime) s.log.WarnContext( ctx, @@ -132,6 +139,7 @@ func (s *WorkloadIdentityJWTService) Run(ctx context.Context) error { failures++ continue } + s.statusReporter.Report(readyz.Healthy) failures = 0 } } diff --git a/lib/tbot/service_workload_identity_x509.go b/lib/tbot/service_workload_identity_x509.go index b956c6bf3b184..21c9ef024858e 100644 --- a/lib/tbot/service_workload_identity_x509.go +++ b/lib/tbot/service_workload_identity_x509.go @@ -36,6 +36,7 @@ import ( "github.com/gravitational/teleport/lib/reversetunnelclient" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tbot/workloadidentity" ) @@ -48,6 +49,7 @@ type WorkloadIdentityX509Service struct { getBotIdentity getBotIdentityFn log *slog.Logger resolver reversetunnelclient.Resolver + statusReporter readyz.Reporter // trustBundleCache is the cache of trust bundles. It only needs to be // provided when running in daemon mode. trustBundleCache *workloadidentity.TrustBundleCache @@ -56,7 +58,10 @@ type WorkloadIdentityX509Service struct { // String returns a human-readable description of the service. func (s *WorkloadIdentityX509Service) String() string { - return fmt.Sprintf("workload-identity-x509 (%s)", s.cfg.Destination.String()) + return cmp.Or( + s.cfg.Name, + fmt.Sprintf("workload-identity-x509 (%s)", s.cfg.Destination.String()), + ) } // OneShot runs the service once, generating the output and writing it to the @@ -100,6 +105,10 @@ func (s *WorkloadIdentityX509Service) Run(ctx context.Context) error { return trace.Wrap(err, "getting CRL set from cache") } + if s.statusReporter == nil { + s.statusReporter = readyz.NoopReporter() + } + jitter := retryutils.DefaultJitter var x509Cred *workloadidentityv1pb.Credential var privateKey crypto.Signer @@ -109,10 +118,8 @@ func (s *WorkloadIdentityX509Service) Run(ctx context.Context) error { for { var retryAfter <-chan time.Time if failures > 0 { - backoffTime := time.Second * time.Duration(math.Pow(2, float64(failures-1))) - if backoffTime > time.Minute { - backoffTime = time.Minute - } + s.statusReporter.Report(readyz.Unhealthy) + backoffTime := min(time.Second*time.Duration(math.Pow(2, float64(failures-1))), time.Minute) backoffTime = jitter(backoffTime) s.log.WarnContext( ctx, @@ -170,6 +177,7 @@ func (s *WorkloadIdentityX509Service) Run(ctx context.Context) error { failures++ continue } + s.statusReporter.Report(readyz.Healthy) failures = 0 } } diff --git a/lib/tbot/tbot.go b/lib/tbot/tbot.go index 5ed5e31d28422..8c8dc9472a43f 100644 --- a/lib/tbot/tbot.go +++ b/lib/tbot/tbot.go @@ -48,6 +48,7 @@ import ( "github.com/gravitational/teleport/lib/tbot/client" "github.com/gravitational/teleport/lib/tbot/config" "github.com/gravitational/teleport/lib/tbot/identity" + "github.com/gravitational/teleport/lib/tbot/readyz" "github.com/gravitational/teleport/lib/tbot/workloadidentity" "github.com/gravitational/teleport/lib/utils" ) @@ -226,6 +227,8 @@ func (b *Bot) Run(ctx context.Context) (err error) { }() } + statusRegistry := readyz.NewRegistry() + b.mu.Lock() b.botIdentitySvc = &identityService{ cfg: b.cfg, @@ -234,6 +237,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { log: b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "identity"), ), + statusReporter: statusRegistry.AddService("identity"), } b.mu.Unlock() @@ -270,8 +274,9 @@ func (b *Bot) Run(ctx context.Context) (err error) { // Setup all other services if b.cfg.DiagAddr != "" { services = append(services, &diagnosticsService{ - diagAddr: b.cfg.DiagAddr, - pprofEnabled: b.cfg.Debug, + diagAddr: b.cfg.DiagAddr, + pprofEnabled: b.cfg.Debug, + statusRegistry: statusRegistry, log: b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "diagnostics"), ), @@ -291,6 +296,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { botIdentityReadyCh: b.botIdentitySvc.Ready(), interval: time.Minute * 30, retryLimit: 5, + statusReporter: statusRegistry.AddService("heartbeat"), }) services = append(services, &caRotationService{ @@ -301,6 +307,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { teleport.ComponentKey, teleport.Component(componentTBot, "ca-rotation"), ), reloadBroadcaster: reloadBroadcaster, + statusReporter: statusRegistry.AddService("ca-rotation"), }) // We only want to create this service if it's needed by a dependent @@ -321,6 +328,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { Logger: b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "spiffe-trust-bundle-cache"), ), + StatusReporter: statusRegistry.AddService("spiffe-trust-bundle-cache"), }) if err != nil { return nil, trace.Wrap(err) @@ -340,6 +348,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { Logger: b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "crl-cache"), ), + StatusReporter: statusRegistry.AddService("crl-cache"), }) if err != nil { return nil, trace.Wrap(err) @@ -384,6 +393,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.DatabaseTunnelService: svc := &DatabaseTunnelService{ @@ -398,6 +408,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.ExampleService: services = append(services, &ExampleService{ @@ -418,6 +429,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.KubernetesOutput: svc := &KubernetesOutputService{ @@ -434,6 +446,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.KubernetesV2Output: svc := &KubernetesV2OutputService{ @@ -450,6 +463,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.SPIFFESVIDOutput: svc := &SPIFFESVIDOutputService{ @@ -462,6 +476,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) if !b.cfg.Oneshot { tbCache, err := setupTrustBundleCache() if err != nil { @@ -483,6 +498,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.ApplicationOutput: svc := &ApplicationOutputService{ @@ -497,6 +513,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.DatabaseOutput: svc := &DatabaseOutputService{ @@ -511,6 +528,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.IdentityOutput: svc := &IdentityOutputService{ @@ -528,6 +546,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.UnstableClientCredentialOutput: svc := &ClientCredentialOutputService{ @@ -541,6 +560,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.ApplicationTunnelService: svc := &ApplicationTunnelService{ @@ -555,6 +575,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.WorkloadIdentityX509Service: svc := &WorkloadIdentityX509Service{ @@ -567,6 +588,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) if !b.cfg.Oneshot { tbCache, err := setupTrustBundleCache() if err != nil { @@ -591,6 +613,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) if !b.cfg.Oneshot { tbCache, err := setupTrustBundleCache() if err != nil { @@ -636,6 +659,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) case *config.WorkloadIdentityAWSRAService: svc := &WorkloadIdentityAWSRAService{ @@ -650,6 +674,7 @@ func (b *Bot) Run(ctx context.Context) (err error) { svc.log = b.log.With( teleport.ComponentKey, teleport.Component(componentTBot, "svc", svc.String()), ) + svc.statusReporter = statusRegistry.AddService(svc.String()) services = append(services, svc) default: return trace.BadParameter("unknown service type: %T", svcCfg) diff --git a/lib/tbot/workloadidentity/crl_cache.go b/lib/tbot/workloadidentity/crl_cache.go index a5f34165bdb22..3d994ec82bc8f 100644 --- a/lib/tbot/workloadidentity/crl_cache.go +++ b/lib/tbot/workloadidentity/crl_cache.go @@ -29,6 +29,7 @@ import ( "github.com/gravitational/trace" workloadidentityv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/workloadidentity/v1" + "github.com/gravitational/teleport/lib/tbot/readyz" ) // CRLSet is a collection of CRLs. @@ -72,6 +73,7 @@ type CRLCache struct { revocationsClient workloadidentityv1pb.WorkloadIdentityRevocationServiceClient logger *slog.Logger botIdentityReadyCh <-chan struct{} + statusReporter readyz.Reporter mu sync.Mutex crlSet *CRLSet @@ -84,6 +86,7 @@ type CRLCacheConfig struct { RevocationsClient workloadidentityv1pb.WorkloadIdentityRevocationServiceClient Logger *slog.Logger BotIdentityReadyCh <-chan struct{} + StatusReporter readyz.Reporter } // NewCRLCache creates a new CRLCache. @@ -94,10 +97,14 @@ func NewCRLCache(cfg CRLCacheConfig) (*CRLCache, error) { case cfg.Logger == nil: return nil, trace.BadParameter("missing Logger") } + if cfg.StatusReporter == nil { + cfg.StatusReporter = readyz.NoopReporter() + } return &CRLCache{ revocationsClient: cfg.RevocationsClient, logger: cfg.Logger, botIdentityReadyCh: cfg.BotIdentityReadyCh, + statusReporter: cfg.StatusReporter, initialized: make(chan struct{}), }, nil } @@ -147,6 +154,7 @@ func (m *CRLCache) Run(ctx context.Context) error { "error", err, "backoff", trustBundleInitFailureBackoff, ) + m.statusReporter.ReportReason(readyz.Unhealthy, err.Error()) } select { case <-ctx.Done(): @@ -168,6 +176,7 @@ func (m *CRLCache) watch(ctx context.Context) error { return trace.Wrap(err, "opening CRL stream") } + m.statusReporter.Report(readyz.Healthy) for { res, err := stream.Recv() if err != nil { diff --git a/lib/tbot/workloadidentity/trust_bundle_cache.go b/lib/tbot/workloadidentity/trust_bundle_cache.go index 707ce302a811d..8c651b4019e1f 100644 --- a/lib/tbot/workloadidentity/trust_bundle_cache.go +++ b/lib/tbot/workloadidentity/trust_bundle_cache.go @@ -41,6 +41,7 @@ import ( "github.com/gravitational/teleport/api/utils/keys" "github.com/gravitational/teleport/lib/jwt" "github.com/gravitational/teleport/lib/services" + "github.com/gravitational/teleport/lib/tbot/readyz" ) var tracer = otel.Tracer("github.com/gravitational/teleport/lib/spiffe") @@ -177,7 +178,8 @@ type TrustBundleCache struct { clusterName string botIdentityReadyCh <-chan struct{} - logger *slog.Logger + logger *slog.Logger + statusReporter readyz.Reporter mu sync.RWMutex bundleSet *BundleSet @@ -199,6 +201,7 @@ type TrustBundleCacheConfig struct { ClusterName string Logger *slog.Logger BotIdentityReadyCh <-chan struct{} + StatusReporter readyz.Reporter } // NewTrustBundleCache creates a new TrustBundleCache. @@ -215,6 +218,9 @@ func NewTrustBundleCache(cfg TrustBundleCacheConfig) (*TrustBundleCache, error) case cfg.Logger == nil: return nil, trace.BadParameter("missing Logger") } + if cfg.StatusReporter == nil { + cfg.StatusReporter = readyz.NoopReporter() + } return &TrustBundleCache{ federationClient: cfg.FederationClient, trustClient: cfg.TrustClient, @@ -222,6 +228,7 @@ func NewTrustBundleCache(cfg TrustBundleCacheConfig) (*TrustBundleCache, error) clusterName: cfg.ClusterName, logger: cfg.Logger, botIdentityReadyCh: cfg.BotIdentityReadyCh, + statusReporter: cfg.StatusReporter, initialized: make(chan struct{}), }, nil } @@ -260,6 +267,7 @@ func (m *TrustBundleCache) Run(ctx context.Context) error { "error", err, "backoff", trustBundleInitFailureBackoff, ) + m.statusReporter.ReportReason(readyz.Unhealthy, err.Error()) } select { case <-ctx.Done(): @@ -342,6 +350,8 @@ func (m *TrustBundleCache) watch(ctx context.Context) error { return trace.Wrap(watcher.Error(), "watcher closed before initialization") } + m.statusReporter.Report(readyz.Healthy) + // Now that we know our watcher is streaming events, we can fetch the // current point-in-time list of resources. bundleSet, err := FetchInitialBundleSet(