From e9d63563cb7ce3f709a735d7dc56060f64639028 Mon Sep 17 00:00:00 2001 From: Rana Ian Date: Sat, 25 Oct 2025 16:06:35 -0700 Subject: [PATCH] Enable health checks for Kubernetes with virtual defaults Health checks are enabled for all Kubernetes clusters by default. A design of creating one health check config default per resource is implemented. The choice eases adoption of health checks, supports existing clusters that already have database health checks, and avoids migrating the backend database. A new Kubernetes-specific `default-kube` health check config is added. And a database-specific `default` health check config already exists, and is preserved. A virtual default design is implemented by returning health check configs from memory if they don't exist in the backend database. The approach has the benefit of not re-inserting default values to the backend after they're deleted, which a prior approach had. Virtual defaults are added at the local health check service level, and returned from functions `GetHealthCheckConfig` and `ListHealthCheckConfigs`. Virtual defaults may be written, updated, and deleted to and from the backend. While virtual defaults may be deleted, it has the net effect of resetting the config to default settings, and matching all resources of that type (db, kube). Virtual defaults are always returned from health check `get` and `list` functions. Changes: - Added `default-kube` health check config specific to Kubernetes only - Updated local service functions `GetHealthCheckConfig` and `ListHealthCheckConfigs` to return virtual defaults - Added unit tests - Updated health check documentation with `default-kube` and info about virtual defaults Part of #58413 Co-authored-by: Edoardo Spadolini --- constants.go | 15 +- .../kubernetes-access/health-checks.mdx | 18 +- lib/auth/export_test.go | 4 - .../healthcheckconfigv1/service_test.go | 8 +- lib/auth/init.go | 27 - lib/auth/init_test.go | 39 +- lib/cache/cache_test.go | 5 + lib/cache/health_check_config_test.go | 77 ++- lib/healthcheck/manager_test.go | 258 ++++++++- lib/services/health_check_config_test.go | 16 + lib/services/local/events.go | 14 +- lib/services/local/health_check_config.go | 122 ++++- .../local/health_check_config_test.go | 507 +++++++++++++++--- lib/services/presets.go | 44 +- lib/services/watcher_test.go | 7 +- 15 files changed, 942 insertions(+), 219 deletions(-) diff --git a/constants.go b/constants.go index fc0d4f891a1ef..73365df4a381b 100644 --- a/constants.go +++ b/constants.go @@ -775,9 +775,18 @@ const ( var PresetRoles = []string{PresetEditorRoleName, PresetAccessRoleName, PresetAuditorRoleName} const ( - // PresetDefaultHealthCheckConfigName is the name of a preset - // default health_check_config that enables health checks for all resources. - PresetDefaultHealthCheckConfigName = "default" + // VirtualDefaultHealthCheckConfigDBName is the name of a virtual + // health_check_config that enables health checks for all database + // resources. For historical reasons, it's value is "default" even + // though it applies to databases only. + VirtualDefaultHealthCheckConfigDBName = "default" + // VirtualDefaultHealthCheckConfigKubeName is the name of a virtual + // health_check_config that enables health checks for all Kubernetes + // resources. + VirtualDefaultHealthCheckConfigKubeName = "default-kube" + // VirtualDefaultHealthCheckConfigCount is the number of virtual + // health_check_config resources. + VirtualDefaultHealthCheckConfigCount = 2 ) const ( diff --git a/docs/pages/enroll-resources/kubernetes-access/health-checks.mdx b/docs/pages/enroll-resources/kubernetes-access/health-checks.mdx index c28567e13a59c..f5e0d36e66ed3 100644 --- a/docs/pages/enroll-resources/kubernetes-access/health-checks.mdx +++ b/docs/pages/enroll-resources/kubernetes-access/health-checks.mdx @@ -177,14 +177,12 @@ spec: kubernetes_labels_expression: 'labels["owner"] == "platform-team"' ``` -A `default_kube` `health_check_config` is introduced in version (= kubernetes.health_check_min_version =), and enables all Kubernetes clusters to participate in health checks. +A `default-kube` `health_check_config` is introduced in version (= kubernetes.health_check_min_version =), and enables all Kubernetes clusters to participate in health checks. ```yaml kind: health_check_config metadata: description: Enables all health checks by default - labels: - teleport.internal/resource-type: preset - name: default_kube + name: default-kube spec: match: kubernetes_labels: @@ -194,6 +192,8 @@ spec: version: v1 ``` +`default-kube` may be disabled, but not permanently deleted. Deleting with `tctl rm health_check_config/default-kube` has the effect of resetting the config to its default settings and matching all Kubernetes clusters. + A different `default` `health_check_config` also exists, and focuses on matching databases for health checks. Multiple different `health_check_config` resources may be created for different groups of Kubernetes clusters. When multiple `health_check_config` match the same Kubernetes cluster, configs are sorted in ascending order by name, and only the first config applies (e.g., the name "00-my-config" has greater precedence than "10-my-config"). @@ -202,14 +202,12 @@ Multiple different `health_check_config` resources may be created for different Set the `match.disabled` field to `true` on any `health_check_config`. -For example, use `tctl edit health_check_config/default_kube` +For example, use `tctl edit health_check_config/default-kube` ```yaml kind: health_check_config metadata: description: Enables all health checks by default - labels: - teleport.internal/resource-type: preset - name: default_kube + name: default-kube spec: match: disable: true @@ -226,7 +224,7 @@ Any defined labels, such as `kubernetes_labels`, are ignored when `disable: true Read the default health check config with `tctl get`: ```bash -$ tctl get health_check_config/default_kube +$ tctl get health_check_config/default-kube ``` Create a new health check config with `tctl create`: @@ -236,7 +234,7 @@ $ tctl create health_check_config.yaml Update an existing config interactively with `tctl edit`: ```bash -$ tctl edit health_check_config/default_kube +$ tctl edit health_check_config/default-kube ``` Delete a health check config with `tctl rm`: diff --git a/lib/auth/export_test.go b/lib/auth/export_test.go index 7571eac1e09bc..5ad518da95f12 100644 --- a/lib/auth/export_test.go +++ b/lib/auth/export_test.go @@ -286,10 +286,6 @@ func CreatePresetRoles(ctx context.Context, um PresetRoleManager) error { return createPresetRoles(ctx, um) } -func CreatePresetHealthCheckConfig(ctx context.Context, svc services.HealthCheckConfig) error { - return createPresetHealthCheckConfig(ctx, svc) -} - func GetPresetUsers() []types.User { return getPresetUsers() } diff --git a/lib/auth/healthcheckconfig/healthcheckconfigv1/service_test.go b/lib/auth/healthcheckconfig/healthcheckconfigv1/service_test.go index 03f80aba67079..381da2a9fa00a 100644 --- a/lib/auth/healthcheckconfig/healthcheckconfigv1/service_test.go +++ b/lib/auth/healthcheckconfig/healthcheckconfigv1/service_test.go @@ -28,6 +28,7 @@ import ( "github.com/gravitational/trace" "github.com/stretchr/testify/require" + "github.com/gravitational/teleport" healthcheckconfigv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/healthcheckconfig/v1" labelv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/label/v1" "github.com/gravitational/teleport/api/types" @@ -98,7 +99,8 @@ func TestHealthCheckConfigCRUD(t *testing.T) { resp, err := clt.ServiceUnderTest.ListHealthCheckConfigs(ctx, &healthcheckconfigv1.ListHealthCheckConfigsRequest{}) if err == nil { require.NotNil(t, resp) - require.Len(t, resp.Configs, 2, "the test bootstrapped exactly 2 health_check_config resources") + require.Len(t, resp.Configs, 2+teleport.VirtualDefaultHealthCheckConfigCount, + "expected 2 inserted and virtual defaults") } return err }, @@ -199,7 +201,7 @@ func (c *accessTest) run(t *testing.T) { ctx, clt := c.setup(t, spec) err := c.actionFn(t, ctx, clt) require.Error(t, err) - require.IsType(t, trace.AccessDenied(""), err) + require.True(t, trace.IsAccessDenied(err)) }) t.Run(fmt.Sprintf("%s is denied", c.name), func(t *testing.T) { @@ -210,7 +212,7 @@ func (c *accessTest) run(t *testing.T) { ctx, clt := c.setup(t, spec) err := c.actionFn(t, ctx, clt) require.Error(t, err) - require.IsType(t, trace.AccessDenied(""), err) + require.True(t, trace.IsAccessDenied(err)) }) } diff --git a/lib/auth/init.go b/lib/auth/init.go index 676d0c08b99b9..bd58f1c055dfd 100644 --- a/lib/auth/init.go +++ b/lib/auth/init.go @@ -682,12 +682,6 @@ func initCluster(ctx context.Context, cfg InitConfig, asrv *Server) error { asrv.logger.WarnContext(ctx, "error creating preset database object import rules", "error", err) } span.AddEvent("completed creating database object import rules") - - span.AddEvent("creating preset health check config") - if err := createPresetHealthCheckConfig(ctx, asrv); err != nil { - return trace.Wrap(err) - } - span.AddEvent("completed creating preset health check config") } else { asrv.logger.InfoContext(ctx, "skipping preset role and user creation") } @@ -1489,27 +1483,6 @@ func createPresetDatabaseObjectImportRule(ctx context.Context, rules services.Da return nil } -// createPresetHealthCheckConfig creates a default preset health check config -// resource that enables health checks on all resources. -func createPresetHealthCheckConfig(ctx context.Context, svc services.HealthCheckConfig) error { - page, _, err := svc.ListHealthCheckConfigs(ctx, 0, "") - if err != nil { - return trace.Wrap(err, "failed listing available health check configs") - } - if len(page) > 0 { - return nil - } - preset := services.NewPresetHealthCheckConfig() - _, err = svc.CreateHealthCheckConfig(ctx, preset) - if err != nil && !trace.IsAlreadyExists(err) { - return trace.Wrap(err, - "failed creating preset health_check_config %s", - preset.GetMetadata().GetName(), - ) - } - return nil -} - // isFirstStart returns 'true' if the auth server is starting for the 1st time // on this server. func isFirstStart(ctx context.Context, authServer *Server, cfg InitConfig) (bool, error) { diff --git a/lib/auth/init_test.go b/lib/auth/init_test.go index 673c226259b6e..1187298189709 100644 --- a/lib/auth/init_test.go +++ b/lib/auth/init_test.go @@ -45,7 +45,6 @@ import ( "golang.org/x/crypto/ssh" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/testing/protocmp" - "google.golang.org/protobuf/types/known/durationpb" kyaml "k8s.io/apimachinery/pkg/util/yaml" "github.com/gravitational/teleport" @@ -141,7 +140,7 @@ func TestBadIdentity(t *testing.T) { // bad cert type _, err = state.ReadSSHIdentityFromKeyPair(priv, pub) - require.IsType(t, trace.BadParameter(""), err) + require.ErrorAs(t, err, new(*trace.BadParameterError)) // missing authority domain cert, err := a.GenerateHostCert(sshca.HostCertificateRequest{ @@ -158,7 +157,7 @@ func TestBadIdentity(t *testing.T) { require.NoError(t, err) _, err = state.ReadSSHIdentityFromKeyPair(priv, cert) - require.IsType(t, trace.BadParameter(""), err) + require.ErrorAs(t, err, new(*trace.BadParameterError)) // missing host uuid cert, err = a.GenerateHostCert(sshca.HostCertificateRequest{ @@ -175,7 +174,7 @@ func TestBadIdentity(t *testing.T) { require.NoError(t, err) _, err = state.ReadSSHIdentityFromKeyPair(priv, cert) - require.IsType(t, trace.BadParameter(""), err) + require.ErrorAs(t, err, new(*trace.BadParameterError)) // unrecognized role cert, err = a.GenerateHostCert(sshca.HostCertificateRequest{ @@ -192,7 +191,7 @@ func TestBadIdentity(t *testing.T) { require.NoError(t, err) _, err = state.ReadSSHIdentityFromKeyPair(priv, cert) - require.IsType(t, trace.BadParameter(""), err) + require.ErrorAs(t, err, new(*trace.BadParameterError)) } func TestSignatureAlgorithmSuite(t *testing.T) { @@ -969,25 +968,15 @@ func TestPresets(t *testing.T) { err := auth.CreatePresetRoles(ctx, as) require.NoError(t, err) - err = auth.CreatePresetHealthCheckConfig(ctx, as) - require.NoError(t, err) - // Second call should not fail err = auth.CreatePresetRoles(ctx, as) require.NoError(t, err) - err = auth.CreatePresetHealthCheckConfig(ctx, as) - require.NoError(t, err) - // Presets were created for _, role := range presetRoleNames { _, err := as.GetRole(ctx, role) require.NoError(t, err) } - - cfg, err := as.GetHealthCheckConfig(ctx, teleport.PresetDefaultHealthCheckConfigName) - require.NoError(t, err) - require.NotNil(t, cfg) }) // Makes sure that existing role with the same name is not modified @@ -1015,26 +1004,6 @@ func TestPresets(t *testing.T) { require.Equal(t, access.GetLogins(types.Allow), out.GetLogins(types.Allow)) }) - t.Run("ExistingHealthCheckConfig", func(t *testing.T) { - as := newTestAuthServer(ctx, t) - clock := clockwork.NewFakeClock() - as.SetClock(clock) - - // an existing health check config should not be modified by init - cfg := services.NewPresetHealthCheckConfig() - cfg.Spec.Interval = durationpb.New(42 * time.Second) - cfg, err := as.CreateHealthCheckConfig(ctx, cfg) - require.NoError(t, err) - - err = auth.CreatePresetHealthCheckConfig(ctx, as) - require.NoError(t, err) - - // Preset was created. Ensure it didn't overwrite the existing config - got, err := as.GetHealthCheckConfig(ctx, cfg.GetMetadata().GetName()) - require.NoError(t, err) - require.Equal(t, cfg.Spec.Interval.AsDuration(), got.Spec.Interval.AsDuration()) - }) - // If a default allow condition is not present, ensure it gets added. t.Run("AddDefaultAllowConditions", func(t *testing.T) { as := newTestAuthServer(ctx, t) diff --git a/lib/cache/cache_test.go b/lib/cache/cache_test.go index 472f674235ae1..8d2a343758dc0 100644 --- a/lib/cache/cache_test.go +++ b/lib/cache/cache_test.go @@ -1363,6 +1363,11 @@ func testResources[T types.Resource](t *testing.T, p *testPack, funcs testFuncs[ // testResources153 is a wrapper for testing resources conforming to types.Resource153 func testResources153[T types.Resource153](t *testing.T, p *testPack, funcs testFuncs[T], opts ...optionsFunc) { + // TODO(rana): Add broader support for virtual resources in list operations. + // Virtual resources change the total count returned by list operations, + // and is unexpected for the current test. When updated, we can remove virtual + // resource filtering and paging from lib/cache/health_check_config_test.go. + opts = append(opts, withSkipPaginationTest()) funcs.resource = defaultResource153Ops[T]() testResourcesInternal(t, p, funcs, opts...) } diff --git a/lib/cache/health_check_config_test.go b/lib/cache/health_check_config_test.go index c69de523d4c70..0b8dbaa5658bb 100644 --- a/lib/cache/health_check_config_test.go +++ b/lib/cache/health_check_config_test.go @@ -25,6 +25,8 @@ import ( "github.com/gravitational/trace" "github.com/stretchr/testify/require" + "github.com/gravitational/teleport" + "github.com/gravitational/teleport/api/defaults" healthcheckconfigv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/healthcheckconfig/v1" "github.com/gravitational/teleport/api/types/healthcheckconfig" ) @@ -57,13 +59,84 @@ func TestHealthCheckConfig(t *testing.T) { _, err := p.healthCheckConfig.CreateHealthCheckConfig(ctx, cfg) return trace.Wrap(err) }, - list: p.healthCheckConfig.ListHealthCheckConfigs, + list: filterHealthCfgNonVirtual(p.healthCheckConfig.ListHealthCheckConfigs), update: func(ctx context.Context, cfg *healthcheckconfigv1.HealthCheckConfig) error { _, err := p.healthCheckConfig.UpdateHealthCheckConfig(ctx, cfg) return trace.Wrap(err) }, deleteAll: p.healthCheckConfig.DeleteAllHealthCheckConfigs, - cacheList: p.cache.ListHealthCheckConfigs, + cacheList: filterHealthCfgNonVirtual(p.cache.ListHealthCheckConfigs), cacheGet: p.cache.GetHealthCheckConfig, }) } + +type listHealthCfgFunc func(ctx context.Context, pageSize int, pageToken string) ([]*healthcheckconfigv1.HealthCheckConfig, string, error) + +// filterHealthCfgNonVirtual excludes virtual defaults while maintaining pagination. +func filterHealthCfgNonVirtual(listFn listHealthCfgFunc) listHealthCfgFunc { + return func(ctx context.Context, pageSize int, pageToken string) ([]*healthcheckconfigv1.HealthCheckConfig, string, error) { + var allNonVirtual []*healthcheckconfigv1.HealthCheckConfig + var token string + for { + items, nextPageToken, err := listFn(ctx, defaults.DefaultChunkSize, token) + if err != nil { + return nil, "", trace.Wrap(err) + } + for _, item := range items { + if !isVirtualDefaultHealthCheckConfig(item.GetMetadata().GetName()) { + allNonVirtual = append(allNonVirtual, item) + } + } + if nextPageToken == "" { + break + } + token = nextPageToken + } + page, nextPageToken := pageHealthCfg(allNonVirtual, pageSize, pageToken) + return page, nextPageToken, nil + } +} + +// pageHealthCfg creates a page from a slice. +func pageHealthCfg( + items []*healthcheckconfigv1.HealthCheckConfig, + pageSize int, + pageToken string, +) ([]*healthcheckconfigv1.HealthCheckConfig, string) { + if len(items) == 0 { + return nil, "" + } + // look for the start index + var idxStart int + if pageToken != "" { + for n, item := range items { + if item.GetMetadata().GetName() == pageToken { + idxStart = n + 1 + break + } + } + } + if idxStart >= len(items) { + return nil, "" + } + // look for the end index + idxEnd := len(items) + if pageSize > 0 && idxStart+pageSize < len(items) { + idxEnd = idxStart + pageSize + } + page := items[idxStart:idxEnd] + var nextPageToken string + if idxEnd < len(items) { + nextPageToken = page[len(page)-1].GetMetadata().GetName() + } + return page, nextPageToken +} + +func isVirtualDefaultHealthCheckConfig(name string) bool { + switch name { + case teleport.VirtualDefaultHealthCheckConfigDBName, + teleport.VirtualDefaultHealthCheckConfigKubeName: + return true + } + return false +} diff --git a/lib/healthcheck/manager_test.go b/lib/healthcheck/manager_test.go index 67012809a4059..66ab3853acf53 100644 --- a/lib/healthcheck/manager_test.go +++ b/lib/healthcheck/manager_test.go @@ -126,6 +126,9 @@ func TestManager(t *testing.T) { healthConfigSvc, err := local.NewHealthCheckConfigService(bk) require.NoError(t, err) + // disable virtual defaults to allow testing explicit config matching behavior + disableVirtualHealthCheckDefaults(t, ctx, healthConfigSvc) + // create a health check config that only matches prod databases prodHCC := healthCheckConfigFixture(t, "prod") prodHCC.Spec.Match.DbLabelsExpression = `labels.env == "prod"` @@ -151,7 +154,8 @@ func TestManager(t *testing.T) { mgr.mu.RLock() configs := mgr.configs[:] mgr.mu.RUnlock() - require.Len(t, configs, 1, "starting the manager should have blocked until configs were initialized") + require.Len(t, configs, 1+teleport.VirtualDefaultHealthCheckConfigCount, + "starting the manager should have blocked until configs were initialized") } listener, err := net.Listen("tcp", "localhost:0") @@ -256,14 +260,6 @@ func TestManager(t *testing.T) { require.ErrorIs(t, trace.BadParameter("health check target resource kind \"node\" is not supported"), err) }) - requireTargetHealth := func(t *testing.T, r types.ResourceWithLabels, status types.TargetHealthStatus, reason types.TargetHealthTransitionReason) { - t.Helper() - health, err := mgr.GetTargetHealth(r) - require.NoError(t, err) - require.Equal(t, string(status), health.Status) - require.Equal(t, string(reason), health.TransitionReason) - } - prodPass := lastResultPassTestEvent(prodDB.GetName()) prodFail := lastResultFailTestEvent(prodDB.GetName()) // initially checks should be disabled for dev but enabled for prod @@ -272,8 +268,8 @@ func TestManager(t *testing.T) { deny(prodFail), denyAll(devDB.GetName()), ) - requireTargetHealth(t, devDB, types.TargetHealthStatusUnknown, types.TargetHealthTransitionReasonDisabled) - requireTargetHealth(t, prodDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, devDB, types.TargetHealthStatusUnknown, types.TargetHealthTransitionReasonDisabled) + requireTargetHealth(t, mgr, prodDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) // another check should reach the prodHCC configured threshold. awaitTestEvents(t, eventsCh, clock, @@ -282,7 +278,7 @@ func TestManager(t *testing.T) { deny(prodFail), denyAll(devDB.GetName()), ) - requireTargetHealth(t, prodDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, prodDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) // now reject health check connections to simulate an unhealthy endpoint prodDialer.deny() @@ -292,7 +288,7 @@ func TestManager(t *testing.T) { expect(prodFail, prodFail), deny(prodPass), denyAll(devDB.GetName())) - requireTargetHealth(t, prodDB, types.TargetHealthStatusUnhealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, prodDB, types.TargetHealthStatusUnhealthy, types.TargetHealthTransitionReasonThreshold) // enable dev health checks for dev db and simulate an unhealthy endpoint on init devDialer.deny() @@ -314,7 +310,7 @@ func TestManager(t *testing.T) { expect(devFail, prodFail), deny(prodPass, devPass), ) - requireTargetHealth(t, devDB, types.TargetHealthStatusUnhealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, devDB, types.TargetHealthStatusUnhealthy, types.TargetHealthTransitionReasonThreshold) // the next dev check should update health status because the unhealthy threshold was met awaitTestEvents(t, eventsCh, clock, @@ -322,7 +318,7 @@ func TestManager(t *testing.T) { expect(devFail, prodFail), deny(prodPass, devPass), ) - requireTargetHealth(t, devDB, types.TargetHealthStatusUnhealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, devDB, types.TargetHealthStatusUnhealthy, types.TargetHealthTransitionReasonThreshold) // set the unhealthy threshold high for dev, so we can simulate several // failing checks after dev becomes healthy without making it become unhealthy @@ -347,8 +343,8 @@ func TestManager(t *testing.T) { expect(devPass, prodPass), deny(devFail, prodFail), ) - requireTargetHealth(t, devDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) - requireTargetHealth(t, prodDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, devDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, prodDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) // now disable the prod health checks err = healthConfigSvc.DeleteHealthCheckConfig(ctx, "prod") @@ -365,9 +361,9 @@ func TestManager(t *testing.T) { deny(devFail, prodFail), ) // prod db should be disabled eventually - requireTargetHealth(t, prodDB, types.TargetHealthStatusUnknown, types.TargetHealthTransitionReasonDisabled) + requireTargetHealth(t, mgr, prodDB, types.TargetHealthStatusUnknown, types.TargetHealthTransitionReasonDisabled) // but dev db should still be healthy - requireTargetHealth(t, devDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, devDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) // fail some checks, then update config to lower the unhealthy threshold devDialer.deny() @@ -377,7 +373,7 @@ func TestManager(t *testing.T) { expect(devFail, devFail, devFail), denyAll(prodDB.GetName()), ) - requireTargetHealth(t, devDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, devDB, types.TargetHealthStatusHealthy, types.TargetHealthTransitionReasonThreshold) devHCC.Spec.UnhealthyThreshold = 1 devHCC.Spec.Interval = durationpb.New(time.Second * 100) _, err = healthConfigSvc.UpdateHealthCheckConfig(ctx, devHCC) @@ -389,7 +385,7 @@ func TestManager(t *testing.T) { deny(devPass, devFail), ) // config update should set unhealthy status since the new threshold is already met - requireTargetHealth(t, devDB, types.TargetHealthStatusUnhealthy, types.TargetHealthTransitionReasonThreshold) + requireTargetHealth(t, mgr, devDB, types.TargetHealthStatusUnhealthy, types.TargetHealthTransitionReasonThreshold) // remove a target err = mgr.RemoveTarget(devDB) @@ -404,7 +400,7 @@ func TestManager(t *testing.T) { require.ErrorIs(t, trace.NotFound("health checker \"name=devDB, kind=db\" not found"), err) // prodDB should still be disabled - requireTargetHealth(t, prodDB, types.TargetHealthStatusUnknown, types.TargetHealthTransitionReasonDisabled) + requireTargetHealth(t, mgr, prodDB, types.TargetHealthStatusUnknown, types.TargetHealthTransitionReasonDisabled) err = mgr.RemoveTarget(prodDB) require.NoError(t, err) @@ -415,6 +411,198 @@ func TestManager(t *testing.T) { ) } +func TestManager_VirtualDefaults(t *testing.T) { + t.Parallel() + // Verify that health check Manager correctly loads and uses virtual defaults. + // Exercises Manager + HealthCheckConfigService + Worker coordination with virtual defaults. + tests := []struct { + name string + component string + virtualDefaultName string + createResource func() (types.ResourceWithLabels, error) + setupHealthChecker func(t *testing.T) (HealthChecker, func()) + }{ + { + name: "database", + component: teleport.ComponentDatabase, + virtualDefaultName: teleport.VirtualDefaultHealthCheckConfigDBName, + createResource: func() (types.ResourceWithLabels, error) { + return types.NewDatabaseV3(types.Metadata{ + Name: "test-db", + }, types.DatabaseSpecV3{ + Protocol: defaults.ProtocolPostgres, + URI: "unused", + }) + }, + setupHealthChecker: func(t *testing.T) (HealthChecker, func()) { + listener, err := net.Listen("tcp", "localhost:0") + require.NoError(t, err) + dialer := &fakeDialer{} + checker := &TargetDialer{ + Resolver: func(ctx context.Context) ([]string, error) { + return []string{listener.Addr().String()}, nil + }, + dial: dialer.DialContext, + } + cleanup := func() { _ = listener.Close() } + return checker, cleanup + }, + }, + { + name: "kube", + component: teleport.ComponentKube, + virtualDefaultName: teleport.VirtualDefaultHealthCheckConfigKubeName, + createResource: func() (types.ResourceWithLabels, error) { + return types.NewKubernetesClusterV3(types.Metadata{ + Name: "test-kube", + }, types.KubernetesClusterSpecV3{}) + }, + setupHealthChecker: func(t *testing.T) (HealthChecker, func()) { + checker := &fakeKubeHealthChecker{} + cleanup := func() {} + return checker, cleanup + }, + }, + } + + ctx := t.Context() + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + // setup backend and health check config service + bk, err := memory.New(memory.Config{Context: ctx}) + require.NoError(t, err) + healthConfigSvc, err := local.NewHealthCheckConfigService(bk) + require.NoError(t, err) + + clock := clockwork.NewFakeClock() + eventsCh := make(chan testEvent, 1024) + mgr, err := NewManager(ctx, ManagerConfig{ + Component: tt.component, + Events: local.NewEventsService(bk), + HealthCheckConfigReader: healthConfigSvc, + Clock: clock, + }) + require.NoError(t, err) + require.NoError(t, mgr.Start(ctx)) + t.Cleanup(func() { + require.NoError(t, mgr.Close()) + }) + + resource, err := tt.createResource() + require.NoError(t, err) + + healthChecker, cleanup := tt.setupHealthChecker(t) + t.Cleanup(cleanup) + + err = mgr.AddTarget(Target{ + HealthChecker: healthChecker, + GetResource: func() types.ResourceWithLabels { + return resource + }, + onHealthCheck: func(lastResultErr error) { + eventsCh <- lastResultTestEvent(resource.GetName(), lastResultErr) + }, + onConfigUpdate: func() { + eventsCh <- configUpdateTestEvent(resource.GetName()) + }, + }) + require.NoError(t, err) + + // virtual defaults start with health checks enabled and matching all resources + passEvent := lastResultPassTestEvent(resource.GetName()) + awaitTestEvents(t, eventsCh, clock, + advanceCount(int(apidefaults.HealthCheckHealthyThreshold)), + expect(passEvent, passEvent), + ) + requireTargetHealth(t, mgr, resource, + types.TargetHealthStatusHealthy, + types.TargetHealthTransitionReasonThreshold) + + // get the virtual default + // modify default to detect change after deletion + virtualDefault, err := healthConfigSvc.GetHealthCheckConfig(ctx, tt.virtualDefaultName) + require.NoError(t, err) + modifiedUnhealthyThreshold := uint32(3) + virtualDefault.Spec.UnhealthyThreshold = modifiedUnhealthyThreshold + virtualDefault.Spec.Match.Disabled = true + + // save modified virtual default to the backend + _, err = healthConfigSvc.CreateHealthCheckConfig(ctx, virtualDefault) + require.NoError(t, err) + awaitTestEvents(t, eventsCh, nil, + expect(configUpdateTestEvent(resource.GetName())), + ) + requireTargetHealth(t, mgr, resource, + types.TargetHealthStatusUnknown, + types.TargetHealthTransitionReasonDisabled) + + // delete persisted virtual default + err = healthConfigSvc.DeleteHealthCheckConfig(ctx, tt.virtualDefaultName) + require.NoError(t, err) + awaitTestEvents(t, eventsCh, nil, + expect(configUpdateTestEvent(resource.GetName())), + ) + requireTargetHealth(t, mgr, resource, + types.TargetHealthStatusUnknown, + types.TargetHealthTransitionReasonInit) // init because worker restarted with new config + + // get the new virtual default which has not been persisted + // expect original default values + afterDelete, err := healthConfigSvc.GetHealthCheckConfig(ctx, tt.virtualDefaultName) + require.NoError(t, err) + require.False(t, afterDelete.Spec.Match.Disabled, + "expecting virtual default to revert enabled state") + require.NotEqual(t, modifiedUnhealthyThreshold, afterDelete.Spec.UnhealthyThreshold, + "expecting virtual default should revert to original unhealthy threshold") + awaitTestEvents(t, eventsCh, clock, + advanceCount(int(apidefaults.HealthCheckHealthyThreshold)), + expect(passEvent, passEvent), + ) + requireTargetHealth(t, mgr, resource, + types.TargetHealthStatusHealthy, + types.TargetHealthTransitionReasonThreshold) + + // disable to stop checking health + virtualDefault, err = healthConfigSvc.GetHealthCheckConfig(ctx, tt.virtualDefaultName) + require.NoError(t, err) + virtualDefault.Spec.Match.Disabled = true + _, err = healthConfigSvc.UpdateHealthCheckConfig(ctx, virtualDefault) + require.NoError(t, err) + awaitTestEvents(t, eventsCh, nil, + expect(configUpdateTestEvent(resource.GetName())), + ) + requireTargetHealth(t, mgr, resource, + types.TargetHealthStatusUnknown, + types.TargetHealthTransitionReasonDisabled) + + // re-enabling config matcher starts health checks + virtualDefault.Spec.Match.Disabled = false + _, err = healthConfigSvc.UpdateHealthCheckConfig(ctx, virtualDefault) + require.NoError(t, err) + + awaitTestEvents(t, eventsCh, clock, + expect(configUpdateTestEvent(resource.GetName())), + advanceCount(int(apidefaults.HealthCheckHealthyThreshold)), + expect(passEvent), + ) + requireTargetHealth(t, mgr, resource, + types.TargetHealthStatusHealthy, + types.TargetHealthTransitionReasonThreshold) + + }) + } +} + +func requireTargetHealth(t *testing.T, mgr Manager, r types.ResourceWithLabels, status types.TargetHealthStatus, reason types.TargetHealthTransitionReason) { + t.Helper() + health, err := mgr.GetTargetHealth(r) + require.NoError(t, err) + require.Equal(t, string(status), health.Status) + require.Equal(t, string(reason), health.TransitionReason) +} + func healthCheckConfigFixture(t *testing.T, name string) *healthcheckconfigv1.HealthCheckConfig { t.Helper() out, err := healthcheckconfig.NewHealthCheckConfig(name, @@ -569,6 +757,17 @@ func (f *fakeDialer) DialContext(ctx context.Context, network, addr string) (net return f.Dialer.DialContext(ctx, network, addr) } +type fakeKubeHealthChecker struct { +} + +func (m *fakeKubeHealthChecker) CheckHealth(ctx context.Context) ([]string, error) { + return []string{"https://localhost:6443"}, nil +} + +func (m *fakeKubeHealthChecker) GetProtocol() types.TargetHealthProtocol { + return types.TargetHealthProtocolHTTP +} + type testEvent struct { name testEventName target string @@ -617,3 +816,18 @@ func lastResultFailTestEvent(targetName string) testEvent { target: targetName, } } + +func disableVirtualHealthCheckDefaults(t *testing.T, ctx context.Context, svc *local.HealthCheckConfigService) { + t.Helper() + virtualDefaults := []string{ + teleport.VirtualDefaultHealthCheckConfigDBName, + teleport.VirtualDefaultHealthCheckConfigKubeName, + } + for _, name := range virtualDefaults { + cfg, err := svc.GetHealthCheckConfig(ctx, name) + require.NoError(t, err) + cfg.Spec.Match.Disabled = true + _, err = svc.UpdateHealthCheckConfig(ctx, cfg) + require.NoError(t, err) + } +} diff --git a/lib/services/health_check_config_test.go b/lib/services/health_check_config_test.go index 83a13deb1e3a3..d85184f53295d 100644 --- a/lib/services/health_check_config_test.go +++ b/lib/services/health_check_config_test.go @@ -38,6 +38,12 @@ func TestValidateHealthCheckConfig(t *testing.T) { require.ErrorContains(t, err, substr) } } + var noErr = func() require.ErrorAssertionFunc { + return func(t require.TestingT, err error, _ ...any) { + t.(*testing.T).Helper() + require.NoError(t, err) + } + } testCases := []struct { name string @@ -387,6 +393,16 @@ func TestValidateHealthCheckConfig(t *testing.T) { }, requireErr: errContains("spec.unhealthy_threshold (11) must not be greater than 10"), }, + { + name: "database virtual default ok", + in: VirtualDefaultHealthCheckConfigDB(), + requireErr: noErr(), + }, + { + name: "kube virtual default ok", + in: VirtualDefaultHealthCheckConfigKube(), + requireErr: noErr(), + }, } for _, tc := range testCases { diff --git a/lib/services/local/events.go b/lib/services/local/events.go index 97403f9c0d2a2..69f60720e55b2 100644 --- a/lib/services/local/events.go +++ b/lib/services/local/events.go @@ -20,6 +20,7 @@ package local import ( "context" + "errors" "log/slog" "slices" "strings" @@ -344,6 +345,10 @@ func (w *watcher) parseEvent(e backend.Event) ([]types.Event, []error) { if p.match(e.Item.Key) { resource, err := p.parse(e) if err != nil { + if eo := parseEventOverrideError(nil); errors.As(err, &eo) { + events = append(events, eo...) + continue + } errs = append(errs, trace.Wrap(err)) continue } @@ -403,7 +408,10 @@ func (w *watcher) Close() error { // resourceParser is an interface // for parsing resource from backend byte event stream type resourceParser interface { - // parse parses resource from the backend event + // parse parses resource from the backend event; if the returned error is a + // parseEventOverrideError then the returned resource is ignored and the + // events in the parseEventOverrideError will be added to the generated + // events instead. parse(event backend.Event) (types.Resource, error) // match returns true if event key matches match(key backend.Key) bool @@ -411,6 +419,10 @@ type resourceParser interface { prefixes() []backend.Key } +type parseEventOverrideError []types.Event + +func (p parseEventOverrideError) Error() string { return "parseEventOverrideError" } + // baseParser is a partial implementation of resourceParser for the most common // resource types (stored under a static prefix). type baseParser struct { diff --git a/lib/services/local/health_check_config.go b/lib/services/local/health_check_config.go index 869dec40f7532..8ee062d49d34f 100644 --- a/lib/services/local/health_check_config.go +++ b/lib/services/local/health_check_config.go @@ -20,13 +20,17 @@ package local import ( "context" + "iter" + "strings" "github.com/gravitational/trace" + "github.com/gravitational/teleport" apidefaults "github.com/gravitational/teleport/api/defaults" healthcheckconfigv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/healthcheckconfig/v1" "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/lib/backend" + iterstream "github.com/gravitational/teleport/lib/itertools/stream" "github.com/gravitational/teleport/lib/services" "github.com/gravitational/teleport/lib/services/local/generic" ) @@ -58,33 +62,107 @@ func NewHealthCheckConfigService(b backend.Backend) (*HealthCheckConfigService, }, nil } +func (*HealthCheckConfigService) hasVirtualResource(name string) bool { + switch name { + case teleport.VirtualDefaultHealthCheckConfigDBName, + teleport.VirtualDefaultHealthCheckConfigKubeName: + return true + default: + return false + } +} + +func (*HealthCheckConfigService) getVirtualResource(name string) *healthcheckconfigv1.HealthCheckConfig { + switch name { + case teleport.VirtualDefaultHealthCheckConfigDBName: + return services.VirtualDefaultHealthCheckConfigDB() + case teleport.VirtualDefaultHealthCheckConfigKubeName: + return services.VirtualDefaultHealthCheckConfigKube() + } + return nil +} + +func (*HealthCheckConfigService) rangeVirtualResources(start string) iter.Seq2[*healthcheckconfigv1.HealthCheckConfig, error] { + return func(yield func(*healthcheckconfigv1.HealthCheckConfig, error) bool) { + switch { + case start <= teleport.VirtualDefaultHealthCheckConfigDBName: + if !yield(services.VirtualDefaultHealthCheckConfigDB(), nil) { + return + } + fallthrough + case start <= teleport.VirtualDefaultHealthCheckConfigKubeName: + if !yield(services.VirtualDefaultHealthCheckConfigKube(), nil) { + return + } + } + } +} + // CreateHealthCheckConfig creates a new HealthCheckConfig resource. func (s *HealthCheckConfigService) CreateHealthCheckConfig(ctx context.Context, config *healthcheckconfigv1.HealthCheckConfig) (*healthcheckconfigv1.HealthCheckConfig, error) { + // we don't need to check if config refers to a potentially virtual resource + // because creating on top of a virtual resource is very convenient so it's + // a break from the semantics of Create that we want to allow created, err := s.svc.CreateResource(ctx, config) return created, trace.Wrap(err) } // GetHealthCheckConfig returns the specified HealthCheckConfig resource. +// A virtual resource, if requested, is always returned even if it doesn't exist in the backend. func (s *HealthCheckConfigService) GetHealthCheckConfig(ctx context.Context, name string) (*healthcheckconfigv1.HealthCheckConfig, error) { item, err := s.svc.GetResource(ctx, name) if err != nil { + if trace.IsNotFound(err) { + if virtualResource := s.getVirtualResource(name); virtualResource != nil { + return virtualResource, nil + } + } return nil, trace.Wrap(err) } return item, nil } // ListHealthCheckConfigs returns a paginated list of HealthCheckConfig resources. +// Virtual resources are always returned even if they don't exist in the backend. func (s *HealthCheckConfigService) ListHealthCheckConfigs(ctx context.Context, pageSize int, pageToken string) ([]*healthcheckconfigv1.HealthCheckConfig, string, error) { - items, nextKey, err := s.svc.ListResources(ctx, pageSize, pageToken) + items, nextPageToken, err := generic.CollectPageAndCursor( + iterstream.MergeStreamsWithPriority( + s.svc.Resources(ctx, pageToken, ""), + s.rangeVirtualResources(pageToken), + func(a, b *healthcheckconfigv1.HealthCheckConfig) int { + return strings.Compare(a.GetMetadata().GetName(), b.GetMetadata().GetName()) + }, + ), + pageSize, + func(v *healthcheckconfigv1.HealthCheckConfig) string { + return v.GetMetadata().GetName() + }, + ) if err != nil { return nil, "", trace.Wrap(err) } - return items, nextKey, nil + return items, nextPageToken, nil } // UpdateHealthCheckConfig updates an existing HealthCheckConfig resource. func (s *HealthCheckConfigService) UpdateHealthCheckConfig(ctx context.Context, config *healthcheckconfigv1.HealthCheckConfig) (*healthcheckconfigv1.HealthCheckConfig, error) { + if virtualResource := s.getVirtualResource(config.GetMetadata().GetName()); virtualResource != nil && config.GetMetadata().GetRevision() == virtualResource.GetMetadata().GetRevision() { + // a (successful) conditional update on a virtual resource is a create + // in storage; no real storage item is going to have the same revision + // as the virtual resource, so this must be a conditional update on the + // virtual resource that we know of + created, err := s.svc.CreateResource(ctx, config) + if err != nil { + if trace.IsAlreadyExists(err) { + return nil, trace.Wrap(backend.ErrIncorrectRevision) + } + return nil, trace.Wrap(err) + } + return created, nil + } + // if this was intended to be a conditional update on a virtual resource it + // was not a successful one, but we can let the storage deal with it updated, err := s.svc.ConditionalUpdateResource(ctx, config) return updated, trace.Wrap(err) } @@ -97,7 +175,13 @@ func (s *HealthCheckConfigService) UpsertHealthCheckConfig(ctx context.Context, // DeleteHealthCheckConfig removes the specified HealthCheckConfig resource. func (s *HealthCheckConfigService) DeleteHealthCheckConfig(ctx context.Context, name string) error { - return trace.Wrap(s.svc.DeleteResource(ctx, name)) + err := s.svc.DeleteResource(ctx, name) + if trace.IsNotFound(err) && s.hasVirtualResource(name) { + // we want to allow deleting virtual resources as a noop even if it's a + // little break from the standard semantics of Delete + return nil + } + return trace.Wrap(err) } // DeleteAllHealthCheckConfigs removes all HealthCheckConfig resources. @@ -105,29 +189,43 @@ func (s *HealthCheckConfigService) DeleteAllHealthCheckConfigs(ctx context.Conte return trace.Wrap(s.svc.DeleteAllResources(ctx)) } -func newHealthCheckConfigParser() *healthCheckConfigParser { - return &healthCheckConfigParser{ - baseParser: newBaseParser(backend.NewKey(healthCheckConfigPrefix)), +func newHealthCheckConfigParser() resourceParser { + return healthCheckConfigParser{} +} + +type healthCheckConfigParser struct{} + +func (healthCheckConfigParser) prefixes() []backend.Key { + return []backend.Key{ + backend.ExactKey(healthCheckConfigPrefix), } } -type healthCheckConfigParser struct { - baseParser +func (healthCheckConfigParser) match(key backend.Key) bool { + return strings.HasPrefix(key.String(), backend.SeparatorString+healthCheckConfigPrefix+backend.SeparatorString) } func (healthCheckConfigParser) parse(event backend.Event) (types.Resource, error) { switch event.Type { case types.OpDelete: - components := event.Item.Key.Components() - if len(components) < 2 { - return nil, trace.NotFound("failed parsing %s", event.Item.Key) + key := event.Item.Key.String() + name, found := strings.CutPrefix(key, backend.SeparatorString+healthCheckConfigPrefix+backend.SeparatorString) + if !found { + return nil, trace.NotFound("failed parsing "+types.KindHealthCheckConfig+" key %+q", key) + } + + if virtualResource := (*HealthCheckConfigService)(nil).getVirtualResource(name); virtualResource != nil { + return nil, parseEventOverrideError{{ + Type: types.OpPut, + Resource: types.Resource153ToLegacy(virtualResource), + }} } return &types.ResourceHeader{ Kind: types.KindHealthCheckConfig, Version: types.V1, Metadata: types.Metadata{ - Name: components[1], + Name: name, Namespace: apidefaults.Namespace, }, }, nil diff --git a/lib/services/local/health_check_config_test.go b/lib/services/local/health_check_config_test.go index b3c3e9c11cf31..c534741ec6216 100644 --- a/lib/services/local/health_check_config_test.go +++ b/lib/services/local/health_check_config_test.go @@ -23,11 +23,13 @@ import ( "testing" "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" "github.com/gravitational/trace" "github.com/jonboulle/clockwork" "github.com/stretchr/testify/require" "google.golang.org/protobuf/testing/protocmp" + "github.com/gravitational/teleport" headerv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/header/v1" healthcheckconfigv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/healthcheckconfig/v1" labelv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/label/v1" @@ -35,103 +37,362 @@ import ( "github.com/gravitational/teleport/api/types/healthcheckconfig" "github.com/gravitational/teleport/lib/backend" "github.com/gravitational/teleport/lib/backend/memory" + "github.com/gravitational/teleport/lib/services" ) func TestHealthCheckConfigService(t *testing.T) { t.Parallel() + ctx := t.Context() - ctx := context.Background() - mem, err := memory.New(memory.Config{ - Context: ctx, - Clock: clockwork.NewFakeClock(), - }) - require.NoError(t, err) - service, err := NewHealthCheckConfigService(backend.NewSanitizer(mem)) - require.NoError(t, err) + tests := []struct { + name string + run func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) + }{ + { + name: "create", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + cfg1 := newHealthCfg(t, "cfg1") + _, err := svc.CreateHealthCheckConfig(ctx, cfg1) + require.NoError(t, err) - cfg1 := newHealthCheckConfig(t, "cfg1") - cfg2 := newHealthCheckConfig(t, "cfg2") + cfg2 := newHealthCfg(t, "cfg2") + _, err = svc.CreateHealthCheckConfig(ctx, cfg2) + require.NoError(t, err) + }, + }, + { + name: "create invalid", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + cfg := newHealthCfg(t, "invalid") + cfg.Spec = nil + _, err := svc.CreateHealthCheckConfig(ctx, cfg) + require.Error(t, err) + }, + }, + { + name: "create virtual defaults not persisted", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + for _, cfg := range newHealthCfgDefaults(t) { + t.Run(cfg.GetMetadata().GetName(), func(t *testing.T) { + // Create a virtual default which hasn't been written to the backend. + out, err := svc.CreateHealthCheckConfig(ctx, cfg) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfg, out, protocmp.Transform())) + }) + } + }, + }, + { + name: "create virtual default concurrent", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + errChan := make(chan error, 2) + for i := 0; i < 2; i++ { + go func() { + cfg := services.VirtualDefaultHealthCheckConfigDB() + _, err := svc.CreateHealthCheckConfig(ctx, cfg) + errChan <- err + }() + } + err1 := <-errChan + err2 := <-errChan - t.Run("empty", func(t *testing.T) { - out, _, err := service.ListHealthCheckConfigs(ctx, 10, "") - require.NoError(t, err) - require.Empty(t, out) - }) + if err1 == nil { + require.True(t, trace.IsAlreadyExists(err2)) + } else { + require.NoError(t, err2) + require.True(t, trace.IsAlreadyExists(err1)) + } + }, + }, + { + name: "get", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + cfg1 := newHealthCfg(t, "cfg1") + _, err := svc.CreateHealthCheckConfig(ctx, cfg1) + require.NoError(t, err) - t.Run("invalid resource is rejected", func(t *testing.T) { - cfg := newHealthCheckConfig(t, "example") - cfg.Spec = nil - _, err := service.CreateHealthCheckConfig(ctx, cfg) - require.Error(t, err) - }) + out, err := svc.GetHealthCheckConfig(ctx, cfg1.GetMetadata().GetName()) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfg1, out, protocmp.Transform())) + }, + }, + { + name: "get not found", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + _, err := svc.GetHealthCheckConfig(ctx, "not-found") + require.True(t, trace.IsNotFound(err)) + }, + }, + { + name: "get virtual defaults not persisted", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + for name, cfg := range newHealthCfgDefaultsMap(t) { + t.Run(name, func(t *testing.T) { + // Get a virtual default which hasn't been written to the backend. + out, err := svc.GetHealthCheckConfig(ctx, name) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfg, out, protocmp.Transform())) + }) + } + }, + }, + { + name: "list with paging", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + cfg1 := newHealthCfg(t, "cfg1") + cfg1, err := svc.CreateHealthCheckConfig(ctx, cfg1) + require.NoError(t, err) + cfg2 := newHealthCfg(t, "cfg2") + cfg2, err = svc.CreateHealthCheckConfig(ctx, cfg2) + require.NoError(t, err) - t.Run("create", func(t *testing.T) { - _, err := service.CreateHealthCheckConfig(ctx, cfg1) - require.NoError(t, err) - _, err = service.CreateHealthCheckConfig(ctx, cfg2) - require.NoError(t, err) - }) + var cfgs []*healthcheckconfigv1.HealthCheckConfig + var token string + for { + out, nextToken, err := svc.ListHealthCheckConfigs(ctx, 1, token) + require.NoError(t, err) + require.Len(t, out, 1) + cfgs = append(cfgs, out...) + if nextToken == "" { + break + } + token = nextToken + } - t.Run("list", func(t *testing.T) { - out, _, err := service.ListHealthCheckConfigs(ctx, 10, "") - require.NoError(t, err) - requireEqualHealthCheckConfigs(t, out, cfg1, cfg2) - }) + require.Len(t, cfgs, teleport.VirtualDefaultHealthCheckConfigCount+2) + requireContainsHealthCheckConfigs(t, cfgs, cfg1, cfg2) + }, + }, + { + name: "list with paging splitting virtual defaults", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + // Create one regular config that sorts between virtual defaults + // (assuming virtual defaults are named alphabetically) + cfg := newHealthCfg(t, "aaa-config") // Sorts before virtual defaults + _, err := svc.CreateHealthCheckConfig(ctx, cfg) + require.NoError(t, err) - t.Run("list with token", func(t *testing.T) { - out1, token, err := service.ListHealthCheckConfigs(ctx, 1, "") - require.NoError(t, err) - require.NotEmpty(t, token) - require.Len(t, out1, 1) - out2, token, err := service.ListHealthCheckConfigs(ctx, 1, token) - require.NoError(t, err) - require.Empty(t, token) - require.Len(t, out2, 1) + pageSize := teleport.VirtualDefaultHealthCheckConfigCount + page1, token, err := svc.ListHealthCheckConfigs(ctx, pageSize, "") + require.NoError(t, err) + require.Len(t, page1, pageSize) + require.NotEmpty(t, token) - combined := append(out1, out2...) - require.Contains(t, combined, cfg1) - require.Contains(t, combined, cfg2) - }) + page2, token2, err := svc.ListHealthCheckConfigs(ctx, pageSize, token) + require.NoError(t, err) + require.Len(t, page2, 1) + require.Empty(t, token2) + require.Empty(t, cmp.Diff( + services.VirtualDefaultHealthCheckConfigKube(), + page2[0], protocmp.Transform())) + }, + }, + { + name: "list virtual defaults not persisted", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + out, _, err := svc.ListHealthCheckConfigs(ctx, 10, "") + require.NoError(t, err) + require.Len(t, out, teleport.VirtualDefaultHealthCheckConfigCount) + require.True(t, slices.IsSortedFunc(out, func(a, b *healthcheckconfigv1.HealthCheckConfig) int { + return strings.Compare(a.GetMetadata().GetName(), b.GetMetadata().GetName()) + }), "expected virtual defaults to be sorted") + requireEqualHealthCheckConfigs(t, out, newHealthCfgDefaults(t)...) + }, + }, + { + name: "update", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + cfg1 := newHealthCfg(t, "cfg1") + cfg1, err := svc.CreateHealthCheckConfig(ctx, cfg1) + require.NoError(t, err) - t.Run("get and update", func(t *testing.T) { - out, err := service.GetHealthCheckConfig(ctx, cfg1.Metadata.Name) - require.NoError(t, err) - require.Equal(t, out, cfg1) + cfg1.Spec.HealthyThreshold = 3 + cfgUpd, err := svc.UpdateHealthCheckConfig(ctx, cfg1) + require.NoError(t, err) + require.Equal(t, cfg1.Spec.HealthyThreshold, cfgUpd.Spec.HealthyThreshold) - out.Spec.HealthyThreshold = 3 - out, err = service.UpdateHealthCheckConfig(ctx, out) - require.NoError(t, err) - require.Equal(t, uint32(3), out.Spec.HealthyThreshold) - }) + cfgGet, err := svc.GetHealthCheckConfig(ctx, cfg1.GetMetadata().GetName()) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfgUpd, cfgGet, protocmp.Transform())) - t.Run("delete not found", func(t *testing.T) { - err := service.DeleteHealthCheckConfig(ctx, "asdf") - require.IsType(t, trace.NotFound(""), err) - }) + cfgGet.Spec.HealthyThreshold = 9 + cfgUpd2, err := svc.UpdateHealthCheckConfig(ctx, cfgGet) + require.NoError(t, err) + require.Equal(t, cfgGet.Spec.HealthyThreshold, cfgUpd2.Spec.HealthyThreshold) + }, + }, + { + name: "update virtual defaults not persisted", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + for _, name := range healthCfgDefaultNames { + t.Run(name, func(t *testing.T) { + // Get a virtual default which hasn't been written to the backend, + // then update. + cfg, err := svc.GetHealthCheckConfig(ctx, name) + require.NoError(t, err) + cfg.Spec.Match.Disabled = true + out, err := svc.UpdateHealthCheckConfig(ctx, cfg) + require.NoError(t, err) + require.Equal(t, cfg.Spec.Match.Disabled, out.Spec.Match.Disabled) + }) + } + }, + }, + { + name: "update virtual defaults persisted", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + for _, cfg := range newHealthCfgDefaults(t) { + t.Run(cfg.GetMetadata().GetName(), func(t *testing.T) { + // Write a virtual default to the backend, + // then update. + cfgCrt, err := svc.CreateHealthCheckConfig(ctx, cfg) + require.NoError(t, err) + cfgCrt.Spec.Match.Disabled = true + out, err := svc.UpdateHealthCheckConfig(ctx, cfgCrt) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfgCrt, out, protocmp.Transform())) + }) + } + }, + }, + { + name: "upsert", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + cfg1 := newHealthCfg(t, "cfg1") + cfgUps, err := svc.UpsertHealthCheckConfig(ctx, cfg1) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfgUps, cfg1, protocmp.Transform())) - t.Run("delete", func(t *testing.T) { - require.NoError(t, service.DeleteHealthCheckConfig(ctx, cfg1.Metadata.Name)) - }) + cfgGet, err := svc.GetHealthCheckConfig(ctx, cfgUps.GetMetadata().GetName()) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfgUps, cfgGet, protocmp.Transform())) - t.Run("upsert", func(t *testing.T) { - cfg3 := newHealthCheckConfig(t, "cfg3") - _, err := service.UpsertHealthCheckConfig(ctx, cfg3) - require.NoError(t, err) + cfgGet.Spec.HealthyThreshold = 9 + out, err := svc.UpsertHealthCheckConfig(ctx, cfgGet) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfgGet, out, protocmp.Transform())) + }, + }, + { + name: "upsert virtual defaults", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + for _, cfg := range newHealthCfgDefaults(t) { + t.Run(cfg.GetMetadata().GetName(), func(t *testing.T) { + cfgUps, err := svc.UpsertHealthCheckConfig(ctx, cfg) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfgUps, cfg, protocmp.Transform())) - out, _, err := service.ListHealthCheckConfigs(ctx, 10, "") - require.NoError(t, err) - requireEqualHealthCheckConfigs(t, out, cfg2, cfg3) - }) + cfgGet, err := svc.GetHealthCheckConfig(ctx, cfg.GetMetadata().GetName()) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfgUps, cfgGet, protocmp.Transform())) - t.Run("delete all", func(t *testing.T) { - require.NoError(t, service.DeleteAllHealthCheckConfigs(ctx)) - out, _, err := service.ListHealthCheckConfigs(ctx, 10, "") - require.NoError(t, err) - require.Empty(t, out) + cfgGet.Spec.Match.Disabled = true + out, err := svc.UpsertHealthCheckConfig(ctx, cfgGet) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfgGet, out, protocmp.Transform())) + }) + } + }, + }, + { + name: "delete", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + cfg1 := newHealthCfg(t, "cfg1") + _, err := svc.CreateHealthCheckConfig(ctx, cfg1) + require.NoError(t, err) + + err = svc.DeleteHealthCheckConfig(ctx, cfg1.GetMetadata().GetName()) + require.NoError(t, err) + }, + }, + { + name: "delete not found", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + err := svc.DeleteHealthCheckConfig(ctx, "not-found") + require.True(t, trace.IsNotFound(err)) + }, + }, + { + name: "delete virtual defaults not persisted", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + for _, cfg := range newHealthCfgDefaults(t) { + t.Run(cfg.GetMetadata().GetName(), func(t *testing.T) { + // Delete a virtual default which hasn't been written to the backend, + // then get the new virtual default which is unwritten to the backend. + err := svc.DeleteHealthCheckConfig(ctx, cfg.GetMetadata().GetName()) + require.NoError(t, err) + + out, err := svc.GetHealthCheckConfig(ctx, cfg.GetMetadata().GetName()) + require.NoError(t, err) + require.Empty(t, cmp.Diff(cfg, out, protocmp.Transform())) + }) + } + }, + }, + { + name: "delete virtual defaults persisted", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + for _, cfg := range newHealthCfgDefaults(t) { + t.Run(cfg.GetMetadata().GetName(), func(t *testing.T) { + // Write a virtual default to the backend, then delete, + // then get the new virtual default which is unwritten to the backend. + _, err := svc.CreateHealthCheckConfig(ctx, cfg) + require.NoError(t, err) + + err = svc.DeleteHealthCheckConfig(ctx, cfg.GetMetadata().GetName()) + require.NoError(t, err) + + out, err := svc.GetHealthCheckConfig(ctx, cfg.GetMetadata().GetName()) + require.NoError(t, err) + requireEqualHealthCfg(t, cfg, out) + }) + } + }, + }, + { + name: "delete all", + run: func(t *testing.T, ctx context.Context, svc *HealthCheckConfigService) { + _, err := svc.CreateHealthCheckConfig(ctx, newHealthCfg(t, "cfg1")) + require.NoError(t, err) + _, err = svc.CreateHealthCheckConfig(ctx, newHealthCfg(t, "cfg2")) + require.NoError(t, err) + _, err = svc.CreateHealthCheckConfig(ctx, newHealthCfg(t, "cfg3")) + require.NoError(t, err) + + err = svc.DeleteAllHealthCheckConfigs(ctx) + require.NoError(t, err) + + out, _, err := svc.ListHealthCheckConfigs(ctx, 10, "") + require.NoError(t, err) + require.Len(t, out, teleport.VirtualDefaultHealthCheckConfigCount) + requireEqualHealthCheckConfigs(t, out, newHealthCfgDefaults(t)...) + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + tt.run(t, ctx, newHealthSvc(t, ctx)) + }) + } +} + +func newHealthSvc(t *testing.T, ctx context.Context) *HealthCheckConfigService { + t.Helper() + mem, err := memory.New(memory.Config{ + Context: ctx, + Clock: clockwork.NewFakeClock(), + }) + require.NoError(t, err) + t.Cleanup(func() { + mem.Close() }) + svc, err := NewHealthCheckConfigService(backend.NewSanitizer(mem)) + require.NoError(t, err) + return svc } -func newHealthCheckConfig(t *testing.T, name string) *healthcheckconfigv1.HealthCheckConfig { +func newHealthCfg(t *testing.T, name string) *healthcheckconfigv1.HealthCheckConfig { t.Helper() cfg, err := healthcheckconfig.NewHealthCheckConfig(name, &healthcheckconfigv1.HealthCheckConfigSpec{ @@ -147,19 +408,6 @@ func newHealthCheckConfig(t *testing.T, name string) *healthcheckconfigv1.Health return cfg } -func requireEqualHealthCheckConfigs(t *testing.T, got []*healthcheckconfigv1.HealthCheckConfig, want ...*healthcheckconfigv1.HealthCheckConfig) { - t.Helper() - cmpByName := func(a, b *healthcheckconfigv1.HealthCheckConfig) int { - return strings.Compare(a.Metadata.GetName(), b.Metadata.GetName()) - } - require.Empty(t, cmp.Diff( - slices.SortedFunc(slices.Values(want), cmpByName), - slices.SortedFunc(slices.Values(got), cmpByName), - protocmp.Transform(), - protocmp.IgnoreFields(&headerv1.Metadata{}, "revision"), - )) -} - func TestHealthCheckConfigParser(t *testing.T) { t.Parallel() parser := newHealthCheckConfigParser() @@ -174,7 +422,34 @@ func TestHealthCheckConfigParser(t *testing.T) { resource, err := parser.parse(event) require.NoError(t, err) require.Equal(t, "example", resource.GetMetadata().Name) + require.Equal(t, types.KindHealthCheckConfig, resource.GetKind()) }) + + t.Run("delete virtual default returns override error", func(t *testing.T) { + for _, name := range healthCfgDefaultNames { + t.Run(name, func(t *testing.T) { + event := backend.Event{ + Type: types.OpDelete, + Item: backend.Item{ + Key: backend.NewKey(healthCheckConfigPrefix, name), + }, + } + + require.True(t, parser.match(event.Item.Key)) + + _, err := parser.parse(event) + + var overrideErr parseEventOverrideError + require.ErrorAs(t, err, &overrideErr, + "deleting virtual default should return parseEventOverrideError") + require.Len(t, overrideErr, 1) + require.Equal(t, types.OpPut, overrideErr[0].Type) + require.Equal(t, name, overrideErr[0].Resource.GetName()) + require.Equal(t, types.KindHealthCheckConfig, overrideErr[0].Resource.GetKind()) + }) + } + }) + t.Run("put", func(t *testing.T) { event := backend.Event{ Type: types.OpPut, @@ -211,5 +486,61 @@ func TestHealthCheckConfigParser(t *testing.T) { resource, err := parser.parse(event) require.NoError(t, err) require.Equal(t, "example", resource.GetMetadata().Name) + require.Equal(t, types.KindHealthCheckConfig, resource.GetKind()) }) + +} + +func newHealthCfgDefaults(t *testing.T) []*healthcheckconfigv1.HealthCheckConfig { + t.Helper() + return []*healthcheckconfigv1.HealthCheckConfig{ + services.VirtualDefaultHealthCheckConfigDB(), + services.VirtualDefaultHealthCheckConfigKube(), + } +} + +func newHealthCfgDefaultsMap(t *testing.T) map[string]*healthcheckconfigv1.HealthCheckConfig { + t.Helper() + return map[string]*healthcheckconfigv1.HealthCheckConfig{ + teleport.VirtualDefaultHealthCheckConfigDBName: services.VirtualDefaultHealthCheckConfigDB(), + teleport.VirtualDefaultHealthCheckConfigKubeName: services.VirtualDefaultHealthCheckConfigKube(), + } } + +func requireEqualHealthCfg(t *testing.T, expect *healthcheckconfigv1.HealthCheckConfig, actual *healthcheckconfigv1.HealthCheckConfig) { + t.Helper() + require.Empty(t, cmp.Diff(expect, actual, + protocmp.Transform(), + protocmp.IgnoreFields(&headerv1.Metadata{}, "revision"), + )) +} + +func requireEqualHealthCheckConfigs(t *testing.T, got []*healthcheckconfigv1.HealthCheckConfig, want ...*healthcheckconfigv1.HealthCheckConfig) { + t.Helper() + require.Empty(t, + cmp.Diff(want, got, + cmpopts.SortSlices(func(a, b *healthcheckconfigv1.HealthCheckConfig) bool { + return a.GetMetadata().GetName() < b.GetMetadata().GetName() + }), + protocmp.Transform(), + protocmp.IgnoreFields(&headerv1.Metadata{}, "revision"), + ), + ) +} + +func requireContainsHealthCheckConfigs(t *testing.T, actual []*healthcheckconfigv1.HealthCheckConfig, expected ...*healthcheckconfigv1.HealthCheckConfig) { + t.Helper() + for _, exp := range expected { + found := slices.ContainsFunc(actual, func(cfg *healthcheckconfigv1.HealthCheckConfig) bool { + return cmp.Equal(exp, cfg, protocmp.Transform()) + }) + require.True(t, found, "config %q not found in list", exp.GetMetadata().GetName()) + } +} + +var ( + healthCfgDefaultNames = []string{ + teleport.VirtualDefaultHealthCheckConfigDBName, + teleport.VirtualDefaultHealthCheckConfigKubeName, + } +) diff --git a/lib/services/presets.go b/lib/services/presets.go index cf1c3460fd1e1..064dae12fd174 100644 --- a/lib/services/presets.go +++ b/lib/services/presets.go @@ -855,19 +855,19 @@ func NewPresetMCPUserRole() types.Role { return role } -// NewPresetHealthCheckConfig returns a preset default health_check_config that -// enables health checks for all resources. -func NewPresetHealthCheckConfig() *healthcheckconfigv1.HealthCheckConfig { +// VirtualDefaultHealthCheckConfigDB returns a health_check_config enabling +// health checks for all databases resources, and is intended to be used as a +// virtual default resource. Its name is "default" for historical reasons. +func VirtualDefaultHealthCheckConfigDB() *healthcheckconfigv1.HealthCheckConfig { return &healthcheckconfigv1.HealthCheckConfig{ Kind: types.KindHealthCheckConfig, Version: types.V1, Metadata: &headerv1.Metadata{ - Name: teleport.PresetDefaultHealthCheckConfigName, - Description: "Enables all health checks by default", - Namespace: apidefaults.Namespace, - Labels: map[string]string{ - types.TeleportInternalResourceType: types.PresetResource, - }, + Name: teleport.VirtualDefaultHealthCheckConfigDBName, + Description: "Enables health checks for all databases by default", + // this revision MUST be changed every time we change the contents + // of the preset so that conditional updates can check against it + Revision: "af391615-1e42-4237-aa2b-155e6abbd41a", }, Spec: &healthcheckconfigv1.HealthCheckConfigSpec{ Match: &healthcheckconfigv1.Matcher{ @@ -881,6 +881,32 @@ func NewPresetHealthCheckConfig() *healthcheckconfigv1.HealthCheckConfig { } } +// VirtualDefaultHealthCheckConfigKube returns a health_check_config enabling +// health checks for all Kubernetes resources. It's intended to be used as a +// virtual default resource. +func VirtualDefaultHealthCheckConfigKube() *healthcheckconfigv1.HealthCheckConfig { + return &healthcheckconfigv1.HealthCheckConfig{ + Kind: types.KindHealthCheckConfig, + Version: types.V1, + Metadata: &headerv1.Metadata{ + Name: teleport.VirtualDefaultHealthCheckConfigKubeName, + Description: "Enables health checks for all Kubernetes clusters by default.", + // this revision MUST be changed every time we change the contents + // of the preset so that conditional updates can check against it + Revision: "d796f007-e60c-4747-8dde-f479aff6b743", + }, + Spec: &healthcheckconfigv1.HealthCheckConfigSpec{ + Match: &healthcheckconfigv1.Matcher{ + // match all kubernetes clusters + KubernetesLabels: []*labelv1.Label{{ + Name: types.Wildcard, + Values: []string{types.Wildcard}, + }}, + }, + }, + } +} + // bootstrapRoleMetadataLabels are metadata labels that will be applied to each role. // These are intended to add labels for older roles that didn't previously have them. func bootstrapRoleMetadataLabels() map[string]map[string]string { diff --git a/lib/services/watcher_test.go b/lib/services/watcher_test.go index b0261ce578444..e273634da523d 100644 --- a/lib/services/watcher_test.go +++ b/lib/services/watcher_test.go @@ -36,6 +36,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/gravitational/teleport" "github.com/gravitational/teleport/api/constants" apidefaults "github.com/gravitational/teleport/api/defaults" healthcheckconfigv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/healthcheckconfig/v1" @@ -1495,7 +1496,7 @@ func TestHealthCheckConfigWatcher(t *testing.T) { select { case resources := <-w.ResourcesC: - require.Empty(t, resources) + require.Len(t, resources, teleport.VirtualDefaultHealthCheckConfigCount) case <-w.Done(): require.FailNow(t, "Watcher has unexpectedly exited.") case <-time.After(2 * time.Second): @@ -1514,7 +1515,7 @@ func TestHealthCheckConfigWatcher(t *testing.T) { require.EventuallyWithT(t, func(t *assert.CollectT) { filtered, err := w.CurrentResources(ctx) require.NoError(t, err) - require.Len(t, filtered, len(resources)) + require.Len(t, filtered, len(resources)+teleport.VirtualDefaultHealthCheckConfigCount) }, time.Second, 100*time.Millisecond, "Timeout waiting for watcher to receive resources.") filtered, err := w.CurrentResourcesWithFilter(ctx, func(s *healthcheckconfigv1.HealthCheckConfig) bool { @@ -1529,7 +1530,7 @@ func TestHealthCheckConfigWatcher(t *testing.T) { require.EventuallyWithT(t, func(t *assert.CollectT) { filtered, err := w.CurrentResources(ctx) require.NoError(t, err) - require.Len(t, filtered, len(resources)-1) + require.Len(t, filtered, len(resources)-1+teleport.VirtualDefaultHealthCheckConfigCount) }, time.Second, time.Millisecond, "Timeout waiting for watcher to receive resources.") filtered, err = w.CurrentResourcesWithFilter(ctx, func(s *healthcheckconfigv1.HealthCheckConfig) bool {