diff --git a/docs/pages/includes/metrics.mdx b/docs/pages/includes/metrics.mdx index dc5acf08e0166..4b9ed5e3c2bca 100644 --- a/docs/pages/includes/metrics.mdx +++ b/docs/pages/includes/metrics.mdx @@ -70,6 +70,7 @@ | `teleport_registered_servers_by_install_methods` | gauge | Teleport Auth | The number of Teleport services that are connected to an Auth Service instance grouped by install methods. | | `teleport_roles_total` | gauge | Teleport Auth | The number of roles that exist in the cluster. | | `teleport_migrations` | gauge | Teleport Auth | Tracks for each migration if it is active (1) or not (0). | +| `teleport_bot_instances` | gauge | Teleport Auth | The number of bot instances across the entire cluster grouped by version. | | `user_login_total` | counter | Teleport Auth | Number of user logins. | | `watcher_event_sizes` | histogram | cache | Overall size of events emitted. | | `watcher_events` | histogram | cache | Per resource size of events emitted. | diff --git a/lib/auth/auth.go b/lib/auth/auth.go index 88c4d029ea1c0..c915bc8ba8a1c 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -1052,6 +1052,18 @@ var ( []string{teleport.TagPrivateKeyPolicy}, ) + botInstancesMetric = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: teleport.MetricNamespace, + Name: teleport.MetricBotInstances, + Help: "The number of bot instances across the entire cluster", + }, + []string{ + teleport.TagVersion, + teleport.TagAutomaticUpdates, + }, + ) + prometheusCollectors = []prometheus.Collector{ generateRequestsCount, generateThrottledRequestsCount, generateRequestsCurrent, generateRequestsLatencies, UserLoginCount, heartbeatsMissedByAuth, @@ -1061,6 +1073,7 @@ var ( registeredAgentsInstallMethod, userCertificatesGeneratedMetric, roleCount, + botInstancesMetric, } ) @@ -1610,6 +1623,7 @@ const ( accessListReminderNotificationsKey autoUpdateAgentReportKey autoUpdateBotInstanceReportKey + autoUpdateBotInstanceMetricsKey ) // runPeriodicOperations runs some periodic bookkeeping operations @@ -1711,6 +1725,12 @@ func (a *Server) runPeriodicOperations() { FirstDuration: retryutils.HalfJitter(10 * time.Second), Jitter: retryutils.SeventhJitter, }) + ticker.Push(interval.SubInterval[periodicIntervalKey]{ + Key: autoUpdateBotInstanceMetricsKey, + Duration: constants.AutoUpdateAgentReportPeriod / 2, + FirstDuration: retryutils.HalfJitter(10 * time.Second), + Jitter: retryutils.SeventhJitter, + }) } if modules.GetModules().IsOSSBuild() { @@ -1839,6 +1859,8 @@ func (a *Server) runPeriodicOperations() { go a.reportAgentVersions(a.closeCtx) case autoUpdateBotInstanceReportKey: go a.botVersionReporter.Report(a.closeCtx) + case autoUpdateBotInstanceMetricsKey: + go a.updateBotInstanceMetrics() } } } @@ -2154,6 +2176,18 @@ func (a *Server) updateAgentMetrics() { } } +func (a *Server) updateBotInstanceMetrics() { + report, err := a.GetAutoUpdateBotInstanceReport(a.closeCtx) + switch { + case trace.IsNotFound(err): + // No report to emit. + case err != nil: + a.logger.ErrorContext(a.closeCtx, "Failed to get bot instance report", "error", err) + default: + machineidv1.EmitInstancesMetric(report, botInstancesMetric) + } +} + var ( // remoteClusterRefreshLimit is the maximum number of backend updates that will be performed // during periodic remote cluster connection status refresh. diff --git a/lib/auth/machineid/machineidv1/auto_update_version_reporter.go b/lib/auth/machineid/machineidv1/auto_update_version_reporter.go index 24398e5e548b3..82f9f099d2da2 100644 --- a/lib/auth/machineid/machineidv1/auto_update_version_reporter.go +++ b/lib/auth/machineid/machineidv1/auto_update_version_reporter.go @@ -25,8 +25,10 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" "google.golang.org/protobuf/types/known/timestamppb" + "github.com/gravitational/teleport" "github.com/gravitational/teleport/api/defaults" "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" machineidv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1" @@ -318,3 +320,34 @@ func (r *AutoUpdateVersionReporter) IsLeader() bool { return false } } + +// EmitInstancesMetric updates the given gauge metric based on the instance report. +func EmitInstancesMetric(report *autoupdate.AutoUpdateBotInstanceReport, gauge *prometheus.GaugeVec) { + gauge.Reset() + + byVersion := make(map[string]int32) + + for group, groupMetrics := range report.GetSpec().GetGroups() { + // Empty group means the bot isn't using Managed Updates. + if group == "" { + for version, versionMetrics := range groupMetrics.GetVersions() { + gauge.With(prometheus.Labels{ + teleport.TagVersion: version, + teleport.TagAutomaticUpdates: "false", + }).Set(float64(versionMetrics.Count)) + } + continue + } + + for version, metrics := range groupMetrics.GetVersions() { + byVersion[version] += metrics.Count + } + } + + for version, count := range byVersion { + gauge.With(prometheus.Labels{ + teleport.TagVersion: version, + teleport.TagAutomaticUpdates: "true", + }).Set(float64(count)) + } +} diff --git a/lib/auth/machineid/machineidv1/auto_update_version_reporter_test.go b/lib/auth/machineid/machineidv1/auto_update_version_reporter_test.go index a37acedab64c3..6bb33ca5ff50a 100644 --- a/lib/auth/machineid/machineidv1/auto_update_version_reporter_test.go +++ b/lib/auth/machineid/machineidv1/auto_update_version_reporter_test.go @@ -19,16 +19,21 @@ package machineidv1_test import ( "context" + "fmt" + "strconv" "testing" "time" "github.com/google/go-cmp/cmp" "github.com/google/uuid" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/require" "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/timestamppb" + "github.com/gravitational/teleport" autoupdatev1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" headerv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/header/v1" machineidv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1" @@ -146,6 +151,64 @@ func TestAutoUpdateVersionReporter(t *testing.T) { } } +func TestEmitInstancesMetric(t *testing.T) { + gauge := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: teleport.MetricNamespace, + Name: teleport.MetricBotInstances, + }, + []string{ + teleport.TagVersion, + teleport.TagAutomaticUpdates, + }, + ) + + machineidv1.EmitInstancesMetric( + &autoupdatev1pb.AutoUpdateBotInstanceReport{ + Spec: &autoupdatev1pb.AutoUpdateBotInstanceReportSpec{ + Groups: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroup{ + "prod": { + Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{ + "18.0.0": {Count: 1}, + "19.0.0": {Count: 1}, + }, + }, + "stage": { + Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{ + "18.0.0": {Count: 1}, + "19.0.0": {Count: 1}, + }, + }, + "": { + Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{ + "19.0.0": {Count: 123}, + "20.0.0": {Count: 321}, + }, + }, + }, + }, + }, + gauge, + ) + + for _, tc := range []struct { + version string + automaticUpdates bool + expectedValue float64 + }{ + {version: "18.0.0", automaticUpdates: true, expectedValue: 2}, + {version: "19.0.0", automaticUpdates: true, expectedValue: 2}, + {version: "19.0.0", automaticUpdates: false, expectedValue: 123}, + {version: "20.0.0", automaticUpdates: false, expectedValue: 321}, + } { + t.Run(fmt.Sprintf("%s/%v", tc.version, tc.automaticUpdates), func(t *testing.T) { + metric := gauge.WithLabelValues(tc.version, strconv.FormatBool(tc.automaticUpdates)) + require.InEpsilon(t, tc.expectedValue, testutil.ToFloat64(metric), 0) + }) + } + +} + type testSemaphores struct{ types.Semaphores } func (s *testSemaphores) AcquireSemaphore(ctx context.Context, params types.AcquireSemaphoreRequest) (*types.SemaphoreLease, error) { diff --git a/metrics.go b/metrics.go index 8c788f72f352a..fba042b95a98a 100644 --- a/metrics.go +++ b/metrics.go @@ -263,6 +263,9 @@ const ( // MetricRegisteredServers tracks the number of Teleport servers that have successfully registered with the Teleport cluster and have not reached the end of their ttl MetricRegisteredServers = "registered_servers" + // MetricBotInstances tracks the number of bot instances across the entire cluster, labeled by version + MetricBotInstances = "bot_instances" + // MetricRegisteredServersByInstallMethods tracks the number of Teleport servers, and their installation method, // that have successfully registered with the Teleport cluster and have not reached the end of their ttl MetricRegisteredServersByInstallMethods = "registered_servers_by_install_methods"