diff --git a/docs/pages/includes/metrics.mdx b/docs/pages/includes/metrics.mdx index ff41bedc0c730..07b9385368221 100644 --- a/docs/pages/includes/metrics.mdx +++ b/docs/pages/includes/metrics.mdx @@ -61,6 +61,7 @@ | `teleport_audit_parquetlog_errors_from_collect_count` | counter | Teleport Audit Log | Number of collect failures in Parquet-format audit log. | | `teleport_connected_resources` | gauge | Teleport Auth | Number and type of resources connected via keepalives. | | `teleport_registered_servers` | gauge | Teleport Auth | The number of Teleport services that are connected to an Auth Service instance grouped by version. | +| `teleport_registered_servers_by_install_methods` | gauge | Teleport Auth | The number of Teleport services that are connected to an Auth Service instance grouped by install methods. | | `user_login_total` | counter | Teleport Auth | Number of user logins. | | `teleport_migrations` | gauge | Teleport Auth | Tracks for each migration if it is active (1) or not (0). | | `watcher_event_sizes` | histogram | cache | Overall size of events emitted. | diff --git a/lib/auth/auth.go b/lib/auth/auth.go index 4231d7eeb120c..c5e5deb6ab45d 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -528,6 +528,15 @@ var ( []string{teleport.TagVersion}, ) + registeredAgentsInstallMethod = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: teleport.MetricNamespace, + Name: teleport.MetricRegisteredServersByInstallMethods, + Help: "The number of Teleport services that are connected to an auth server by install method.", + }, + []string{teleport.TagInstallMethods}, + ) + migrations = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: teleport.MetricNamespace, @@ -577,6 +586,7 @@ var ( registeredAgents, migrations, totalInstancesMetric, enrolledInUpgradesMetric, upgraderCountsMetric, accessRequestsCreatedMetric, + registeredAgentsInstallMethod, } ) @@ -1009,6 +1019,7 @@ func (a *Server) runPeriodicOperations() { heartbeatsMissedByAuth.Set(float64(missedKeepAliveCount)) case <-promTicker.Next(): a.updateVersionMetrics() + a.updateInstallMethodsMetrics() case <-releaseCheck.Next(): a.syncReleaseAlerts(ctx, true) case <-localReleaseCheck.Next(): @@ -1291,6 +1302,33 @@ func (a *Server) updateVersionMetrics() { } } +// updateInstallMethodsMetrics leverages the inventory control stream to report the install methods +// of all instances that are connected to a single auth server via prometheus metrics. +// To get an accurate representation of install methods in an entire cluster the metric must be aggregated +// with all auth instances. +func (a *Server) updateInstallMethodsMetrics() { + installMethodCount := make(map[string]int) + + // record install methods for all connected resources + a.inventory.Iter(func(handle inventory.UpstreamHandle) { + installMethod := "unknown" + installMethods := append([]string{}, handle.AgentMetadata().InstallMethods...) + + if len(installMethods) > 0 { + slices.Sort(installMethods) + installMethod = strings.Join(installMethods, ",") + } + + installMethodCount[installMethod]++ + }) + + // reset the gauges so that any versions that fall off are removed from exported metrics + registeredAgentsInstallMethod.Reset() + for installMethod, count := range installMethodCount { + registeredAgentsInstallMethod.WithLabelValues(installMethod).Set(float64(count)) + } +} + var ( // remoteClusterRefreshLimit is the maximum number of backend updates that will be performed // during periodic remote cluster connection status refresh. diff --git a/lib/inventory/controller.go b/lib/inventory/controller.go index ee6d7cac05a96..f6b9d7cf3208b 100644 --- a/lib/inventory/controller.go +++ b/lib/inventory/controller.go @@ -481,6 +481,8 @@ func (c *Controller) handleSSHServerHB(handle *upstreamHandle, sshServer *types. } func (c *Controller) handleAgentMetadata(handle *upstreamHandle, m proto.UpstreamInventoryAgentMetadata) { + handle.SetAgentMetadata(m) + svcs := make([]string, 0, len(handle.Hello().Services)) for _, svc := range handle.Hello().Services { svcs = append(svcs, strings.ToLower(svc.String())) diff --git a/lib/inventory/controller_test.go b/lib/inventory/controller_test.go index 420d7bdd1d3d2..cbfd21f93e85f 100644 --- a/lib/inventory/controller_test.go +++ b/lib/inventory/controller_test.go @@ -18,12 +18,14 @@ package inventory import ( "context" + "runtime" "sync" "testing" "time" "github.com/gravitational/trace" "github.com/stretchr/testify/require" + "golang.org/x/exp/slices" "github.com/gravitational/teleport" "github.com/gravitational/teleport/api/client" @@ -453,6 +455,65 @@ func TestUpdateLabels(t *testing.T) { }, time.Second, 100*time.Millisecond) } +// TestAgentMetadata verifies that an instance's agent metadata is received in +// inventory control stream. +func TestAgentMetadata(t *testing.T) { + // set the install method to validate it was returned as agent metadata + t.Setenv("TELEPORT_INSTALL_METHOD_AWSOIDC_DEPLOYSERVICE", "true") + const serverID = "test-instance" + const peerAddr = "1.2.3.4:456" + + events := make(chan testEvent, 1024) + + auth := &fakeAuth{} + + controller := NewController( + auth, + usagereporter.DiscardUsageReporter{}, + withInstanceHBInterval(time.Millisecond*200), + withTestEventsChannel(events), + ) + defer controller.Close() + + // Set up fake in-memory control stream. + upstream, downstream := client.InventoryControlStreamPipe(client.ICSPipePeerAddr(peerAddr)) + upstreamHello := proto.UpstreamInventoryHello{ + ServerID: serverID, + Version: teleport.Version, + Services: []types.SystemRole{types.RoleNode}, + } + downstreamHello := proto.DownstreamInventoryHello{ + Version: teleport.Version, + ServerID: "auth", + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + NewDownstreamHandle(func(ctx context.Context) (client.DownstreamInventoryControlStream, error) { + return downstream, nil + }, upstreamHello) + + // Wait for upstream hello. + select { + case msg := <-upstream.Recv(): + require.Equal(t, upstreamHello, msg) + case <-ctx.Done(): + require.Fail(t, "never got upstream hello") + } + require.NoError(t, upstream.Send(ctx, downstreamHello)) + controller.RegisterControlStream(upstream, upstreamHello) + + // Verify that control stream upstreamHandle is now accessible. + upstreamHandle, ok := controller.GetControlStream(serverID) + require.True(t, ok) + + // Validate that the agent's metadata ends up in the auth server. + require.Eventually(t, func() bool { + return slices.Equal(upstreamHandle.AgentMetadata().InstallMethods, []string{"awsoidc_deployservice"}) && + upstreamHandle.AgentMetadata().OS == runtime.GOOS + }, 5*time.Second, 200*time.Millisecond) +} + type eventOpts struct { expect map[testEvent]int deny map[testEvent]struct{} diff --git a/lib/inventory/inventory.go b/lib/inventory/inventory.go index 1fc71bbb278b4..6dec5fb02e0d1 100644 --- a/lib/inventory/inventory.go +++ b/lib/inventory/inventory.go @@ -327,6 +327,9 @@ type UpstreamHandle interface { // Hello gets the cached upstream hello that was used to initialize the stream. Hello() proto.UpstreamInventoryHello + // AgentMetadata is the service's metadata: OS, glibc version, install methods, ... + AgentMetadata() proto.UpstreamInventoryAgentMetadata + Ping(ctx context.Context, id uint64) (d time.Duration, err error) // HasService is a helper for checking if a given service is associated with this // stream. @@ -496,6 +499,9 @@ type upstreamHandle struct { client.UpstreamInventoryControlStream hello proto.UpstreamInventoryHello + agentMDLock sync.RWMutex + agentMetadata proto.UpstreamInventoryAgentMetadata + ticker *interval.MultiInterval[intervalKey] pingC chan pingRequest @@ -572,6 +578,20 @@ func (h *upstreamHandle) Hello() proto.UpstreamInventoryHello { return h.hello } +// AgentMetadata returns the Agent's metadata (eg os, glibc version, install methods, teleport version). +func (h *upstreamHandle) AgentMetadata() proto.UpstreamInventoryAgentMetadata { + h.agentMDLock.RLock() + defer h.agentMDLock.RUnlock() + return h.agentMetadata +} + +// SetAgentMetadata sets the agent metadata for the current handler. +func (h *upstreamHandle) SetAgentMetadata(agentMD proto.UpstreamInventoryAgentMetadata) { + h.agentMDLock.Lock() + defer h.agentMDLock.Unlock() + h.agentMetadata = agentMD +} + func (h *upstreamHandle) HasService(service types.SystemRole) bool { for _, s := range h.hello.Services { if s == service { diff --git a/metrics.go b/metrics.go index 22c71aef03b66..775ee0a2d147a 100644 --- a/metrics.go +++ b/metrics.go @@ -226,6 +226,10 @@ const ( // MetricRegisteredServers tracks the number of Teleport servers that have successfully registered with the Teleport cluster and have not reached the end of their ttl MetricRegisteredServers = "registered_servers" + // MetricRegisteredServersByInstallMethods tracks the number of Teleport servers, and their installation method, + // that have successfully registered with the Teleport cluster and have not reached the end of their ttl + MetricRegisteredServersByInstallMethods = "registered_servers_by_install_methods" + // MetricReverseSSHTunnels defines the number of connected SSH reverse tunnels to the proxy MetricReverseSSHTunnels = "reverse_tunnels_connected" @@ -268,6 +272,11 @@ const ( // TagClient is a prometheus label to indicate what client the metric is tied to TagClient = "client" + + // TagInstallMethods is a prometheus label to indicate what installation methods + // were used for the agent. + // This value comes from UpstreamInventoryAgentMetadata (sourced in lib/inventory/metadata.fetchInstallMethods). + TagInstallMethods = "install_methods" ) const (