Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/pages/includes/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
| `teleport_registered_servers_by_install_methods` | gauge | Teleport Auth | The number of Teleport services that are connected to an Auth Service instance grouped by install methods. |
| `teleport_roles_total` | gauge | Teleport Auth | The number of roles that exist in the cluster. |
| `teleport_migrations` | gauge | Teleport Auth | Tracks for each migration if it is active (1) or not (0). |
| `teleport_bot_instances` | gauge | Teleport Auth | The number of bot instances across the entire cluster grouped by version. |
| `user_login_total` | counter | Teleport Auth | Number of user logins. |
| `watcher_event_sizes` | histogram | cache | Overall size of events emitted. |
| `watcher_events` | histogram | cache | Per resource size of events emitted. |
Expand Down
34 changes: 34 additions & 0 deletions lib/auth/auth.go
Original file line number Diff line number Diff line change
Expand Up @@ -1052,6 +1052,18 @@ var (
[]string{teleport.TagPrivateKeyPolicy},
)

botInstancesMetric = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: teleport.MetricNamespace,
Name: teleport.MetricBotInstances,
Help: "The number of bot instances across the entire cluster",
},
[]string{
teleport.TagVersion,
teleport.TagAutomaticUpdates,
},
)

prometheusCollectors = []prometheus.Collector{
generateRequestsCount, generateThrottledRequestsCount,
generateRequestsCurrent, generateRequestsLatencies, UserLoginCount, heartbeatsMissedByAuth,
Expand All @@ -1061,6 +1073,7 @@ var (
registeredAgentsInstallMethod,
userCertificatesGeneratedMetric,
roleCount,
botInstancesMetric,
}
)

Expand Down Expand Up @@ -1610,6 +1623,7 @@ const (
accessListReminderNotificationsKey
autoUpdateAgentReportKey
autoUpdateBotInstanceReportKey
autoUpdateBotInstanceMetricsKey
)

// runPeriodicOperations runs some periodic bookkeeping operations
Expand Down Expand Up @@ -1711,6 +1725,12 @@ func (a *Server) runPeriodicOperations() {
FirstDuration: retryutils.HalfJitter(10 * time.Second),
Jitter: retryutils.SeventhJitter,
})
ticker.Push(interval.SubInterval[periodicIntervalKey]{
Key: autoUpdateBotInstanceMetricsKey,
Duration: constants.AutoUpdateAgentReportPeriod / 2,
FirstDuration: retryutils.HalfJitter(10 * time.Second),
Jitter: retryutils.SeventhJitter,
})
}

if modules.GetModules().IsOSSBuild() {
Expand Down Expand Up @@ -1839,6 +1859,8 @@ func (a *Server) runPeriodicOperations() {
go a.reportAgentVersions(a.closeCtx)
case autoUpdateBotInstanceReportKey:
go a.botVersionReporter.Report(a.closeCtx)
case autoUpdateBotInstanceMetricsKey:
go a.updateBotInstanceMetrics()
}
}
}
Expand Down Expand Up @@ -2154,6 +2176,18 @@ func (a *Server) updateAgentMetrics() {
}
}

func (a *Server) updateBotInstanceMetrics() {
report, err := a.GetAutoUpdateBotInstanceReport(a.closeCtx)
switch {
case trace.IsNotFound(err):
// No report to emit.
case err != nil:
a.logger.ErrorContext(a.closeCtx, "Failed to get bot instance report", "error", err)
default:
machineidv1.EmitInstancesMetric(report, botInstancesMetric)
}
}

var (
// remoteClusterRefreshLimit is the maximum number of backend updates that will be performed
// during periodic remote cluster connection status refresh.
Expand Down
33 changes: 33 additions & 0 deletions lib/auth/machineid/machineidv1/auto_update_version_reporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ import (

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/protobuf/types/known/timestamppb"

"github.com/gravitational/teleport"
"github.com/gravitational/teleport/api/defaults"
"github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
machineidv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1"
Expand Down Expand Up @@ -318,3 +320,34 @@ func (r *AutoUpdateVersionReporter) IsLeader() bool {
return false
}
}

// EmitInstancesMetric updates the given gauge metric based on the instance report.
func EmitInstancesMetric(report *autoupdate.AutoUpdateBotInstanceReport, gauge *prometheus.GaugeVec) {
gauge.Reset()

byVersion := make(map[string]int32)

for group, groupMetrics := range report.GetSpec().GetGroups() {
// Empty group means the bot isn't using Managed Updates.
if group == "" {
for version, versionMetrics := range groupMetrics.GetVersions() {
gauge.With(prometheus.Labels{
teleport.TagVersion: version,
teleport.TagAutomaticUpdates: "false",
}).Set(float64(versionMetrics.Count))
}
continue
}

for version, metrics := range groupMetrics.GetVersions() {
byVersion[version] += metrics.Count
}
}

for version, count := range byVersion {
gauge.With(prometheus.Labels{
teleport.TagVersion: version,
teleport.TagAutomaticUpdates: "true",
}).Set(float64(count))
}
Comment on lines +347 to +352
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like an infinitely growing number of timeseries, never reset during the process lifetime. I'm not sure how much of an issue this is but this smells. You might want to leave a comment saying that no other labels should be added to this metric without further safeties to mitigate label cardinality.

For reference I wrote but never had time to merge a metrics RFD: https://github.com/gravitational/teleport/blob/92a08edc66a192a161a90ca5cf10162085d63d9c/rfd/0197-prometheus-metrics.md

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sounds like a problem we should actually address - since if you upgraded all your tbots from one version to another, they'd all still show up under the old version 😓 I think I ran into this in the past year or two - let me see if I can find how I resolved it.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I've handled this two different ways previously:

  1. Just reset the GaugeVec before you write to it. This obviously leaves a very short period where it's state is "weird" but often if you've precomputed the values before you start writing into it - this period is very very short.
  2. Just directly implemented the Collector interface myself. I actually feel like I tend to prefer this to (1) since it's less hacky but you also benefit less from the guard rails the SDK provides.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for linking to the RFD! That's a really useful resource.

On the resetting thing, I'm already calling gauge.Reset on L316, which I think is what @strideynet suggested above. Is that enough to drop the stale series?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops - I missed that 🙈

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,21 @@ package machineidv1_test

import (
"context"
"fmt"
"strconv"
"testing"
"time"

"github.com/google/go-cmp/cmp"
"github.com/google/uuid"
"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/testing/protocmp"
"google.golang.org/protobuf/types/known/timestamppb"

"github.com/gravitational/teleport"
autoupdatev1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
headerv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/header/v1"
machineidv1pb "github.com/gravitational/teleport/api/gen/proto/go/teleport/machineid/v1"
Expand Down Expand Up @@ -146,6 +151,64 @@ func TestAutoUpdateVersionReporter(t *testing.T) {
}
}

func TestEmitInstancesMetric(t *testing.T) {
gauge := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: teleport.MetricNamespace,
Name: teleport.MetricBotInstances,
},
[]string{
teleport.TagVersion,
teleport.TagAutomaticUpdates,
},
)

machineidv1.EmitInstancesMetric(
&autoupdatev1pb.AutoUpdateBotInstanceReport{
Spec: &autoupdatev1pb.AutoUpdateBotInstanceReportSpec{
Groups: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroup{
"prod": {
Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{
"18.0.0": {Count: 1},
"19.0.0": {Count: 1},
},
},
"stage": {
Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{
"18.0.0": {Count: 1},
"19.0.0": {Count: 1},
},
},
"": {
Versions: map[string]*autoupdatev1pb.AutoUpdateBotInstanceReportSpecGroupVersion{
"19.0.0": {Count: 123},
"20.0.0": {Count: 321},
},
},
},
},
},
gauge,
)

for _, tc := range []struct {
version string
automaticUpdates bool
expectedValue float64
}{
{version: "18.0.0", automaticUpdates: true, expectedValue: 2},
{version: "19.0.0", automaticUpdates: true, expectedValue: 2},
{version: "19.0.0", automaticUpdates: false, expectedValue: 123},
{version: "20.0.0", automaticUpdates: false, expectedValue: 321},
} {
t.Run(fmt.Sprintf("%s/%v", tc.version, tc.automaticUpdates), func(t *testing.T) {
metric := gauge.WithLabelValues(tc.version, strconv.FormatBool(tc.automaticUpdates))
require.InEpsilon(t, tc.expectedValue, testutil.ToFloat64(metric), 0)
})
}

}

type testSemaphores struct{ types.Semaphores }

func (s *testSemaphores) AcquireSemaphore(ctx context.Context, params types.AcquireSemaphoreRequest) (*types.SemaphoreLease, error) {
Expand Down
3 changes: 3 additions & 0 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ const (
// MetricRegisteredServers tracks the number of Teleport servers that have successfully registered with the Teleport cluster and have not reached the end of their ttl
MetricRegisteredServers = "registered_servers"

// MetricBotInstances tracks the number of bot instances across the entire cluster, labeled by version
MetricBotInstances = "bot_instances"

// MetricRegisteredServersByInstallMethods tracks the number of Teleport servers, and their installation method,
// that have successfully registered with the Teleport cluster and have not reached the end of their ttl
MetricRegisteredServersByInstallMethods = "registered_servers_by_install_methods"
Expand Down
Loading