From c5eb5ba1c6e8dbc41343c1c06ff7e6b0222765a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Mon, 12 Jan 2026 12:48:26 +0100 Subject: [PATCH 01/52] Add client metrics --- client/internal/connect.go | 7 +++++-- client/internal/debug/debug.go | 32 ++++++++++++++++++++++++++++++ client/internal/engine.go | 17 +++++++++++++++- client/internal/metrics/metrics.go | 32 ++++++++++++++++++++++++++++++ client/server/server.go | 5 ++++- go.mod | 3 +++ go.sum | 6 ++++++ 7 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 client/internal/metrics/metrics.go diff --git a/client/internal/connect.go b/client/internal/connect.go index 017c8bf10d8..7ca3d8f4933 100644 --- a/client/internal/connect.go +++ b/client/internal/connect.go @@ -22,6 +22,7 @@ import ( "github.com/netbirdio/netbird/client/iface/device" "github.com/netbirdio/netbird/client/internal/dns" "github.com/netbirdio/netbird/client/internal/listener" + "github.com/netbirdio/netbird/client/internal/metrics" "github.com/netbirdio/netbird/client/internal/peer" "github.com/netbirdio/netbird/client/internal/profilemanager" "github.com/netbirdio/netbird/client/internal/statemanager" @@ -52,6 +53,7 @@ type ConnectClient struct { engineMutex sync.Mutex persistSyncResponse bool + clientMetrics *metrics.ClientMetrics } func NewConnectClient( @@ -59,7 +61,7 @@ func NewConnectClient( config *profilemanager.Config, statusRecorder *peer.Status, doInitalAutoUpdate bool, - + clientMetrics *metrics.ClientMetrics, ) *ConnectClient { return &ConnectClient{ ctx: ctx, @@ -67,6 +69,7 @@ func NewConnectClient( statusRecorder: statusRecorder, doInitialAutoUpdate: doInitalAutoUpdate, engineMutex: sync.Mutex{}, + clientMetrics: clientMetrics, } } @@ -308,7 +311,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan checks := loginResp.GetChecks() c.engineMutex.Lock() - engine := NewEngine(engineCtx, cancel, signalClient, mgmClient, relayManager, engineConfig, mobileDependency, c.statusRecorder, checks, stateManager) + engine := NewEngine(engineCtx, cancel, signalClient, mgmClient, relayManager, engineConfig, mobileDependency, c.statusRecorder, checks, stateManager, c.clientMetrics) engine.SetSyncResponsePersistence(c.persistSyncResponse) c.engine = engine c.engineMutex.Unlock() diff --git a/client/internal/debug/debug.go b/client/internal/debug/debug.go index 01a0377a5d2..25390fe6536 100644 --- a/client/internal/debug/debug.go +++ b/client/internal/debug/debug.go @@ -51,6 +51,7 @@ resolved_domains.txt: Anonymized resolved domain IP addresses from the status re config.txt: Anonymized configuration information of the NetBird client. network_map.json: Anonymized sync response containing peer configurations, routes, DNS settings, and firewall rules. state.json: Anonymized client state dump containing netbird states for the active profile. +metrics.txt: Client metrics in Prometheus format including connection statistics, reliability metrics, and performance indicators. mutex.prof: Mutex profiling information. goroutine.prof: Goroutine profiling information. block.prof: Block profiling information. @@ -216,6 +217,11 @@ const ( darwinStdoutLogPath = "/var/log/netbird.err.log" ) +// MetricsExporter is an interface for exporting metrics +type MetricsExporter interface { + Export(w io.Writer) error +} + type BundleGenerator struct { anonymizer *anonymize.Anonymizer @@ -224,6 +230,7 @@ type BundleGenerator struct { statusRecorder *peer.Status syncResponse *mgmProto.SyncResponse logFile string + clientMetrics MetricsExporter anonymize bool clientStatus string @@ -245,6 +252,7 @@ type GeneratorDependencies struct { StatusRecorder *peer.Status SyncResponse *mgmProto.SyncResponse LogFile string + ClientMetrics MetricsExporter } func NewBundleGenerator(deps GeneratorDependencies, cfg BundleConfig) *BundleGenerator { @@ -261,6 +269,7 @@ func NewBundleGenerator(deps GeneratorDependencies, cfg BundleConfig) *BundleGen statusRecorder: deps.StatusRecorder, syncResponse: deps.SyncResponse, logFile: deps.LogFile, + clientMetrics: deps.ClientMetrics, anonymize: cfg.Anonymize, clientStatus: cfg.ClientStatus, @@ -348,6 +357,10 @@ func (g *BundleGenerator) createArchive() error { log.Errorf("failed to add corrupted state files to debug bundle: %v", err) } + if err := g.addMetrics(); err != nil { + log.Errorf("failed to add metrics to debug bundle: %v", err) + } + if err := g.addWgShow(); err != nil { log.Errorf("failed to add wg show output: %v", err) } @@ -655,6 +668,25 @@ func (g *BundleGenerator) addStateFile() error { return nil } +func (g *BundleGenerator) addMetrics() error { + if g.clientMetrics == nil { + log.Debugf("skipping metrics in debug bundle: no metrics collector") + return nil + } + + var buf bytes.Buffer + if err := g.clientMetrics.Export(&buf); err != nil { + return fmt.Errorf("export metrics: %w", err) + } + + if err := g.addFileToZip(&buf, "metrics.txt"); err != nil { + return fmt.Errorf("add metrics file to zip: %w", err) + } + + log.Debugf("added metrics to debug bundle") + return nil +} + func (g *BundleGenerator) addUpdateLogs() error { inst := installer.New() logFiles := inst.LogFiles() diff --git a/client/internal/engine.go b/client/internal/engine.go index 4f18c3bc898..439058afcf4 100644 --- a/client/internal/engine.go +++ b/client/internal/engine.go @@ -35,6 +35,7 @@ import ( dnsconfig "github.com/netbirdio/netbird/client/internal/dns/config" "github.com/netbirdio/netbird/client/internal/dnsfwd" "github.com/netbirdio/netbird/client/internal/ingressgw" + "github.com/netbirdio/netbird/client/internal/metrics" "github.com/netbirdio/netbird/client/internal/netflow" nftypes "github.com/netbirdio/netbird/client/internal/netflow/types" "github.com/netbirdio/netbird/client/internal/networkmonitor" @@ -211,6 +212,9 @@ type Engine struct { shutdownWg sync.WaitGroup probeStunTurn *relay.StunTurnProbe + + // clientMetrics collects and pushes metrics + clientMetrics *metrics.ClientMetrics } // Peer is an instance of the Connection Peer @@ -224,7 +228,7 @@ type localIpUpdater interface { } // NewEngine creates a new Connection Engine with probes attached -func NewEngine(clientCtx context.Context, clientCancel context.CancelFunc, signalClient signal.Client, mgmClient mgm.Client, relayManager *relayClient.Manager, config *EngineConfig, mobileDep MobileDependency, statusRecorder *peer.Status, checks []*mgmProto.Checks, stateManager *statemanager.Manager) *Engine { +func NewEngine(clientCtx context.Context, clientCancel context.CancelFunc, signalClient signal.Client, mgmClient mgm.Client, relayManager *relayClient.Manager, config *EngineConfig, mobileDep MobileDependency, statusRecorder *peer.Status, checks []*mgmProto.Checks, stateManager *statemanager.Manager, clientMetrics *metrics.ClientMetrics) *Engine { engine := &Engine{ clientCtx: clientCtx, clientCancel: clientCancel, @@ -244,6 +248,7 @@ func NewEngine(clientCtx context.Context, clientCancel context.CancelFunc, signa checks: checks, connSemaphore: semaphoregroup.NewSemaphoreGroup(connInitLimit), probeStunTurn: relay.NewStunTurnProbe(relay.DefaultCacheTTL), + clientMetrics: clientMetrics, } log.Infof("I am: %s", config.WgPrivateKey.PublicKey().String()) @@ -289,6 +294,11 @@ func (e *Engine) Stop() error { e.updateManager.Stop() } + // Update metrics engine status + if e.clientMetrics != nil { + e.clientMetrics.SetEngineStatus(0) // 0=stopped + } + log.Info("cleaning up status recorder states") e.statusRecorder.ReplaceOfflinePeers([]peer.State{}) e.statusRecorder.UpdateDNSStates([]peer.NSGroupState{}) @@ -519,6 +529,11 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL) } }() + // Update metrics engine status + if e.clientMetrics != nil { + e.clientMetrics.SetEngineStatus(1) // 1=running + } + return nil } diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go new file mode 100644 index 00000000000..b1322c53228 --- /dev/null +++ b/client/internal/metrics/metrics.go @@ -0,0 +1,32 @@ +package metrics + +import ( + "io" + + "github.com/VictoriaMetrics/metrics" +) + +// ClientMetrics holds all client-side metrics +type ClientMetrics struct { + // ICE negotiation metrics + iceNegotiationDuration *metrics.Histogram +} + +// NewClientMetrics creates a new ClientMetrics instance +func NewClientMetrics() *ClientMetrics { + return &ClientMetrics{ + // ICE negotiation metrics + iceNegotiationDuration: metrics.NewHistogram(`netbird_client_ice_negotiation_duration_seconds`), + } +} + +// RecordICENegotiationDuration records the time taken for ICE negotiation +func (m *ClientMetrics) RecordICENegotiationDuration(seconds float64) { + m.iceNegotiationDuration.Update(seconds) +} + +// Export writes all metrics in Prometheus format to the provided writer +func (m *ClientMetrics) Export(w io.Writer) error { + metrics.WritePrometheus(w, true) + return nil +} diff --git a/client/server/server.go b/client/server/server.go index 7b6c4e98c11..66dca5cd28b 100644 --- a/client/server/server.go +++ b/client/server/server.go @@ -24,6 +24,7 @@ import ( "google.golang.org/protobuf/types/known/timestamppb" "github.com/netbirdio/netbird/client/internal/auth" + "github.com/netbirdio/netbird/client/internal/metrics" "github.com/netbirdio/netbird/client/internal/profilemanager" "github.com/netbirdio/netbird/client/system" mgm "github.com/netbirdio/netbird/shared/management/client" @@ -76,6 +77,7 @@ type Server struct { statusRecorder *peer.Status sessionWatcher *internal.SessionWatcher + clientMetrics *metrics.ClientMetrics lastProbe time.Time persistSyncResponse bool @@ -109,6 +111,7 @@ func New(ctx context.Context, logFile string, configFile string, profilesDisable profilesDisabled: profilesDisabled, updateSettingsDisabled: updateSettingsDisabled, jwtCache: newJWTCache(), + clientMetrics: metrics.NewClientMetrics(), } } @@ -1524,7 +1527,7 @@ func (s *Server) GetFeatures(ctx context.Context, msg *proto.GetFeaturesRequest) func (s *Server) connect(ctx context.Context, config *profilemanager.Config, statusRecorder *peer.Status, doInitialAutoUpdate bool, runningChan chan struct{}) error { log.Tracef("running client connection") - s.connectClient = internal.NewConnectClient(ctx, config, statusRecorder, doInitialAutoUpdate) + s.connectClient = internal.NewConnectClient(ctx, config, statusRecorder, doInitialAutoUpdate, s.clientMetrics) s.connectClient.SetSyncResponsePersistence(s.persistSyncResponse) if err := s.connectClient.Run(runningChan); err != nil { return err diff --git a/go.mod b/go.mod index cf55b926046..5c1bd1e8aae 100644 --- a/go.mod +++ b/go.mod @@ -140,6 +140,7 @@ require ( github.com/Masterminds/sprig/v3 v3.3.0 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/Microsoft/hcsshim v0.12.3 // indirect + github.com/VictoriaMetrics/metrics v1.40.2 // indirect github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be // indirect github.com/awnumar/memcall v0.4.0 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.10 // indirect @@ -261,6 +262,8 @@ require ( github.com/stretchr/objx v0.5.2 // indirect github.com/tklauser/go-sysconf v0.3.14 // indirect github.com/tklauser/numcpus v0.8.0 // indirect + github.com/valyala/fastrand v1.1.0 // indirect + github.com/valyala/histogram v1.2.0 // indirect github.com/vishvananda/netns v0.0.5 // indirect github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect github.com/wlynxg/anet v0.0.3 // indirect diff --git a/go.sum b/go.sum index e89e0ef125a..0b1377e2b07 100644 --- a/go.sum +++ b/go.sum @@ -37,6 +37,8 @@ github.com/Microsoft/hcsshim v0.12.3 h1:LS9NXqXhMoqNCplK1ApmVSfB4UnVLRDWRapB6EIl github.com/Microsoft/hcsshim v0.12.3/go.mod h1:Iyl1WVpZzr+UkzjekHZbV8o5Z9ZkxNGx6CtY2Qg/JVQ= github.com/TheJumpCloud/jcapi-go v3.0.0+incompatible h1:hqcTK6ZISdip65SR792lwYJTa/axESA0889D3UlZbLo= github.com/TheJumpCloud/jcapi-go v3.0.0+incompatible/go.mod h1:6B1nuc1MUs6c62ODZDl7hVE5Pv7O2XGSkgg2olnq34I= +github.com/VictoriaMetrics/metrics v1.40.2 h1:OVSjKcQEx6JAwGeu8/KQm9Su5qJ72TMEW4xYn5vw3Ac= +github.com/VictoriaMetrics/metrics v1.40.2/go.mod h1:XE4uudAAIRaJE614Tl5HMrtoEU6+GDZO4QTnNSsZRuA= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= @@ -566,6 +568,10 @@ github.com/tklauser/numcpus v0.8.0 h1:Mx4Wwe/FjZLeQsK/6kt2EOepwwSl7SmJrK5bV/dXYg github.com/tklauser/numcpus v0.8.0/go.mod h1:ZJZlAY+dmR4eut8epnzf0u/VwodKmryxR8txiloSqBE= github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= +github.com/valyala/fastrand v1.1.0 h1:f+5HkLW4rsgzdNoleUOB69hyT9IlD2ZQh9GyDMfb5G8= +github.com/valyala/fastrand v1.1.0/go.mod h1:HWqCzkrkg6QXT8V2EXWvXCoow7vLwOFN002oeRzjapQ= +github.com/valyala/histogram v1.2.0 h1:wyYGAZZt3CpwUiIb9AU/Zbllg1llXyrtApRS815OLoQ= +github.com/valyala/histogram v1.2.0/go.mod h1:Hb4kBwb4UxsaNbbbh+RRz8ZR6pdodR57tzWUS3BUzXY= github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4= github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY= From e3a5c44d37e2e60560506d637f659d7e5cba1f1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 15 Jan 2026 22:16:38 +0100 Subject: [PATCH 02/52] Add client metrics system with OpenTelemetry and VictoriaMetrics support Implements a comprehensive client metrics system to track peer connection stages and performance. The system supports multiple backend implementations (OpenTelemetry, VictoriaMetrics, and no-op) and tracks detailed connection stage durations from creation through WireGuard handshake. Key changes: - Add metrics package with pluggable backend implementations - Implement OpenTelemetry metrics backend - Implement VictoriaMetrics metrics backend - Add no-op metrics implementation for disabled state - Track connection stages: creation, semaphore, signaling, connection ready, and WireGuard handshake - Move WireGuard watcher functionality to conn.go - Refactor engine to integrate metrics tracking - Add metrics export endpoint in debug server --- client/internal/connect.go | 6 +- client/internal/engine.go | 38 ++-- client/internal/metrics/connection_type.go | 17 ++ client/internal/metrics/deployment_type.go | 46 ++++ client/internal/metrics/metrics.go | 62 +++-- client/internal/metrics/noop.go | 22 ++ client/internal/metrics/otel.go | 250 +++++++++++++++++++++ client/internal/metrics/victoria.go | 106 +++++++++ client/internal/peer/conn.go | 144 +++++++++++- client/internal/peer/wg_watcher.go | 38 +++- client/internal/peer/wg_watcher_test.go | 4 +- client/internal/peer/worker_relay.go | 20 -- client/server/debug.go | 8 + client/server/server.go | 5 +- shared/management/client/client.go | 1 + shared/management/client/grpc.go | 7 + 16 files changed, 694 insertions(+), 80 deletions(-) create mode 100644 client/internal/metrics/connection_type.go create mode 100644 client/internal/metrics/deployment_type.go create mode 100644 client/internal/metrics/noop.go create mode 100644 client/internal/metrics/otel.go create mode 100644 client/internal/metrics/victoria.go diff --git a/client/internal/connect.go b/client/internal/connect.go index 7ca3d8f4933..dad2e6b6b4c 100644 --- a/client/internal/connect.go +++ b/client/internal/connect.go @@ -22,7 +22,6 @@ import ( "github.com/netbirdio/netbird/client/iface/device" "github.com/netbirdio/netbird/client/internal/dns" "github.com/netbirdio/netbird/client/internal/listener" - "github.com/netbirdio/netbird/client/internal/metrics" "github.com/netbirdio/netbird/client/internal/peer" "github.com/netbirdio/netbird/client/internal/profilemanager" "github.com/netbirdio/netbird/client/internal/statemanager" @@ -53,7 +52,6 @@ type ConnectClient struct { engineMutex sync.Mutex persistSyncResponse bool - clientMetrics *metrics.ClientMetrics } func NewConnectClient( @@ -61,7 +59,6 @@ func NewConnectClient( config *profilemanager.Config, statusRecorder *peer.Status, doInitalAutoUpdate bool, - clientMetrics *metrics.ClientMetrics, ) *ConnectClient { return &ConnectClient{ ctx: ctx, @@ -69,7 +66,6 @@ func NewConnectClient( statusRecorder: statusRecorder, doInitialAutoUpdate: doInitalAutoUpdate, engineMutex: sync.Mutex{}, - clientMetrics: clientMetrics, } } @@ -311,7 +307,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan checks := loginResp.GetChecks() c.engineMutex.Lock() - engine := NewEngine(engineCtx, cancel, signalClient, mgmClient, relayManager, engineConfig, mobileDependency, c.statusRecorder, checks, stateManager, c.clientMetrics) + engine := NewEngine(engineCtx, cancel, signalClient, mgmClient, relayManager, engineConfig, mobileDependency, c.statusRecorder, checks, stateManager) engine.SetSyncResponsePersistence(c.persistSyncResponse) c.engine = engine c.engineMutex.Unlock() diff --git a/client/internal/engine.go b/client/internal/engine.go index 439058afcf4..7e6a933f415 100644 --- a/client/internal/engine.go +++ b/client/internal/engine.go @@ -228,7 +228,13 @@ type localIpUpdater interface { } // NewEngine creates a new Connection Engine with probes attached -func NewEngine(clientCtx context.Context, clientCancel context.CancelFunc, signalClient signal.Client, mgmClient mgm.Client, relayManager *relayClient.Manager, config *EngineConfig, mobileDep MobileDependency, statusRecorder *peer.Status, checks []*mgmProto.Checks, stateManager *statemanager.Manager, clientMetrics *metrics.ClientMetrics) *Engine { +func NewEngine(clientCtx context.Context, clientCancel context.CancelFunc, signalClient signal.Client, mgmClient mgm.Client, relayManager *relayClient.Manager, config *EngineConfig, mobileDep MobileDependency, statusRecorder *peer.Status, checks []*mgmProto.Checks, stateManager *statemanager.Manager) *Engine { + // Initialize metrics based on deployment type + var deploymentType metrics.DeploymentType + if mgmClient != nil { + deploymentType = metrics.DetermineDeploymentType(mgmClient.GetServerURL()) + } + engine := &Engine{ clientCtx: clientCtx, clientCancel: clientCancel, @@ -248,7 +254,7 @@ func NewEngine(clientCtx context.Context, clientCancel context.CancelFunc, signa checks: checks, connSemaphore: semaphoregroup.NewSemaphoreGroup(connInitLimit), probeStunTurn: relay.NewStunTurnProbe(relay.DefaultCacheTTL), - clientMetrics: clientMetrics, + clientMetrics: metrics.NewClientMetrics(deploymentType, true), } log.Infof("I am: %s", config.WgPrivateKey.PublicKey().String()) @@ -294,11 +300,6 @@ func (e *Engine) Stop() error { e.updateManager.Stop() } - // Update metrics engine status - if e.clientMetrics != nil { - e.clientMetrics.SetEngineStatus(0) // 0=stopped - } - log.Info("cleaning up status recorder states") e.statusRecorder.ReplaceOfflinePeers([]peer.State{}) e.statusRecorder.UpdateDNSStates([]peer.NSGroupState{}) @@ -529,11 +530,6 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL) } }() - // Update metrics engine status - if e.clientMetrics != nil { - e.clientMetrics.SetEngineStatus(1) // 1=running - } - return nil } @@ -1400,12 +1396,13 @@ func (e *Engine) createPeerConn(pubKey string, allowedIPs []netip.Prefix, agentV } serviceDependencies := peer.ServiceDependencies{ - StatusRecorder: e.statusRecorder, - Signaler: e.signaler, - IFaceDiscover: e.mobileDep.IFaceDiscover, - RelayManager: e.relayManager, - SrWatcher: e.srWatcher, - Semaphore: e.connSemaphore, + StatusRecorder: e.statusRecorder, + Signaler: e.signaler, + IFaceDiscover: e.mobileDep.IFaceDiscover, + RelayManager: e.relayManager, + SrWatcher: e.srWatcher, + Semaphore: e.connSemaphore, + MetricsRecorder: e.clientMetrics, } peerConn, err := peer.NewConn(config, serviceDependencies) if err != nil { @@ -1689,6 +1686,11 @@ func (e *Engine) GetFirewallManager() firewallManager.Manager { return e.firewall } +// GetClientMetrics returns the client metrics +func (e *Engine) GetClientMetrics() *metrics.ClientMetrics { + return e.clientMetrics +} + func findIPFromInterfaceName(ifaceName string) (net.IP, error) { iface, err := net.InterfaceByName(ifaceName) if err != nil { diff --git a/client/internal/metrics/connection_type.go b/client/internal/metrics/connection_type.go new file mode 100644 index 00000000000..a3406a6b89a --- /dev/null +++ b/client/internal/metrics/connection_type.go @@ -0,0 +1,17 @@ +package metrics + +// ConnectionType represents the type of peer connection +type ConnectionType string + +const ( + // ConnectionTypeICE represents a direct peer-to-peer connection using ICE + ConnectionTypeICE ConnectionType = "ice" + + // ConnectionTypeRelay represents a relayed connection + ConnectionTypeRelay ConnectionType = "relay" +) + +// String returns the string representation of the connection type +func (c ConnectionType) String() string { + return string(c) +} diff --git a/client/internal/metrics/deployment_type.go b/client/internal/metrics/deployment_type.go new file mode 100644 index 00000000000..e1d9e8d539e --- /dev/null +++ b/client/internal/metrics/deployment_type.go @@ -0,0 +1,46 @@ +package metrics + +import ( + "strings" +) + +// DeploymentType represents the type of NetBird deployment +type DeploymentType int + +const ( + // DeploymentTypeUnknown represents an unknown or uninitialized deployment type + DeploymentTypeUnknown DeploymentType = iota + + // DeploymentTypeCloud represents a cloud-hosted NetBird deployment + DeploymentTypeCloud + + // DeploymentTypeSelfHosted represents a self-hosted NetBird deployment + DeploymentTypeSelfHosted +) + +// String returns the string representation of the deployment type +func (d DeploymentType) String() string { + switch d { + case DeploymentTypeCloud: + return "cloud" + case DeploymentTypeSelfHosted: + return "selfhosted" + default: + return "selfhosted" + } +} + +// DetermineDeploymentType determines if the deployment is cloud or self-hosted +// based on the management URL string +func DetermineDeploymentType(managementURL string) DeploymentType { + if managementURL == "" { + return DeploymentTypeUnknown + } + + // Check for NetBird cloud API domain + if strings.Contains(strings.ToLower(managementURL), "api.netbird.io") { + return DeploymentTypeCloud + } + + return DeploymentTypeSelfHosted +} diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index b1322c53228..787ffc1ac3f 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -1,32 +1,62 @@ package metrics import ( + "context" "io" - - "github.com/VictoriaMetrics/metrics" + "time" ) -// ClientMetrics holds all client-side metrics +// metricsImplementation defines the internal interface for metrics implementations +type metricsImplementation interface { + // RecordConnectionStages records connection stage metrics from timestamps + RecordConnectionStages( + ctx context.Context, + connectionType ConnectionType, + isReconnection bool, + timestamps ConnectionStageTimestamps, + ) + + // Export exports metrics in Prometheus format + Export(w io.Writer) error +} + type ClientMetrics struct { - // ICE negotiation metrics - iceNegotiationDuration *metrics.Histogram + impl metricsImplementation +} + +// ConnectionStageTimestamps holds timestamps for each connection stage +type ConnectionStageTimestamps struct { + Created time.Time + SemaphoreAcquired time.Time + Signaling time.Time // First signal sent (initial) or signal received (reconnection) + ConnectionReady time.Time + WgHandshakeSuccess time.Time } // NewClientMetrics creates a new ClientMetrics instance -func NewClientMetrics() *ClientMetrics { - return &ClientMetrics{ - // ICE negotiation metrics - iceNegotiationDuration: metrics.NewHistogram(`netbird_client_ice_negotiation_duration_seconds`), +// If enabled is true, uses an OpenTelemetry implementation +// If enabled is false, uses a no-op implementation +func NewClientMetrics(deploymentType DeploymentType, enabled bool) *ClientMetrics { + var impl metricsImplementation + if !enabled { + impl = &noopMetrics{} + } else { + impl = newOtelMetrics(deploymentType) } + return &ClientMetrics{impl: impl} } -// RecordICENegotiationDuration records the time taken for ICE negotiation -func (m *ClientMetrics) RecordICENegotiationDuration(seconds float64) { - m.iceNegotiationDuration.Update(seconds) +// RecordConnectionStages calculates stage durations from timestamps and records them +func (c *ClientMetrics) RecordConnectionStages( + ctx context.Context, + connectionType ConnectionType, + isReconnection bool, + timestamps ConnectionStageTimestamps, +) { + c.impl.RecordConnectionStages(ctx, connectionType, isReconnection, timestamps) } -// Export writes all metrics in Prometheus format to the provided writer -func (m *ClientMetrics) Export(w io.Writer) error { - metrics.WritePrometheus(w, true) - return nil +// Export exports metrics to the writer +func (c *ClientMetrics) Export(w io.Writer) error { + return c.impl.Export(w) } diff --git a/client/internal/metrics/noop.go b/client/internal/metrics/noop.go new file mode 100644 index 00000000000..bf8aa432007 --- /dev/null +++ b/client/internal/metrics/noop.go @@ -0,0 +1,22 @@ +package metrics + +import ( + "context" + "io" +) + +// noopMetrics is a no-op implementation of metricsImplementation +type noopMetrics struct{} + +func (s *noopMetrics) RecordConnectionStages( + _ context.Context, + _ ConnectionType, + _ bool, + _ ConnectionStageTimestamps, +) { + // No-op +} + +func (s *noopMetrics) Export(_ io.Writer) error { + return nil +} diff --git a/client/internal/metrics/otel.go b/client/internal/metrics/otel.go new file mode 100644 index 00000000000..43bb1223268 --- /dev/null +++ b/client/internal/metrics/otel.go @@ -0,0 +1,250 @@ +package metrics + +import ( + "context" + "fmt" + "io" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" +) + +// otelMetrics is the OpenTelemetry implementation of ClientMetrics +type otelMetrics struct { + reader *sdkmetric.ManualReader + meterProvider *sdkmetric.MeterProvider + meter metric.Meter + + // Static attributes applied to all metrics + deploymentType DeploymentType + + // Connection stage duration histograms + stageCreationToSemaphore metric.Float64Histogram + stageSemaphoreToSignaling metric.Float64Histogram + stageSignalingToConnection metric.Float64Histogram + stageConnectionToHandshake metric.Float64Histogram + stageTotalCreationToHandshake metric.Float64Histogram +} + +func newOtelMetrics(deploymentType DeploymentType) metricsImplementation { + reader := sdkmetric.NewManualReader() + meterProvider := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + + otel.SetMeterProvider(meterProvider) + + meter := meterProvider.Meter("netbird.client") + + stageCreationToSemaphore, err := meter.Float64Histogram( + "netbird.peer.connection.stage.creation_to_semaphore", + metric.WithDescription("Duration from connection creation to semaphore acquisition"), + metric.WithUnit("s"), + ) + if err != nil { + return &noopMetrics{} + } + + stageSemaphoreToSignaling, err := meter.Float64Histogram( + "netbird.peer.connection.stage.semaphore_to_signaling", + metric.WithDescription("Duration from semaphore acquisition to signaling start"), + metric.WithUnit("s"), + ) + if err != nil { + return &noopMetrics{} + } + + stageSignalingToConnection, err := meter.Float64Histogram( + "netbird.peer.connection.stage.signaling_to_connection", + metric.WithDescription("Duration from signaling start to connection ready"), + metric.WithUnit("s"), + ) + if err != nil { + return &noopMetrics{} + } + + stageConnectionToHandshake, err := meter.Float64Histogram( + "netbird.peer.connection.stage.connection_to_handshake", + metric.WithDescription("Duration from connection ready to WireGuard handshake success"), + metric.WithUnit("s"), + ) + if err != nil { + return &noopMetrics{} + } + + stageTotalCreationToHandshake, err := meter.Float64Histogram( + "netbird.peer.connection.total.creation_to_handshake", + metric.WithDescription("Total duration from connection creation to WireGuard handshake success"), + metric.WithUnit("s"), + ) + if err != nil { + return &noopMetrics{} + } + + return &otelMetrics{ + reader: reader, + meterProvider: meterProvider, + meter: meter, + deploymentType: deploymentType, + stageCreationToSemaphore: stageCreationToSemaphore, + stageSemaphoreToSignaling: stageSemaphoreToSignaling, + stageSignalingToConnection: stageSignalingToConnection, + stageConnectionToHandshake: stageConnectionToHandshake, + stageTotalCreationToHandshake: stageTotalCreationToHandshake, + } +} + +// RecordConnectionStages records the duration of each connection stage from timestamps +func (m *otelMetrics) RecordConnectionStages( + ctx context.Context, + connectionType ConnectionType, + isReconnection bool, + timestamps ConnectionStageTimestamps, +) { + // Calculate stage durations + var creationToSemaphore, semaphoreToSignaling, signalingToConnection, connectionToHandshake, totalDuration float64 + + if !timestamps.Created.IsZero() && !timestamps.SemaphoreAcquired.IsZero() { + creationToSemaphore = timestamps.SemaphoreAcquired.Sub(timestamps.Created).Seconds() + } + + if !timestamps.SemaphoreAcquired.IsZero() && !timestamps.Signaling.IsZero() { + semaphoreToSignaling = timestamps.Signaling.Sub(timestamps.SemaphoreAcquired).Seconds() + } + + if !timestamps.Signaling.IsZero() && !timestamps.ConnectionReady.IsZero() { + signalingToConnection = timestamps.ConnectionReady.Sub(timestamps.Signaling).Seconds() + } + + if !timestamps.ConnectionReady.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { + connectionToHandshake = timestamps.WgHandshakeSuccess.Sub(timestamps.ConnectionReady).Seconds() + } + + if !timestamps.Created.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { + totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.Created).Seconds() + } + + // Determine attempt type + attemptType := "initial" + if isReconnection { + attemptType = "reconnection" + } + + // Combine deployment type, connection type, and attempt type attributes + attrs := metric.WithAttributes( + attribute.String("deployment_type", m.deploymentType.String()), + attribute.String("connection_type", connectionType.String()), + attribute.String("attempt_type", attemptType), + ) + + m.stageCreationToSemaphore.Record(ctx, creationToSemaphore, attrs) + m.stageSemaphoreToSignaling.Record(ctx, semaphoreToSignaling, attrs) + m.stageSignalingToConnection.Record(ctx, signalingToConnection, attrs) + m.stageConnectionToHandshake.Record(ctx, connectionToHandshake, attrs) + m.stageTotalCreationToHandshake.Record(ctx, totalDuration, attrs) +} + +// Export writes metrics in Prometheus text format +func (m *otelMetrics) Export(w io.Writer) error { + if m.reader == nil { + return fmt.Errorf("metrics reader not initialized") + } + + // Collect current metrics + var rm metricdata.ResourceMetrics + if err := m.reader.Collect(context.Background(), &rm); err != nil { + return fmt.Errorf("failed to collect metrics: %w", err) + } + + // Iterate through scope metrics and write in Prometheus format + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + // Write HELP line + if _, err := fmt.Fprintf(w, "# HELP %s %s\n", m.Name, m.Description); err != nil { + return err + } + + // Write TYPE line + if _, err := fmt.Fprintf(w, "# TYPE %s histogram\n", m.Name); err != nil { + return err + } + + // Handle histogram data + if hist, ok := m.Data.(metricdata.Histogram[float64]); ok { + for _, dp := range hist.DataPoints { + // Build label string from attributes + labelStr := "" + if len(dp.Attributes.ToSlice()) > 0 { + labels := "" + for _, attr := range dp.Attributes.ToSlice() { + if labels != "" { + labels += "," + } + labels += fmt.Sprintf("%s=\"%s\"", attr.Key, attr.Value.AsString()) + } + labelStr = labels + } + + // Write bucket counts + cumulativeCount := uint64(0) + for i, bound := range dp.Bounds { + cumulativeCount += dp.BucketCounts[i] + bucketLabel := labelStr + if bucketLabel != "" { + bucketLabel += "," + } + bucketLabel += fmt.Sprintf("le=\"%g\"", bound) + if _, err := fmt.Fprintf(w, "%s_bucket{%s} %d\n", + m.Name, bucketLabel, cumulativeCount); err != nil { + return err + } + } + + // Write +Inf bucket (last bucket count) + if len(dp.BucketCounts) > len(dp.Bounds) { + cumulativeCount += dp.BucketCounts[len(dp.BucketCounts)-1] + } + bucketLabel := labelStr + if bucketLabel != "" { + bucketLabel += "," + } + bucketLabel += "le=\"+Inf\"" + if _, err := fmt.Fprintf(w, "%s_bucket{%s} %d\n", + m.Name, bucketLabel, cumulativeCount); err != nil { + return err + } + + // Write sum + if labelStr != "" { + if _, err := fmt.Fprintf(w, "%s_sum{%s} %g\n", m.Name, labelStr, dp.Sum); err != nil { + return err + } + } else { + if _, err := fmt.Fprintf(w, "%s_sum %g\n", m.Name, dp.Sum); err != nil { + return err + } + } + + // Write count + if labelStr != "" { + if _, err := fmt.Fprintf(w, "%s_count{%s} %d\n", m.Name, labelStr, dp.Count); err != nil { + return err + } + } else { + if _, err := fmt.Fprintf(w, "%s_count %d\n", m.Name, dp.Count); err != nil { + return err + } + } + } + } + + // Empty line between metrics + if _, err := fmt.Fprintf(w, "\n"); err != nil { + return err + } + } + } + + return nil +} diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go new file mode 100644 index 00000000000..0ea953003be --- /dev/null +++ b/client/internal/metrics/victoria.go @@ -0,0 +1,106 @@ +package metrics + +import ( + "context" + "fmt" + "io" + + "github.com/VictoriaMetrics/metrics" +) + +// victoriaMetrics is the VictoriaMetrics implementation of ClientMetrics +type victoriaMetrics struct { + // Static attributes applied to all metrics + deploymentType DeploymentType + + // Metrics set for managing all metrics + set *metrics.Set +} + +func newVictoriaMetrics(deploymentType DeploymentType) metricsImplementation { + return &victoriaMetrics{ + deploymentType: deploymentType, + set: metrics.NewSet(), + } +} + +// RecordConnectionStages records the duration of each connection stage from timestamps +func (m *victoriaMetrics) RecordConnectionStages( + ctx context.Context, + connectionType ConnectionType, + isReconnection bool, + timestamps ConnectionStageTimestamps, +) { + // Calculate stage durations + var creationToSemaphore, semaphoreToSignaling, signalingToConnection, connectionToHandshake, totalDuration float64 + + if !timestamps.Created.IsZero() && !timestamps.SemaphoreAcquired.IsZero() { + creationToSemaphore = timestamps.SemaphoreAcquired.Sub(timestamps.Created).Seconds() + } + + if !timestamps.SemaphoreAcquired.IsZero() && !timestamps.Signaling.IsZero() { + semaphoreToSignaling = timestamps.Signaling.Sub(timestamps.SemaphoreAcquired).Seconds() + } + + if !timestamps.Signaling.IsZero() && !timestamps.ConnectionReady.IsZero() { + signalingToConnection = timestamps.ConnectionReady.Sub(timestamps.Signaling).Seconds() + } + + if !timestamps.ConnectionReady.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { + connectionToHandshake = timestamps.WgHandshakeSuccess.Sub(timestamps.ConnectionReady).Seconds() + } + + if !timestamps.Created.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { + totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.Created).Seconds() + } + + // Determine attempt type + attemptType := "initial" + if isReconnection { + attemptType = "reconnection" + } + + connTypeStr := connectionType.String() + + // Record observations using histograms + m.set.GetOrCreateHistogram( + m.getMetricName("netbird_peer_connection_stage_creation_to_semaphore", connTypeStr, attemptType), + ).Update(creationToSemaphore) + + m.set.GetOrCreateHistogram( + m.getMetricName("netbird_peer_connection_stage_semaphore_to_signaling", connTypeStr, attemptType), + ).Update(semaphoreToSignaling) + + m.set.GetOrCreateHistogram( + m.getMetricName("netbird_peer_connection_stage_signaling_to_connection", connTypeStr, attemptType), + ).Update(signalingToConnection) + + m.set.GetOrCreateHistogram( + m.getMetricName("netbird_peer_connection_stage_connection_to_handshake", connTypeStr, attemptType), + ).Update(connectionToHandshake) + + m.set.GetOrCreateHistogram( + m.getMetricName("netbird_peer_connection_total_creation_to_handshake", connTypeStr, attemptType), + ).Update(totalDuration) +} + +// getMetricName constructs a metric name with labels +func (m *victoriaMetrics) getMetricName(baseName, connectionType, attemptType string) string { + return fmt.Sprintf(`%s{deployment_type=%q,connection_type=%q,attempt_type=%q}`, + baseName, + m.deploymentType.String(), + connectionType, + attemptType, + ) +} + +// Export writes metrics in Prometheus text format +func (m *victoriaMetrics) Export(w io.Writer) error { + if m.set == nil { + return fmt.Errorf("metrics set not initialized") + } + + // Write metrics in Prometheus format + m.set.WritePrometheus(w) + return nil +} diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index 20a2eb34225..0efd95918c2 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -16,6 +16,7 @@ import ( "github.com/netbirdio/netbird/client/iface/configurer" "github.com/netbirdio/netbird/client/iface/wgproxy" + "github.com/netbirdio/netbird/client/internal/metrics" "github.com/netbirdio/netbird/client/internal/peer/conntype" "github.com/netbirdio/netbird/client/internal/peer/dispatcher" "github.com/netbirdio/netbird/client/internal/peer/guard" @@ -28,6 +29,16 @@ import ( semaphoregroup "github.com/netbirdio/netbird/util/semaphore-group" ) +// MetricsRecorder is an interface for recording peer connection metrics +type MetricsRecorder interface { + RecordConnectionStages( + ctx context.Context, + connectionType metrics.ConnectionType, + isReconnection bool, + timestamps metrics.ConnectionStageTimestamps, + ) +} + type ServiceDependencies struct { StatusRecorder *Status Signaler *Signaler @@ -36,6 +47,7 @@ type ServiceDependencies struct { SrWatcher *guard.SRWatcher Semaphore *semaphoregroup.SemaphoreGroup PeerConnDispatcher *dispatcher.ConnectionDispatcher + MetricsRecorder MetricsRecorder } type WgConfig struct { @@ -98,6 +110,7 @@ type Conn struct { workerICE *WorkerICE workerRelay *WorkerRelay + wgWatcher *WGWatcher wgWatcherWg sync.WaitGroup // used to store the remote Rosenpass key for Relayed connection in case of connection update from ice @@ -115,6 +128,13 @@ type Conn struct { dumpState *stateDump endpointUpdater *EndpointUpdater + + // Connection stage timestamps for metrics + metricsRecorder MetricsRecorder + hasBeenConnected bool // Track if we've ever established a successful connection + isReconnectionAttempt bool // Track if current attempt is a reconnection + stageTimestamps metrics.ConnectionStageTimestamps + stagesMutex sync.Mutex } // NewConn creates a new not opened Conn to the remote peer. @@ -126,6 +146,7 @@ func NewConn(config ConnConfig, services ServiceDependencies) (*Conn, error) { connLog := log.WithField("peer", config.Key) + dumpState := newStateDump(config.Key, connLog, services.StatusRecorder) var conn = &Conn{ Log: connLog, config: config, @@ -137,10 +158,13 @@ func NewConn(config ConnConfig, services ServiceDependencies) (*Conn, error) { semaphore: services.Semaphore, statusRelay: worker.NewAtomicStatus(), statusICE: worker.NewAtomicStatus(), - dumpState: newStateDump(config.Key, connLog, services.StatusRecorder), + dumpState: dumpState, endpointUpdater: NewEndpointUpdater(connLog, config.WgConfig, isController(config)), + metricsRecorder: services.MetricsRecorder, } + conn.wgWatcher = NewWGWatcher(connLog, config.WgConfig.WgInterface, config.Key, dumpState, conn.onWGHandshakeSuccess) + return conn, nil } @@ -148,10 +172,21 @@ func NewConn(config ConnConfig, services ServiceDependencies) (*Conn, error) { // It will try to establish a connection using ICE and in parallel with relay. The higher priority connection type will // be used. func (conn *Conn) Open(engineCtx context.Context) error { + // Record the start time - beginning of connection attempt + conn.stagesMutex.Lock() + conn.stageTimestamps.Created = time.Now() + conn.stagesMutex.Unlock() + + // Semaphore.Add() blocks here until there's a free slot if err := conn.semaphore.Add(engineCtx); err != nil { return err } + // Record when semaphore was acquired (after the wait) + conn.stagesMutex.Lock() + conn.stageTimestamps.SemaphoreAcquired = time.Now() + conn.stagesMutex.Unlock() + conn.mu.Lock() defer conn.mu.Unlock() @@ -231,7 +266,7 @@ func (conn *Conn) Close(signalToRemote bool) { conn.Log.Infof("close peer connection") conn.ctxCancel() - conn.workerRelay.DisableWgWatcher() + conn.wgWatcher.DisableWgWatcher() conn.workerRelay.CloseConn() conn.workerICE.Close() @@ -260,6 +295,14 @@ func (conn *Conn) Close(signalToRemote bool) { } conn.setStatusToDisconnected() + + // Reset connection metrics state + conn.stagesMutex.Lock() + conn.hasBeenConnected = false + conn.isReconnectionAttempt = false + conn.stageTimestamps = metrics.ConnectionStageTimestamps{} + conn.stagesMutex.Unlock() + conn.opened = false conn.wg.Wait() conn.Log.Infof("peer connection closed") @@ -292,6 +335,22 @@ func (conn *Conn) SetOnDisconnected(handler func(remotePeer string)) { func (conn *Conn) OnRemoteOffer(offer OfferAnswer) { conn.dumpState.RemoteOffer() conn.Log.Infof("OnRemoteOffer, on status ICE: %s, status Relay: %s", conn.statusICE, conn.statusRelay) + + conn.stagesMutex.Lock() + // Detect reconnection: we had been connected before, but now both ICE and Relay are disconnected + if conn.hasBeenConnected && conn.evalStatus() != StatusConnected { + conn.isReconnectionAttempt = true + conn.stageTimestamps = metrics.ConnectionStageTimestamps{} // Reset for new reconnection attempt + now := time.Now() + conn.stageTimestamps.Created = now + conn.stageTimestamps.Signaling = now + conn.Log.Infof("Reconnection triggered by remote offer") + } else if conn.stageTimestamps.Signaling.IsZero() { + // First time receiving offer for this connection attempt (signaling start) + conn.stageTimestamps.Signaling = time.Now() + } + conn.stagesMutex.Unlock() + conn.handshaker.OnRemoteOffer(offer) } @@ -366,7 +425,7 @@ func (conn *Conn) onICEConnectionIsReady(priority conntype.ConnPriority, iceConn ep = directEp } - conn.workerRelay.DisableWgWatcher() + conn.wgWatcher.DisableWgWatcher() // todo consider to run conn.wgWatcherWg.Wait() here if conn.wgProxyRelay != nil { @@ -390,6 +449,12 @@ func (conn *Conn) onICEConnectionIsReady(priority conntype.ConnPriority, iceConn conn.wgProxyRelay.RedirectAs(ep) } + conn.wgWatcherWg.Add(1) + go func() { + defer conn.wgWatcherWg.Done() + conn.wgWatcher.EnableWgWatcher(conn.ctx, nil) + }() + conn.currentConnPriority = priority conn.statusICE.SetConnected() conn.updateIceState(iceConnInfo) @@ -423,15 +488,17 @@ func (conn *Conn) onICEStateDisconnected() { conn.Log.Errorf("failed to switch to relay conn: %v", err) } + conn.wgWatcher.DisableWgWatcher() conn.wgWatcherWg.Add(1) go func() { defer conn.wgWatcherWg.Done() - conn.workerRelay.EnableWgWatcher(conn.ctx) + conn.wgWatcher.EnableWgWatcher(conn.ctx, conn.onWGDisconnected) }() conn.wgProxyRelay.Work() conn.currentConnPriority = conntype.Relay } else { conn.Log.Infof("ICE disconnected, do not switch to Relay. Reset priority to: %s", conntype.None.String()) + conn.wgWatcher.DisableWgWatcher() conn.currentConnPriority = conntype.None if err := conn.config.WgConfig.WgInterface.RemoveEndpointAddress(conn.config.WgConfig.RemoteKey); err != nil { conn.Log.Errorf("failed to remove wg endpoint: %v", err) @@ -500,10 +567,11 @@ func (conn *Conn) onRelayConnectionIsReady(rci RelayConnInfo) { return } + conn.wgWatcher.DisableWgWatcher() conn.wgWatcherWg.Add(1) go func() { defer conn.wgWatcherWg.Done() - conn.workerRelay.EnableWgWatcher(conn.ctx) + conn.wgWatcher.EnableWgWatcher(conn.ctx, conn.onWGDisconnected) }() wgConfigWorkaround() @@ -560,7 +628,15 @@ func (conn *Conn) onGuardEvent() { conn.dumpState.SendOffer() if err := conn.handshaker.SendOffer(); err != nil { conn.Log.Errorf("failed to send offer: %v", err) + return + } + + // Record signaling start timestamp (first signal sent) + conn.stagesMutex.Lock() + if conn.stageTimestamps.Signaling.IsZero() { + conn.stageTimestamps.Signaling = time.Now() } + conn.stagesMutex.Unlock() } func (conn *Conn) updateRelayStatus(relayServerAddr string, rosenpassPubKey []byte) { @@ -625,6 +701,20 @@ func (conn *Conn) doOnConnected(remoteRosenpassPubKey []byte, remoteRosenpassAdd runtime.GC() } + // Record connection ready timestamp and mark as connected + conn.stagesMutex.Lock() + if conn.stageTimestamps.ConnectionReady.IsZero() { + conn.stageTimestamps.ConnectionReady = time.Now() + } + // Mark that we've established a connection + conn.hasBeenConnected = true + + // todo: remove this when fixed the wireguard watcher + conn.stageTimestamps.WgHandshakeSuccess = time.Now() + conn.recordConnectionMetrics() + + conn.stagesMutex.Unlock() + if conn.onConnected != nil { conn.onConnected(conn.config.Key, remoteRosenpassPubKey, conn.config.WgConfig.AllowedIps[0].Addr().String(), remoteRosenpassAddr) } @@ -734,6 +824,50 @@ func (conn *Conn) setRelayedProxy(proxy wgproxy.Proxy) { conn.wgProxyRelay = proxy } +// onWGDisconnected is called when the WireGuard handshake times out +func (conn *Conn) onWGDisconnected() { + conn.workerRelay.CloseConn() + conn.onRelayDisconnected() +} + +// onWGHandshakeSuccess is called when the first WireGuard handshake is detected +func (conn *Conn) onWGHandshakeSuccess() { + conn.stagesMutex.Lock() + defer conn.stagesMutex.Unlock() + + /* + if conn.stageTimestamps.WgHandshakeSuccess.IsZero() { + conn.stageTimestamps.WgHandshakeSuccess = time.Now() + conn.recordConnectionMetrics() + } + + */ +} + +// recordConnectionMetrics records connection stage timestamps as metrics +func (conn *Conn) recordConnectionMetrics() { + if conn.metricsRecorder == nil { + return + } + + // Determine connection type based on current priority + var connType metrics.ConnectionType + switch conn.currentConnPriority { + case conntype.Relay: + connType = metrics.ConnectionTypeRelay + default: + connType = metrics.ConnectionTypeICE + } + + // Record metrics with timestamps - duration calculation happens in metrics package + conn.metricsRecorder.RecordConnectionStages( + context.Background(), + connType, + conn.isReconnectionAttempt, + conn.stageTimestamps, + ) +} + // AllowedIP returns the allowed IP of the remote peer func (conn *Conn) AllowedIP() netip.Addr { return conn.config.WgConfig.AllowedIps[0].Addr() diff --git a/client/internal/peer/wg_watcher.go b/client/internal/peer/wg_watcher.go index 0ed200fdafc..39241d5968c 100644 --- a/client/internal/peer/wg_watcher.go +++ b/client/internal/peer/wg_watcher.go @@ -34,14 +34,17 @@ type WGWatcher struct { ctxCancel context.CancelFunc ctxLock sync.Mutex enabledTime time.Time + + onFirstHandshakeFn func() } -func NewWGWatcher(log *log.Entry, wgIfaceStater WGInterfaceStater, peerKey string, stateDump *stateDump) *WGWatcher { +func NewWGWatcher(log *log.Entry, wgIfaceStater WGInterfaceStater, peerKey string, stateDump *stateDump, onFirstHandshakeFn func()) *WGWatcher { return &WGWatcher{ - log: log, - wgIfaceStater: wgIfaceStater, - peerKey: peerKey, - stateDump: stateDump, + log: log, + wgIfaceStater: wgIfaceStater, + peerKey: peerKey, + stateDump: stateDump, + onFirstHandshakeFn: onFirstHandshakeFn, } } @@ -100,12 +103,17 @@ func (w *WGWatcher) periodicHandshakeCheck(ctx context.Context, ctxCancel contex case <-timer.C: handshake, ok := w.handshakeCheck(lastHandshake) if !ok { - onDisconnectedFn() + if onDisconnectedFn != nil { + onDisconnectedFn() + } return } if lastHandshake.IsZero() { - elapsed := handshake.Sub(w.enabledTime).Seconds() + elapsed := w.calcElapsed(handshake) w.log.Infof("first wg handshake detected within: %.2fsec, (%s)", elapsed, handshake) + if w.onFirstHandshakeFn != nil { + w.onFirstHandshakeFn() + } } lastHandshake = *handshake @@ -134,19 +142,19 @@ func (w *WGWatcher) handshakeCheck(lastHandshake time.Time) (*time.Time, bool) { // the current know handshake did not change if handshake.Equal(lastHandshake) { - w.log.Warnf("WireGuard handshake timed out, closing relay connection: %v", handshake) + w.log.Warnf("WireGuard handshake timed out: %v", handshake) return nil, false } // in case if the machine is suspended, the handshake time will be in the past if handshake.Add(checkPeriod).Before(time.Now()) { - w.log.Warnf("WireGuard handshake timed out, closing relay connection: %v", handshake) + w.log.Warnf("WireGuard handshake timed out: %v", handshake) return nil, false } // error handling for handshake time in the future if handshake.After(time.Now()) { - w.log.Warnf("WireGuard handshake is in the future, closing relay connection: %v", handshake) + w.log.Warnf("WireGuard handshake is in the future: %v", handshake) return nil, false } @@ -164,3 +172,13 @@ func (w *WGWatcher) wgState() (time.Time, error) { } return wgState.LastHandshake, nil } + +// calcElapsed calculates elapsed time since watcher was enabled. +// The watcher started after the wg configuration happens, because of this need to normalise the negative value +func (w *WGWatcher) calcElapsed(handshake *time.Time) float64 { + elapsed := handshake.Sub(w.enabledTime).Seconds() + if elapsed < 0 { + elapsed = 0 + } + return elapsed +} diff --git a/client/internal/peer/wg_watcher_test.go b/client/internal/peer/wg_watcher_test.go index d7c277eff14..3442fa786cd 100644 --- a/client/internal/peer/wg_watcher_test.go +++ b/client/internal/peer/wg_watcher_test.go @@ -28,7 +28,7 @@ func TestWGWatcher_EnableWgWatcher(t *testing.T) { mlog := log.WithField("peer", "tet") mocWgIface := &MocWgIface{} - watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{})) + watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{}), nil) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -57,7 +57,7 @@ func TestWGWatcher_ReEnable(t *testing.T) { mlog := log.WithField("peer", "tet") mocWgIface := &MocWgIface{} - watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{})) + watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{}), nil) ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/client/internal/peer/worker_relay.go b/client/internal/peer/worker_relay.go index f584487f53d..a9fb72d2b39 100644 --- a/client/internal/peer/worker_relay.go +++ b/client/internal/peer/worker_relay.go @@ -30,8 +30,6 @@ type WorkerRelay struct { relayLock sync.Mutex relaySupportedOnRemotePeer atomic.Bool - - wgWatcher *WGWatcher } func NewWorkerRelay(ctx context.Context, log *log.Entry, ctrl bool, config ConnConfig, conn *Conn, relayManager *relayClient.Manager, stateDump *stateDump) *WorkerRelay { @@ -42,7 +40,6 @@ func NewWorkerRelay(ctx context.Context, log *log.Entry, ctrl bool, config ConnC config: config, conn: conn, relayManager: relayManager, - wgWatcher: NewWGWatcher(log, config.WgConfig.WgInterface, config.Key, stateDump), } return r } @@ -93,14 +90,6 @@ func (w *WorkerRelay) OnNewOffer(remoteOfferAnswer *OfferAnswer) { }) } -func (w *WorkerRelay) EnableWgWatcher(ctx context.Context) { - w.wgWatcher.EnableWgWatcher(ctx, w.onWGDisconnected) -} - -func (w *WorkerRelay) DisableWgWatcher() { - w.wgWatcher.DisableWgWatcher() -} - func (w *WorkerRelay) RelayInstanceAddress() (string, error) { return w.relayManager.RelayInstanceAddress() } @@ -125,14 +114,6 @@ func (w *WorkerRelay) CloseConn() { } } -func (w *WorkerRelay) onWGDisconnected() { - w.relayLock.Lock() - _ = w.relayedConn.Close() - w.relayLock.Unlock() - - w.conn.onRelayDisconnected() -} - func (w *WorkerRelay) isRelaySupported(answer *OfferAnswer) bool { if !w.relayManager.HasRelayAddress() { return false @@ -148,6 +129,5 @@ func (w *WorkerRelay) preferredRelayServer(myRelayAddress, remoteRelayAddress st } func (w *WorkerRelay) onRelayClientDisconnected() { - w.wgWatcher.DisableWgWatcher() go w.conn.onRelayDisconnected() } diff --git a/client/server/debug.go b/client/server/debug.go index 056d9df21c4..a6e5926e9ee 100644 --- a/client/server/debug.go +++ b/client/server/debug.go @@ -32,12 +32,20 @@ func (s *Server) DebugBundle(_ context.Context, req *proto.DebugBundleRequest) ( log.Warnf("failed to get latest sync response: %v", err) } + var clientMetrics debug.MetricsExporter + if s.connectClient != nil { + if engine := s.connectClient.Engine(); engine != nil { + clientMetrics = engine.GetClientMetrics() + } + } + bundleGenerator := debug.NewBundleGenerator( debug.GeneratorDependencies{ InternalConfig: s.config, StatusRecorder: s.statusRecorder, SyncResponse: syncResponse, LogFile: s.logFile, + ClientMetrics: clientMetrics, }, debug.BundleConfig{ Anonymize: req.GetAnonymize(), diff --git a/client/server/server.go b/client/server/server.go index 66dca5cd28b..7b6c4e98c11 100644 --- a/client/server/server.go +++ b/client/server/server.go @@ -24,7 +24,6 @@ import ( "google.golang.org/protobuf/types/known/timestamppb" "github.com/netbirdio/netbird/client/internal/auth" - "github.com/netbirdio/netbird/client/internal/metrics" "github.com/netbirdio/netbird/client/internal/profilemanager" "github.com/netbirdio/netbird/client/system" mgm "github.com/netbirdio/netbird/shared/management/client" @@ -77,7 +76,6 @@ type Server struct { statusRecorder *peer.Status sessionWatcher *internal.SessionWatcher - clientMetrics *metrics.ClientMetrics lastProbe time.Time persistSyncResponse bool @@ -111,7 +109,6 @@ func New(ctx context.Context, logFile string, configFile string, profilesDisable profilesDisabled: profilesDisabled, updateSettingsDisabled: updateSettingsDisabled, jwtCache: newJWTCache(), - clientMetrics: metrics.NewClientMetrics(), } } @@ -1527,7 +1524,7 @@ func (s *Server) GetFeatures(ctx context.Context, msg *proto.GetFeaturesRequest) func (s *Server) connect(ctx context.Context, config *profilemanager.Config, statusRecorder *peer.Status, doInitialAutoUpdate bool, runningChan chan struct{}) error { log.Tracef("running client connection") - s.connectClient = internal.NewConnectClient(ctx, config, statusRecorder, doInitialAutoUpdate, s.clientMetrics) + s.connectClient = internal.NewConnectClient(ctx, config, statusRecorder, doInitialAutoUpdate) s.connectClient.SetSyncResponsePersistence(s.persistSyncResponse) if err := s.connectClient.Run(runningChan); err != nil { return err diff --git a/shared/management/client/client.go b/shared/management/client/client.go index 3126bcd1fc5..8a89010eb4f 100644 --- a/shared/management/client/client.go +++ b/shared/management/client/client.go @@ -20,6 +20,7 @@ type Client interface { GetDeviceAuthorizationFlow(serverKey wgtypes.Key) (*proto.DeviceAuthorizationFlow, error) GetPKCEAuthorizationFlow(serverKey wgtypes.Key) (*proto.PKCEAuthorizationFlow, error) GetNetworkMap(sysInfo *system.Info) (*proto.NetworkMap, error) + GetServerURL() string IsHealthy() bool SyncMeta(sysInfo *system.Info) error Logout() error diff --git a/shared/management/client/grpc.go b/shared/management/client/grpc.go index 89860ac9bbb..a030f137149 100644 --- a/shared/management/client/grpc.go +++ b/shared/management/client/grpc.go @@ -45,6 +45,7 @@ type GrpcClient struct { conn *grpc.ClientConn connStateCallback ConnStateNotifier connStateCallbackLock sync.RWMutex + serverURL string } // NewClient creates a new client to Management service @@ -74,9 +75,15 @@ func NewClient(ctx context.Context, addr string, ourPrivateKey wgtypes.Key, tlsE ctx: ctx, conn: conn, connStateCallbackLock: sync.RWMutex{}, + serverURL: addr, }, nil } +// GetServerURL returns the management server URL +func (c *GrpcClient) GetServerURL() string { + return c.serverURL +} + // Close closes connection to the Management Service func (c *GrpcClient) Close() error { return c.conn.Close() From 5169129029f891e2f5c97c1006fbb6e630e6dc0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Wed, 28 Jan 2026 15:51:51 +0100 Subject: [PATCH 03/52] Add signaling metrics tracking for initial and reconnection attempts --- client/internal/peer/conn.go | 2 +- client/internal/peer/handshaker.go | 28 ++++++++++++++++++++------- client/internal/peer/metrics_saver.go | 26 +++++++++++++++++++++++-- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index c23d356e04c..c0a2c286284 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -213,7 +213,7 @@ func (conn *Conn) Open(engineCtx context.Context) error { } conn.workerICE = workerICE - conn.handshaker = NewHandshaker(conn.Log, conn.config, conn.signaler, conn.workerICE, conn.workerRelay) + conn.handshaker = NewHandshaker(conn.Log, conn.config, conn.signaler, conn.workerICE, conn.workerRelay, &conn.metricsStages) conn.handshaker.AddRelayListener(conn.workerRelay.OnNewOffer) if !isForceRelayed() { diff --git a/client/internal/peer/handshaker.go b/client/internal/peer/handshaker.go index aff26f84722..9b50cecd146 100644 --- a/client/internal/peer/handshaker.go +++ b/client/internal/peer/handshaker.go @@ -44,12 +44,13 @@ type OfferAnswer struct { } type Handshaker struct { - mu sync.Mutex - log *log.Entry - config ConnConfig - signaler *Signaler - ice *WorkerICE - relay *WorkerRelay + mu sync.Mutex + log *log.Entry + config ConnConfig + signaler *Signaler + ice *WorkerICE + relay *WorkerRelay + metricsStages *MetricsStages // relayListener is not blocking because the listener is using a goroutine to process the messages // and it will only keep the latest message if multiple offers are received in a short time // this is to avoid blocking the handshaker if the listener is doing some heavy processing @@ -64,13 +65,14 @@ type Handshaker struct { remoteAnswerCh chan OfferAnswer } -func NewHandshaker(log *log.Entry, config ConnConfig, signaler *Signaler, ice *WorkerICE, relay *WorkerRelay) *Handshaker { +func NewHandshaker(log *log.Entry, config ConnConfig, signaler *Signaler, ice *WorkerICE, relay *WorkerRelay, metricsStages *MetricsStages) *Handshaker { return &Handshaker{ log: log, config: config, signaler: signaler, ice: ice, relay: relay, + metricsStages: metricsStages, remoteOffersCh: make(chan OfferAnswer), remoteAnswerCh: make(chan OfferAnswer), } @@ -89,6 +91,12 @@ func (h *Handshaker) Listen(ctx context.Context) { select { case remoteOfferAnswer := <-h.remoteOffersCh: h.log.Infof("received offer, running version %s, remote WireGuard listen port %d, session id: %s", remoteOfferAnswer.Version, remoteOfferAnswer.WgListenPort, remoteOfferAnswer.SessionIDString()) + + // Record signaling received for reconnection attempts + if h.metricsStages != nil { + h.metricsStages.RecordSignalingReceived() + } + if h.relayListener != nil { h.relayListener.Notify(&remoteOfferAnswer) } @@ -103,6 +111,12 @@ func (h *Handshaker) Listen(ctx context.Context) { } case remoteOfferAnswer := <-h.remoteAnswerCh: h.log.Infof("received answer, running version %s, remote WireGuard listen port %d, session id: %s", remoteOfferAnswer.Version, remoteOfferAnswer.WgListenPort, remoteOfferAnswer.SessionIDString()) + + // Record signaling received for reconnection attempts + if h.metricsStages != nil { + h.metricsStages.RecordSignalingReceived() + } + if h.relayListener != nil { h.relayListener.Notify(&remoteOfferAnswer) } diff --git a/client/internal/peer/metrics_saver.go b/client/internal/peer/metrics_saver.go index 6cf00d4ff92..2e78ec58de3 100644 --- a/client/internal/peer/metrics_saver.go +++ b/client/internal/peer/metrics_saver.go @@ -29,13 +29,35 @@ func (s *MetricsStages) RecordSemaphoreAcquired() { s.stageTimestamps.SemaphoreAcquired = time.Now() } +// RecordSignaling records the signaling timestamp when sending offers +// For initial connections: records when we start sending +// For reconnections: does nothing (we wait for RecordSignalingReceived) func (s *MetricsStages) RecordSignaling() { s.mu.Lock() defer s.mu.Unlock() - log.Infof("--- RecordSignaling") + log.Infof("--- RecordSignaling (send)") + + if s.isReconnectionAttempt { + return + } if s.stageTimestamps.Signaling.IsZero() { - log.Infof("--- Recorded Signaling") + log.Infof("--- Recorded Signaling (initial connection, sending)") + s.stageTimestamps.Signaling = time.Now() + } +} + +// RecordSignalingReceived records the signaling timestamp when receiving offers/answers +// For reconnections: records when we receive the first signal +// For initial connections: does nothing (already recorded in RecordSignaling) +func (s *MetricsStages) RecordSignalingReceived() { + s.mu.Lock() + defer s.mu.Unlock() + log.Infof("--- RecordSignalingReceived (receive)") + + // Only record for reconnections when we receive a signal + if s.isReconnectionAttempt && s.stageTimestamps.Signaling.IsZero() { + log.Infof("--- Recorded Signaling (reconnection, receiving)") s.stageTimestamps.Signaling = time.Now() } } From cbfde79dd8f4dc324c37e031bb354e153cb05910 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Wed, 28 Jan 2026 15:59:24 +0100 Subject: [PATCH 04/52] Reset connection stage timestamps during reconnections to exclude unnecessary metrics tracking --- client/internal/metrics/victoria.go | 2 +- client/internal/peer/conn.go | 44 +++++++++++++-------------- client/internal/peer/metrics_saver.go | 40 +++++++++++------------- client/internal/peer/wg_watcher.go | 3 +- 4 files changed, 41 insertions(+), 48 deletions(-) diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index 23a967d4504..63fadc76a1f 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -84,7 +84,7 @@ func (m *victoriaMetrics) RecordConnectionStages( m.getMetricName("netbird_peer_connection_total_creation_to_handshake", connTypeStr, attemptType), ).Update(totalDuration) - log.Infof("--- Peer connection metrics [%s, %s, %s]: creation→semaphore: %.3fs, semaphore→signaling: %.3fs, signaling→connection: %.3fs, connection→handshake: %.3fs, total: %.3fs", + log.Tracef("peer connection metrics [%s, %s, %s]: creation→semaphore: %.3fs, semaphore→signaling: %.3fs, signaling→connection: %.3fs, connection→handshake: %.3fs, total: %.3fs", m.deploymentType.String(), connTypeStr, attemptType, creationToSemaphore, semaphoreToSignaling, signalingToConnection, connectionToHandshake, totalDuration) diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index c0a2c286284..9a0966a5742 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -303,7 +303,6 @@ func (conn *Conn) Close(signalToRemote bool) { } conn.setStatusToDisconnected() - conn.opened = false conn.wg.Wait() conn.Log.Infof("peer connection closed") @@ -343,7 +342,6 @@ func (conn *Conn) SetRosenpassInitializedPresharedKeyValidator(handler func(peer func (conn *Conn) OnRemoteOffer(offer OfferAnswer) { conn.dumpState.RemoteOffer() conn.Log.Infof("OnRemoteOffer, on status ICE: %s, status Relay: %s", conn.statusICE, conn.statusRelay) - conn.handshaker.OnRemoteOffer(offer) } @@ -387,7 +385,7 @@ func (conn *Conn) onICEConnectionIsReady(priority conntype.ConnPriority, iceConn if conn.currentConnPriority > priority { conn.Log.Infof("current connection priority (%s) is higher than the new one (%s), do not upgrade connection", conn.currentConnPriority, priority) conn.statusICE.SetConnected() - conn.updateIceState(iceConnInfo) + conn.updateIceState(iceConnInfo, time.Now()) return } @@ -428,11 +426,14 @@ func (conn *Conn) onICEConnectionIsReady(priority conntype.ConnPriority, iceConn conn.Log.Infof("configure WireGuard endpoint to: %s", ep.String()) presharedKey := conn.presharedKey(iceConnInfo.RosenpassPubKey) - conn.enableWgWatcherIfNeeded() + updateTime := time.Now() if err = conn.endpointUpdater.ConfigureWGEndpoint(ep, presharedKey); err != nil { conn.handleConfigurationFailure(err, wgProxy) return } + + conn.enableWgWatcherIfNeeded(updateTime) + wgConfigWorkaround() if conn.wgProxyRelay != nil { @@ -442,8 +443,8 @@ func (conn *Conn) onICEConnectionIsReady(priority conntype.ConnPriority, iceConn conn.currentConnPriority = priority conn.statusICE.SetConnected() - conn.updateIceState(iceConnInfo) - conn.doOnConnected(iceConnInfo.RosenpassPubKey, iceConnInfo.RosenpassAddr) + conn.updateIceState(iceConnInfo, updateTime) + conn.doOnConnected(iceConnInfo.RosenpassPubKey, iceConnInfo.RosenpassAddr, updateTime) } func (conn *Conn) onICEStateDisconnected() { @@ -535,13 +536,13 @@ func (conn *Conn) onRelayConnectionIsReady(rci RelayConnInfo) { conn.Log.Debugf("do not switch to relay because current priority is: %s", conn.currentConnPriority.String()) conn.setRelayedProxy(wgProxy) conn.statusRelay.SetConnected() - conn.updateRelayStatus(rci.relayedConn.RemoteAddr().String(), rci.rosenpassPubKey) + conn.updateRelayStatus(rci.relayedConn.RemoteAddr().String(), rci.rosenpassPubKey, time.Now()) return } wgProxy.Work() presharedKey := conn.presharedKey(rci.rosenpassPubKey) - conn.enableWgWatcherIfNeeded() + updateTime := time.Now() if err := conn.endpointUpdater.ConfigureWGEndpoint(wgProxy.EndpointAddr(), presharedKey); err != nil { if err := wgProxy.CloseConn(); err != nil { conn.Log.Warnf("Failed to close relay connection: %v", err) @@ -550,14 +551,17 @@ func (conn *Conn) onRelayConnectionIsReady(rci RelayConnInfo) { return } + conn.enableWgWatcherIfNeeded(updateTime) + wgConfigWorkaround() + conn.rosenpassRemoteKey = rci.rosenpassPubKey conn.currentConnPriority = conntype.Relay conn.statusRelay.SetConnected() conn.setRelayedProxy(wgProxy) - conn.updateRelayStatus(rci.relayedConn.RemoteAddr().String(), rci.rosenpassPubKey) + conn.updateRelayStatus(rci.relayedConn.RemoteAddr().String(), rci.rosenpassPubKey, updateTime) conn.Log.Infof("start to communicate with peer via relay") - conn.doOnConnected(rci.rosenpassPubKey, rci.rosenpassAddr) + conn.doOnConnected(rci.rosenpassPubKey, rci.rosenpassAddr, updateTime) } func (conn *Conn) onRelayDisconnected() { @@ -640,10 +644,10 @@ func (conn *Conn) onWGDisconnected() { } } -func (conn *Conn) updateRelayStatus(relayServerAddr string, rosenpassPubKey []byte) { +func (conn *Conn) updateRelayStatus(relayServerAddr string, rosenpassPubKey []byte, updateTime time.Time) { peerState := State{ PubKey: conn.config.Key, - ConnStatusUpdate: time.Now(), + ConnStatusUpdate: updateTime, ConnStatus: conn.evalStatus(), Relayed: conn.isRelayed(), RelayServerAddress: relayServerAddr, @@ -656,10 +660,10 @@ func (conn *Conn) updateRelayStatus(relayServerAddr string, rosenpassPubKey []by } } -func (conn *Conn) updateIceState(iceConnInfo ICEConnInfo) { +func (conn *Conn) updateIceState(iceConnInfo ICEConnInfo, updateTime time.Time) { peerState := State{ PubKey: conn.config.Key, - ConnStatusUpdate: time.Now(), + ConnStatusUpdate: updateTime, ConnStatus: conn.evalStatus(), Relayed: iceConnInfo.Relayed, LocalIceCandidateType: iceConnInfo.LocalIceCandidateType, @@ -697,12 +701,12 @@ func (conn *Conn) setStatusToDisconnected() { } } -func (conn *Conn) doOnConnected(remoteRosenpassPubKey []byte, remoteRosenpassAddr string) { +func (conn *Conn) doOnConnected(remoteRosenpassPubKey []byte, remoteRosenpassAddr string, updateTime time.Time) { if runtime.GOOS == "ios" { runtime.GC() } - conn.metricsStages.RecordConnectionReady() + conn.metricsStages.RecordConnectionReady(updateTime) if conn.onConnected != nil { conn.onConnected(conn.config.Key, remoteRosenpassPubKey, conn.config.WgConfig.AllowedIps[0].Addr().String(), remoteRosenpassAddr) @@ -768,15 +772,14 @@ func (conn *Conn) isConnectedOnAllWay() (connected bool) { return true } -func (conn *Conn) enableWgWatcherIfNeeded() { +func (conn *Conn) enableWgWatcherIfNeeded(enabledTime time.Time) { if !conn.wgWatcher.IsEnabled() { wgWatcherCtx, wgWatcherCancel := context.WithCancel(conn.ctx) conn.wgWatcherCancel = wgWatcherCancel conn.wgWatcherWg.Add(1) - now := time.Now() go func() { defer conn.wgWatcherWg.Done() - conn.wgWatcher.EnableWgWatcher(wgWatcherCtx, now, conn.onWGDisconnected, conn.onWGHandshakeSuccess) + conn.wgWatcher.EnableWgWatcher(wgWatcherCtx, enabledTime, conn.onWGDisconnected, conn.onWGHandshakeSuccess) }() } } @@ -848,9 +851,7 @@ func (conn *Conn) onWGHandshakeSuccess(when time.Time) { // recordConnectionMetrics records connection stage timestamps as metrics func (conn *Conn) recordConnectionMetrics() { - log.Infof("--- record Metrics") if conn.metricsRecorder == nil { - log.Infof("--- is nil") return } @@ -863,7 +864,6 @@ func (conn *Conn) recordConnectionMetrics() { connType = metrics.ConnectionTypeICE } - log.Infof("-- record: connType: %v, %v, %v", connType, conn.metricsStages.IsReconnection(), conn.metricsStages.GetTimestamps()) // Record metrics with timestamps - duration calculation happens in metrics package conn.metricsRecorder.RecordConnectionStages( context.Background(), diff --git a/client/internal/peer/metrics_saver.go b/client/internal/peer/metrics_saver.go index 2e78ec58de3..39e71a6458a 100644 --- a/client/internal/peer/metrics_saver.go +++ b/client/internal/peer/metrics_saver.go @@ -4,8 +4,6 @@ import ( "sync" "time" - log "github.com/sirupsen/logrus" - "github.com/netbirdio/netbird/client/internal/metrics" ) @@ -18,14 +16,12 @@ type MetricsStages struct { func (s *MetricsStages) RecordCreated() { s.mu.Lock() defer s.mu.Unlock() - log.Infof("--- RecordCreated") s.stageTimestamps.Created = time.Now() } func (s *MetricsStages) RecordSemaphoreAcquired() { s.mu.Lock() defer s.mu.Unlock() - log.Infof("--- RecordSemaphoreAcquired") s.stageTimestamps.SemaphoreAcquired = time.Now() } @@ -35,14 +31,12 @@ func (s *MetricsStages) RecordSemaphoreAcquired() { func (s *MetricsStages) RecordSignaling() { s.mu.Lock() defer s.mu.Unlock() - log.Infof("--- RecordSignaling (send)") if s.isReconnectionAttempt { return } if s.stageTimestamps.Signaling.IsZero() { - log.Infof("--- Recorded Signaling (initial connection, sending)") s.stageTimestamps.Signaling = time.Now() } } @@ -53,47 +47,47 @@ func (s *MetricsStages) RecordSignaling() { func (s *MetricsStages) RecordSignalingReceived() { s.mu.Lock() defer s.mu.Unlock() - log.Infof("--- RecordSignalingReceived (receive)") // Only record for reconnections when we receive a signal if s.isReconnectionAttempt && s.stageTimestamps.Signaling.IsZero() { - log.Infof("--- Recorded Signaling (reconnection, receiving)") s.stageTimestamps.Signaling = time.Now() } } -func (s *MetricsStages) RecordConnectionReady() { +func (s *MetricsStages) RecordConnectionReady(when time.Time) { s.mu.Lock() defer s.mu.Unlock() - log.Infof("--- RecordConnectionReady") if s.stageTimestamps.ConnectionReady.IsZero() { - log.Infof("--- Recorded ConnectionReady") - s.stageTimestamps.ConnectionReady = time.Now() + s.stageTimestamps.ConnectionReady = when } - } -func (s *MetricsStages) RecordWGHandshakeSuccess(elapsed time.Time) { +func (s *MetricsStages) RecordWGHandshakeSuccess(handshakeTime time.Time) { s.mu.Lock() defer s.mu.Unlock() - log.Infof("--- record: %v, %v", s.stageTimestamps.ConnectionReady, elapsed) if !s.stageTimestamps.ConnectionReady.IsZero() { - // todo, check if it is earlier then ConnectionReady - s.stageTimestamps.WgHandshakeSuccess = elapsed + // WireGuard only reports handshake times with second precision, but ConnectionReady + // is captured with microsecond precision. If handshake appears before ConnectionReady + // due to truncation (e.g., handshake at 6.042s truncated to 6.000s), normalize to + // ConnectionReady to avoid negative duration metrics. + if handshakeTime.Before(s.stageTimestamps.ConnectionReady) { + s.stageTimestamps.WgHandshakeSuccess = s.stageTimestamps.ConnectionReady + } else { + s.stageTimestamps.WgHandshakeSuccess = handshakeTime + } } } +// Disconnected sets the mode to reconnection. It is called only when both ICE and Relay have been disconnected at the same time. func (s *MetricsStages) Disconnected() { - log.Infof("--- Disconnected") s.mu.Lock() defer s.mu.Unlock() - now := time.Now() - s.stageTimestamps = metrics.ConnectionStageTimestamps{ - Created: now, - SemaphoreAcquired: now, - } + // Reset all timestamps for reconnection + // For reconnections, we only track from Signaling onwards + // This avoids meaningless creation→semaphore and semaphore→signaling metrics + s.stageTimestamps = metrics.ConnectionStageTimestamps{} s.isReconnectionAttempt = true } diff --git a/client/internal/peer/wg_watcher.go b/client/internal/peer/wg_watcher.go index 94df5c8d265..e83875208bb 100644 --- a/client/internal/peer/wg_watcher.go +++ b/client/internal/peer/wg_watcher.go @@ -94,8 +94,7 @@ func (w *WGWatcher) periodicHandshakeCheck(ctx context.Context, onDisconnectedFn } if lastHandshake.IsZero() { elapsed := calcElapsed(enabledTime, *handshake) - w.log.Infof("--- first wg handshake detected within: %.2fsec, (%s - %s)", elapsed, enabledTime, handshake) - onHandshakeSuccessFn(*handshake) + w.log.Infof("first wg handshake detected within: %.2fsec, (%s)", elapsed, handshake) } lastHandshake = *handshake From 08295e511619aeec7f908c1e059d2f6c22cc9969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Wed, 28 Jan 2026 17:35:10 +0100 Subject: [PATCH 05/52] Delete otel lib from client --- client/internal/metrics/otel.go | 250 -------------------------------- go.mod | 2 +- 2 files changed, 1 insertion(+), 251 deletions(-) delete mode 100644 client/internal/metrics/otel.go diff --git a/client/internal/metrics/otel.go b/client/internal/metrics/otel.go deleted file mode 100644 index 43bb1223268..00000000000 --- a/client/internal/metrics/otel.go +++ /dev/null @@ -1,250 +0,0 @@ -package metrics - -import ( - "context" - "fmt" - "io" - - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/metric" - sdkmetric "go.opentelemetry.io/otel/sdk/metric" - "go.opentelemetry.io/otel/sdk/metric/metricdata" -) - -// otelMetrics is the OpenTelemetry implementation of ClientMetrics -type otelMetrics struct { - reader *sdkmetric.ManualReader - meterProvider *sdkmetric.MeterProvider - meter metric.Meter - - // Static attributes applied to all metrics - deploymentType DeploymentType - - // Connection stage duration histograms - stageCreationToSemaphore metric.Float64Histogram - stageSemaphoreToSignaling metric.Float64Histogram - stageSignalingToConnection metric.Float64Histogram - stageConnectionToHandshake metric.Float64Histogram - stageTotalCreationToHandshake metric.Float64Histogram -} - -func newOtelMetrics(deploymentType DeploymentType) metricsImplementation { - reader := sdkmetric.NewManualReader() - meterProvider := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) - - otel.SetMeterProvider(meterProvider) - - meter := meterProvider.Meter("netbird.client") - - stageCreationToSemaphore, err := meter.Float64Histogram( - "netbird.peer.connection.stage.creation_to_semaphore", - metric.WithDescription("Duration from connection creation to semaphore acquisition"), - metric.WithUnit("s"), - ) - if err != nil { - return &noopMetrics{} - } - - stageSemaphoreToSignaling, err := meter.Float64Histogram( - "netbird.peer.connection.stage.semaphore_to_signaling", - metric.WithDescription("Duration from semaphore acquisition to signaling start"), - metric.WithUnit("s"), - ) - if err != nil { - return &noopMetrics{} - } - - stageSignalingToConnection, err := meter.Float64Histogram( - "netbird.peer.connection.stage.signaling_to_connection", - metric.WithDescription("Duration from signaling start to connection ready"), - metric.WithUnit("s"), - ) - if err != nil { - return &noopMetrics{} - } - - stageConnectionToHandshake, err := meter.Float64Histogram( - "netbird.peer.connection.stage.connection_to_handshake", - metric.WithDescription("Duration from connection ready to WireGuard handshake success"), - metric.WithUnit("s"), - ) - if err != nil { - return &noopMetrics{} - } - - stageTotalCreationToHandshake, err := meter.Float64Histogram( - "netbird.peer.connection.total.creation_to_handshake", - metric.WithDescription("Total duration from connection creation to WireGuard handshake success"), - metric.WithUnit("s"), - ) - if err != nil { - return &noopMetrics{} - } - - return &otelMetrics{ - reader: reader, - meterProvider: meterProvider, - meter: meter, - deploymentType: deploymentType, - stageCreationToSemaphore: stageCreationToSemaphore, - stageSemaphoreToSignaling: stageSemaphoreToSignaling, - stageSignalingToConnection: stageSignalingToConnection, - stageConnectionToHandshake: stageConnectionToHandshake, - stageTotalCreationToHandshake: stageTotalCreationToHandshake, - } -} - -// RecordConnectionStages records the duration of each connection stage from timestamps -func (m *otelMetrics) RecordConnectionStages( - ctx context.Context, - connectionType ConnectionType, - isReconnection bool, - timestamps ConnectionStageTimestamps, -) { - // Calculate stage durations - var creationToSemaphore, semaphoreToSignaling, signalingToConnection, connectionToHandshake, totalDuration float64 - - if !timestamps.Created.IsZero() && !timestamps.SemaphoreAcquired.IsZero() { - creationToSemaphore = timestamps.SemaphoreAcquired.Sub(timestamps.Created).Seconds() - } - - if !timestamps.SemaphoreAcquired.IsZero() && !timestamps.Signaling.IsZero() { - semaphoreToSignaling = timestamps.Signaling.Sub(timestamps.SemaphoreAcquired).Seconds() - } - - if !timestamps.Signaling.IsZero() && !timestamps.ConnectionReady.IsZero() { - signalingToConnection = timestamps.ConnectionReady.Sub(timestamps.Signaling).Seconds() - } - - if !timestamps.ConnectionReady.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { - connectionToHandshake = timestamps.WgHandshakeSuccess.Sub(timestamps.ConnectionReady).Seconds() - } - - if !timestamps.Created.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { - totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.Created).Seconds() - } - - // Determine attempt type - attemptType := "initial" - if isReconnection { - attemptType = "reconnection" - } - - // Combine deployment type, connection type, and attempt type attributes - attrs := metric.WithAttributes( - attribute.String("deployment_type", m.deploymentType.String()), - attribute.String("connection_type", connectionType.String()), - attribute.String("attempt_type", attemptType), - ) - - m.stageCreationToSemaphore.Record(ctx, creationToSemaphore, attrs) - m.stageSemaphoreToSignaling.Record(ctx, semaphoreToSignaling, attrs) - m.stageSignalingToConnection.Record(ctx, signalingToConnection, attrs) - m.stageConnectionToHandshake.Record(ctx, connectionToHandshake, attrs) - m.stageTotalCreationToHandshake.Record(ctx, totalDuration, attrs) -} - -// Export writes metrics in Prometheus text format -func (m *otelMetrics) Export(w io.Writer) error { - if m.reader == nil { - return fmt.Errorf("metrics reader not initialized") - } - - // Collect current metrics - var rm metricdata.ResourceMetrics - if err := m.reader.Collect(context.Background(), &rm); err != nil { - return fmt.Errorf("failed to collect metrics: %w", err) - } - - // Iterate through scope metrics and write in Prometheus format - for _, sm := range rm.ScopeMetrics { - for _, m := range sm.Metrics { - // Write HELP line - if _, err := fmt.Fprintf(w, "# HELP %s %s\n", m.Name, m.Description); err != nil { - return err - } - - // Write TYPE line - if _, err := fmt.Fprintf(w, "# TYPE %s histogram\n", m.Name); err != nil { - return err - } - - // Handle histogram data - if hist, ok := m.Data.(metricdata.Histogram[float64]); ok { - for _, dp := range hist.DataPoints { - // Build label string from attributes - labelStr := "" - if len(dp.Attributes.ToSlice()) > 0 { - labels := "" - for _, attr := range dp.Attributes.ToSlice() { - if labels != "" { - labels += "," - } - labels += fmt.Sprintf("%s=\"%s\"", attr.Key, attr.Value.AsString()) - } - labelStr = labels - } - - // Write bucket counts - cumulativeCount := uint64(0) - for i, bound := range dp.Bounds { - cumulativeCount += dp.BucketCounts[i] - bucketLabel := labelStr - if bucketLabel != "" { - bucketLabel += "," - } - bucketLabel += fmt.Sprintf("le=\"%g\"", bound) - if _, err := fmt.Fprintf(w, "%s_bucket{%s} %d\n", - m.Name, bucketLabel, cumulativeCount); err != nil { - return err - } - } - - // Write +Inf bucket (last bucket count) - if len(dp.BucketCounts) > len(dp.Bounds) { - cumulativeCount += dp.BucketCounts[len(dp.BucketCounts)-1] - } - bucketLabel := labelStr - if bucketLabel != "" { - bucketLabel += "," - } - bucketLabel += "le=\"+Inf\"" - if _, err := fmt.Fprintf(w, "%s_bucket{%s} %d\n", - m.Name, bucketLabel, cumulativeCount); err != nil { - return err - } - - // Write sum - if labelStr != "" { - if _, err := fmt.Fprintf(w, "%s_sum{%s} %g\n", m.Name, labelStr, dp.Sum); err != nil { - return err - } - } else { - if _, err := fmt.Fprintf(w, "%s_sum %g\n", m.Name, dp.Sum); err != nil { - return err - } - } - - // Write count - if labelStr != "" { - if _, err := fmt.Fprintf(w, "%s_count{%s} %d\n", m.Name, labelStr, dp.Count); err != nil { - return err - } - } else { - if _, err := fmt.Fprintf(w, "%s_count %d\n", m.Name, dp.Count); err != nil { - return err - } - } - } - } - - // Empty line between metrics - if _, err := fmt.Fprintf(w, "\n"); err != nil { - return err - } - } - } - - return nil -} diff --git a/go.mod b/go.mod index dc922b2f849..0b8d40470b2 100644 --- a/go.mod +++ b/go.mod @@ -33,6 +33,7 @@ require ( fyne.io/fyne/v2 v2.7.0 fyne.io/systray v1.12.1-0.20260116214250-81f8e1a496f9 github.com/TheJumpCloud/jcapi-go v3.0.0+incompatible + github.com/VictoriaMetrics/metrics v1.40.2 github.com/awnumar/memguard v0.23.0 github.com/aws/aws-sdk-go-v2 v1.36.3 github.com/aws/aws-sdk-go-v2/config v1.29.14 @@ -141,7 +142,6 @@ require ( github.com/Masterminds/sprig/v3 v3.3.0 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/Microsoft/hcsshim v0.12.3 // indirect - github.com/VictoriaMetrics/metrics v1.40.2 // indirect github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be // indirect github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/awnumar/memcall v0.4.0 // indirect From e7283a81985e27b2e7f4045b0b1981270d47b732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Wed, 28 Jan 2026 17:39:25 +0100 Subject: [PATCH 06/52] Update unit tests --- client/internal/peer/wg_watcher_test.go | 12 ++++++------ shared/management/client/mock.go | 9 +++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/client/internal/peer/wg_watcher_test.go b/client/internal/peer/wg_watcher_test.go index e73d3566839..3ce91cd466e 100644 --- a/client/internal/peer/wg_watcher_test.go +++ b/client/internal/peer/wg_watcher_test.go @@ -35,11 +35,11 @@ func TestWGWatcher_EnableWgWatcher(t *testing.T) { defer cancel() onDisconnected := make(chan struct{}, 1) - go watcher.EnableWgWatcher(ctx,, func() { + go watcher.EnableWgWatcher(ctx, time.Now(), func() { mlog.Infof("onDisconnectedFn") onDisconnected <- struct{}{} - }, func(elapsed float64) { - mlog.Infof("onHandshakeSuccess: %.3fs", elapsed) + }, func(when time.Time) { + mlog.Infof("onHandshakeSuccess: %v", when) }) // wait for initial reading @@ -66,7 +66,7 @@ func TestWGWatcher_ReEnable(t *testing.T) { wg.Add(1) go func() { defer wg.Done() - watcher.EnableWgWatcher(ctx, func() {}, func(elapsed float64) {}) + watcher.EnableWgWatcher(ctx, time.Now(), func() {}, func(when time.Time) {}) }() cancel() @@ -77,9 +77,9 @@ func TestWGWatcher_ReEnable(t *testing.T) { defer cancel() onDisconnected := make(chan struct{}, 1) - go watcher.EnableWgWatcher(ctx,, func() { + go watcher.EnableWgWatcher(ctx, time.Now(), func() { onDisconnected <- struct{}{} - }, func(elapsed float64) {}) + }, func(when time.Time) {}) time.Sleep(2 * time.Second) mocWgIface.disconnect() diff --git a/shared/management/client/mock.go b/shared/management/client/mock.go index ac96f7b36c6..a85f5e3bee6 100644 --- a/shared/management/client/mock.go +++ b/shared/management/client/mock.go @@ -18,6 +18,7 @@ type MockClient struct { LoginFunc func(serverKey wgtypes.Key, info *system.Info, sshKey []byte, dnsLabels domain.List) (*proto.LoginResponse, error) GetDeviceAuthorizationFlowFunc func(serverKey wgtypes.Key) (*proto.DeviceAuthorizationFlow, error) GetPKCEAuthorizationFlowFunc func(serverKey wgtypes.Key) (*proto.PKCEAuthorizationFlow, error) + GetServerURLFunc func() string SyncMetaFunc func(sysInfo *system.Info) error LogoutFunc func() error JobFunc func(ctx context.Context, msgHandler func(msg *proto.JobRequest) *proto.JobResponse) error @@ -88,6 +89,14 @@ func (m *MockClient) GetNetworkMap(_ *system.Info) (*proto.NetworkMap, error) { return nil, nil } +// GetServerURL mock implementation of GetServerURL from mgm.Client interface +func (m *MockClient) GetServerURL() string { + if m.GetServerURLFunc == nil { + return "" + } + return m.GetServerURLFunc() +} + func (m *MockClient) SyncMeta(sysInfo *system.Info) error { if m.SyncMetaFunc == nil { return nil From 138e728427c100fbfba71d1a4bb840b2142ec330 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 29 Jan 2026 11:43:31 +0100 Subject: [PATCH 07/52] Invoke callback on handshake success in WireGuard watcher --- client/internal/peer/wg_watcher.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/client/internal/peer/wg_watcher.go b/client/internal/peer/wg_watcher.go index e83875208bb..f966107c0c2 100644 --- a/client/internal/peer/wg_watcher.go +++ b/client/internal/peer/wg_watcher.go @@ -95,6 +95,9 @@ func (w *WGWatcher) periodicHandshakeCheck(ctx context.Context, onDisconnectedFn if lastHandshake.IsZero() { elapsed := calcElapsed(enabledTime, *handshake) w.log.Infof("first wg handshake detected within: %.2fsec, (%s)", elapsed, handshake) + if onHandshakeSuccessFn != nil { + onHandshakeSuccessFn(*handshake) + } } lastHandshake = *handshake From ca3e6d93d366e0cf475365f2783f67c532435da1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Wed, 11 Feb 2026 15:02:37 +0100 Subject: [PATCH 08/52] Add Netbird version tracking to client metrics Integrate Netbird version into VictoriaMetrics backend and metrics labels. Update `ClientMetrics` constructor and metric name formatting to include version information. --- client/internal/engine.go | 3 ++- client/internal/metrics/metrics.go | 4 ++-- client/internal/metrics/victoria.go | 7 +++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/client/internal/engine.go b/client/internal/engine.go index 3c1518aa9c3..0fae4e61666 100644 --- a/client/internal/engine.go +++ b/client/internal/engine.go @@ -66,6 +66,7 @@ import ( signal "github.com/netbirdio/netbird/shared/signal/client" sProto "github.com/netbirdio/netbird/shared/signal/proto" "github.com/netbirdio/netbird/util" + "github.com/netbirdio/netbird/version" ) // PeerConnectionTimeoutMax is a timeout of an initial connection attempt to a remote peer. @@ -278,7 +279,7 @@ func NewEngine( connSemaphore: semaphoregroup.NewSemaphoreGroup(connInitLimit), probeStunTurn: relay.NewStunTurnProbe(relay.DefaultCacheTTL), jobExecutor: jobexec.NewExecutor(), - clientMetrics: metrics.NewClientMetrics(deploymentType, true), + clientMetrics: metrics.NewClientMetrics(deploymentType, version.NetbirdVersion(), true), } log.Infof("I am: %s", config.WgPrivateKey.PublicKey().String()) diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 98df1a6a389..7699f5b02c6 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -36,12 +36,12 @@ type ConnectionStageTimestamps struct { // NewClientMetrics creates a new ClientMetrics instance // If enabled is true, uses an OpenTelemetry implementation // If enabled is false, uses a no-op implementation -func NewClientMetrics(deploymentType DeploymentType, enabled bool) *ClientMetrics { +func NewClientMetrics(deploymentType DeploymentType, version string, enabled bool) *ClientMetrics { var impl metricsImplementation if !enabled { impl = &noopMetrics{} } else { - impl = newVictoriaMetrics(deploymentType) + impl = newVictoriaMetrics(deploymentType, version) } return &ClientMetrics{impl: impl} } diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index 63fadc76a1f..10ee713da7a 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -13,14 +13,16 @@ import ( type victoriaMetrics struct { // Static attributes applied to all metrics deploymentType DeploymentType + version string // Metrics set for managing all metrics set *metrics.Set } -func newVictoriaMetrics(deploymentType DeploymentType) metricsImplementation { +func newVictoriaMetrics(deploymentType DeploymentType, version string) metricsImplementation { return &victoriaMetrics{ deploymentType: deploymentType, + version: version, set: metrics.NewSet(), } } @@ -92,11 +94,12 @@ func (m *victoriaMetrics) RecordConnectionStages( // getMetricName constructs a metric name with labels func (m *victoriaMetrics) getMetricName(baseName, connectionType, attemptType string) string { - return fmt.Sprintf(`%s{deployment_type=%q,connection_type=%q,attempt_type=%q}`, + return fmt.Sprintf(`%s{deployment_type=%q,connection_type=%q,attempt_type=%q,version=%q}`, baseName, m.deploymentType.String(), connectionType, attemptType, + m.version, ) } From bec58b85b1c51575a575b8dcc35244e58d24b6b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Wed, 11 Feb 2026 15:18:45 +0100 Subject: [PATCH 09/52] Add sync duration tracking to client metrics Introduce `RecordSyncDuration` for measuring sync message processing time. Update all metrics implementations (VictoriaMetrics, no-op) to support the new method. Refactor `ClientMetrics` to use `AgentInfo` for static agent data. --- client/internal/engine.go | 9 +++++++-- client/internal/metrics/metrics.go | 18 ++++++++++++++++-- client/internal/metrics/noop.go | 5 +++++ client/internal/metrics/victoria.go | 29 +++++++++++++++++++---------- 4 files changed, 47 insertions(+), 14 deletions(-) diff --git a/client/internal/engine.go b/client/internal/engine.go index 0fae4e61666..968f635ecbb 100644 --- a/client/internal/engine.go +++ b/client/internal/engine.go @@ -279,7 +279,10 @@ func NewEngine( connSemaphore: semaphoregroup.NewSemaphoreGroup(connInitLimit), probeStunTurn: relay.NewStunTurnProbe(relay.DefaultCacheTTL), jobExecutor: jobexec.NewExecutor(), - clientMetrics: metrics.NewClientMetrics(deploymentType, version.NetbirdVersion(), true), + clientMetrics: metrics.NewClientMetrics(metrics.AgentInfo{ + DeploymentType: deploymentType, + Version: version.NetbirdVersion(), + }, true), } log.Infof("I am: %s", config.WgPrivateKey.PublicKey().String()) @@ -842,7 +845,9 @@ func (e *Engine) handleAutoUpdateVersion(autoUpdateSettings *mgmProto.AutoUpdate func (e *Engine) handleSync(update *mgmProto.SyncResponse) error { started := time.Now() defer func() { - log.Infof("sync finished in %s", time.Since(started)) + duration := time.Since(started) + log.Infof("sync finished in %s", duration) + e.clientMetrics.RecordSyncDuration(e.ctx, duration) }() e.syncMsgMux.Lock() defer e.syncMsgMux.Unlock() diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 7699f5b02c6..a8fd456a18d 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -6,6 +6,12 @@ import ( "time" ) +// AgentInfo holds static information about the agent +type AgentInfo struct { + DeploymentType DeploymentType + Version string +} + // metricsImplementation defines the internal interface for metrics implementations type metricsImplementation interface { // RecordConnectionStages records connection stage metrics from timestamps @@ -16,6 +22,9 @@ type metricsImplementation interface { timestamps ConnectionStageTimestamps, ) + // RecordSyncDuration records how long it took to process a sync message + RecordSyncDuration(ctx context.Context, duration time.Duration) + // Export exports metrics in Prometheus format Export(w io.Writer) error } @@ -36,12 +45,12 @@ type ConnectionStageTimestamps struct { // NewClientMetrics creates a new ClientMetrics instance // If enabled is true, uses an OpenTelemetry implementation // If enabled is false, uses a no-op implementation -func NewClientMetrics(deploymentType DeploymentType, version string, enabled bool) *ClientMetrics { +func NewClientMetrics(agentInfo AgentInfo, enabled bool) *ClientMetrics { var impl metricsImplementation if !enabled { impl = &noopMetrics{} } else { - impl = newVictoriaMetrics(deploymentType, version) + impl = newVictoriaMetrics(agentInfo) } return &ClientMetrics{impl: impl} } @@ -56,6 +65,11 @@ func (c *ClientMetrics) RecordConnectionStages( c.impl.RecordConnectionStages(ctx, connectionType, isReconnection, timestamps) } +// RecordSyncDuration records the duration of sync message processing +func (c *ClientMetrics) RecordSyncDuration(ctx context.Context, duration time.Duration) { + c.impl.RecordSyncDuration(ctx, duration) +} + // Export exports metrics to the writer func (c *ClientMetrics) Export(w io.Writer) error { return c.impl.Export(w) diff --git a/client/internal/metrics/noop.go b/client/internal/metrics/noop.go index bf8aa432007..fb1f6126e75 100644 --- a/client/internal/metrics/noop.go +++ b/client/internal/metrics/noop.go @@ -3,6 +3,7 @@ package metrics import ( "context" "io" + "time" ) // noopMetrics is a no-op implementation of metricsImplementation @@ -17,6 +18,10 @@ func (s *noopMetrics) RecordConnectionStages( // No-op } +func (s *noopMetrics) RecordSyncDuration(_ context.Context, _ time.Duration) { + // No-op +} + func (s *noopMetrics) Export(_ io.Writer) error { return nil } diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index 10ee713da7a..32706a1f921 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io" + "time" "github.com/VictoriaMetrics/metrics" log "github.com/sirupsen/logrus" @@ -11,19 +12,17 @@ import ( // victoriaMetrics is the VictoriaMetrics implementation of ClientMetrics type victoriaMetrics struct { - // Static attributes applied to all metrics - deploymentType DeploymentType - version string + // Static agent information applied to all metrics + agentInfo AgentInfo // Metrics set for managing all metrics set *metrics.Set } -func newVictoriaMetrics(deploymentType DeploymentType, version string) metricsImplementation { +func newVictoriaMetrics(agentInfo AgentInfo) metricsImplementation { return &victoriaMetrics{ - deploymentType: deploymentType, - version: version, - set: metrics.NewSet(), + agentInfo: agentInfo, + set: metrics.NewSet(), } } @@ -87,7 +86,7 @@ func (m *victoriaMetrics) RecordConnectionStages( ).Update(totalDuration) log.Tracef("peer connection metrics [%s, %s, %s]: creation→semaphore: %.3fs, semaphore→signaling: %.3fs, signaling→connection: %.3fs, connection→handshake: %.3fs, total: %.3fs", - m.deploymentType.String(), connTypeStr, attemptType, + m.agentInfo.DeploymentType.String(), connTypeStr, attemptType, creationToSemaphore, semaphoreToSignaling, signalingToConnection, connectionToHandshake, totalDuration) } @@ -96,13 +95,23 @@ func (m *victoriaMetrics) RecordConnectionStages( func (m *victoriaMetrics) getMetricName(baseName, connectionType, attemptType string) string { return fmt.Sprintf(`%s{deployment_type=%q,connection_type=%q,attempt_type=%q,version=%q}`, baseName, - m.deploymentType.String(), + m.agentInfo.DeploymentType.String(), connectionType, attemptType, - m.version, + m.agentInfo.Version, ) } +// RecordSyncDuration records the duration of sync message processing +func (m *victoriaMetrics) RecordSyncDuration(ctx context.Context, duration time.Duration) { + metricName := fmt.Sprintf(`netbird_sync_duration_seconds{deployment_type=%q,version=%q}`, + m.agentInfo.DeploymentType.String(), + m.agentInfo.Version, + ) + + m.set.GetOrCreateHistogram(metricName).Update(duration.Seconds()) +} + // Export writes metrics in Prometheus text format func (m *victoriaMetrics) Export(w io.Writer) error { if m.set == nil { From 3753bf7fc456651d1adf25a65fc8858e3c1c4f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Wed, 11 Feb 2026 15:28:59 +0100 Subject: [PATCH 10/52] Remove no-op metrics implementation and simplify ClientMetrics constructor Eliminate unused `noopMetrics` and refactor `ClientMetrics` to always use the VictoriaMetrics implementation. Update associated logic to reflect these changes. --- client/internal/engine.go | 8 ++++---- client/internal/metrics/metrics.go | 21 +++++++++++---------- client/internal/metrics/noop.go | 27 --------------------------- 3 files changed, 15 insertions(+), 41 deletions(-) delete mode 100644 client/internal/metrics/noop.go diff --git a/client/internal/engine.go b/client/internal/engine.go index 968f635ecbb..ebe4363480e 100644 --- a/client/internal/engine.go +++ b/client/internal/engine.go @@ -279,12 +279,12 @@ func NewEngine( connSemaphore: semaphoregroup.NewSemaphoreGroup(connInitLimit), probeStunTurn: relay.NewStunTurnProbe(relay.DefaultCacheTTL), jobExecutor: jobexec.NewExecutor(), - clientMetrics: metrics.NewClientMetrics(metrics.AgentInfo{ - DeploymentType: deploymentType, - Version: version.NetbirdVersion(), - }, true), } + engine.clientMetrics = metrics.NewClientMetrics(metrics.AgentInfo{ + DeploymentType: deploymentType, + Version: version.NetbirdVersion()}) + log.Infof("I am: %s", config.WgPrivateKey.PublicKey().String()) return engine } diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index a8fd456a18d..4e160270243 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -43,16 +43,8 @@ type ConnectionStageTimestamps struct { } // NewClientMetrics creates a new ClientMetrics instance -// If enabled is true, uses an OpenTelemetry implementation -// If enabled is false, uses a no-op implementation -func NewClientMetrics(agentInfo AgentInfo, enabled bool) *ClientMetrics { - var impl metricsImplementation - if !enabled { - impl = &noopMetrics{} - } else { - impl = newVictoriaMetrics(agentInfo) - } - return &ClientMetrics{impl: impl} +func NewClientMetrics(agentInfo AgentInfo) *ClientMetrics { + return &ClientMetrics{impl: newVictoriaMetrics(agentInfo)} } // RecordConnectionStages calculates stage durations from timestamps and records them @@ -62,15 +54,24 @@ func (c *ClientMetrics) RecordConnectionStages( isReconnection bool, timestamps ConnectionStageTimestamps, ) { + if c == nil { + return + } c.impl.RecordConnectionStages(ctx, connectionType, isReconnection, timestamps) } // RecordSyncDuration records the duration of sync message processing func (c *ClientMetrics) RecordSyncDuration(ctx context.Context, duration time.Duration) { + if c == nil { + return + } c.impl.RecordSyncDuration(ctx, duration) } // Export exports metrics to the writer func (c *ClientMetrics) Export(w io.Writer) error { + if c == nil { + return nil + } return c.impl.Export(w) } diff --git a/client/internal/metrics/noop.go b/client/internal/metrics/noop.go deleted file mode 100644 index fb1f6126e75..00000000000 --- a/client/internal/metrics/noop.go +++ /dev/null @@ -1,27 +0,0 @@ -package metrics - -import ( - "context" - "io" - "time" -) - -// noopMetrics is a no-op implementation of metricsImplementation -type noopMetrics struct{} - -func (s *noopMetrics) RecordConnectionStages( - _ context.Context, - _ ConnectionType, - _ bool, - _ ConnectionStageTimestamps, -) { - // No-op -} - -func (s *noopMetrics) RecordSyncDuration(_ context.Context, _ time.Duration) { - // No-op -} - -func (s *noopMetrics) Export(_ io.Writer) error { - return nil -} From 7e276a40d9f97acb6f8b1442f8245af72cc54751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Wed, 11 Feb 2026 15:41:32 +0100 Subject: [PATCH 11/52] Add total duration tracking for connection attempts Calculate total duration for both initial connections and reconnections, accounting for different timestamp scenarios. Update `Export` method to include Prometheus HELP comments. --- client/internal/metrics/victoria.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index 32706a1f921..18889856d40 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -52,8 +52,13 @@ func (m *victoriaMetrics) RecordConnectionStages( connectionToHandshake = timestamps.WgHandshakeSuccess.Sub(timestamps.ConnectionReady).Seconds() } + // Calculate total duration: + // For initial connections: Created → WgHandshakeSuccess + // For reconnections: Signaling → WgHandshakeSuccess (since Created is not tracked) if !timestamps.Created.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.Created).Seconds() + } else if !timestamps.Signaling.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { + totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.Signaling).Seconds() } // Determine attempt type @@ -112,7 +117,7 @@ func (m *victoriaMetrics) RecordSyncDuration(ctx context.Context, duration time. m.set.GetOrCreateHistogram(metricName).Update(duration.Seconds()) } -// Export writes metrics in Prometheus text format +// Export writes metrics in Prometheus text format with HELP comments func (m *victoriaMetrics) Export(w io.Writer) error { if m.set == nil { return fmt.Errorf("metrics set not initialized") From cf0a1fa0e2decaaf34c8729d67c0cc2d94af4c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Wed, 11 Feb 2026 15:42:00 +0100 Subject: [PATCH 12/52] Add metrics push support to VictoriaMetrics integration --- client/internal/connect.go | 41 ++- client/internal/engine.go | 13 +- client/internal/engine_test.go | 11 +- client/internal/metrics/docs/README.md | 181 +++++++++++++ .../metrics/docs/docker-compose.victoria.yml | 43 +++ .../provisioning/dashboards/dashboard.yml | 12 + .../json/netbird-connection-metrics.json | 245 ++++++++++++++++++ .../datasources/victoriametrics.yml | 12 + client/internal/metrics/env.go | 76 ++++++ client/internal/metrics/metrics.go | 104 +++++++- client/internal/metrics/push.go | 151 +++++++++++ client/internal/metrics/victoria.go | 39 ++- 12 files changed, 885 insertions(+), 43 deletions(-) create mode 100644 client/internal/metrics/docs/README.md create mode 100644 client/internal/metrics/docs/docker-compose.victoria.yml create mode 100644 client/internal/metrics/docs/grafana/provisioning/dashboards/dashboard.yml create mode 100644 client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json create mode 100644 client/internal/metrics/docs/grafana/provisioning/datasources/victoriametrics.yml create mode 100644 client/internal/metrics/env.go create mode 100644 client/internal/metrics/push.go diff --git a/client/internal/connect.go b/client/internal/connect.go index 17fc20c427a..392ba382d6a 100644 --- a/client/internal/connect.go +++ b/client/internal/connect.go @@ -23,6 +23,7 @@ import ( "github.com/netbirdio/netbird/client/iface/netstack" "github.com/netbirdio/netbird/client/internal/dns" "github.com/netbirdio/netbird/client/internal/listener" + "github.com/netbirdio/netbird/client/internal/metrics" "github.com/netbirdio/netbird/client/internal/peer" "github.com/netbirdio/netbird/client/internal/profilemanager" "github.com/netbirdio/netbird/client/internal/statemanager" @@ -49,8 +50,9 @@ type ConnectClient struct { statusRecorder *peer.Status doInitialAutoUpdate bool - engine *Engine - engineMutex sync.Mutex + engine *Engine + engineMutex sync.Mutex + clientMetrics *metrics.ClientMetrics persistSyncResponse bool } @@ -131,10 +133,34 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan } }() + // Stop metrics push on exit + defer func() { + if c.clientMetrics != nil { + c.clientMetrics.StopPush() + } + }() + log.Infof("starting NetBird client version %s on %s/%s", version.NetbirdVersion(), runtime.GOOS, runtime.GOARCH) nbnet.Init() + // Initialize metrics once at startup + if c.clientMetrics == nil { + // Start with unknown deployment type, will be updated on first successful connection + agentInfo := metrics.AgentInfo{ + DeploymentType: metrics.DeploymentTypeUnknown, + Version: version.NetbirdVersion(), + OS: runtime.GOOS, + } + c.clientMetrics = metrics.NewClientMetrics(agentInfo) + log.Debugf("initialized client metrics") + + // Start metrics push if enabled (uses daemon context, persists across engine restarts) + if metrics.IsMetricsPushEnabled() { + c.clientMetrics.StartPush(c.ctx, metrics.DefaultPushConfig) + } + } + backOff := &backoff.ExponentialBackOff{ InitialInterval: time.Second, RandomizationFactor: 1, @@ -222,6 +248,15 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan mgmNotifier := statusRecorderToMgmConnStateNotifier(c.statusRecorder) mgmClient.SetConnStateListener(mgmNotifier) + // Update metrics with actual deployment type after connection + deploymentType := metrics.DetermineDeploymentType(mgmClient.GetServerURL()) + agentInfo := metrics.AgentInfo{ + DeploymentType: deploymentType, + Version: version.NetbirdVersion(), + OS: runtime.GOOS, + } + c.clientMetrics.UpdateAgentInfo(agentInfo) + log.Debugf("connected to the Management service %s", c.config.ManagementURL.Host) defer func() { if err = mgmClient.Close(); err != nil { @@ -308,7 +343,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan checks := loginResp.GetChecks() c.engineMutex.Lock() - engine := NewEngine(engineCtx, cancel, signalClient, mgmClient, relayManager, engineConfig, mobileDependency, c.statusRecorder, checks, stateManager) + engine := NewEngine(engineCtx, cancel, signalClient, mgmClient, relayManager, engineConfig, mobileDependency, c.statusRecorder, checks, stateManager, c.clientMetrics) engine.SetSyncResponsePersistence(c.persistSyncResponse) c.engine = engine c.engineMutex.Unlock() diff --git a/client/internal/engine.go b/client/internal/engine.go index ebe4363480e..2100c0ae2fb 100644 --- a/client/internal/engine.go +++ b/client/internal/engine.go @@ -66,7 +66,6 @@ import ( signal "github.com/netbirdio/netbird/shared/signal/client" sProto "github.com/netbirdio/netbird/shared/signal/proto" "github.com/netbirdio/netbird/util" - "github.com/netbirdio/netbird/version" ) // PeerConnectionTimeoutMax is a timeout of an initial connection attempt to a remote peer. @@ -252,13 +251,8 @@ func NewEngine( statusRecorder *peer.Status, checks []*mgmProto.Checks, stateManager *statemanager.Manager, + clientMetrics *metrics.ClientMetrics, ) *Engine { - // Initialize metrics based on deployment type - var deploymentType metrics.DeploymentType - if mgmClient != nil { - deploymentType = metrics.DetermineDeploymentType(mgmClient.GetServerURL()) - } - engine := &Engine{ clientCtx: clientCtx, clientCancel: clientCancel, @@ -279,12 +273,9 @@ func NewEngine( connSemaphore: semaphoregroup.NewSemaphoreGroup(connInitLimit), probeStunTurn: relay.NewStunTurnProbe(relay.DefaultCacheTTL), jobExecutor: jobexec.NewExecutor(), + clientMetrics: clientMetrics, } - engine.clientMetrics = metrics.NewClientMetrics(metrics.AgentInfo{ - DeploymentType: deploymentType, - Version: version.NetbirdVersion()}) - log.Infof("I am: %s", config.WgPrivateKey.PublicKey().String()) return engine } diff --git a/client/internal/engine_test.go b/client/internal/engine_test.go index 012c8ad6e27..ba9cb3ad109 100644 --- a/client/internal/engine_test.go +++ b/client/internal/engine_test.go @@ -267,6 +267,7 @@ func TestEngine_SSH(t *testing.T) { peer.NewRecorder("https://mgm"), nil, nil, + nil, ) engine.dnsServer = &dns.MockServer{ @@ -434,7 +435,7 @@ func TestEngine_UpdateNetworkMap(t *testing.T) { WgPrivateKey: key, WgPort: 33100, MTU: iface.DefaultMTU, - }, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil) + }, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil, nil) wgIface := &MockWGIface{ NameFunc: func() string { return "utun102" }, @@ -653,7 +654,7 @@ func TestEngine_Sync(t *testing.T) { WgPrivateKey: key, WgPort: 33100, MTU: iface.DefaultMTU, - }, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil) + }, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil, nil) engine.ctx = ctx engine.dnsServer = &dns.MockServer{ @@ -818,7 +819,7 @@ func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) { WgPrivateKey: key, WgPort: 33100, MTU: iface.DefaultMTU, - }, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil) + }, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil, nil) engine.ctx = ctx newNet, err := stdnet.NewNet(context.Background(), nil) if err != nil { @@ -1020,7 +1021,7 @@ func TestEngine_UpdateNetworkMapWithDNSUpdate(t *testing.T) { WgPrivateKey: key, WgPort: 33100, MTU: iface.DefaultMTU, - }, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil) + }, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil, nil) engine.ctx = ctx newNet, err := stdnet.NewNet(context.Background(), nil) @@ -1546,7 +1547,7 @@ func createEngine(ctx context.Context, cancel context.CancelFunc, setupKey strin } relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU) - e, err := NewEngine(ctx, cancel, signalClient, mgmtClient, relayMgr, conf, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil), nil + e, err := NewEngine(ctx, cancel, signalClient, mgmtClient, relayMgr, conf, MobileDependency{}, peer.NewRecorder("https://mgm"), nil, nil, nil), nil e.ctx = ctx return e, err } diff --git a/client/internal/metrics/docs/README.md b/client/internal/metrics/docs/README.md new file mode 100644 index 00000000000..a51165433e5 --- /dev/null +++ b/client/internal/metrics/docs/README.md @@ -0,0 +1,181 @@ +# Client Metrics + +Internal documentation for the NetBird client metrics system. + +## Overview + +Client metrics track connection performance and sync durations. Metrics are: +- Collected in-memory using VictoriaMetrics histograms +- Pushed periodically to a VictoriaMetrics server +- Disabled by default (opt-in via environment variable) +- Managed at daemon layer (survives engine restarts) + +## Architecture + +### Layer Separation + +``` +Daemon Layer (connect.go) + ├─ Creates ClientMetrics instance once + ├─ Starts/stops push lifecycle + └─ Updates AgentInfo on profile switch + │ + ▼ +Engine Layer (engine.go) + └─ Records metrics via ClientMetrics methods +``` + +### Data Flow + +``` +NetBird Client + ├─ Records metrics in memory (histograms) + ├─ Push to VictoriaMetrics via HTTP POST + └─ Metrics endpoint: /api/v1/import/prometheus + │ + ▼ +VictoriaMetrics (port 8428) + ├─ Stores time-series data + ├─ 12 month retention + └─ Prometheus-compatible query API + │ + ▼ +Grafana (port 3000) + └─ Pre-configured dashboard +``` + +## Metrics Collected + +### Connection Stage Timing + +1. `netbird_peer_connection_stage_creation_to_semaphore` +2. `netbird_peer_connection_stage_semaphore_to_signaling` +3. `netbird_peer_connection_stage_signaling_to_connection` +4. `netbird_peer_connection_stage_connection_to_handshake` +5. `netbird_peer_connection_total_creation_to_handshake` + +Labels: +- `deployment_type`: "cloud" | "selfhosted" | "unknown" +- `connection_type`: "ice" | "relay" +- `attempt_type`: "initial" | "reconnection" +- `version`: NetBird version string +- `os`: Operating system (linux, darwin, windows, android, ios, etc.) + +### Sync Duration + +Tracks time to process sync messages from management server: + +1. `netbird_sync_duration_seconds` + +Labels: +- `deployment_type`: "cloud" | "selfhosted" | "unknown" +- `version`: NetBird version string +- `os`: Operating system (linux, darwin, windows, android, ios, etc.) + +## Configuration + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-----------------------------------------| +| `NB_METRICS_ENABLED` | `false` | Enable metrics push | +| `NB_METRICS_SERVER_URL` | `https://api.netbird.io:8428/api/v1/import/prometheus` | VictoriaMetrics endpoint | +| `NB_METRICS_INTERVAL` | | Push interval (e.g., "1m", "30m", "4h") | + +### Configuration Precedence + +For URL and Interval, the precedence is: +1. **Config parameter** - Explicitly passed to `StartPush()` +2. **Environment variable** - `NB_METRICS_SERVER_URL` / `NB_METRICS_INTERVAL` +3. **Default value** - From `metrics.DefaultPushConfig` + +## Push Behavior + +1. `StartPush()` spawns background goroutine with ticker +2. First push happens immediately on startup +3. Periodically: `push()` → `Export()` → HTTP POST +4. On failure: log error, continue (non-blocking) +5. On success: log debug message +6. `StopPush()` cancels context and waits for goroutine + +**Important:** +- Metrics **accumulate** in memory (cumulative histograms) +- Metrics are **NOT reset** after push (correct Prometheus behavior) +- VictoriaMetrics calculates rates from deltas between pushes +- Each push sends **all** accumulated metrics +- Metrics only reset on process restart + +## Local Development Setup + +### 1. Start VictoriaMetrics + +```bash +# From this directory +docker-compose -f docker-compose.victoria.yml up -d + +# View logs +docker-compose -f docker-compose.victoria.yml logs -f +``` + +**Access:** +- VictoriaMetrics UI: http://localhost:8428 +- Grafana: http://localhost:3001 (admin/admin) + +### 2. Configure Client + +```bash +export NB_METRICS_ENABLED=true +export NB_METRICS_SERVER_URL=http://localhost:8428/api/v1/import/prometheus +export NB_METRICS_INTERVAL=1m + +# Run client +cd ../../../.. +go run main.go up +``` + +### 3. Verify Metrics + +```bash +# Watch client logs +go run main.go up 2>&1 | grep -i metric + +# List all available metric names +curl http://localhost:8428/api/v1/label/__name__/values + +# Query specific metric +curl 'http://localhost:8428/api/v1/query?query=netbird_peer_connection_total_creation_to_handshake_count' +``` + +### 4. View in Grafana + +Open http://localhost:3001/d/netbird-connection-metrics + +Dashboard JSON location: +``` +grafana/provisioning/dashboards/json/netbird-connection-metrics.json +``` + +Export modified dashboards from Grafana UI and replace this file. + +## Querying Metrics + +### VictoriaMetrics UI + +Open http://localhost:8428/vmui + +```promql +# P95 connection time +histogram_quantile(0.95, netbird_peer_connection_total_creation_to_handshake) + +# Connection rate +rate(netbird_peer_connection_total_creation_to_handshake_count[5m]) + +# Average sync duration +rate(netbird_sync_duration_seconds_sum[5m]) / rate(netbird_sync_duration_seconds_count[5m]) +``` + +### API Queries + +```bash +curl 'http://localhost:8428/api/v1/query?query=netbird_peer_connection_total_creation_to_handshake_count' +``` diff --git a/client/internal/metrics/docs/docker-compose.victoria.yml b/client/internal/metrics/docs/docker-compose.victoria.yml new file mode 100644 index 00000000000..75005a39e19 --- /dev/null +++ b/client/internal/metrics/docs/docker-compose.victoria.yml @@ -0,0 +1,43 @@ +version: '3.8' + +services: + victoriametrics: + container_name: victoriametrics + image: victoriametrics/victoria-metrics:latest + ports: + - "8428:8428" + volumes: + - victoria-metrics-data:/victoria-metrics-data + command: + - "--storageDataPath=/victoria-metrics-data" + - "--httpListenAddr=:8428" + - "--retentionPeriod=12" # Keep data for 12 months + restart: unless-stopped + networks: + - metrics + + grafana: + container_name: grafana + image: grafana/grafana:latest + ports: + - "3001:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning + depends_on: + - victoriametrics + restart: unless-stopped + networks: + - metrics + +volumes: + victoria-metrics-data: + grafana-data: + +networks: + metrics: + driver: bridge \ No newline at end of file diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/dashboard.yml b/client/internal/metrics/docs/grafana/provisioning/dashboards/dashboard.yml new file mode 100644 index 00000000000..a7e8d3989d5 --- /dev/null +++ b/client/internal/metrics/docs/grafana/provisioning/dashboards/dashboard.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'NetBird Dashboards' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards/json \ No newline at end of file diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json new file mode 100644 index 00000000000..aa734fe39f7 --- /dev/null +++ b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json @@ -0,0 +1,245 @@ +{ + "title": "NetBird Client Connection Metrics", + "tags": ["netbird", "connections"], + "timezone": "browser", + "panels": [ + { + "id": 1, + "title": "Connection Stage Durations (Average)", + "type": "timeseries", + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(netbird_peer_connection_stage_creation_to_semaphore_sum / netbird_peer_connection_stage_creation_to_semaphore_count) * 1000", + "legendFormat": "1. Creation→Semaphore ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", + "refId": "A" + }, + { + "expr": "(netbird_peer_connection_stage_semaphore_to_signaling_sum / netbird_peer_connection_stage_semaphore_to_signaling_count) * 1000", + "legendFormat": "2. Semaphore→Signaling ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", + "refId": "B" + }, + { + "expr": "(netbird_peer_connection_stage_signaling_to_connection_sum / netbird_peer_connection_stage_signaling_to_connection_count) * 1000", + "legendFormat": "3. Signaling→Connection ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", + "refId": "C" + }, + { + "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum / netbird_peer_connection_stage_connection_to_handshake_count) * 1000", + "legendFormat": "4. Connection→Handshake ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", + "refId": "D" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "stacking": { + "mode": "normal" + } + } + } + } + }, + { + "id": 8, + "title": "Sync Duration (Average)", + "type": "timeseries", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 9 + }, + "targets": [ + { + "expr": "(rate(netbird_sync_duration_seconds_sum[5m]) / rate(netbird_sync_duration_seconds_count[5m])) * 1000", + "legendFormat": "{{deployment_type}}/{{os}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0 + } + } + }, + { + "id": 9, + "title": "Sync Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 9 + }, + "targets": [ + { + "expr": "rate(netbird_sync_duration_seconds_count[5m]) * 60", + "legendFormat": "{{deployment_type}}/{{os}}", + "refId": "A" + } + ], + "options": { + "colorMode": "value", + "graphMode": "area" + }, + "fieldConfig": { + "defaults": { + "unit": "ops/min", + "decimals": 2 + } + } + }, + { + "id": 2, + "title": "Total Connection Time (Average)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "targets": [ + { + "expr": "(netbird_peer_connection_total_creation_to_handshake_sum / netbird_peer_connection_total_creation_to_handshake_count) * 1000", + "legendFormat": "{{connection_type}}/{{attempt_type}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0 + } + } + }, + { + "id": 3, + "title": "Initial Connection - Average Time by Stage", + "type": "bargauge", + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 15 + }, + "targets": [ + { + "expr": "(netbird_peer_connection_stage_creation_to_semaphore_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_creation_to_semaphore_count{attempt_type=\"initial\"}) * 1000", + "legendFormat": "Creation→Semaphore", + "refId": "A" + }, + { + "expr": "(netbird_peer_connection_stage_semaphore_to_signaling_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_semaphore_to_signaling_count{attempt_type=\"initial\"}) * 1000", + "legendFormat": "Semaphore→Signaling", + "refId": "B" + }, + { + "expr": "(netbird_peer_connection_stage_signaling_to_connection_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_signaling_to_connection_count{attempt_type=\"initial\"}) * 1000", + "legendFormat": "Signaling→Connection", + "refId": "C" + }, + { + "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_connection_to_handshake_count{attempt_type=\"initial\"}) * 1000", + "legendFormat": "Connection→Handshake", + "refId": "D" + } + ], + "options": { + "orientation": "horizontal", + "displayMode": "gradient" + }, + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0 + } + } + }, + { + "id": 7, + "title": "Reconnection - Average Time by Stage", + "type": "bargauge", + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 15 + }, + "targets": [ + { + "expr": "(netbird_peer_connection_stage_signaling_to_connection_sum{attempt_type=\"reconnection\"} / netbird_peer_connection_stage_signaling_to_connection_count{attempt_type=\"reconnection\"}) * 1000", + "legendFormat": "Signaling→Connection", + "refId": "A" + }, + { + "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum{attempt_type=\"reconnection\"} / netbird_peer_connection_stage_connection_to_handshake_count{attempt_type=\"reconnection\"}) * 1000", + "legendFormat": "Connection→Handshake", + "refId": "B" + } + ], + "options": { + "orientation": "horizontal", + "displayMode": "gradient" + }, + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0 + } + } + }, + { + "id": 4, + "title": "Total Connections", + "type": "stat", + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 23 + }, + "targets": [ + { + "expr": "netbird_peer_connection_total_creation_to_handshake_count", + "legendFormat": "{{connection_type}}/{{attempt_type}}", + "refId": "A" + } + ], + "options": { + "colorMode": "value", + "graphMode": "area" + } + }, + { + "id": 6, + "title": "ICE vs Relay", + "type": "piechart", + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 23 + }, + "targets": [ + { + "expr": "sum(netbird_peer_connection_total_creation_to_handshake_count) by (connection_type)", + "legendFormat": "{{connection_type}}", + "refId": "A" + } + ] + } + ], + "schemaVersion": 27, + "version": 3, + "refresh": "30s" +} diff --git a/client/internal/metrics/docs/grafana/provisioning/datasources/victoriametrics.yml b/client/internal/metrics/docs/grafana/provisioning/datasources/victoriametrics.yml new file mode 100644 index 00000000000..4b1c84b9552 --- /dev/null +++ b/client/internal/metrics/docs/grafana/provisioning/datasources/victoriametrics.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +datasources: + - name: VictoriaMetrics + type: prometheus + access: proxy + url: http://victoriametrics:8428 + isDefault: true + editable: true + jsonData: + httpMethod: POST + timeInterval: 30s \ No newline at end of file diff --git a/client/internal/metrics/env.go b/client/internal/metrics/env.go new file mode 100644 index 00000000000..c20b87ff9e2 --- /dev/null +++ b/client/internal/metrics/env.go @@ -0,0 +1,76 @@ +package metrics + +import ( + "net/url" + "os" + "strconv" + "time" + + log "github.com/sirupsen/logrus" +) + +const ( + // EnvMetricsEnabled is the environment variable to enable metrics push (default: disabled) + EnvMetricsEnabled = "NB_METRICS_ENABLED" + + // EnvMetricsServerURL is the environment variable to override the metrics server URL + EnvMetricsServerURL = "NB_METRICS_SERVER_URL" + + // EnvMetricsInterval is the environment variable to set the push interval (default: 4h) + // Format: duration string like "1h", "30m", "4h" + EnvMetricsInterval = "NB_METRICS_INTERVAL" +) + +var ( + defaultMetricsURL *url.URL +) + +func init() { + var err error + defaultMetricsURL, err = url.Parse("https://api.netbird.io:8428/api/v1/import/prometheus") + if err != nil { + log.Fatalf("failed to parse default metrics URL: %v", err) + } +} + +// IsMetricsPushEnabled returns true if metrics push is enabled via NB_METRICS_ENABLED env var +// Disabled by default. Set NB_METRICS_ENABLED=true to enable +func IsMetricsPushEnabled() bool { + enabled, _ := strconv.ParseBool(os.Getenv(EnvMetricsEnabled)) + return enabled +} + +// getMetricsServerURL returns the metrics server URL (never nil) +// First checks NB_METRICS_SERVER_URL environment variable and validates it +// If not set or invalid, returns the default NetBird metrics server (api.netbird.io:8428) +func getMetricsServerURL() url.URL { + // Check environment variable first + if envURLStr := os.Getenv(EnvMetricsServerURL); envURLStr != "" { + envURL, err := url.Parse(envURLStr) + if err != nil { + log.Warnf("invalid metrics server URL from env %q: %v, using default", envURLStr, err) + return *defaultMetricsURL + } + return *envURL + } + + return *defaultMetricsURL +} + +// getMetricsInterval returns the metrics push interval from environment variable +// If not set or invalid, returns 0 (which will use the default in NewPush) +func getMetricsInterval() time.Duration { + if intervalStr := os.Getenv(EnvMetricsInterval); intervalStr != "" { + interval, err := time.ParseDuration(intervalStr) + if err != nil { + log.Warnf("invalid metrics interval from env %q: %v, using default", intervalStr, err) + return 0 + } + if interval <= 0 { + log.Warnf("invalid metrics interval from env %q: must be positive, using default", intervalStr) + return 0 + } + return interval + } + return 0 +} diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 4e160270243..5265545be5c 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -2,14 +2,19 @@ package metrics import ( "context" + "fmt" "io" + "sync" "time" + + log "github.com/sirupsen/logrus" ) // AgentInfo holds static information about the agent type AgentInfo struct { DeploymentType DeploymentType Version string + OS string // runtime.GOOS (linux, darwin, windows, etc.) } // metricsImplementation defines the internal interface for metrics implementations @@ -17,13 +22,14 @@ type metricsImplementation interface { // RecordConnectionStages records connection stage metrics from timestamps RecordConnectionStages( ctx context.Context, + agentInfo AgentInfo, connectionType ConnectionType, isReconnection bool, timestamps ConnectionStageTimestamps, ) // RecordSyncDuration records how long it took to process a sync message - RecordSyncDuration(ctx context.Context, duration time.Duration) + RecordSyncDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration) // Export exports metrics in Prometheus format Export(w io.Writer) error @@ -31,6 +37,14 @@ type metricsImplementation interface { type ClientMetrics struct { impl metricsImplementation + + agentInfo AgentInfo + mu sync.RWMutex + + push *Push + pushMu sync.Mutex + wg sync.WaitGroup + pushCancel context.CancelFunc } // ConnectionStageTimestamps holds timestamps for each connection stage @@ -42,9 +56,23 @@ type ConnectionStageTimestamps struct { WgHandshakeSuccess time.Time } +// String returns a human-readable representation of the connection stage timestamps +func (c ConnectionStageTimestamps) String() string { + return fmt.Sprintf("ConnectionStageTimestamps{Created=%v, SemaphoreAcquired=%v, Signaling=%v, ConnectionReady=%v, WgHandshakeSuccess=%v}", + c.Created.Format(time.RFC3339Nano), + c.SemaphoreAcquired.Format(time.RFC3339Nano), + c.Signaling.Format(time.RFC3339Nano), + c.ConnectionReady.Format(time.RFC3339Nano), + c.WgHandshakeSuccess.Format(time.RFC3339Nano), + ) +} + // NewClientMetrics creates a new ClientMetrics instance func NewClientMetrics(agentInfo AgentInfo) *ClientMetrics { - return &ClientMetrics{impl: newVictoriaMetrics(agentInfo)} + return &ClientMetrics{ + impl: newVictoriaMetrics(), + agentInfo: agentInfo, + } } // RecordConnectionStages calculates stage durations from timestamps and records them @@ -57,7 +85,11 @@ func (c *ClientMetrics) RecordConnectionStages( if c == nil { return } - c.impl.RecordConnectionStages(ctx, connectionType, isReconnection, timestamps) + c.mu.RLock() + agentInfo := c.agentInfo + c.mu.RUnlock() + + c.impl.RecordConnectionStages(ctx, agentInfo, connectionType, isReconnection, timestamps) } // RecordSyncDuration records the duration of sync message processing @@ -65,7 +97,28 @@ func (c *ClientMetrics) RecordSyncDuration(ctx context.Context, duration time.Du if c == nil { return } - c.impl.RecordSyncDuration(ctx, duration) + c.mu.RLock() + agentInfo := c.agentInfo + c.mu.RUnlock() + + c.impl.RecordSyncDuration(ctx, agentInfo, duration) +} + +// UpdateAgentInfo updates the agent information (e.g., when switching profiles) +func (c *ClientMetrics) UpdateAgentInfo(agentInfo AgentInfo) { + if c == nil { + return + } + + c.mu.Lock() + oldDeploymentType := c.agentInfo.DeploymentType + c.agentInfo = agentInfo + c.mu.Unlock() + + if oldDeploymentType != agentInfo.DeploymentType { + log.Infof("metrics deployment type updated: %s -> %s", + oldDeploymentType.String(), agentInfo.DeploymentType.String()) + } } // Export exports metrics to the writer @@ -75,3 +128,46 @@ func (c *ClientMetrics) Export(w io.Writer) error { } return c.impl.Export(w) } + +// StartPush starts periodic pushing of metrics with the given configuration +// Precedence: config parameter > env var > DefaultPushConfig +func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) { + if c == nil { + return + } + + c.pushMu.Lock() + defer c.pushMu.Unlock() + + if c.push != nil { + log.Warnf("metrics push already running") + return + } + + ctx, cancel := context.WithCancel(ctx) + c.pushCancel = cancel + + push := NewPush(c.impl, config) + c.wg.Add(1) + go func() { + defer c.wg.Done() + push.Start(ctx) + }() + c.push = push + + log.Infof("started metrics push to %s with interval %s", push.pushURL, push.interval) +} + +func (c *ClientMetrics) StopPush() { + if c == nil { + return + } + c.pushMu.Lock() + defer c.pushMu.Unlock() + if c.push == nil { + return + } + + c.pushCancel() + c.wg.Wait() +} diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go new file mode 100644 index 00000000000..b6223d988e3 --- /dev/null +++ b/client/internal/metrics/push.go @@ -0,0 +1,151 @@ +package metrics + +import ( + "bytes" + "context" + "fmt" + "net/http" + "net/url" + "time" + + log "github.com/sirupsen/logrus" +) + +const ( + // DefaultPushInterval is the default interval for pushing metrics + DefaultPushInterval = 5 * time.Minute +) + +var ( + DefaultPushConfig = PushConfig{ + URL: nil, // Will use getMetricsServerURL() + Interval: 0, // Will use getMetricsInterval() or DefaultPushInterval + } +) + +// PushConfig holds configuration for metrics push +type PushConfig struct { + // URL is the metrics server URL. If nil, uses env var or default + URL *url.URL + // Interval is how often to push metrics. If 0, uses env var or default (4h) + Interval time.Duration +} + +// Push handles periodic pushing of metrics to VictoriaMetrics +type Push struct { + metrics metricsImplementation + pushURL string + interval time.Duration + client *http.Client +} + +// NewPush creates a new Push instance with configuration resolution +// Precedence: config parameter > env var > DefaultPushConfig +func NewPush(metrics metricsImplementation, config PushConfig) *Push { + // Resolve URL: config > env var (always returns valid URL) + var pushURL url.URL + if config.URL != nil { + pushURL = *config.URL + } else { + pushURL = getMetricsServerURL() + } + + // Resolve interval: config > env var > default + interval := config.Interval + if interval == 0 { + if envInterval := getMetricsInterval(); envInterval > 0 { + interval = envInterval + } else { + interval = DefaultPushInterval + } + } + + return &Push{ + metrics: metrics, + pushURL: pushURL.String(), + interval: interval, + client: &http.Client{ + Timeout: 10 * time.Second, + }, + } +} + +// Start starts the periodic push ticker +// Pushes immediately on start, then every interval +func (p *Push) Start(ctx context.Context) { + if p.pushURL == "" { + log.Debug("metrics push URL not configured, skipping push") + return + } + + // Push immediately on start + if err := p.push(ctx); err != nil { + log.Errorf("failed to push metrics on start: %v", err) + } + + ticker := time.NewTicker(p.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + log.Debug("stopping metrics push") + return + case <-ticker.C: + if err := p.push(ctx); err != nil { + log.Errorf("failed to push metrics: %v", err) + } + } + } +} + +// push exports metrics and sends them to VictoriaMetrics +func (p *Push) push(ctx context.Context) error { + // Export metrics to buffer + var buf bytes.Buffer + if err := p.metrics.Export(&buf); err != nil { + return fmt.Errorf("export metrics: %w", err) + } + + // Don't push if there are no metrics + if buf.Len() == 0 { + log.Tracef("no metrics to push") + return nil + } + + // Log what we're pushing (first 500 bytes) + preview := buf.String() + if len(preview) > 500 { + preview = preview[:500] + } + log.Tracef("pushing metrics (%d bytes): %s", buf.Len(), preview) + + // Create HTTP request + req, err := http.NewRequestWithContext(ctx, "POST", p.pushURL, &buf) + if err != nil { + return fmt.Errorf("create request: %w", err) + } + req.Header.Set("Content-Type", "text/plain") + + // Send request + resp, err := p.client.Do(req) + if err != nil { + return fmt.Errorf("send request: %w", err) + } + defer func() { + if resp.Body == nil { + return + } + if err := resp.Body.Close(); err != nil { + log.Warnf("failed to close response body: %v", err) + } + }() + + // Check response status + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("push failed with status %d", resp.StatusCode) + } + + log.Debugf("successfully pushed metrics to %s", p.pushURL) + return nil +} diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index 18889856d40..78826a395fb 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -12,23 +12,20 @@ import ( // victoriaMetrics is the VictoriaMetrics implementation of ClientMetrics type victoriaMetrics struct { - // Static agent information applied to all metrics - agentInfo AgentInfo - // Metrics set for managing all metrics set *metrics.Set } -func newVictoriaMetrics(agentInfo AgentInfo) metricsImplementation { +func newVictoriaMetrics() metricsImplementation { return &victoriaMetrics{ - agentInfo: agentInfo, - set: metrics.NewSet(), + set: metrics.NewSet(), } } // RecordConnectionStages records the duration of each connection stage from timestamps func (m *victoriaMetrics) RecordConnectionStages( ctx context.Context, + agentInfo AgentInfo, connectionType ConnectionType, isReconnection bool, timestamps ConnectionStageTimestamps, @@ -71,47 +68,49 @@ func (m *victoriaMetrics) RecordConnectionStages( // Record observations using histograms m.set.GetOrCreateHistogram( - m.getMetricName("netbird_peer_connection_stage_creation_to_semaphore", connTypeStr, attemptType), + m.getMetricName(agentInfo, "netbird_peer_connection_stage_creation_to_semaphore", connTypeStr, attemptType), ).Update(creationToSemaphore) m.set.GetOrCreateHistogram( - m.getMetricName("netbird_peer_connection_stage_semaphore_to_signaling", connTypeStr, attemptType), + m.getMetricName(agentInfo, "netbird_peer_connection_stage_semaphore_to_signaling", connTypeStr, attemptType), ).Update(semaphoreToSignaling) m.set.GetOrCreateHistogram( - m.getMetricName("netbird_peer_connection_stage_signaling_to_connection", connTypeStr, attemptType), + m.getMetricName(agentInfo, "netbird_peer_connection_stage_signaling_to_connection", connTypeStr, attemptType), ).Update(signalingToConnection) m.set.GetOrCreateHistogram( - m.getMetricName("netbird_peer_connection_stage_connection_to_handshake", connTypeStr, attemptType), + m.getMetricName(agentInfo, "netbird_peer_connection_stage_connection_to_handshake", connTypeStr, attemptType), ).Update(connectionToHandshake) m.set.GetOrCreateHistogram( - m.getMetricName("netbird_peer_connection_total_creation_to_handshake", connTypeStr, attemptType), + m.getMetricName(agentInfo, "netbird_peer_connection_total_creation_to_handshake", connTypeStr, attemptType), ).Update(totalDuration) log.Tracef("peer connection metrics [%s, %s, %s]: creation→semaphore: %.3fs, semaphore→signaling: %.3fs, signaling→connection: %.3fs, connection→handshake: %.3fs, total: %.3fs", - m.agentInfo.DeploymentType.String(), connTypeStr, attemptType, + agentInfo.DeploymentType.String(), connTypeStr, attemptType, creationToSemaphore, semaphoreToSignaling, signalingToConnection, connectionToHandshake, totalDuration) } // getMetricName constructs a metric name with labels -func (m *victoriaMetrics) getMetricName(baseName, connectionType, attemptType string) string { - return fmt.Sprintf(`%s{deployment_type=%q,connection_type=%q,attempt_type=%q,version=%q}`, +func (m *victoriaMetrics) getMetricName(agentInfo AgentInfo, baseName, connectionType, attemptType string) string { + return fmt.Sprintf(`%s{deployment_type=%q,connection_type=%q,attempt_type=%q,version=%q,os=%q}`, baseName, - m.agentInfo.DeploymentType.String(), + agentInfo.DeploymentType.String(), connectionType, attemptType, - m.agentInfo.Version, + agentInfo.Version, + agentInfo.OS, ) } // RecordSyncDuration records the duration of sync message processing -func (m *victoriaMetrics) RecordSyncDuration(ctx context.Context, duration time.Duration) { - metricName := fmt.Sprintf(`netbird_sync_duration_seconds{deployment_type=%q,version=%q}`, - m.agentInfo.DeploymentType.String(), - m.agentInfo.Version, +func (m *victoriaMetrics) RecordSyncDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration) { + metricName := fmt.Sprintf(`netbird_sync_duration_seconds{deployment_type=%q,version=%q,os=%q}`, + agentInfo.DeploymentType.String(), + agentInfo.Version, + agentInfo.OS, ) m.set.GetOrCreateHistogram(metricName).Update(duration.Seconds()) From 8ed99ba5cec9c450fba889f732a54cc020f92ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Fri, 20 Feb 2026 14:40:34 +0100 Subject: [PATCH 13/52] [client] anchor connection metrics to first signal received --- client/internal/metrics/docs/README.md | 18 +++++++--- .../json/netbird-connection-metrics.json | 30 ++++++---------- client/internal/metrics/metrics.go | 6 ++-- client/internal/metrics/victoria.go | 35 +++++++------------ client/internal/peer/conn.go | 1 - client/internal/peer/metrics_saver.go | 32 ++++------------- 6 files changed, 47 insertions(+), 75 deletions(-) diff --git a/client/internal/metrics/docs/README.md b/client/internal/metrics/docs/README.md index a51165433e5..dadd1316919 100644 --- a/client/internal/metrics/docs/README.md +++ b/client/internal/metrics/docs/README.md @@ -49,10 +49,20 @@ Grafana (port 3000) ### Connection Stage Timing 1. `netbird_peer_connection_stage_creation_to_semaphore` -2. `netbird_peer_connection_stage_semaphore_to_signaling` -3. `netbird_peer_connection_stage_signaling_to_connection` -4. `netbird_peer_connection_stage_connection_to_handshake` -5. `netbird_peer_connection_total_creation_to_handshake` +2. `netbird_peer_connection_stage_signaling_received_to_connection` +3. `netbird_peer_connection_stage_connection_to_handshake` +4. `netbird_peer_connection_total_creation_to_handshake` + +**Stage descriptions:** + +| Metric suffix | Timestamps | Description | +|--------------------------------------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------| +| `creation_to_semaphore` | `Created → SemaphoreAcquired` | Queuing delay waiting for a connection semaphore slot. Only recorded for initial connections. | +| `signaling_received_to_connection` | `SignalingReceived → ConnectionReady` | ICE/relay negotiation time after the first signal is received from the remote peer. Excludes the wait for the remote peer to come online. | +| `connection_to_handshake` | `ConnectionReady → WgHandshakeSuccess` | WireGuard cryptographic handshake latency once the transport layer is ready. | +| `total_creation_to_handshake` | `SignalingReceived → WgHandshakeSuccess`| End-to-end connection time anchored at the first received signal. Excludes semaphore queuing and offline-peer wait time. | + +**Note:** `SignalingReceived` is set when the first offer or answer arrives from the remote peer (in both initial and reconnection paths). It is the anchor for all downstream stage durations, ensuring metrics reflect actual negotiation performance rather than how long the remote peer was unreachable. Labels: - `deployment_type`: "cloud" | "selfhosted" | "unknown" diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json index aa734fe39f7..1c52819e783 100644 --- a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json +++ b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json @@ -20,19 +20,14 @@ "refId": "A" }, { - "expr": "(netbird_peer_connection_stage_semaphore_to_signaling_sum / netbird_peer_connection_stage_semaphore_to_signaling_count) * 1000", - "legendFormat": "2. Semaphore→Signaling ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", + "expr": "(netbird_peer_connection_stage_signaling_received_to_connection_sum / netbird_peer_connection_stage_signaling_received_to_connection_count) * 1000", + "legendFormat": "2. SignalingReceived→Connection ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", "refId": "B" }, - { - "expr": "(netbird_peer_connection_stage_signaling_to_connection_sum / netbird_peer_connection_stage_signaling_to_connection_count) * 1000", - "legendFormat": "3. Signaling→Connection ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", - "refId": "C" - }, { "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum / netbird_peer_connection_stage_connection_to_handshake_count) * 1000", - "legendFormat": "4. Connection→Handshake ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", - "refId": "D" + "legendFormat": "3. Connection→Handshake ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", + "refId": "C" } ], "fieldConfig": { @@ -139,19 +134,14 @@ "refId": "A" }, { - "expr": "(netbird_peer_connection_stage_semaphore_to_signaling_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_semaphore_to_signaling_count{attempt_type=\"initial\"}) * 1000", - "legendFormat": "Semaphore→Signaling", + "expr": "(netbird_peer_connection_stage_signaling_received_to_connection_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_signaling_received_to_connection_count{attempt_type=\"initial\"}) * 1000", + "legendFormat": "SignalingReceived→Connection", "refId": "B" }, - { - "expr": "(netbird_peer_connection_stage_signaling_to_connection_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_signaling_to_connection_count{attempt_type=\"initial\"}) * 1000", - "legendFormat": "Signaling→Connection", - "refId": "C" - }, { "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_connection_to_handshake_count{attempt_type=\"initial\"}) * 1000", "legendFormat": "Connection→Handshake", - "refId": "D" + "refId": "C" } ], "options": { @@ -177,8 +167,8 @@ }, "targets": [ { - "expr": "(netbird_peer_connection_stage_signaling_to_connection_sum{attempt_type=\"reconnection\"} / netbird_peer_connection_stage_signaling_to_connection_count{attempt_type=\"reconnection\"}) * 1000", - "legendFormat": "Signaling→Connection", + "expr": "(netbird_peer_connection_stage_signaling_received_to_connection_sum{attempt_type=\"reconnection\"} / netbird_peer_connection_stage_signaling_received_to_connection_count{attempt_type=\"reconnection\"}) * 1000", + "legendFormat": "SignalingReceived→Connection", "refId": "A" }, { @@ -240,6 +230,6 @@ } ], "schemaVersion": 27, - "version": 3, + "version": 4, "refresh": "30s" } diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 5265545be5c..9dce7c87b96 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -51,17 +51,17 @@ type ClientMetrics struct { type ConnectionStageTimestamps struct { Created time.Time SemaphoreAcquired time.Time - Signaling time.Time // First signal sent (initial) or signal received (reconnection) + SignalingReceived time.Time // First signal received from remote peer (both initial and reconnection) ConnectionReady time.Time WgHandshakeSuccess time.Time } // String returns a human-readable representation of the connection stage timestamps func (c ConnectionStageTimestamps) String() string { - return fmt.Sprintf("ConnectionStageTimestamps{Created=%v, SemaphoreAcquired=%v, Signaling=%v, ConnectionReady=%v, WgHandshakeSuccess=%v}", + return fmt.Sprintf("ConnectionStageTimestamps{Created=%v, SemaphoreAcquired=%v, SignalingReceived=%v, ConnectionReady=%v, WgHandshakeSuccess=%v}", c.Created.Format(time.RFC3339Nano), c.SemaphoreAcquired.Format(time.RFC3339Nano), - c.Signaling.Format(time.RFC3339Nano), + c.SignalingReceived.Format(time.RFC3339Nano), c.ConnectionReady.Format(time.RFC3339Nano), c.WgHandshakeSuccess.Format(time.RFC3339Nano), ) diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index 78826a395fb..90dde7ae4d0 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -31,31 +31,26 @@ func (m *victoriaMetrics) RecordConnectionStages( timestamps ConnectionStageTimestamps, ) { // Calculate stage durations - var creationToSemaphore, semaphoreToSignaling, signalingToConnection, connectionToHandshake, totalDuration float64 + var creationToSemaphore, signalingReceivedToConnection, connectionToHandshake, totalDuration float64 if !timestamps.Created.IsZero() && !timestamps.SemaphoreAcquired.IsZero() { creationToSemaphore = timestamps.SemaphoreAcquired.Sub(timestamps.Created).Seconds() } - if !timestamps.SemaphoreAcquired.IsZero() && !timestamps.Signaling.IsZero() { - semaphoreToSignaling = timestamps.Signaling.Sub(timestamps.SemaphoreAcquired).Seconds() - } - - if !timestamps.Signaling.IsZero() && !timestamps.ConnectionReady.IsZero() { - signalingToConnection = timestamps.ConnectionReady.Sub(timestamps.Signaling).Seconds() + // Use SignalingReceived as the base: measures negotiation time after the remote peer + // responded, excluding unbounded wait time when the remote peer is offline. + if !timestamps.SignalingReceived.IsZero() && !timestamps.ConnectionReady.IsZero() { + signalingReceivedToConnection = timestamps.ConnectionReady.Sub(timestamps.SignalingReceived).Seconds() } if !timestamps.ConnectionReady.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { connectionToHandshake = timestamps.WgHandshakeSuccess.Sub(timestamps.ConnectionReady).Seconds() } - // Calculate total duration: - // For initial connections: Created → WgHandshakeSuccess - // For reconnections: Signaling → WgHandshakeSuccess (since Created is not tracked) - if !timestamps.Created.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { - totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.Created).Seconds() - } else if !timestamps.Signaling.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { - totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.Signaling).Seconds() + // Calculate total duration anchored at SignalingReceived → WgHandshakeSuccess. + // This excludes the potentially unbounded wait for the remote peer to come online. + if !timestamps.SignalingReceived.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { + totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.SignalingReceived).Seconds() } // Determine attempt type @@ -72,12 +67,8 @@ func (m *victoriaMetrics) RecordConnectionStages( ).Update(creationToSemaphore) m.set.GetOrCreateHistogram( - m.getMetricName(agentInfo, "netbird_peer_connection_stage_semaphore_to_signaling", connTypeStr, attemptType), - ).Update(semaphoreToSignaling) - - m.set.GetOrCreateHistogram( - m.getMetricName(agentInfo, "netbird_peer_connection_stage_signaling_to_connection", connTypeStr, attemptType), - ).Update(signalingToConnection) + m.getMetricName(agentInfo, "netbird_peer_connection_stage_signaling_received_to_connection", connTypeStr, attemptType), + ).Update(signalingReceivedToConnection) m.set.GetOrCreateHistogram( m.getMetricName(agentInfo, "netbird_peer_connection_stage_connection_to_handshake", connTypeStr, attemptType), @@ -87,9 +78,9 @@ func (m *victoriaMetrics) RecordConnectionStages( m.getMetricName(agentInfo, "netbird_peer_connection_total_creation_to_handshake", connTypeStr, attemptType), ).Update(totalDuration) - log.Tracef("peer connection metrics [%s, %s, %s]: creation→semaphore: %.3fs, semaphore→signaling: %.3fs, signaling→connection: %.3fs, connection→handshake: %.3fs, total: %.3fs", + log.Tracef("peer connection metrics [%s, %s, %s]: creation→semaphore: %.3fs, signalingReceived→connection: %.3fs, connection→handshake: %.3fs, total: %.3fs", agentInfo.DeploymentType.String(), connTypeStr, attemptType, - creationToSemaphore, semaphoreToSignaling, signalingToConnection, connectionToHandshake, + creationToSemaphore, signalingReceivedToConnection, connectionToHandshake, totalDuration) } diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index a6cc3ed84fb..5ed51714d21 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -627,7 +627,6 @@ func (conn *Conn) onGuardEvent() { if err := conn.handshaker.SendOffer(); err != nil { conn.Log.Errorf("failed to send offer: %v", err) } - conn.metricsStages.RecordSignaling() } func (conn *Conn) onWGDisconnected() { diff --git a/client/internal/peer/metrics_saver.go b/client/internal/peer/metrics_saver.go index 39e71a6458a..96c275b97f0 100644 --- a/client/internal/peer/metrics_saver.go +++ b/client/internal/peer/metrics_saver.go @@ -25,32 +25,15 @@ func (s *MetricsStages) RecordSemaphoreAcquired() { s.stageTimestamps.SemaphoreAcquired = time.Now() } -// RecordSignaling records the signaling timestamp when sending offers -// For initial connections: records when we start sending -// For reconnections: does nothing (we wait for RecordSignalingReceived) -func (s *MetricsStages) RecordSignaling() { - s.mu.Lock() - defer s.mu.Unlock() - - if s.isReconnectionAttempt { - return - } - - if s.stageTimestamps.Signaling.IsZero() { - s.stageTimestamps.Signaling = time.Now() - } -} - -// RecordSignalingReceived records the signaling timestamp when receiving offers/answers -// For reconnections: records when we receive the first signal -// For initial connections: does nothing (already recorded in RecordSignaling) +// RecordSignalingReceived records when the first signal is received from the remote peer. +// Used as the base for all subsequent stage durations to avoid inflating metrics when +// the remote peer was offline. func (s *MetricsStages) RecordSignalingReceived() { s.mu.Lock() defer s.mu.Unlock() - // Only record for reconnections when we receive a signal - if s.isReconnectionAttempt && s.stageTimestamps.Signaling.IsZero() { - s.stageTimestamps.Signaling = time.Now() + if s.stageTimestamps.SignalingReceived.IsZero() { + s.stageTimestamps.SignalingReceived = time.Now() } } @@ -84,9 +67,8 @@ func (s *MetricsStages) Disconnected() { s.mu.Lock() defer s.mu.Unlock() - // Reset all timestamps for reconnection - // For reconnections, we only track from Signaling onwards - // This avoids meaningless creation→semaphore and semaphore→signaling metrics + // Reset all timestamps for reconnection; Created and SemaphoreAcquired are not + // tracked for reconnections since only SignalingReceived onwards is meaningful. s.stageTimestamps = metrics.ConnectionStageTimestamps{} s.isReconnectionAttempt = true } From 8a852d4ac396d65447b8182ee352efddf997d987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 5 Mar 2026 09:24:45 +0100 Subject: [PATCH 14/52] Remove creation_to_semaphore connection stage metric MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The semaphore queuing stage (Created → SemaphoreAcquired) is no longer tracked. Connection metrics now start from SignalingReceived. Updated docs and Grafana dashboard accordingly. --- client/internal/metrics/docs/README.md | 14 +++++----- .../json/netbird-connection-metrics.json | 26 ++++++------------- client/internal/metrics/metrics.go | 6 +---- client/internal/metrics/victoria.go | 17 +++--------- client/internal/peer/conn.go | 1 - client/internal/peer/metrics_saver.go | 10 +------ 6 files changed, 19 insertions(+), 55 deletions(-) diff --git a/client/internal/metrics/docs/README.md b/client/internal/metrics/docs/README.md index dadd1316919..90b91565e3c 100644 --- a/client/internal/metrics/docs/README.md +++ b/client/internal/metrics/docs/README.md @@ -48,19 +48,17 @@ Grafana (port 3000) ### Connection Stage Timing -1. `netbird_peer_connection_stage_creation_to_semaphore` -2. `netbird_peer_connection_stage_signaling_received_to_connection` -3. `netbird_peer_connection_stage_connection_to_handshake` -4. `netbird_peer_connection_total_creation_to_handshake` +1. `netbird_peer_connection_stage_signaling_received_to_connection` +2. `netbird_peer_connection_stage_connection_to_handshake` +3. `netbird_peer_connection_total_creation_to_handshake` **Stage descriptions:** | Metric suffix | Timestamps | Description | |--------------------------------------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------| -| `creation_to_semaphore` | `Created → SemaphoreAcquired` | Queuing delay waiting for a connection semaphore slot. Only recorded for initial connections. | | `signaling_received_to_connection` | `SignalingReceived → ConnectionReady` | ICE/relay negotiation time after the first signal is received from the remote peer. Excludes the wait for the remote peer to come online. | | `connection_to_handshake` | `ConnectionReady → WgHandshakeSuccess` | WireGuard cryptographic handshake latency once the transport layer is ready. | -| `total_creation_to_handshake` | `SignalingReceived → WgHandshakeSuccess`| End-to-end connection time anchored at the first received signal. Excludes semaphore queuing and offline-peer wait time. | +| `total_creation_to_handshake` | `SignalingReceived → WgHandshakeSuccess`| End-to-end connection time anchored at the first received signal. Excludes offline-peer wait time. | **Note:** `SignalingReceived` is set when the first offer or answer arrives from the remote peer (in both initial and reconnection paths). It is the anchor for all downstream stage durations, ensuring metrics reflect actual negotiation performance rather than how long the remote peer was unreachable. @@ -136,7 +134,7 @@ docker-compose -f docker-compose.victoria.yml logs -f ```bash export NB_METRICS_ENABLED=true export NB_METRICS_SERVER_URL=http://localhost:8428/api/v1/import/prometheus -export NB_METRICS_INTERVAL=1m +export NB_METRICS_INTERVAL=1m # Run client cd ../../../.. @@ -188,4 +186,4 @@ rate(netbird_sync_duration_seconds_sum[5m]) / rate(netbird_sync_duration_seconds ```bash curl 'http://localhost:8428/api/v1/query?query=netbird_peer_connection_total_creation_to_handshake_count' -``` +``` \ No newline at end of file diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json index 1c52819e783..17ca9ed7cd0 100644 --- a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json +++ b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json @@ -14,20 +14,15 @@ "y": 0 }, "targets": [ - { - "expr": "(netbird_peer_connection_stage_creation_to_semaphore_sum / netbird_peer_connection_stage_creation_to_semaphore_count) * 1000", - "legendFormat": "1. Creation→Semaphore ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", - "refId": "A" - }, { "expr": "(netbird_peer_connection_stage_signaling_received_to_connection_sum / netbird_peer_connection_stage_signaling_received_to_connection_count) * 1000", - "legendFormat": "2. SignalingReceived→Connection ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", - "refId": "B" + "legendFormat": "1. SignalingReceived→Connection ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", + "refId": "A" }, { "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum / netbird_peer_connection_stage_connection_to_handshake_count) * 1000", - "legendFormat": "3. Connection→Handshake ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", - "refId": "C" + "legendFormat": "2. Connection→Handshake ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", + "refId": "B" } ], "fieldConfig": { @@ -128,20 +123,15 @@ "y": 15 }, "targets": [ - { - "expr": "(netbird_peer_connection_stage_creation_to_semaphore_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_creation_to_semaphore_count{attempt_type=\"initial\"}) * 1000", - "legendFormat": "Creation→Semaphore", - "refId": "A" - }, { "expr": "(netbird_peer_connection_stage_signaling_received_to_connection_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_signaling_received_to_connection_count{attempt_type=\"initial\"}) * 1000", "legendFormat": "SignalingReceived→Connection", - "refId": "B" + "refId": "A" }, { "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_connection_to_handshake_count{attempt_type=\"initial\"}) * 1000", "legendFormat": "Connection→Handshake", - "refId": "C" + "refId": "B" } ], "options": { @@ -230,6 +220,6 @@ } ], "schemaVersion": 27, - "version": 4, + "version": 5, "refresh": "30s" -} +} \ No newline at end of file diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 9dce7c87b96..b5919eb4a2e 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -49,8 +49,6 @@ type ClientMetrics struct { // ConnectionStageTimestamps holds timestamps for each connection stage type ConnectionStageTimestamps struct { - Created time.Time - SemaphoreAcquired time.Time SignalingReceived time.Time // First signal received from remote peer (both initial and reconnection) ConnectionReady time.Time WgHandshakeSuccess time.Time @@ -58,9 +56,7 @@ type ConnectionStageTimestamps struct { // String returns a human-readable representation of the connection stage timestamps func (c ConnectionStageTimestamps) String() string { - return fmt.Sprintf("ConnectionStageTimestamps{Created=%v, SemaphoreAcquired=%v, SignalingReceived=%v, ConnectionReady=%v, WgHandshakeSuccess=%v}", - c.Created.Format(time.RFC3339Nano), - c.SemaphoreAcquired.Format(time.RFC3339Nano), + return fmt.Sprintf("ConnectionStageTimestamps{SignalingReceived=%v, ConnectionReady=%v, WgHandshakeSuccess=%v}", c.SignalingReceived.Format(time.RFC3339Nano), c.ConnectionReady.Format(time.RFC3339Nano), c.WgHandshakeSuccess.Format(time.RFC3339Nano), diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index 90dde7ae4d0..c49c1f12d34 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -31,11 +31,7 @@ func (m *victoriaMetrics) RecordConnectionStages( timestamps ConnectionStageTimestamps, ) { // Calculate stage durations - var creationToSemaphore, signalingReceivedToConnection, connectionToHandshake, totalDuration float64 - - if !timestamps.Created.IsZero() && !timestamps.SemaphoreAcquired.IsZero() { - creationToSemaphore = timestamps.SemaphoreAcquired.Sub(timestamps.Created).Seconds() - } + var signalingReceivedToConnection, connectionToHandshake, totalDuration float64 // Use SignalingReceived as the base: measures negotiation time after the remote peer // responded, excluding unbounded wait time when the remote peer is offline. @@ -61,11 +57,6 @@ func (m *victoriaMetrics) RecordConnectionStages( connTypeStr := connectionType.String() - // Record observations using histograms - m.set.GetOrCreateHistogram( - m.getMetricName(agentInfo, "netbird_peer_connection_stage_creation_to_semaphore", connTypeStr, attemptType), - ).Update(creationToSemaphore) - m.set.GetOrCreateHistogram( m.getMetricName(agentInfo, "netbird_peer_connection_stage_signaling_received_to_connection", connTypeStr, attemptType), ).Update(signalingReceivedToConnection) @@ -78,10 +69,8 @@ func (m *victoriaMetrics) RecordConnectionStages( m.getMetricName(agentInfo, "netbird_peer_connection_total_creation_to_handshake", connTypeStr, attemptType), ).Update(totalDuration) - log.Tracef("peer connection metrics [%s, %s, %s]: creation→semaphore: %.3fs, signalingReceived→connection: %.3fs, connection→handshake: %.3fs, total: %.3fs", - agentInfo.DeploymentType.String(), connTypeStr, attemptType, - creationToSemaphore, signalingReceivedToConnection, connectionToHandshake, - totalDuration) + log.Tracef("peer connection metrics [%s, %s, %s]: signalingReceived→connection: %.3fs, connection→handshake: %.3fs, total: %.3fs", + agentInfo.DeploymentType.String(), connTypeStr, attemptType, signalingReceivedToConnection, connectionToHandshake, totalDuration) } // getMetricName constructs a metric name with labels diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index dfa5ba0d1a9..df71660aa67 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -175,7 +175,6 @@ func (conn *Conn) Open(engineCtx context.Context) error { // Record the start time - beginning of connection attempt conn.metricsStages = MetricsStages{} - conn.metricsStages.RecordCreated() conn.ctx, conn.ctxCancel = context.WithCancel(engineCtx) diff --git a/client/internal/peer/metrics_saver.go b/client/internal/peer/metrics_saver.go index 96c275b97f0..369877215fb 100644 --- a/client/internal/peer/metrics_saver.go +++ b/client/internal/peer/metrics_saver.go @@ -16,13 +16,6 @@ type MetricsStages struct { func (s *MetricsStages) RecordCreated() { s.mu.Lock() defer s.mu.Unlock() - s.stageTimestamps.Created = time.Now() -} - -func (s *MetricsStages) RecordSemaphoreAcquired() { - s.mu.Lock() - defer s.mu.Unlock() - s.stageTimestamps.SemaphoreAcquired = time.Now() } // RecordSignalingReceived records when the first signal is received from the remote peer. @@ -67,8 +60,7 @@ func (s *MetricsStages) Disconnected() { s.mu.Lock() defer s.mu.Unlock() - // Reset all timestamps for reconnection; Created and SemaphoreAcquired are not - // tracked for reconnections since only SignalingReceived onwards is meaningful. + // Reset all timestamps for reconnection s.stageTimestamps = metrics.ConnectionStageTimestamps{} s.isReconnectionAttempt = true } From 473f59c263fb65599176310131cae52115ecc5dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 5 Mar 2026 12:51:31 +0100 Subject: [PATCH 15/52] [client] Add remote push config for metrics with version-based eligibility Introduce remoteconfig.Manager that fetches a remote JSON config to control metrics push interval and restrict pushing to a specific agent version range. When NB_METRICS_INTERVAL is set, remote config is bypassed entirely for local override. --- client/internal/metrics/env.go | 17 +- client/internal/metrics/metrics.go | 15 +- client/internal/metrics/push.go | 117 ++++++--- client/internal/metrics/push_test.go | 241 ++++++++++++++++++ .../internal/metrics/remoteconfig/manager.go | 136 ++++++++++ .../metrics/remoteconfig/manager_test.go | 177 +++++++++++++ 6 files changed, 668 insertions(+), 35 deletions(-) create mode 100644 client/internal/metrics/push_test.go create mode 100644 client/internal/metrics/remoteconfig/manager.go create mode 100644 client/internal/metrics/remoteconfig/manager_test.go diff --git a/client/internal/metrics/env.go b/client/internal/metrics/env.go index c20b87ff9e2..66e1fbad8c0 100644 --- a/client/internal/metrics/env.go +++ b/client/internal/metrics/env.go @@ -16,9 +16,16 @@ const ( // EnvMetricsServerURL is the environment variable to override the metrics server URL EnvMetricsServerURL = "NB_METRICS_SERVER_URL" - // EnvMetricsInterval is the environment variable to set the push interval (default: 4h) + // EnvMetricsInterval overrides the push interval from the remote config. + // When set, metrics are always pushed at this interval, ignoring remote config's + // period_minutes and version range filtering. // Format: duration string like "1h", "30m", "4h" EnvMetricsInterval = "NB_METRICS_INTERVAL" + + // EnvMetricsConfigURL is the environment variable to override the metrics push config URL + EnvMetricsConfigURL = "NB_METRICS_CONFIG_URL" + + defaultMetricsConfigURL = "https://api.netbird.io/client-metrics-config.json" ) var ( @@ -57,6 +64,14 @@ func getMetricsServerURL() url.URL { return *defaultMetricsURL } +// getMetricsConfigURL returns the URL to fetch push configuration from +func getMetricsConfigURL() string { + if envURL := os.Getenv(EnvMetricsConfigURL); envURL != "" { + return envURL + } + return defaultMetricsConfigURL +} + // getMetricsInterval returns the metrics push interval from environment variable // If not set or invalid, returns 0 (which will use the default in NewPush) func getMetricsInterval() time.Duration { diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index b5919eb4a2e..b07f25e9ca1 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -8,6 +8,8 @@ import ( "time" log "github.com/sirupsen/logrus" + + "github.com/netbirdio/netbird/client/internal/metrics/remoteconfig" ) // AgentInfo holds static information about the agent @@ -143,7 +145,12 @@ func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) { ctx, cancel := context.WithCancel(ctx) c.pushCancel = cancel - push := NewPush(c.impl, config) + c.mu.RLock() + agentVersion := c.agentInfo.Version + c.mu.RUnlock() + + configManager := remoteconfig.NewManager(getMetricsConfigURL(), remoteconfig.DefaultMinRefreshInterval) + push := NewPush(c.impl, configManager, config, agentVersion) c.wg.Add(1) go func() { defer c.wg.Done() @@ -151,7 +158,11 @@ func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) { }() c.push = push - log.Infof("started metrics push to %s with interval %s", push.pushURL, push.interval) + if push.overrideInterval > 0 { + log.Infof("started metrics push to %s with override interval %s", push.pushURL, push.overrideInterval) + } else { + log.Infof("started metrics push to %s with remote config", push.pushURL) + } } func (c *ClientMetrics) StopPush() { diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index b6223d988e3..fc98d569193 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -8,7 +8,10 @@ import ( "net/url" "time" + goversion "github.com/hashicorp/go-version" log "github.com/sirupsen/logrus" + + "github.com/netbirdio/netbird/client/internal/metrics/remoteconfig" ) const ( @@ -18,8 +21,8 @@ const ( var ( DefaultPushConfig = PushConfig{ - URL: nil, // Will use getMetricsServerURL() - Interval: 0, // Will use getMetricsInterval() or DefaultPushInterval + URL: nil, + Interval: 0, } ) @@ -31,17 +34,24 @@ type PushConfig struct { Interval time.Duration } +// remoteConfigProvider abstracts remote push config fetching for testability +type remoteConfigProvider interface { + RefreshIfNeeded(ctx context.Context) *remoteconfig.Config +} + // Push handles periodic pushing of metrics to VictoriaMetrics type Push struct { - metrics metricsImplementation - pushURL string - interval time.Duration - client *http.Client + metrics metricsImplementation + pushURL string + agentVersion *goversion.Version + overrideInterval time.Duration // if set, bypass remote config and always push at this interval + + configManager remoteConfigProvider + client *http.Client } // NewPush creates a new Push instance with configuration resolution -// Precedence: config parameter > env var > DefaultPushConfig -func NewPush(metrics metricsImplementation, config PushConfig) *Push { +func NewPush(metrics metricsImplementation, configManager remoteConfigProvider, config PushConfig, agentVersion string) *Push { // Resolve URL: config > env var (always returns valid URL) var pushURL url.URL if config.URL != nil { @@ -50,55 +60,93 @@ func NewPush(metrics metricsImplementation, config PushConfig) *Push { pushURL = getMetricsServerURL() } - // Resolve interval: config > env var > default - interval := config.Interval - if interval == 0 { - if envInterval := getMetricsInterval(); envInterval > 0 { - interval = envInterval - } else { - interval = DefaultPushInterval - } + // If interval is explicitly set (config param or env var), bypass remote config entirely + overrideInterval := config.Interval + if overrideInterval == 0 { + overrideInterval = getMetricsInterval() // 0 if env var not set + } + + parsedVersion, err := goversion.NewVersion(agentVersion) + if err != nil { + log.Warnf("failed to parse agent version %q: %v", agentVersion, err) } return &Push{ - metrics: metrics, - pushURL: pushURL.String(), - interval: interval, + metrics: metrics, + pushURL: pushURL.String(), + agentVersion: parsedVersion, + overrideInterval: overrideInterval, + configManager: configManager, client: &http.Client{ Timeout: 10 * time.Second, }, } } -// Start starts the periodic push ticker -// Pushes immediately on start, then every interval +// Start starts the periodic push loop. +// If overrideInterval is set (via env var), pushes unconditionally at that interval. +// Otherwise, fetches remote config to determine push period and version eligibility. func (p *Push) Start(ctx context.Context) { if p.pushURL == "" { log.Debug("metrics push URL not configured, skipping push") return } - // Push immediately on start - if err := p.push(ctx); err != nil { - log.Errorf("failed to push metrics on start: %v", err) - } - - ticker := time.NewTicker(p.interval) - defer ticker.Stop() + timer := time.NewTimer(0) // fire immediately on first iteration + defer timer.Stop() for { select { case <-ctx.Done(): log.Debug("stopping metrics push") return - case <-ticker.C: - if err := p.push(ctx); err != nil { - log.Errorf("failed to push metrics: %v", err) - } + case <-timer.C: } + + nextInterval := p.tick(ctx) + timer.Reset(nextInterval) } } +// tick performs a single push cycle and returns the duration to wait before the next one. +func (p *Push) tick(ctx context.Context) time.Duration { + interval, shouldPush := p.resolveInterval(ctx) + if shouldPush { + if err := p.push(ctx); err != nil { + log.Errorf("failed to push metrics: %v", err) + } + } + return interval +} + +// resolveInterval determines the push interval and whether a push should happen. +// If overrideInterval is set, it bypasses remote config and always pushes. +// Otherwise, it fetches remote config and checks version eligibility. +func (p *Push) resolveInterval(ctx context.Context) (time.Duration, bool) { + if p.overrideInterval > 0 { + return p.overrideInterval, true + } + + config := p.configManager.RefreshIfNeeded(ctx) + if config == nil { + log.Debug("no metrics push config available, waiting to retry") + return DefaultPushInterval, false + } + + if p.agentVersion == nil { + log.Debug("agent version not available, skipping metrics push") + return config.Period, false + } + + if !isVersionInRange(p.agentVersion, config.VersionSince, config.VersionUntil) { + log.Debugf("agent version %s not in range [%s, %s), skipping metrics push", + p.agentVersion, config.VersionSince, config.VersionUntil) + return config.Period, false + } + + return config.Period, true +} + // push exports metrics and sends them to VictoriaMetrics func (p *Push) push(ctx context.Context) error { // Export metrics to buffer @@ -149,3 +197,8 @@ func (p *Push) push(ctx context.Context) error { log.Debugf("successfully pushed metrics to %s", p.pushURL) return nil } + +// isVersionInRange checks if current falls within [since, until) +func isVersionInRange(current, since, until *goversion.Version) bool { + return !current.LessThan(since) && current.LessThan(until) +} diff --git a/client/internal/metrics/push_test.go b/client/internal/metrics/push_test.go new file mode 100644 index 00000000000..6dd9bf3c495 --- /dev/null +++ b/client/internal/metrics/push_test.go @@ -0,0 +1,241 @@ +package metrics + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + goversion "github.com/hashicorp/go-version" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/netbirdio/netbird/client/internal/metrics/remoteconfig" +) + +func mustVersion(s string) *goversion.Version { + v, err := goversion.NewVersion(s) + if err != nil { + panic(err) + } + return v +} + +func testConfig(since, until string, period time.Duration) *remoteconfig.Config { + return &remoteconfig.Config{ + VersionSince: mustVersion(since), + VersionUntil: mustVersion(until), + Period: period, + } +} + +// mockConfigProvider implements remoteConfigProvider for testing +type mockConfigProvider struct { + config *remoteconfig.Config +} + +func (m *mockConfigProvider) RefreshIfNeeded(_ context.Context) *remoteconfig.Config { + return m.config +} + +// mockMetrics implements metricsImplementation for testing +type mockMetrics struct { + exportData string +} + +func (m *mockMetrics) RecordConnectionStages(_ context.Context, _ AgentInfo, _ ConnectionType, _ bool, _ ConnectionStageTimestamps) { +} + +func (m *mockMetrics) RecordSyncDuration(_ context.Context, _ AgentInfo, _ time.Duration) { +} + +func (m *mockMetrics) Export(w io.Writer) error { + if m.exportData != "" { + _, err := w.Write([]byte(m.exportData)) + return err + } + return nil +} + +func TestPush_OverrideIntervalAlwaysPushes(t *testing.T) { + var pushCount atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + pushCount.Add(1) + w.WriteHeader(http.StatusNoContent) + })) + defer server.Close() + + metrics := &mockMetrics{exportData: "test_metric 1\n"} + configProvider := &mockConfigProvider{config: nil} // no remote config + + push := &Push{ + metrics: metrics, + pushURL: server.URL, + agentVersion: mustVersion("1.0.0"), + overrideInterval: 50 * time.Millisecond, + configManager: configProvider, + client: &http.Client{Timeout: 5 * time.Second}, + } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + push.Start(ctx) + close(done) + }() + + // Wait for a few pushes + require.Eventually(t, func() bool { + return pushCount.Load() >= 3 + }, 2*time.Second, 10*time.Millisecond) + + cancel() + <-done +} + +func TestPush_RemoteConfigVersionInRange(t *testing.T) { + var pushCount atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + pushCount.Add(1) + w.WriteHeader(http.StatusNoContent) + })) + defer server.Close() + + metrics := &mockMetrics{exportData: "test_metric 1\n"} + configProvider := &mockConfigProvider{config: testConfig("1.0.0", "2.0.0", 1*time.Minute)} + + push := &Push{ + metrics: metrics, + pushURL: server.URL, + agentVersion: mustVersion("1.5.0"), + configManager: configProvider, + client: &http.Client{Timeout: 5 * time.Second}, + } + + interval, shouldPush := push.resolveInterval(context.Background()) + assert.True(t, shouldPush) + assert.Equal(t, 1*time.Minute, interval) + assert.Equal(t, int32(0), pushCount.Load()) // resolveInterval doesn't push +} + +func TestPush_RemoteConfigVersionOutOfRange(t *testing.T) { + metrics := &mockMetrics{exportData: "test_metric 1\n"} + configProvider := &mockConfigProvider{config: testConfig("1.0.0", "1.5.0", 1*time.Minute)} + + push := &Push{ + metrics: metrics, + pushURL: "http://localhost", + agentVersion: mustVersion("2.0.0"), + configManager: configProvider, + client: &http.Client{Timeout: 5 * time.Second}, + } + + interval, shouldPush := push.resolveInterval(context.Background()) + assert.False(t, shouldPush) + assert.Equal(t, 1*time.Minute, interval) +} + +func TestPush_NoConfigReturnsDefault(t *testing.T) { + metrics := &mockMetrics{} + configProvider := &mockConfigProvider{config: nil} + + push := &Push{ + metrics: metrics, + pushURL: "http://localhost", + agentVersion: mustVersion("1.0.0"), + configManager: configProvider, + client: &http.Client{Timeout: 5 * time.Second}, + } + + interval, shouldPush := push.resolveInterval(context.Background()) + assert.False(t, shouldPush) + assert.Equal(t, DefaultPushInterval, interval) +} + +func TestPush_OverrideIntervalBypassesRemoteConfig(t *testing.T) { + metrics := &mockMetrics{} + // Remote config says version is out of range, but override should bypass it + configProvider := &mockConfigProvider{config: testConfig("3.0.0", "4.0.0", 60*time.Minute)} + + push := &Push{ + metrics: metrics, + pushURL: "http://localhost", + agentVersion: mustVersion("1.0.0"), + overrideInterval: 30 * time.Second, + configManager: configProvider, + client: &http.Client{Timeout: 5 * time.Second}, + } + + interval, shouldPush := push.resolveInterval(context.Background()) + assert.True(t, shouldPush) + assert.Equal(t, 30*time.Second, interval) +} + +func TestPush_NoMetricsSkipsPush(t *testing.T) { + var pushCount atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + pushCount.Add(1) + w.WriteHeader(http.StatusNoContent) + })) + defer server.Close() + + metrics := &mockMetrics{exportData: ""} // no metrics to export + + push := &Push{ + metrics: metrics, + pushURL: server.URL, + client: &http.Client{Timeout: 5 * time.Second}, + } + + err := push.push(context.Background()) + assert.NoError(t, err) + assert.Equal(t, int32(0), pushCount.Load()) // no HTTP request made +} + +func TestPush_EmptyURLSkipsStart(t *testing.T) { + push := &Push{ + pushURL: "", + } + + // Should return immediately without blocking + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + + done := make(chan struct{}) + go func() { + push.Start(ctx) + close(done) + }() + + select { + case <-done: + // good, returned immediately + case <-ctx.Done(): + t.Fatal("Start did not return for empty URL") + } +} + +func TestIsVersionInRange(t *testing.T) { + tests := []struct { + name string + current string + since string + until string + expected bool + }{ + {"at lower bound inclusive", "1.2.2", "1.2.2", "1.2.3", true}, + {"in range", "1.2.2", "1.2.0", "1.3.0", true}, + {"at upper bound exclusive", "1.2.3", "1.2.2", "1.2.3", false}, + {"below range", "1.2.1", "1.2.2", "1.2.3", false}, + {"above range", "1.3.0", "1.2.2", "1.2.3", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, isVersionInRange(mustVersion(tt.current), mustVersion(tt.since), mustVersion(tt.until))) + }) + } +} diff --git a/client/internal/metrics/remoteconfig/manager.go b/client/internal/metrics/remoteconfig/manager.go new file mode 100644 index 00000000000..6ab39c33ae0 --- /dev/null +++ b/client/internal/metrics/remoteconfig/manager.go @@ -0,0 +1,136 @@ +package remoteconfig + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "sync" + "time" + + goversion "github.com/hashicorp/go-version" + log "github.com/sirupsen/logrus" +) + +const ( + DefaultMinRefreshInterval = 30 * time.Minute +) + +// Config holds the parsed remote push configuration +type Config struct { + VersionSince *goversion.Version + VersionUntil *goversion.Version + Period time.Duration +} + +// rawConfig is the JSON wire format fetched from the remote server +type rawConfig struct { + VersionSince string `json:"version-since"` + VersionUntil string `json:"version-until"` + PeriodMinutes int `json:"period_minutes"` +} + +// Manager handles fetching and caching remote push configuration +type Manager struct { + configURL string + minRefreshInterval time.Duration + client *http.Client + + mu sync.Mutex + lastConfig *Config + lastFetched time.Time +} + +func NewManager(configURL string, minRefreshInterval time.Duration) *Manager { + return &Manager{ + configURL: configURL, + minRefreshInterval: minRefreshInterval, + client: &http.Client{ + Timeout: 10 * time.Second, + }, + } +} + +// RefreshIfNeeded fetches new config if the cached one is stale. +// Returns the current config (possibly just fetched) or nil if unavailable. +func (m *Manager) RefreshIfNeeded(ctx context.Context) *Config { + m.mu.Lock() + defer m.mu.Unlock() + + if m.isConfigFresh() { + return m.lastConfig + } + + fetchedConfig, err := m.fetch(ctx) + if err != nil { + log.Warnf("failed to fetch metrics remote config: %v", err) + return m.lastConfig // return cached (may be nil) + } + + m.lastConfig = fetchedConfig + m.lastFetched = time.Now() + + log.Tracef("fetched metrics remote config: version-since=%s version-until=%s period=%s", + fetchedConfig.VersionSince, fetchedConfig.VersionUntil, fetchedConfig.Period) + + return fetchedConfig +} + +func (m *Manager) isConfigFresh() bool { + if m.lastConfig == nil { + return false + } + return time.Since(m.lastFetched) < m.minRefreshInterval +} + +func (m *Manager) fetch(ctx context.Context) (*Config, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, m.configURL, nil) + if err != nil { + return nil, fmt.Errorf("create request: %w", err) + } + + resp, err := m.client.Do(req) + if err != nil { + return nil, fmt.Errorf("send request: %w", err) + } + defer func() { + if resp.Body != nil { + _ = resp.Body.Close() + } + }() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } + + body, err := io.ReadAll(io.LimitReader(resp.Body, 4096)) + if err != nil { + return nil, fmt.Errorf("read body: %w", err) + } + + var raw rawConfig + if err := json.Unmarshal(body, &raw); err != nil { + return nil, fmt.Errorf("parse config: %w", err) + } + + if raw.PeriodMinutes <= 0 { + return nil, fmt.Errorf("invalid period_minutes: %d", raw.PeriodMinutes) + } + + since, err := goversion.NewVersion(raw.VersionSince) + if err != nil { + return nil, fmt.Errorf("parse version-since %q: %w", raw.VersionSince, err) + } + + until, err := goversion.NewVersion(raw.VersionUntil) + if err != nil { + return nil, fmt.Errorf("parse version-until %q: %w", raw.VersionUntil, err) + } + + return &Config{ + VersionSince: since, + VersionUntil: until, + Period: time.Duration(raw.PeriodMinutes) * time.Minute, + }, nil +} diff --git a/client/internal/metrics/remoteconfig/manager_test.go b/client/internal/metrics/remoteconfig/manager_test.go new file mode 100644 index 00000000000..346b44a5e89 --- /dev/null +++ b/client/internal/metrics/remoteconfig/manager_test.go @@ -0,0 +1,177 @@ +package remoteconfig + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const testMinRefresh = 100 * time.Millisecond + +func TestManager_FetchSuccess(t *testing.T) { + server := newConfigServer(t, rawConfig{ + VersionSince: "1.0.0", + VersionUntil: "2.0.0", + PeriodMinutes: 60, + }) + defer server.Close() + + mgr := NewManager(server.URL, testMinRefresh) + config := mgr.RefreshIfNeeded(context.Background()) + + require.NotNil(t, config) + assert.Equal(t, "1.0.0", config.VersionSince.String()) + assert.Equal(t, "2.0.0", config.VersionUntil.String()) + assert.Equal(t, 60*time.Minute, config.Period) +} + +func TestManager_CachesConfig(t *testing.T) { + var fetchCount atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fetchCount.Add(1) + err := json.NewEncoder(w).Encode(rawConfig{ + VersionSince: "1.0.0", + VersionUntil: "2.0.0", + PeriodMinutes: 60, + }) + require.NoError(t, err) + })) + defer server.Close() + + mgr := NewManager(server.URL, testMinRefresh) + + // First call fetches + config1 := mgr.RefreshIfNeeded(context.Background()) + require.NotNil(t, config1) + assert.Equal(t, int32(1), fetchCount.Load()) + + // Second call uses cache (within minRefreshInterval) + config2 := mgr.RefreshIfNeeded(context.Background()) + require.NotNil(t, config2) + assert.Equal(t, int32(1), fetchCount.Load()) + assert.Equal(t, config1, config2) +} + +func TestManager_RefetchesWhenStale(t *testing.T) { + var fetchCount atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fetchCount.Add(1) + err := json.NewEncoder(w).Encode(rawConfig{ + VersionSince: "1.0.0", + VersionUntil: "2.0.0", + PeriodMinutes: 60, + }) + require.NoError(t, err) + })) + defer server.Close() + + mgr := NewManager(server.URL, testMinRefresh) + + // First fetch + mgr.RefreshIfNeeded(context.Background()) + assert.Equal(t, int32(1), fetchCount.Load()) + + // Wait for config to become stale + time.Sleep(testMinRefresh + 10*time.Millisecond) + + // Should refetch + mgr.RefreshIfNeeded(context.Background()) + assert.Equal(t, int32(2), fetchCount.Load()) +} + +func TestManager_FetchFailureReturnsNil(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + mgr := NewManager(server.URL, testMinRefresh) + config := mgr.RefreshIfNeeded(context.Background()) + + assert.Nil(t, config) +} + +func TestManager_FetchFailureReturnsCached(t *testing.T) { + var fetchCount atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fetchCount.Add(1) + if fetchCount.Load() > 1 { + w.WriteHeader(http.StatusInternalServerError) + return + } + err := json.NewEncoder(w).Encode(rawConfig{ + VersionSince: "1.0.0", + VersionUntil: "2.0.0", + PeriodMinutes: 60, + }) + require.NoError(t, err) + })) + defer server.Close() + + mgr := NewManager(server.URL, testMinRefresh) + + // First call succeeds + config1 := mgr.RefreshIfNeeded(context.Background()) + require.NotNil(t, config1) + + // Wait for config to become stale + time.Sleep(testMinRefresh + 10*time.Millisecond) + + // Second call fails but returns cached + config2 := mgr.RefreshIfNeeded(context.Background()) + require.NotNil(t, config2) + assert.Equal(t, config1, config2) +} + +func TestManager_RejectsInvalidPeriod(t *testing.T) { + tests := []struct { + name string + period int + }{ + {"zero", 0}, + {"negative", -5}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := newConfigServer(t, rawConfig{ + VersionSince: "1.0.0", + VersionUntil: "2.0.0", + PeriodMinutes: tt.period, + }) + defer server.Close() + + mgr := NewManager(server.URL, testMinRefresh) + config := mgr.RefreshIfNeeded(context.Background()) + assert.Nil(t, config) + }) + } +} + +func TestManager_RejectsInvalidJSON(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, err := w.Write([]byte("not json")) + require.NoError(t, err) + })) + defer server.Close() + + mgr := NewManager(server.URL, testMinRefresh) + config := mgr.RefreshIfNeeded(context.Background()) + assert.Nil(t, config) +} + +func newConfigServer(t *testing.T, config rawConfig) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + err := json.NewEncoder(w).Encode(config) + require.NoError(t, err) + })) +} From ee13016154a134423891a8803a0290bf168b75a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 5 Mar 2026 15:06:52 +0100 Subject: [PATCH 16/52] [client] Add WASM-compatible NewClientMetrics implementation Replace NewClientMetrics in metrics.go with a WASM-specific stub in metrics_js.go, returning nil for compatibility with JS builds. Simplify method usage for WASM targets. --- client/internal/metrics/metrics.go | 8 -------- client/internal/metrics/metrics_js.go | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) create mode 100644 client/internal/metrics/metrics_js.go diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index b07f25e9ca1..3119d3e78ad 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -65,14 +65,6 @@ func (c ConnectionStageTimestamps) String() string { ) } -// NewClientMetrics creates a new ClientMetrics instance -func NewClientMetrics(agentInfo AgentInfo) *ClientMetrics { - return &ClientMetrics{ - impl: newVictoriaMetrics(), - agentInfo: agentInfo, - } -} - // RecordConnectionStages calculates stage durations from timestamps and records them func (c *ClientMetrics) RecordConnectionStages( ctx context.Context, diff --git a/client/internal/metrics/metrics_js.go b/client/internal/metrics/metrics_js.go new file mode 100644 index 00000000000..dfa6d8243f5 --- /dev/null +++ b/client/internal/metrics/metrics_js.go @@ -0,0 +1,8 @@ +//go:build js + +package metrics + +// NewClientMetrics returns nil on WASM builds — all ClientMetrics methods are nil-safe. +func NewClientMetrics(AgentInfo) *ClientMetrics { + return nil +} From eb1a9b1bd8156ae67178334b9983a494beadd1d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 5 Mar 2026 15:08:41 +0100 Subject: [PATCH 17/52] Add missing file --- client/internal/metrics/metrics_default.go | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 client/internal/metrics/metrics_default.go diff --git a/client/internal/metrics/metrics_default.go b/client/internal/metrics/metrics_default.go new file mode 100644 index 00000000000..85705a540db --- /dev/null +++ b/client/internal/metrics/metrics_default.go @@ -0,0 +1,11 @@ +//go:build !js + +package metrics + +// NewClientMetrics creates a new ClientMetrics instance +func NewClientMetrics(agentInfo AgentInfo) *ClientMetrics { + return &ClientMetrics{ + impl: newVictoriaMetrics(), + agentInfo: agentInfo, + } +} From f2ef0c4dc6d1ff0511ab45ffca64e5e71d959c35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 5 Mar 2026 17:06:06 +0100 Subject: [PATCH 18/52] Update default case in DeploymentType.String to return "unknown" instead of "selfhosted" --- client/internal/metrics/deployment_type.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/internal/metrics/deployment_type.go b/client/internal/metrics/deployment_type.go index e1d9e8d539e..4bf4fa020d1 100644 --- a/client/internal/metrics/deployment_type.go +++ b/client/internal/metrics/deployment_type.go @@ -26,7 +26,7 @@ func (d DeploymentType) String() string { case DeploymentTypeSelfHosted: return "selfhosted" default: - return "selfhosted" + return "unknown" } } From e5850641c8b4e3d5aef999473cc4f1777afa4399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 5 Mar 2026 17:43:01 +0100 Subject: [PATCH 19/52] [client] Rework metrics to use timestamped samples instead of histograms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace cumulative Prometheus histograms with timestamped point-in-time samples that are pushed once and cleared. This fixes metrics for sparse events (connections/syncs that happen once at startup) where rate() and increase() produced incorrect or empty results. Changes: - Switch from VictoriaMetrics histogram library to raw Prometheus text format with explicit millisecond timestamps - Reset samples after successful push (no resending stale data) - Rename connection_to_handshake → connection_to_wg_handshake - Add netbird_peer_connection_count metric for ICE vs Relay tracking - Simplify dashboard: point-based scatter plots, donut pie chart - Add maxStalenessInterval=1m to VictoriaMetrics to prevent forward-fill - Fix deployment_type Unknown returning "selfhosted" instead of "unknown" - Fix inverted shouldPush condition in push.go --- client/internal/metrics/docs/README.md | 10 +- .../metrics/docs/docker-compose.victoria.yml | 1 + .../json/netbird-connection-metrics.json | 194 +++++------------- client/internal/metrics/metrics.go | 11 +- client/internal/metrics/push.go | 1 + client/internal/metrics/push_test.go | 3 + client/internal/metrics/victoria.go | 125 ++++++----- 7 files changed, 141 insertions(+), 204 deletions(-) diff --git a/client/internal/metrics/docs/README.md b/client/internal/metrics/docs/README.md index 90b91565e3c..91866432580 100644 --- a/client/internal/metrics/docs/README.md +++ b/client/internal/metrics/docs/README.md @@ -49,7 +49,7 @@ Grafana (port 3000) ### Connection Stage Timing 1. `netbird_peer_connection_stage_signaling_received_to_connection` -2. `netbird_peer_connection_stage_connection_to_handshake` +2. `netbird_peer_connection_stage_connection_to_wg_handshake` 3. `netbird_peer_connection_total_creation_to_handshake` **Stage descriptions:** @@ -57,7 +57,7 @@ Grafana (port 3000) | Metric suffix | Timestamps | Description | |--------------------------------------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------| | `signaling_received_to_connection` | `SignalingReceived → ConnectionReady` | ICE/relay negotiation time after the first signal is received from the remote peer. Excludes the wait for the remote peer to come online. | -| `connection_to_handshake` | `ConnectionReady → WgHandshakeSuccess` | WireGuard cryptographic handshake latency once the transport layer is ready. | +| `connection_to_wg_handshake` | `ConnectionReady → WgHandshakeSuccess` | WireGuard cryptographic handshake latency once the transport layer is ready. | | `total_creation_to_handshake` | `SignalingReceived → WgHandshakeSuccess`| End-to-end connection time anchored at the first received signal. Excludes offline-peer wait time. | **Note:** `SignalingReceived` is set when the first offer or answer arrives from the remote peer (in both initial and reconnection paths). It is the anchor for all downstream stage durations, ensuring metrics reflect actual negotiation performance rather than how long the remote peer was unreachable. @@ -108,9 +108,9 @@ For URL and Interval, the precedence is: **Important:** - Metrics **accumulate** in memory (cumulative histograms) -- Metrics are **NOT reset** after push (correct Prometheus behavior) -- VictoriaMetrics calculates rates from deltas between pushes -- Each push sends **all** accumulated metrics +- Each push sends all accumulated metrics; VictoriaMetrics computes deltas +- Use `rate(sum)/rate(count)` for averages per time window +- Use `increase(count)` for event counts per time window - Metrics only reset on process restart ## Local Development Setup diff --git a/client/internal/metrics/docs/docker-compose.victoria.yml b/client/internal/metrics/docs/docker-compose.victoria.yml index 75005a39e19..429b89a202a 100644 --- a/client/internal/metrics/docs/docker-compose.victoria.yml +++ b/client/internal/metrics/docs/docker-compose.victoria.yml @@ -12,6 +12,7 @@ services: - "--storageDataPath=/victoria-metrics-data" - "--httpListenAddr=:8428" - "--retentionPeriod=12" # Keep data for 12 months + - "--search.maxStalenessInterval=1m" # Stop forward-filling after 1 minute of no new data restart: unless-stopped networks: - metrics diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json index 17ca9ed7cd0..b32a9c4a790 100644 --- a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json +++ b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json @@ -1,225 +1,131 @@ { + "uid": "netbird-connection-metrics", "title": "NetBird Client Connection Metrics", "tags": ["netbird", "connections"], "timezone": "browser", "panels": [ { - "id": 1, - "title": "Connection Stage Durations (Average)", + "id": 8, + "title": "Sync Duration", "type": "timeseries", "gridPos": { - "h": 9, + "h": 6, "w": 24, "x": 0, "y": 0 }, "targets": [ { - "expr": "(netbird_peer_connection_stage_signaling_received_to_connection_sum / netbird_peer_connection_stage_signaling_received_to_connection_count) * 1000", - "legendFormat": "1. SignalingReceived→Connection ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", + "expr": "netbird_sync_duration_seconds * 1000", + "legendFormat": "{{deployment_type}}/{{os}}", "refId": "A" - }, - { - "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum / netbird_peer_connection_stage_connection_to_handshake_count) * 1000", - "legendFormat": "2. Connection→Handshake ({{deployment_type}}/{{connection_type}}/{{attempt_type}})", - "refId": "B" } ], "fieldConfig": { "defaults": { "unit": "ms", + "min": 0, "custom": { - "stacking": { - "mode": "normal" - } + "drawStyle": "points", + "pointSize": 5 } } } }, { - "id": 8, - "title": "Sync Duration (Average)", + "id": 1, + "title": "Connection Stage Durations", "type": "timeseries", "gridPos": { - "h": 6, - "w": 12, + "h": 9, + "w": 24, "x": 0, - "y": 9 + "y": 6 }, "targets": [ { - "expr": "(rate(netbird_sync_duration_seconds_sum[5m]) / rate(netbird_sync_duration_seconds_count[5m])) * 1000", - "legendFormat": "{{deployment_type}}/{{os}}", + "expr": "netbird_peer_connection_stage_signaling_received_to_connection_seconds * 1000", + "legendFormat": "1. SignalingReceived→Connection ({{deployment_type}}/{{connection_type}})", "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "min": 0 - } - } - }, - { - "id": 9, - "title": "Sync Rate", - "type": "stat", - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 9 - }, - "targets": [ + }, { - "expr": "rate(netbird_sync_duration_seconds_count[5m]) * 60", - "legendFormat": "{{deployment_type}}/{{os}}", - "refId": "A" + "expr": "netbird_peer_connection_stage_connection_to_wg_handshake_seconds * 1000", + "legendFormat": "2. Connection→WG Handshake ({{deployment_type}}/{{connection_type}})", + "refId": "B" } ], - "options": { - "colorMode": "value", - "graphMode": "area" - }, "fieldConfig": { "defaults": { - "unit": "ops/min", - "decimals": 2 + "unit": "ms", + "custom": { + "drawStyle": "points", + "pointSize": 5 + } } } }, { "id": 2, - "title": "Total Connection Time (Average)", + "title": "Total Connection Time", "type": "timeseries", "gridPos": { "h": 8, - "w": 12, + "w": 24, "x": 0, "y": 15 }, "targets": [ { - "expr": "(netbird_peer_connection_total_creation_to_handshake_sum / netbird_peer_connection_total_creation_to_handshake_count) * 1000", - "legendFormat": "{{connection_type}}/{{attempt_type}}", + "expr": "netbird_peer_connection_total_seconds * 1000", + "legendFormat": "{{deployment_type}}/{{connection_type}}", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "ms", - "min": 0 + "min": 0, + "custom": { + "drawStyle": "points", + "pointSize": 5 + } } } }, { - "id": 3, - "title": "Initial Connection - Average Time by Stage", - "type": "bargauge", + "id": 4, + "title": "ICE vs Relay", + "type": "piechart", "gridPos": { "h": 8, - "w": 6, - "x": 12, - "y": 15 + "w": 24, + "x": 0, + "y": 23 }, "targets": [ { - "expr": "(netbird_peer_connection_stage_signaling_received_to_connection_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_signaling_received_to_connection_count{attempt_type=\"initial\"}) * 1000", - "legendFormat": "SignalingReceived→Connection", + "expr": "count_over_time(netbird_peer_connection_count{connection_type=\"ice\"}[$__range])", + "legendFormat": "ICE", "refId": "A" }, { - "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum{attempt_type=\"initial\"} / netbird_peer_connection_stage_connection_to_handshake_count{attempt_type=\"initial\"}) * 1000", - "legendFormat": "Connection→Handshake", + "expr": "count_over_time(netbird_peer_connection_count{connection_type=\"relay\"}[$__range])", + "legendFormat": "Relay", "refId": "B" } ], "options": { - "orientation": "horizontal", - "displayMode": "gradient" - }, - "fieldConfig": { - "defaults": { - "unit": "ms", - "min": 0 - } - } - }, - { - "id": 7, - "title": "Reconnection - Average Time by Stage", - "type": "bargauge", - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 15 - }, - "targets": [ - { - "expr": "(netbird_peer_connection_stage_signaling_received_to_connection_sum{attempt_type=\"reconnection\"} / netbird_peer_connection_stage_signaling_received_to_connection_count{attempt_type=\"reconnection\"}) * 1000", - "legendFormat": "SignalingReceived→Connection", - "refId": "A" + "reduceOptions": { + "calcs": ["sum"] }, - { - "expr": "(netbird_peer_connection_stage_connection_to_handshake_sum{attempt_type=\"reconnection\"} / netbird_peer_connection_stage_connection_to_handshake_count{attempt_type=\"reconnection\"}) * 1000", - "legendFormat": "Connection→Handshake", - "refId": "B" - } - ], - "options": { - "orientation": "horizontal", - "displayMode": "gradient" - }, - "fieldConfig": { - "defaults": { - "unit": "ms", - "min": 0 + "pieType": "donut", + "tooltip": { + "mode": "multi" } } - }, - { - "id": 4, - "title": "Total Connections", - "type": "stat", - "gridPos": { - "h": 4, - "w": 12, - "x": 0, - "y": 23 - }, - "targets": [ - { - "expr": "netbird_peer_connection_total_creation_to_handshake_count", - "legendFormat": "{{connection_type}}/{{attempt_type}}", - "refId": "A" - } - ], - "options": { - "colorMode": "value", - "graphMode": "area" - } - }, - { - "id": 6, - "title": "ICE vs Relay", - "type": "piechart", - "gridPos": { - "h": 4, - "w": 12, - "x": 12, - "y": 23 - }, - "targets": [ - { - "expr": "sum(netbird_peer_connection_total_creation_to_handshake_count) by (connection_type)", - "legendFormat": "{{connection_type}}", - "refId": "A" - } - ] } ], "schemaVersion": 27, - "version": 5, + "version": 12, "refresh": "30s" } \ No newline at end of file diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 3119d3e78ad..6d415e662c8 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -35,6 +35,9 @@ type metricsImplementation interface { // Export exports metrics in Prometheus format Export(w io.Writer) error + + // Reset clears all collected metrics + Reset() } type ClientMetrics struct { @@ -79,6 +82,7 @@ func (c *ClientMetrics) RecordConnectionStages( agentInfo := c.agentInfo c.mu.RUnlock() + log.Infof("--- conn stages: %v, %v", connectionType, timestamps) c.impl.RecordConnectionStages(ctx, agentInfo, connectionType, isReconnection, timestamps) } @@ -101,14 +105,8 @@ func (c *ClientMetrics) UpdateAgentInfo(agentInfo AgentInfo) { } c.mu.Lock() - oldDeploymentType := c.agentInfo.DeploymentType c.agentInfo = agentInfo c.mu.Unlock() - - if oldDeploymentType != agentInfo.DeploymentType { - log.Infof("metrics deployment type updated: %s -> %s", - oldDeploymentType.String(), agentInfo.DeploymentType.String()) - } } // Export exports metrics to the writer @@ -116,6 +114,7 @@ func (c *ClientMetrics) Export(w io.Writer) error { if c == nil { return nil } + return c.impl.Export(w) } diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index fc98d569193..b21fd21621b 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -195,6 +195,7 @@ func (p *Push) push(ctx context.Context) error { } log.Debugf("successfully pushed metrics to %s", p.pushURL) + p.metrics.Reset() return nil } diff --git a/client/internal/metrics/push_test.go b/client/internal/metrics/push_test.go index 6dd9bf3c495..41e2cc41f6a 100644 --- a/client/internal/metrics/push_test.go +++ b/client/internal/metrics/push_test.go @@ -60,6 +60,9 @@ func (m *mockMetrics) Export(w io.Writer) error { return nil } +func (m *mockMetrics) Reset() { +} + func TestPush_OverrideIntervalAlwaysPushes(t *testing.T) { var pushCount atomic.Int32 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index c49c1f12d34..70cad014294 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -4,105 +4,132 @@ import ( "context" "fmt" "io" + "sync" "time" - "github.com/VictoriaMetrics/metrics" log "github.com/sirupsen/logrus" ) -// victoriaMetrics is the VictoriaMetrics implementation of ClientMetrics +type metricSample struct { + name string + value float64 + timestamp time.Time +} + +// victoriaMetrics collects metric events as timestamped samples. +// Each event is recorded with its exact timestamp, pushed once, then cleared. type victoriaMetrics struct { - // Metrics set for managing all metrics - set *metrics.Set + mu sync.Mutex + samples []metricSample } func newVictoriaMetrics() metricsImplementation { - return &victoriaMetrics{ - set: metrics.NewSet(), - } + return &victoriaMetrics{} } -// RecordConnectionStages records the duration of each connection stage from timestamps func (m *victoriaMetrics) RecordConnectionStages( - ctx context.Context, + _ context.Context, agentInfo AgentInfo, connectionType ConnectionType, isReconnection bool, timestamps ConnectionStageTimestamps, ) { - // Calculate stage durations - var signalingReceivedToConnection, connectionToHandshake, totalDuration float64 + var signalingReceivedToConnection, connectionToWgHandshake, totalDuration float64 - // Use SignalingReceived as the base: measures negotiation time after the remote peer - // responded, excluding unbounded wait time when the remote peer is offline. if !timestamps.SignalingReceived.IsZero() && !timestamps.ConnectionReady.IsZero() { signalingReceivedToConnection = timestamps.ConnectionReady.Sub(timestamps.SignalingReceived).Seconds() } if !timestamps.ConnectionReady.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { - connectionToHandshake = timestamps.WgHandshakeSuccess.Sub(timestamps.ConnectionReady).Seconds() + connectionToWgHandshake = timestamps.WgHandshakeSuccess.Sub(timestamps.ConnectionReady).Seconds() } - // Calculate total duration anchored at SignalingReceived → WgHandshakeSuccess. - // This excludes the potentially unbounded wait for the remote peer to come online. if !timestamps.SignalingReceived.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.SignalingReceived).Seconds() } - // Determine attempt type attemptType := "initial" if isReconnection { attemptType = "reconnection" } connTypeStr := connectionType.String() - - m.set.GetOrCreateHistogram( - m.getMetricName(agentInfo, "netbird_peer_connection_stage_signaling_received_to_connection", connTypeStr, attemptType), - ).Update(signalingReceivedToConnection) - - m.set.GetOrCreateHistogram( - m.getMetricName(agentInfo, "netbird_peer_connection_stage_connection_to_handshake", connTypeStr, attemptType), - ).Update(connectionToHandshake) - - m.set.GetOrCreateHistogram( - m.getMetricName(agentInfo, "netbird_peer_connection_total_creation_to_handshake", connTypeStr, attemptType), - ).Update(totalDuration) - - log.Tracef("peer connection metrics [%s, %s, %s]: signalingReceived→connection: %.3fs, connection→handshake: %.3fs, total: %.3fs", - agentInfo.DeploymentType.String(), connTypeStr, attemptType, signalingReceivedToConnection, connectionToHandshake, totalDuration) -} - -// getMetricName constructs a metric name with labels -func (m *victoriaMetrics) getMetricName(agentInfo AgentInfo, baseName, connectionType, attemptType string) string { - return fmt.Sprintf(`%s{deployment_type=%q,connection_type=%q,attempt_type=%q,version=%q,os=%q}`, - baseName, + labels := fmt.Sprintf(`deployment_type=%q,connection_type=%q,attempt_type=%q,version=%q,os=%q`, agentInfo.DeploymentType.String(), - connectionType, + connTypeStr, attemptType, agentInfo.Version, agentInfo.OS, ) + + now := time.Now() + + m.mu.Lock() + defer m.mu.Unlock() + + m.samples = append(m.samples, + metricSample{ + name: fmt.Sprintf("netbird_peer_connection_stage_signaling_received_to_connection_seconds{%s}", labels), + value: signalingReceivedToConnection, + timestamp: now, + }, + metricSample{ + name: fmt.Sprintf("netbird_peer_connection_stage_connection_to_wg_handshake_seconds{%s}", labels), + value: connectionToWgHandshake, + timestamp: now, + }, + metricSample{ + name: fmt.Sprintf("netbird_peer_connection_total_seconds{%s}", labels), + value: totalDuration, + timestamp: now, + }, + metricSample{ + name: fmt.Sprintf("netbird_peer_connection_count{%s}", labels), + value: 1, + timestamp: now, + }, + ) + + log.Tracef("peer connection metrics [%s, %s, %s]: signalingReceived→connection: %.3fs, connection→wg_handshake: %.3fs, total: %.3fs", + agentInfo.DeploymentType.String(), connTypeStr, attemptType, signalingReceivedToConnection, connectionToWgHandshake, totalDuration) } -// RecordSyncDuration records the duration of sync message processing -func (m *victoriaMetrics) RecordSyncDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration) { - metricName := fmt.Sprintf(`netbird_sync_duration_seconds{deployment_type=%q,version=%q,os=%q}`, +func (m *victoriaMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentInfo, duration time.Duration) { + name := fmt.Sprintf(`netbird_sync_duration_seconds{deployment_type=%q,version=%q,os=%q}`, agentInfo.DeploymentType.String(), agentInfo.Version, agentInfo.OS, ) - m.set.GetOrCreateHistogram(metricName).Update(duration.Seconds()) + m.mu.Lock() + defer m.mu.Unlock() + + m.samples = append(m.samples, metricSample{ + name: name, + value: duration.Seconds(), + timestamp: time.Now(), + }) } -// Export writes metrics in Prometheus text format with HELP comments +// Export writes pending samples in Prometheus text format with explicit timestamps. +// Format: metric_name{labels} value timestamp_ms func (m *victoriaMetrics) Export(w io.Writer) error { - if m.set == nil { - return fmt.Errorf("metrics set not initialized") + m.mu.Lock() + samples := make([]metricSample, len(m.samples)) + copy(samples, m.samples) + m.mu.Unlock() + + for _, s := range samples { + if _, err := fmt.Fprintf(w, "%s %g %d\n", s.name, s.value, s.timestamp.UnixMilli()); err != nil { + return err + } } - - // Write metrics in Prometheus format - m.set.WritePrometheus(w) return nil } + +// Reset clears pending samples after a successful push +func (m *victoriaMetrics) Reset() { + m.mu.Lock() + defer m.mu.Unlock() + m.samples = m.samples[:0] +} From 5a018c10d502d3fca2832d29f5dba16efb60eca9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 5 Mar 2026 18:36:02 +0100 Subject: [PATCH 20/52] [client] Add InfluxDB metrics backend alongside VictoriaMetrics Add influxdb.go with timestamped line protocol export for sparse one-shot events. Restore victoria.go to use proper Prometheus histograms. Update Grafana dashboards, add InfluxDB datasource, and update docs. Co-Authored-By: Claude Opus 4.6 --- client/internal/metrics/docs/README.md | 171 ++++++++---------- .../metrics/docs/docker-compose.victoria.yml | 23 +++ .../json/netbird-connection-metrics.json | 12 +- .../json/netbird-influxdb-metrics.json | 133 ++++++++++++++ .../provisioning/datasources/influxdb.yml | 14 ++ client/internal/metrics/env.go | 10 +- client/internal/metrics/influxdb.go | 145 +++++++++++++++ client/internal/metrics/push.go | 5 +- client/internal/metrics/victoria.go | 111 +++++------- 9 files changed, 450 insertions(+), 174 deletions(-) create mode 100644 client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json create mode 100644 client/internal/metrics/docs/grafana/provisioning/datasources/influxdb.yml create mode 100644 client/internal/metrics/influxdb.go diff --git a/client/internal/metrics/docs/README.md b/client/internal/metrics/docs/README.md index 91866432580..865b025dba6 100644 --- a/client/internal/metrics/docs/README.md +++ b/client/internal/metrics/docs/README.md @@ -4,10 +4,17 @@ Internal documentation for the NetBird client metrics system. ## Overview -Client metrics track connection performance and sync durations. Metrics are: -- Collected in-memory using VictoriaMetrics histograms -- Pushed periodically to a VictoriaMetrics server -- Disabled by default (opt-in via environment variable) +Client metrics track connection performance and sync durations. Two backend implementations are available: + +- **InfluxDB** (`influxdb.go`): Timestamped samples in InfluxDB line protocol. Best for sparse one-shot events (connections, syncs). Each event is pushed once then cleared. +- **VictoriaMetrics** (`victoria.go`): Prometheus-style cumulative histograms. Better for continuous/high-frequency metrics. + +Select the implementation in `metrics_default.go`: +- `newInfluxDBMetrics()` — InfluxDB line protocol +- `newVictoriaMetrics()` — Prometheus format + +Metrics are: +- Disabled by default (opt-in via `NB_METRICS_ENABLED=true`) - Managed at daemon layer (survives engine restarts) ## Architecture @@ -25,57 +32,36 @@ Engine Layer (engine.go) └─ Records metrics via ClientMetrics methods ``` -### Data Flow - -``` -NetBird Client - ├─ Records metrics in memory (histograms) - ├─ Push to VictoriaMetrics via HTTP POST - └─ Metrics endpoint: /api/v1/import/prometheus - │ - ▼ -VictoriaMetrics (port 8428) - ├─ Stores time-series data - ├─ 12 month retention - └─ Prometheus-compatible query API - │ - ▼ -Grafana (port 3000) - └─ Pre-configured dashboard -``` - ## Metrics Collected ### Connection Stage Timing -1. `netbird_peer_connection_stage_signaling_received_to_connection` -2. `netbird_peer_connection_stage_connection_to_wg_handshake` -3. `netbird_peer_connection_total_creation_to_handshake` - -**Stage descriptions:** - -| Metric suffix | Timestamps | Description | -|--------------------------------------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------| -| `signaling_received_to_connection` | `SignalingReceived → ConnectionReady` | ICE/relay negotiation time after the first signal is received from the remote peer. Excludes the wait for the remote peer to come online. | -| `connection_to_wg_handshake` | `ConnectionReady → WgHandshakeSuccess` | WireGuard cryptographic handshake latency once the transport layer is ready. | -| `total_creation_to_handshake` | `SignalingReceived → WgHandshakeSuccess`| End-to-end connection time anchored at the first received signal. Excludes offline-peer wait time. | +Measurement: `netbird_peer_connection` -**Note:** `SignalingReceived` is set when the first offer or answer arrives from the remote peer (in both initial and reconnection paths). It is the anchor for all downstream stage durations, ensuring metrics reflect actual negotiation performance rather than how long the remote peer was unreachable. +| Field | Timestamps | Description | +|-------|-----------|-------------| +| `signaling_to_connection_seconds` | `SignalingReceived → ConnectionReady` | ICE/relay negotiation time after the first signal is received from the remote peer | +| `connection_to_wg_handshake_seconds` | `ConnectionReady → WgHandshakeSuccess` | WireGuard cryptographic handshake latency once the transport layer is ready | +| `total_seconds` | `SignalingReceived → WgHandshakeSuccess` | End-to-end connection time anchored at the first received signal | -Labels: +Tags: - `deployment_type`: "cloud" | "selfhosted" | "unknown" - `connection_type`: "ice" | "relay" - `attempt_type`: "initial" | "reconnection" - `version`: NetBird version string - `os`: Operating system (linux, darwin, windows, android, ios, etc.) +**Note:** `SignalingReceived` is set when the first offer or answer arrives from the remote peer (in both initial and reconnection paths). It excludes the potentially unbounded wait for the remote peer to come online. + ### Sync Duration -Tracks time to process sync messages from management server: +Measurement: `netbird_sync` -1. `netbird_sync_duration_seconds` +| Field | Description | +|-------|-------------| +| `duration_seconds` | Time to process a sync message from management server | -Labels: +Tags: - `deployment_type`: "cloud" | "selfhosted" | "unknown" - `version`: NetBird version string - `os`: Operating system (linux, darwin, windows, android, ios, etc.) @@ -84,11 +70,20 @@ Labels: ### Environment Variables -| Variable | Default | Description | -|----------|---------|-----------------------------------------| -| `NB_METRICS_ENABLED` | `false` | Enable metrics push | -| `NB_METRICS_SERVER_URL` | `https://api.netbird.io:8428/api/v1/import/prometheus` | VictoriaMetrics endpoint | -| `NB_METRICS_INTERVAL` | | Push interval (e.g., "1m", "30m", "4h") | +| Variable | Default | Description | +|----------|---------|-------------| +| `NB_METRICS_ENABLED` | `false` | Enable metrics push | +| `NB_METRICS_SERVER_URL` | `https://api.netbird.io:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns` | Metrics endpoint URL | +| `NB_METRICS_INTERVAL` | | Push interval (e.g., "1m", "30m", "4h"). When set, bypasses remote config. | +| `NB_METRICS_TOKEN` | | Optional auth token for the metrics server | +| `NB_METRICS_CONFIG_URL` | `https://api.netbird.io/client-metrics-config.json` | Remote push config URL | + +### Backend-specific URLs + +| Backend | URL | +|---------|-----| +| **InfluxDB** | `http://:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns` | +| **VictoriaMetrics** | `http://:8428/api/v1/import/prometheus` | ### Configuration Precedence @@ -99,91 +94,75 @@ For URL and Interval, the precedence is: ## Push Behavior -1. `StartPush()` spawns background goroutine with ticker +1. `StartPush()` spawns background goroutine with timer 2. First push happens immediately on startup 3. Periodically: `push()` → `Export()` → HTTP POST 4. On failure: log error, continue (non-blocking) -5. On success: log debug message +5. On success: `Reset()` clears pushed samples, log debug message 6. `StopPush()` cancels context and waits for goroutine -**Important:** -- Metrics **accumulate** in memory (cumulative histograms) -- Each push sends all accumulated metrics; VictoriaMetrics computes deltas -- Use `rate(sum)/rate(count)` for averages per time window -- Use `increase(count)` for event counts per time window -- Metrics only reset on process restart +**InfluxDB mode:** Samples are collected with exact timestamps, pushed once, then cleared. No data is resent. + +**VictoriaMetrics mode:** Cumulative histograms accumulate in memory. After successful push, metrics are unregistered. Use `rate(sum)/rate(count)` for averages. ## Local Development Setup -### 1. Start VictoriaMetrics +### 1. Start Services ```bash # From this directory -docker-compose -f docker-compose.victoria.yml up -d - -# View logs -docker-compose -f docker-compose.victoria.yml logs -f +docker compose -f docker-compose.victoria.yml up -d ``` **Access:** -- VictoriaMetrics UI: http://localhost:8428 - Grafana: http://localhost:3001 (admin/admin) +- InfluxDB: http://localhost:8086 +- VictoriaMetrics: http://localhost:8428 -### 2. Configure Client +### 2. Configure Client (InfluxDB) ```bash export NB_METRICS_ENABLED=true -export NB_METRICS_SERVER_URL=http://localhost:8428/api/v1/import/prometheus +export NB_METRICS_SERVER_URL='http://localhost:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns' +export NB_METRICS_TOKEN=netbird-metrics-token export NB_METRICS_INTERVAL=1m - -# Run client -cd ../../../.. -go run main.go up ``` -### 3. Verify Metrics +Make sure `metrics_default.go` uses `newInfluxDBMetrics()`. -```bash -# Watch client logs -go run main.go up 2>&1 | grep -i metric - -# List all available metric names -curl http://localhost:8428/api/v1/label/__name__/values +### 3. Configure Client (VictoriaMetrics) -# Query specific metric -curl 'http://localhost:8428/api/v1/query?query=netbird_peer_connection_total_creation_to_handshake_count' +```bash +export NB_METRICS_ENABLED=true +export NB_METRICS_SERVER_URL=http://localhost:8428/api/v1/import/prometheus +export NB_METRICS_INTERVAL=1m ``` -### 4. View in Grafana +Make sure `metrics_default.go` uses `newVictoriaMetrics()`. -Open http://localhost:3001/d/netbird-connection-metrics +### 4. Run Client -Dashboard JSON location: -``` -grafana/provisioning/dashboards/json/netbird-connection-metrics.json +```bash +cd ../../../.. +go run ./client/ up ``` -Export modified dashboards from Grafana UI and replace this file. - -## Querying Metrics - -### VictoriaMetrics UI - -Open http://localhost:8428/vmui +### 5. View in Grafana -```promql -# P95 connection time -histogram_quantile(0.95, netbird_peer_connection_total_creation_to_handshake) +- **InfluxDB dashboard:** http://localhost:3001/d/netbird-influxdb-metrics +- **VictoriaMetrics dashboard:** http://localhost:3001/d/netbird-connection-metrics -# Connection rate -rate(netbird_peer_connection_total_creation_to_handshake_count[5m]) +### 6. Verify Data -# Average sync duration -rate(netbird_sync_duration_seconds_sum[5m]) / rate(netbird_sync_duration_seconds_count[5m]) -``` +```bash +# InfluxDB - query data +curl -H "Authorization: Token netbird-metrics-token" \ + 'http://localhost:8086/api/v2/query?org=netbird' \ + --data-urlencode 'q=from(bucket:"metrics") |> range(start: -1h)' -### API Queries +# VictoriaMetrics - list metrics +curl http://localhost:8428/api/v1/label/__name__/values -```bash -curl 'http://localhost:8428/api/v1/query?query=netbird_peer_connection_total_creation_to_handshake_count' +# VictoriaMetrics - delete all data +curl -s http://localhost:8428/api/v1/admin/tsdb/delete_series --data-urlencode 'match[]={__name__=~".+"}' ``` \ No newline at end of file diff --git a/client/internal/metrics/docs/docker-compose.victoria.yml b/client/internal/metrics/docs/docker-compose.victoria.yml index 429b89a202a..79ae27206d7 100644 --- a/client/internal/metrics/docs/docker-compose.victoria.yml +++ b/client/internal/metrics/docs/docker-compose.victoria.yml @@ -17,6 +17,26 @@ services: networks: - metrics + influxdb: + container_name: influxdb + image: influxdb:2 + ports: + - "8086:8086" + volumes: + - influxdb-data:/var/lib/influxdb2 + environment: + - DOCKER_INFLUXDB_INIT_MODE=setup + - DOCKER_INFLUXDB_INIT_USERNAME=admin + - DOCKER_INFLUXDB_INIT_PASSWORD=adminadmin + - DOCKER_INFLUXDB_INIT_ORG=netbird + - DOCKER_INFLUXDB_INIT_BUCKET=metrics + - DOCKER_INFLUXDB_INIT_RETENTION=365d + - DOCKER_INFLUXDB_INIT_ADMIN_TOKEN=netbird-metrics-token + - INFLUXD_HTTP_AUTH_ENABLED=false + restart: unless-stopped + networks: + - metrics + grafana: container_name: grafana image: grafana/grafana:latest @@ -26,17 +46,20 @@ services: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=admin - GF_USERS_ALLOW_SIGN_UP=false + - GF_INSTALL_PLUGINS= volumes: - grafana-data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning depends_on: - victoriametrics + - influxdb restart: unless-stopped networks: - metrics volumes: victoria-metrics-data: + influxdb-data: grafana-data: networks: diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json index b32a9c4a790..be2a28021aa 100644 --- a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json +++ b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json @@ -16,7 +16,7 @@ }, "targets": [ { - "expr": "netbird_sync_duration_seconds * 1000", + "expr": "netbird_sync_duration_seconds_sum / netbird_sync_duration_seconds_count * 1000", "legendFormat": "{{deployment_type}}/{{os}}", "refId": "A" } @@ -44,12 +44,12 @@ }, "targets": [ { - "expr": "netbird_peer_connection_stage_signaling_received_to_connection_seconds * 1000", + "expr": "netbird_peer_connection_stage_signaling_received_to_connection_sum / netbird_peer_connection_stage_signaling_received_to_connection_count * 1000", "legendFormat": "1. SignalingReceived→Connection ({{deployment_type}}/{{connection_type}})", "refId": "A" }, { - "expr": "netbird_peer_connection_stage_connection_to_wg_handshake_seconds * 1000", + "expr": "netbird_peer_connection_stage_connection_to_wg_handshake_sum / netbird_peer_connection_stage_connection_to_wg_handshake_count * 1000", "legendFormat": "2. Connection→WG Handshake ({{deployment_type}}/{{connection_type}})", "refId": "B" } @@ -76,7 +76,7 @@ }, "targets": [ { - "expr": "netbird_peer_connection_total_seconds * 1000", + "expr": "netbird_peer_connection_total_creation_to_wg_handshake_sum / netbird_peer_connection_total_creation_to_wg_handshake_count * 1000", "legendFormat": "{{deployment_type}}/{{connection_type}}", "refId": "A" } @@ -104,12 +104,12 @@ }, "targets": [ { - "expr": "count_over_time(netbird_peer_connection_count{connection_type=\"ice\"}[$__range])", + "expr": "sum(netbird_peer_connection_total_creation_to_wg_handshake_count{connection_type=\"ice\"})", "legendFormat": "ICE", "refId": "A" }, { - "expr": "count_over_time(netbird_peer_connection_count{connection_type=\"relay\"}[$__range])", + "expr": "sum(netbird_peer_connection_total_creation_to_wg_handshake_count{connection_type=\"relay\"})", "legendFormat": "Relay", "refId": "B" } diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json new file mode 100644 index 00000000000..6e5f83be5fc --- /dev/null +++ b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json @@ -0,0 +1,133 @@ +{ + "uid": "netbird-influxdb-metrics", + "title": "NetBird Client Metrics (InfluxDB)", + "tags": ["netbird", "connections", "influxdb"], + "timezone": "browser", + "panels": [ + { + "id": 1, + "title": "Sync Duration", + "type": "timeseries", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 0 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0, + "custom": { + "drawStyle": "points", + "pointSize": 5 + } + } + } + }, + { + "id": 2, + "title": "Connection Stage Durations", + "type": "timeseries", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 6 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and (r._field == \"signaling_to_connection_seconds\" or r._field == \"connection_to_wg_handshake_seconds\"))\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "drawStyle": "points", + "pointSize": 5 + } + } + } + }, + { + "id": 3, + "title": "Total Connection Time", + "type": "timeseries", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 15 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0, + "custom": { + "drawStyle": "points", + "pointSize": 5 + } + } + } + }, + { + "id": 4, + "title": "ICE vs Relay", + "type": "piechart", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 23 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> group(columns: [\"connection_type\"])\n |> count()\n |> rename(columns: {_value: \"count\"})", + "refId": "A" + } + ], + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "pieType": "donut", + "tooltip": { + "mode": "multi" + } + } + } + ], + "schemaVersion": 27, + "version": 1, + "refresh": "30s" +} \ No newline at end of file diff --git a/client/internal/metrics/docs/grafana/provisioning/datasources/influxdb.yml b/client/internal/metrics/docs/grafana/provisioning/datasources/influxdb.yml new file mode 100644 index 00000000000..952b080c778 --- /dev/null +++ b/client/internal/metrics/docs/grafana/provisioning/datasources/influxdb.yml @@ -0,0 +1,14 @@ +apiVersion: 1 + +datasources: + - name: InfluxDB + type: influxdb + access: proxy + url: http://influxdb:8086 + editable: true + jsonData: + version: Flux + organization: netbird + defaultBucket: metrics + secureJsonData: + token: netbird-metrics-token \ No newline at end of file diff --git a/client/internal/metrics/env.go b/client/internal/metrics/env.go index 66e1fbad8c0..22b181e7a31 100644 --- a/client/internal/metrics/env.go +++ b/client/internal/metrics/env.go @@ -22,6 +22,9 @@ const ( // Format: duration string like "1h", "30m", "4h" EnvMetricsInterval = "NB_METRICS_INTERVAL" + // EnvMetricsToken is the optional authentication token for the metrics server + EnvMetricsToken = "NB_METRICS_TOKEN" + // EnvMetricsConfigURL is the environment variable to override the metrics push config URL EnvMetricsConfigURL = "NB_METRICS_CONFIG_URL" @@ -34,7 +37,7 @@ var ( func init() { var err error - defaultMetricsURL, err = url.Parse("https://api.netbird.io:8428/api/v1/import/prometheus") + defaultMetricsURL, err = url.Parse("https://api.netbird.io:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns") if err != nil { log.Fatalf("failed to parse default metrics URL: %v", err) } @@ -72,6 +75,11 @@ func getMetricsConfigURL() string { return defaultMetricsConfigURL } +// getMetricsToken returns the optional auth token for the metrics server +func getMetricsToken() string { + return os.Getenv(EnvMetricsToken) +} + // getMetricsInterval returns the metrics push interval from environment variable // If not set or invalid, returns 0 (which will use the default in NewPush) func getMetricsInterval() time.Duration { diff --git a/client/internal/metrics/influxdb.go b/client/internal/metrics/influxdb.go new file mode 100644 index 00000000000..ce9941e6693 --- /dev/null +++ b/client/internal/metrics/influxdb.go @@ -0,0 +1,145 @@ +package metrics + +import ( + "context" + "fmt" + "io" + "sync" + "time" + + log "github.com/sirupsen/logrus" +) + +// influxSample is a single InfluxDB line protocol entry. +type influxSample struct { + measurement string + tags string + fields map[string]float64 + timestamp time.Time +} + +// influxDBMetrics collects metric events as timestamped samples. +// Each event is recorded with its exact timestamp, pushed once, then cleared. +type influxDBMetrics struct { + mu sync.Mutex + samples []influxSample +} + +func newInfluxDBMetrics() metricsImplementation { + return &influxDBMetrics{} +} + +func (m *influxDBMetrics) RecordConnectionStages( + _ context.Context, + agentInfo AgentInfo, + connectionType ConnectionType, + isReconnection bool, + timestamps ConnectionStageTimestamps, +) { + var signalingReceivedToConnection, connectionToWgHandshake, totalDuration float64 + + if !timestamps.SignalingReceived.IsZero() && !timestamps.ConnectionReady.IsZero() { + signalingReceivedToConnection = timestamps.ConnectionReady.Sub(timestamps.SignalingReceived).Seconds() + } + + if !timestamps.ConnectionReady.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { + connectionToWgHandshake = timestamps.WgHandshakeSuccess.Sub(timestamps.ConnectionReady).Seconds() + } + + if !timestamps.SignalingReceived.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { + totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.SignalingReceived).Seconds() + } + + attemptType := "initial" + if isReconnection { + attemptType = "reconnection" + } + + connTypeStr := connectionType.String() + tags := fmt.Sprintf("deployment_type=%s,connection_type=%s,attempt_type=%s,version=%s,os=%s", + agentInfo.DeploymentType.String(), + connTypeStr, + attemptType, + agentInfo.Version, + agentInfo.OS, + ) + + now := time.Now() + + m.mu.Lock() + defer m.mu.Unlock() + + m.samples = append(m.samples, influxSample{ + measurement: "netbird_peer_connection", + tags: tags, + fields: map[string]float64{ + "signaling_to_connection_seconds": signalingReceivedToConnection, + "connection_to_wg_handshake_seconds": connectionToWgHandshake, + "total_seconds": totalDuration, + }, + timestamp: now, + }) + + log.Tracef("peer connection metrics [%s, %s, %s]: signalingReceived→connection: %.3fs, connection→wg_handshake: %.3fs, total: %.3fs", + agentInfo.DeploymentType.String(), connTypeStr, attemptType, signalingReceivedToConnection, connectionToWgHandshake, totalDuration) +} + +func (m *influxDBMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentInfo, duration time.Duration) { + tags := fmt.Sprintf("deployment_type=%s,version=%s,os=%s", + agentInfo.DeploymentType.String(), + agentInfo.Version, + agentInfo.OS, + ) + + m.mu.Lock() + defer m.mu.Unlock() + + m.samples = append(m.samples, influxSample{ + measurement: "netbird_sync", + tags: tags, + fields: map[string]float64{ + "duration_seconds": duration.Seconds(), + }, + timestamp: time.Now(), + }) +} + +// Export writes pending samples in InfluxDB line protocol format. +// Format: measurement,tag=val,tag=val field=val,field=val timestamp_ns +func (m *influxDBMetrics) Export(w io.Writer) error { + m.mu.Lock() + samples := make([]influxSample, len(m.samples)) + copy(samples, m.samples) + m.mu.Unlock() + + for _, s := range samples { + if _, err := fmt.Fprintf(w, "%s,%s ", s.measurement, s.tags); err != nil { + return err + } + + first := true + for k, v := range s.fields { + if !first { + if _, err := fmt.Fprint(w, ","); err != nil { + return err + } + } + if _, err := fmt.Fprintf(w, "%s=%g", k, v); err != nil { + return err + } + first = false + } + + if _, err := fmt.Fprintf(w, " %d\n", s.timestamp.UnixNano()); err != nil { + return err + } + } + return nil +} + +// Reset clears pending samples after a successful push +func (m *influxDBMetrics) Reset() { + m.mu.Lock() + defer m.mu.Unlock() + m.samples = m.samples[:0] +} \ No newline at end of file diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index b21fd21621b..6960354ad12 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -173,7 +173,10 @@ func (p *Push) push(ctx context.Context) error { if err != nil { return fmt.Errorf("create request: %w", err) } - req.Header.Set("Content-Type", "text/plain") + req.Header.Set("Content-Type", "text/plain; charset=utf-8") + if token := getMetricsToken(); token != "" { + req.Header.Set("Authorization", "Token "+token) + } // Send request resp, err := p.client.Do(req) diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index 70cad014294..5ea73e38134 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -4,29 +4,25 @@ import ( "context" "fmt" "io" - "sync" "time" + "github.com/VictoriaMetrics/metrics" log "github.com/sirupsen/logrus" ) -type metricSample struct { - name string - value float64 - timestamp time.Time -} - -// victoriaMetrics collects metric events as timestamped samples. -// Each event is recorded with its exact timestamp, pushed once, then cleared. +// victoriaMetrics is the VictoriaMetrics implementation of ClientMetrics type victoriaMetrics struct { - mu sync.Mutex - samples []metricSample + // Metrics set for managing all metrics + set *metrics.Set } func newVictoriaMetrics() metricsImplementation { - return &victoriaMetrics{} + return &victoriaMetrics{ + set: metrics.NewSet(), + } } +// RecordConnectionStages records the duration of each connection stage from timestamps func (m *victoriaMetrics) RecordConnectionStages( _ context.Context, agentInfo AgentInfo, @@ -54,82 +50,57 @@ func (m *victoriaMetrics) RecordConnectionStages( } connTypeStr := connectionType.String() - labels := fmt.Sprintf(`deployment_type=%q,connection_type=%q,attempt_type=%q,version=%q,os=%q`, + + m.set.GetOrCreateHistogram( + m.getMetricName(agentInfo, "netbird_peer_connection_stage_signaling_received_to_connection", connTypeStr, attemptType), + ).Update(signalingReceivedToConnection) + + m.set.GetOrCreateHistogram( + m.getMetricName(agentInfo, "netbird_peer_connection_stage_connection_to_wg_handshake", connTypeStr, attemptType), + ).Update(connectionToWgHandshake) + + m.set.GetOrCreateHistogram( + m.getMetricName(agentInfo, "netbird_peer_connection_total_creation_to_wg_handshake", connTypeStr, attemptType), + ).Update(totalDuration) + + log.Tracef("peer connection metrics [%s, %s, %s]: signalingReceived→connection: %.3fs, connection→wg_handshake: %.3fs, total: %.3fs", + agentInfo.DeploymentType.String(), connTypeStr, attemptType, signalingReceivedToConnection, connectionToWgHandshake, totalDuration) +} + +// getMetricName constructs a metric name with labels +func (m *victoriaMetrics) getMetricName(agentInfo AgentInfo, baseName, connectionType, attemptType string) string { + return fmt.Sprintf(`%s{deployment_type=%q,connection_type=%q,attempt_type=%q,version=%q,os=%q}`, + baseName, agentInfo.DeploymentType.String(), - connTypeStr, + connectionType, attemptType, agentInfo.Version, agentInfo.OS, ) - - now := time.Now() - - m.mu.Lock() - defer m.mu.Unlock() - - m.samples = append(m.samples, - metricSample{ - name: fmt.Sprintf("netbird_peer_connection_stage_signaling_received_to_connection_seconds{%s}", labels), - value: signalingReceivedToConnection, - timestamp: now, - }, - metricSample{ - name: fmt.Sprintf("netbird_peer_connection_stage_connection_to_wg_handshake_seconds{%s}", labels), - value: connectionToWgHandshake, - timestamp: now, - }, - metricSample{ - name: fmt.Sprintf("netbird_peer_connection_total_seconds{%s}", labels), - value: totalDuration, - timestamp: now, - }, - metricSample{ - name: fmt.Sprintf("netbird_peer_connection_count{%s}", labels), - value: 1, - timestamp: now, - }, - ) - - log.Tracef("peer connection metrics [%s, %s, %s]: signalingReceived→connection: %.3fs, connection→wg_handshake: %.3fs, total: %.3fs", - agentInfo.DeploymentType.String(), connTypeStr, attemptType, signalingReceivedToConnection, connectionToWgHandshake, totalDuration) } +// RecordSyncDuration records the duration of sync message processing func (m *victoriaMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentInfo, duration time.Duration) { - name := fmt.Sprintf(`netbird_sync_duration_seconds{deployment_type=%q,version=%q,os=%q}`, + metricName := fmt.Sprintf(`netbird_sync_duration_seconds{deployment_type=%q,version=%q,os=%q}`, agentInfo.DeploymentType.String(), agentInfo.Version, agentInfo.OS, ) - m.mu.Lock() - defer m.mu.Unlock() - - m.samples = append(m.samples, metricSample{ - name: name, - value: duration.Seconds(), - timestamp: time.Now(), - }) + m.set.GetOrCreateHistogram(metricName).Update(duration.Seconds()) } -// Export writes pending samples in Prometheus text format with explicit timestamps. -// Format: metric_name{labels} value timestamp_ms +// Export writes metrics in Prometheus text format func (m *victoriaMetrics) Export(w io.Writer) error { - m.mu.Lock() - samples := make([]metricSample, len(m.samples)) - copy(samples, m.samples) - m.mu.Unlock() - - for _, s := range samples { - if _, err := fmt.Fprintf(w, "%s %g %d\n", s.name, s.value, s.timestamp.UnixMilli()); err != nil { - return err - } + if m.set == nil { + return fmt.Errorf("metrics set not initialized") } + + m.set.WritePrometheus(w) return nil } -// Reset clears pending samples after a successful push +// Reset clears all collected metrics func (m *victoriaMetrics) Reset() { - m.mu.Lock() - defer m.mu.Unlock() - m.samples = m.samples[:0] -} + m.set.UnregisterAllMetrics() +} \ No newline at end of file From 4aeab6966a9cd7d97fe65379663a91d93fddb7e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Fri, 6 Mar 2026 11:02:26 +0100 Subject: [PATCH 21/52] [client] Fix metrics issues and update dev docker setup - Fix StopPush not clearing push state, preventing restart - Fix race condition reading currentConnPriority without lock in recordConnectionMetrics - Fix stale comment referencing old metrics server URL - Update docker-compose for InfluxDB: add scoped tokens, .env config, init scripts - Rename docker-compose.victoria.yml to docker-compose.yml --- client/internal/connect.go | 2 +- .../json/netbird-influxdb-metrics.json | 133 --------- client/internal/metrics/env.go | 94 +++--- client/internal/metrics/influxdb.go | 38 ++- client/internal/metrics/infra/.env | 10 + client/internal/metrics/infra/.gitignore | 1 + .../metrics/{docs => infra}/README.md | 113 +++++--- .../docker-compose.yml} | 30 +- .../provisioning/dashboards/dashboard.yml | 0 .../json/netbird-connection-metrics.json | 0 .../json/netbird-influxdb-metrics.json | 219 ++++++++++++++ .../provisioning/datasources/influxdb.yml | 2 +- .../datasources/victoriametrics.yml | 0 .../infra/influxdb/scripts/create-tokens.sh | 25 ++ .../internal/metrics/infra/ingest/Dockerfile | 8 + client/internal/metrics/infra/ingest/go.mod | 3 + client/internal/metrics/infra/ingest/main.go | 225 +++++++++++++++ client/internal/metrics/metrics.go | 23 +- client/internal/metrics/metrics_default.go | 2 +- client/internal/metrics/push.go | 190 +++++++----- client/internal/metrics/push_test.go | 272 ++++++++++++------ .../internal/metrics/remoteconfig/manager.go | 19 +- .../metrics/remoteconfig/manager_test.go | 22 +- client/internal/metrics/victoria.go | 2 +- client/internal/peer/conn.go | 6 +- client/internal/peer/metrics_saver.go | 5 - 26 files changed, 1016 insertions(+), 428 deletions(-) delete mode 100644 client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json create mode 100644 client/internal/metrics/infra/.env create mode 100644 client/internal/metrics/infra/.gitignore rename client/internal/metrics/{docs => infra}/README.md (53%) rename client/internal/metrics/{docs/docker-compose.victoria.yml => infra/docker-compose.yml} (61%) rename client/internal/metrics/{docs => infra}/grafana/provisioning/dashboards/dashboard.yml (100%) rename client/internal/metrics/{docs => infra}/grafana/provisioning/dashboards/json/netbird-connection-metrics.json (100%) create mode 100644 client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json rename client/internal/metrics/{docs => infra}/grafana/provisioning/datasources/influxdb.yml (87%) rename client/internal/metrics/{docs => infra}/grafana/provisioning/datasources/victoriametrics.yml (100%) create mode 100755 client/internal/metrics/infra/influxdb/scripts/create-tokens.sh create mode 100644 client/internal/metrics/infra/ingest/Dockerfile create mode 100644 client/internal/metrics/infra/ingest/go.mod create mode 100644 client/internal/metrics/infra/ingest/main.go diff --git a/client/internal/connect.go b/client/internal/connect.go index 1548021e6ae..107cce629bf 100644 --- a/client/internal/connect.go +++ b/client/internal/connect.go @@ -157,7 +157,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan // Start metrics push if enabled (uses daemon context, persists across engine restarts) if metrics.IsMetricsPushEnabled() { - c.clientMetrics.StartPush(c.ctx, metrics.DefaultPushConfig) + c.clientMetrics.StartPush(c.ctx, metrics.PushConfigFromEnv()) } } diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json b/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json deleted file mode 100644 index 6e5f83be5fc..00000000000 --- a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json +++ /dev/null @@ -1,133 +0,0 @@ -{ - "uid": "netbird-influxdb-metrics", - "title": "NetBird Client Metrics (InfluxDB)", - "tags": ["netbird", "connections", "influxdb"], - "timezone": "browser", - "panels": [ - { - "id": 1, - "title": "Sync Duration", - "type": "timeseries", - "datasource": { - "type": "influxdb", - "uid": "" - }, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 0 - }, - "targets": [ - { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "min": 0, - "custom": { - "drawStyle": "points", - "pointSize": 5 - } - } - } - }, - { - "id": 2, - "title": "Connection Stage Durations", - "type": "timeseries", - "datasource": { - "type": "influxdb", - "uid": "" - }, - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 6 - }, - "targets": [ - { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and (r._field == \"signaling_to_connection_seconds\" or r._field == \"connection_to_wg_handshake_seconds\"))\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "drawStyle": "points", - "pointSize": 5 - } - } - } - }, - { - "id": 3, - "title": "Total Connection Time", - "type": "timeseries", - "datasource": { - "type": "influxdb", - "uid": "" - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 15 - }, - "targets": [ - { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "min": 0, - "custom": { - "drawStyle": "points", - "pointSize": 5 - } - } - } - }, - { - "id": 4, - "title": "ICE vs Relay", - "type": "piechart", - "datasource": { - "type": "influxdb", - "uid": "" - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 23 - }, - "targets": [ - { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> group(columns: [\"connection_type\"])\n |> count()\n |> rename(columns: {_value: \"count\"})", - "refId": "A" - } - ], - "options": { - "reduceOptions": { - "calcs": ["lastNotNull"] - }, - "pieType": "donut", - "tooltip": { - "mode": "multi" - } - } - } - ], - "schemaVersion": 27, - "version": 1, - "refresh": "30s" -} \ No newline at end of file diff --git a/client/internal/metrics/env.go b/client/internal/metrics/env.go index 22b181e7a31..e829ee34e78 100644 --- a/client/internal/metrics/env.go +++ b/client/internal/metrics/env.go @@ -13,7 +13,14 @@ const ( // EnvMetricsEnabled is the environment variable to enable metrics push (default: disabled) EnvMetricsEnabled = "NB_METRICS_ENABLED" - // EnvMetricsServerURL is the environment variable to override the metrics server URL + // EnvMetricsForceSending if set to true, skips remote configuration fetch and forces metric sending + EnvMetricsForceSending = "NB_METRICS_FORCE_SENDING" + + // EnvMetricsConfigURL is the environment variable to override the metrics push config ServerAddress + EnvMetricsConfigURL = "NB_METRICS_CONFIG_URL" + + // EnvMetricsServerURL is the environment variable to override the metrics server address. + // When set, this takes precedence over the server_url from remote push config. EnvMetricsServerURL = "NB_METRICS_SERVER_URL" // EnvMetricsInterval overrides the push interval from the remote config. @@ -22,27 +29,9 @@ const ( // Format: duration string like "1h", "30m", "4h" EnvMetricsInterval = "NB_METRICS_INTERVAL" - // EnvMetricsToken is the optional authentication token for the metrics server - EnvMetricsToken = "NB_METRICS_TOKEN" - - // EnvMetricsConfigURL is the environment variable to override the metrics push config URL - EnvMetricsConfigURL = "NB_METRICS_CONFIG_URL" - - defaultMetricsConfigURL = "https://api.netbird.io/client-metrics-config.json" + defaultMetricsConfigURL = "https://ingest.stage.npeer.io/config" ) -var ( - defaultMetricsURL *url.URL -) - -func init() { - var err error - defaultMetricsURL, err = url.Parse("https://api.netbird.io:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns") - if err != nil { - log.Fatalf("failed to parse default metrics URL: %v", err) - } -} - // IsMetricsPushEnabled returns true if metrics push is enabled via NB_METRICS_ENABLED env var // Disabled by default. Set NB_METRICS_ENABLED=true to enable func IsMetricsPushEnabled() bool { @@ -50,21 +39,28 @@ func IsMetricsPushEnabled() bool { return enabled } -// getMetricsServerURL returns the metrics server URL (never nil) -// First checks NB_METRICS_SERVER_URL environment variable and validates it -// If not set or invalid, returns the default NetBird metrics server (api.netbird.io:8428) -func getMetricsServerURL() url.URL { - // Check environment variable first - if envURLStr := os.Getenv(EnvMetricsServerURL); envURLStr != "" { - envURL, err := url.Parse(envURLStr) - if err != nil { - log.Warnf("invalid metrics server URL from env %q: %v, using default", envURLStr, err) - return *defaultMetricsURL - } - return *envURL +// getMetricsInterval returns the metrics push interval from NB_METRICS_INTERVAL env var. +// Returns 0 if not set or invalid. +func getMetricsInterval() time.Duration { + intervalStr := os.Getenv(EnvMetricsInterval) + if intervalStr == "" { + return 0 + } + interval, err := time.ParseDuration(intervalStr) + if err != nil { + log.Warnf("invalid metrics interval from env %q: %v", intervalStr, err) + return 0 + } + if interval <= 0 { + log.Warnf("invalid metrics interval from env %q: must be positive", intervalStr) + return 0 } + return interval +} - return *defaultMetricsURL +func isForceSending() bool { + force, _ := strconv.ParseBool(os.Getenv(EnvMetricsForceSending)) + return force } // getMetricsConfigURL returns the URL to fetch push configuration from @@ -75,25 +71,17 @@ func getMetricsConfigURL() string { return defaultMetricsConfigURL } -// getMetricsToken returns the optional auth token for the metrics server -func getMetricsToken() string { - return os.Getenv(EnvMetricsToken) -} - -// getMetricsInterval returns the metrics push interval from environment variable -// If not set or invalid, returns 0 (which will use the default in NewPush) -func getMetricsInterval() time.Duration { - if intervalStr := os.Getenv(EnvMetricsInterval); intervalStr != "" { - interval, err := time.ParseDuration(intervalStr) - if err != nil { - log.Warnf("invalid metrics interval from env %q: %v, using default", intervalStr, err) - return 0 - } - if interval <= 0 { - log.Warnf("invalid metrics interval from env %q: must be positive, using default", intervalStr) - return 0 - } - return interval +// getMetricsServerURL returns the metrics server URL from NB_METRICS_SERVER_URL env var. +// Returns nil if not set or invalid. +func getMetricsServerURL() *url.URL { + envURL := os.Getenv(EnvMetricsServerURL) + if envURL == "" { + return nil + } + parsed, err := url.Parse(envURL) + if err != nil { + log.Warnf("invalid metrics server URL from env: %v", err) + return nil } - return 0 + return parsed } diff --git a/client/internal/metrics/influxdb.go b/client/internal/metrics/influxdb.go index ce9941e6693..74bfdca621e 100644 --- a/client/internal/metrics/influxdb.go +++ b/client/internal/metrics/influxdb.go @@ -10,6 +10,13 @@ import ( log "github.com/sirupsen/logrus" ) +const ( + maxSampleAge = 5 * 24 * time.Hour // drop samples older than 5 days + maxBufferSize = 5 * 1024 * 1024 // drop oldest samples when estimated size exceeds 5 MB + // estimatedSampleSize is a rough per-sample memory estimate (measurement + tags + fields + timestamp) + estimatedSampleSize = 256 +) + // influxSample is a single InfluxDB line protocol entry. type influxSample struct { measurement string @@ -28,7 +35,6 @@ type influxDBMetrics struct { func newInfluxDBMetrics() metricsImplementation { return &influxDBMetrics{} } - func (m *influxDBMetrics) RecordConnectionStages( _ context.Context, agentInfo AgentInfo, @@ -79,6 +85,7 @@ func (m *influxDBMetrics) RecordConnectionStages( }, timestamp: now, }) + m.trimLocked() log.Tracef("peer connection metrics [%s, %s, %s]: signalingReceived→connection: %.3fs, connection→wg_handshake: %.3fs, total: %.3fs", agentInfo.DeploymentType.String(), connTypeStr, attemptType, signalingReceivedToConnection, connectionToWgHandshake, totalDuration) @@ -102,6 +109,7 @@ func (m *influxDBMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentI }, timestamp: time.Now(), }) + m.trimLocked() } // Export writes pending samples in InfluxDB line protocol format. @@ -142,4 +150,30 @@ func (m *influxDBMetrics) Reset() { m.mu.Lock() defer m.mu.Unlock() m.samples = m.samples[:0] -} \ No newline at end of file +} + +// trimLocked removes samples that exceed age or size limits. +// Must be called with m.mu held. +func (m *influxDBMetrics) trimLocked() { + now := time.Now() + + // drop samples older than maxSampleAge + cutoff := 0 + for cutoff < len(m.samples) && now.Sub(m.samples[cutoff].timestamp) > maxSampleAge { + cutoff++ + } + if cutoff > 0 { + copy(m.samples, m.samples[cutoff:]) + m.samples = m.samples[:len(m.samples)-cutoff] + log.Warnf("influxdb metrics: dropped %d samples older than %s", cutoff, maxSampleAge) + } + + // drop oldest samples if estimated size exceeds maxBufferSize + maxSamples := maxBufferSize / estimatedSampleSize + if len(m.samples) > maxSamples { + drop := len(m.samples) - maxSamples + copy(m.samples, m.samples[drop:]) + m.samples = m.samples[:maxSamples] + log.Warnf("influxdb metrics: dropped %d oldest samples to stay under %d MB size limit", drop, maxBufferSize/(1024*1024)) + } +} diff --git a/client/internal/metrics/infra/.env b/client/internal/metrics/infra/.env new file mode 100644 index 00000000000..958d3ff8362 --- /dev/null +++ b/client/internal/metrics/infra/.env @@ -0,0 +1,10 @@ +# InfluxDB admin (server-side only, never exposed to clients) +INFLUXDB_ADMIN_PASSWORD=adminadmin +INFLUXDB_ADMIN_TOKEN=stage-admin-token + +# Remote config served by ingest at /config +# Set CONFIG_SERVER_URL to the ingest server's public address to enable +CONFIG_METRICS_SERVER_URL= +CONFIG_VERSION_SINCE=0.0.0 +CONFIG_VERSION_UNTIL=99.99.99 +CONFIG_PERIOD_MINUTES=5 diff --git a/client/internal/metrics/infra/.gitignore b/client/internal/metrics/infra/.gitignore new file mode 100644 index 00000000000..2eea525d885 --- /dev/null +++ b/client/internal/metrics/infra/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/client/internal/metrics/docs/README.md b/client/internal/metrics/infra/README.md similarity index 53% rename from client/internal/metrics/docs/README.md rename to client/internal/metrics/infra/README.md index 865b025dba6..87cb4db9bf4 100644 --- a/client/internal/metrics/docs/README.md +++ b/client/internal/metrics/infra/README.md @@ -32,6 +32,23 @@ Engine Layer (engine.go) └─ Records metrics via ClientMetrics methods ``` +### Ingest Server + +Clients do not talk to InfluxDB directly. An ingest server sits between clients and InfluxDB: + +``` +Client ──POST──▶ Ingest Server (:8087) ──▶ InfluxDB (internal) + │ + ├─ Validates line protocol + ├─ Whitelists measurements & fields + ├─ Rejects out-of-bound values + └─ Serves remote config at /config +``` + +- **No client-side auth required** — the ingest server holds the InfluxDB token server-side +- **InfluxDB is not exposed** — only accessible within the docker network +- Source: `ingest/main.go` + ## Metrics Collected ### Connection Stage Timing @@ -66,39 +83,52 @@ Tags: - `version`: NetBird version string - `os`: Operating system (linux, darwin, windows, android, ios, etc.) +## Buffer Limits + +The InfluxDB backend limits in-memory sample storage to prevent unbounded growth when pushes fail: +- **Max age:** Samples older than 5 days are dropped +- **Max size:** Estimated buffer size capped at 5 MB (~20k samples) + ## Configuration -### Environment Variables +### Client Environment Variables | Variable | Default | Description | |----------|---------|-------------| | `NB_METRICS_ENABLED` | `false` | Enable metrics push | -| `NB_METRICS_SERVER_URL` | `https://api.netbird.io:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns` | Metrics endpoint URL | -| `NB_METRICS_INTERVAL` | | Push interval (e.g., "1m", "30m", "4h"). When set, bypasses remote config. | -| `NB_METRICS_TOKEN` | | Optional auth token for the metrics server | +| `NB_METRICS_SERVER_URL` | *(from remote config)* | Ingest server URL (e.g., `https://ingest.npeer.io`) | +| `NB_METRICS_INTERVAL` | *(from remote config)* | Push interval (e.g., "1m", "30m", "4h") | +| `NB_METRICS_FORCE_SENDING` | `false` | Skip remote config, push unconditionally | | `NB_METRICS_CONFIG_URL` | `https://api.netbird.io/client-metrics-config.json` | Remote push config URL | -### Backend-specific URLs +### Ingest Server Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `INGEST_LISTEN_ADDR` | `:8087` | Listen address | +| `INFLUXDB_URL` | `http://influxdb:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns` | InfluxDB write endpoint | +| `INFLUXDB_TOKEN` | *(required)* | InfluxDB auth token (server-side only) | +| `CONFIG_METRICS_SERVER_URL` | *(empty — disables /config)* | `server_url` in the remote config JSON (the URL clients push metrics to) | +| `CONFIG_VERSION_SINCE` | `0.0.0` | Minimum client version to push metrics | +| `CONFIG_VERSION_UNTIL` | `99.99.99` | Maximum client version to push metrics | +| `CONFIG_PERIOD_MINUTES` | `5` | Push interval in minutes | -| Backend | URL | -|---------|-----| -| **InfluxDB** | `http://:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns` | -| **VictoriaMetrics** | `http://:8428/api/v1/import/prometheus` | +The ingest server serves a remote config JSON at `GET /config` when `CONFIG_METRICS_SERVER_URL` is set. Clients can use `NB_METRICS_CONFIG_URL=http:///config` to fetch it. ### Configuration Precedence For URL and Interval, the precedence is: -1. **Config parameter** - Explicitly passed to `StartPush()` -2. **Environment variable** - `NB_METRICS_SERVER_URL` / `NB_METRICS_INTERVAL` -3. **Default value** - From `metrics.DefaultPushConfig` +1. **Environment variable** - `NB_METRICS_SERVER_URL` / `NB_METRICS_INTERVAL` +2. **Remote config** - fetched from `NB_METRICS_CONFIG_URL` +3. **Default** - 5 minute interval, URL from remote config ## Push Behavior 1. `StartPush()` spawns background goroutine with timer 2. First push happens immediately on startup -3. Periodically: `push()` → `Export()` → HTTP POST +3. Periodically: `push()` → `Export()` → HTTP POST to ingest server 4. On failure: log error, continue (non-blocking) -5. On success: `Reset()` clears pushed samples, log debug message +5. On success: `Reset()` clears pushed samples 6. `StopPush()` cancels context and waits for goroutine **InfluxDB mode:** Samples are collected with exact timestamps, pushed once, then cleared. No data is resent. @@ -107,62 +137,57 @@ For URL and Interval, the precedence is: ## Local Development Setup -### 1. Start Services - -```bash -# From this directory -docker compose -f docker-compose.victoria.yml up -d -``` - -**Access:** -- Grafana: http://localhost:3001 (admin/admin) -- InfluxDB: http://localhost:8086 -- VictoriaMetrics: http://localhost:8428 - -### 2. Configure Client (InfluxDB) +### 1. Configure and Start Services ```bash -export NB_METRICS_ENABLED=true -export NB_METRICS_SERVER_URL='http://localhost:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns' -export NB_METRICS_TOKEN=netbird-metrics-token -export NB_METRICS_INTERVAL=1m +# From this directory (client/internal/metrics/infra) +cp .env.example .env +# Edit .env — set INFLUXDB_ADMIN_PASSWORD, INFLUXDB_ADMIN_TOKEN +docker compose up -d ``` -Make sure `metrics_default.go` uses `newInfluxDBMetrics()`. +This starts: +- **Ingest server** on http://localhost:8087 — accepts client metrics (no auth needed) +- **InfluxDB** — internal only, not exposed to host +- **Grafana** on http://localhost:3001 +- **VictoriaMetrics** on http://localhost:8428 -### 3. Configure Client (VictoriaMetrics) +### 2. Configure Client ```bash export NB_METRICS_ENABLED=true -export NB_METRICS_SERVER_URL=http://localhost:8428/api/v1/import/prometheus +export NB_METRICS_FORCE_SENDING=true +export NB_METRICS_SERVER_URL=http://localhost:8087 export NB_METRICS_INTERVAL=1m ``` -Make sure `metrics_default.go` uses `newVictoriaMetrics()`. - -### 4. Run Client +### 3. Run Client ```bash cd ../../../.. go run ./client/ up ``` -### 5. View in Grafana +### 4. View in Grafana - **InfluxDB dashboard:** http://localhost:3001/d/netbird-influxdb-metrics - **VictoriaMetrics dashboard:** http://localhost:3001/d/netbird-connection-metrics -### 6. Verify Data +### 5. Verify Data ```bash -# InfluxDB - query data -curl -H "Authorization: Token netbird-metrics-token" \ - 'http://localhost:8086/api/v2/query?org=netbird' \ - --data-urlencode 'q=from(bucket:"metrics") |> range(start: -1h)' +# Query via InfluxDB (using admin token from .env) +docker compose exec influxdb influx query \ + 'from(bucket: "metrics") |> range(start: -1h)' \ + --org netbird + +# Check ingest server health +curl http://localhost:8087/health # VictoriaMetrics - list metrics curl http://localhost:8428/api/v1/label/__name__/values # VictoriaMetrics - delete all data -curl -s http://localhost:8428/api/v1/admin/tsdb/delete_series --data-urlencode 'match[]={__name__=~".+"}' +curl -s http://localhost:8428/api/v1/admin/tsdb/delete_series \ + --data-urlencode 'match[]={__name__=~".+"}' ``` \ No newline at end of file diff --git a/client/internal/metrics/docs/docker-compose.victoria.yml b/client/internal/metrics/infra/docker-compose.yml similarity index 61% rename from client/internal/metrics/docs/docker-compose.victoria.yml rename to client/internal/metrics/infra/docker-compose.yml index 79ae27206d7..9533444abd4 100644 --- a/client/internal/metrics/docs/docker-compose.victoria.yml +++ b/client/internal/metrics/infra/docker-compose.yml @@ -20,19 +20,38 @@ services: influxdb: container_name: influxdb image: influxdb:2 - ports: - - "8086:8086" + # No ports exposed — only accessible within the metrics network volumes: - influxdb-data:/var/lib/influxdb2 + - ./influxdb/scripts:/docker-entrypoint-initdb.d environment: - DOCKER_INFLUXDB_INIT_MODE=setup - DOCKER_INFLUXDB_INIT_USERNAME=admin - - DOCKER_INFLUXDB_INIT_PASSWORD=adminadmin + - DOCKER_INFLUXDB_INIT_PASSWORD=${INFLUXDB_ADMIN_PASSWORD:?required} - DOCKER_INFLUXDB_INIT_ORG=netbird - DOCKER_INFLUXDB_INIT_BUCKET=metrics - DOCKER_INFLUXDB_INIT_RETENTION=365d - - DOCKER_INFLUXDB_INIT_ADMIN_TOKEN=netbird-metrics-token - - INFLUXD_HTTP_AUTH_ENABLED=false + - DOCKER_INFLUXDB_INIT_ADMIN_TOKEN=${INFLUXDB_ADMIN_TOKEN:-} + restart: unless-stopped + networks: + - metrics + + ingest: + container_name: ingest + build: + context: ./ingest + ports: + - "8087:8087" + environment: + - INGEST_LISTEN_ADDR=:8087 + - INFLUXDB_URL=http://influxdb:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns + - INFLUXDB_TOKEN=${INFLUXDB_ADMIN_TOKEN:?required} + - CONFIG_METRICS_SERVER_URL=${CONFIG_METRICS_SERVER_URL:-} + - CONFIG_VERSION_SINCE=${CONFIG_VERSION_SINCE:-0.0.0} + - CONFIG_VERSION_UNTIL=${CONFIG_VERSION_UNTIL:-99.99.99} + - CONFIG_PERIOD_MINUTES=${CONFIG_PERIOD_MINUTES:-5} + depends_on: + - influxdb restart: unless-stopped networks: - metrics @@ -47,6 +66,7 @@ services: - GF_SECURITY_ADMIN_PASSWORD=admin - GF_USERS_ALLOW_SIGN_UP=false - GF_INSTALL_PLUGINS= + - INFLUXDB_ADMIN_TOKEN=${INFLUXDB_ADMIN_TOKEN:-} volumes: - grafana-data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/dashboard.yml b/client/internal/metrics/infra/grafana/provisioning/dashboards/dashboard.yml similarity index 100% rename from client/internal/metrics/docs/grafana/provisioning/dashboards/dashboard.yml rename to client/internal/metrics/infra/grafana/provisioning/dashboards/dashboard.yml diff --git a/client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json b/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-connection-metrics.json similarity index 100% rename from client/internal/metrics/docs/grafana/provisioning/dashboards/json/netbird-connection-metrics.json rename to client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-connection-metrics.json diff --git a/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json b/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json new file mode 100644 index 00000000000..ad0ba09fcee --- /dev/null +++ b/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json @@ -0,0 +1,219 @@ +{ + "uid": "netbird-influxdb-metrics", + "title": "NetBird Client Metrics (InfluxDB)", + "tags": ["netbird", "connections", "influxdb"], + "timezone": "browser", + "panels": [ + { + "id": 5, + "title": "Sync Duration Extremes", + "type": "stat", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", + "refId": "A" + }, + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0 + } + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "colorMode": "value", + "graphMode": "none", + "textMode": "auto" + } + }, + { + "id": 6, + "title": "Total Connection Time Extremes", + "type": "stat", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", + "refId": "A" + }, + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0 + } + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "colorMode": "value", + "graphMode": "none", + "textMode": "auto" + } + }, + { + "id": 1, + "title": "Sync Duration", + "type": "timeseries", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\"])\n |> set(key: \"_field\", value: \"Sync Duration\")", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0, + "custom": { + "drawStyle": "points", + "pointSize": 5 + } + } + } + }, + { + "id": 4, + "title": "ICE vs Relay", + "type": "piechart", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> drop(columns: [\"deployment_type\", \"attempt_type\", \"version\", \"os\"])\n |> group(columns: [\"connection_type\"])\n |> count()", + "refId": "A" + } + ], + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "pieType": "donut", + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 2, + "title": "Connection Stage Durations (avg)", + "type": "bargauge", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"signaling_to_connection_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Signaling to Connection\"})", + "refId": "A" + }, + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"connection_to_wg_handshake_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Connection to WG Handshake\"})", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0 + } + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "orientation": "horizontal", + "displayMode": "gradient" + } + }, + { + "id": 3, + "title": "Total Connection Time", + "type": "timeseries", + "datasource": { + "type": "influxdb", + "uid": "" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> set(key: \"_field\", value: \"Total Connection Time\")", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0, + "custom": { + "drawStyle": "points", + "pointSize": 5 + } + } + } + } + ], + "schemaVersion": 27, + "version": 1, + "refresh": "30s" +} diff --git a/client/internal/metrics/docs/grafana/provisioning/datasources/influxdb.yml b/client/internal/metrics/infra/grafana/provisioning/datasources/influxdb.yml similarity index 87% rename from client/internal/metrics/docs/grafana/provisioning/datasources/influxdb.yml rename to client/internal/metrics/infra/grafana/provisioning/datasources/influxdb.yml index 952b080c778..ade9b89c2a5 100644 --- a/client/internal/metrics/docs/grafana/provisioning/datasources/influxdb.yml +++ b/client/internal/metrics/infra/grafana/provisioning/datasources/influxdb.yml @@ -11,4 +11,4 @@ datasources: organization: netbird defaultBucket: metrics secureJsonData: - token: netbird-metrics-token \ No newline at end of file + token: ${INFLUXDB_ADMIN_TOKEN} \ No newline at end of file diff --git a/client/internal/metrics/docs/grafana/provisioning/datasources/victoriametrics.yml b/client/internal/metrics/infra/grafana/provisioning/datasources/victoriametrics.yml similarity index 100% rename from client/internal/metrics/docs/grafana/provisioning/datasources/victoriametrics.yml rename to client/internal/metrics/infra/grafana/provisioning/datasources/victoriametrics.yml diff --git a/client/internal/metrics/infra/influxdb/scripts/create-tokens.sh b/client/internal/metrics/infra/influxdb/scripts/create-tokens.sh new file mode 100755 index 00000000000..5f50d65abba --- /dev/null +++ b/client/internal/metrics/infra/influxdb/scripts/create-tokens.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Creates a scoped InfluxDB read-only token for Grafana. +# Clients do not need a token — they push via the ingest server. + +BUCKET_ID=$(influx bucket list --org netbird --name metrics --json | grep -oP '"id"\s*:\s*"\K[^"]+' | head -1) +ORG_ID=$(influx org list --name netbird --json | grep -oP '"id"\s*:\s*"\K[^"]+' | head -1) + +if [ -z "$BUCKET_ID" ] || [ -z "$ORG_ID" ]; then + echo "ERROR: Could not determine bucket or org ID" + echo "BUCKET_ID=$BUCKET_ID ORG_ID=$ORG_ID" + exit 1 +fi + +# Create read-only token for Grafana +READ_TOKEN=$(influx auth create \ + --org netbird \ + --read-bucket "$BUCKET_ID" \ + --description "Grafana read-only token" \ + --json | grep -oP '"token"\s*:\s*"\K[^"]+' | head -1) + +echo "" +echo "============================================" +echo "GRAFANA READ-ONLY TOKEN:" +echo "$READ_TOKEN" +echo "============================================" \ No newline at end of file diff --git a/client/internal/metrics/infra/ingest/Dockerfile b/client/internal/metrics/infra/ingest/Dockerfile new file mode 100644 index 00000000000..8293c6f0f35 --- /dev/null +++ b/client/internal/metrics/infra/ingest/Dockerfile @@ -0,0 +1,8 @@ +FROM golang:1.25-alpine AS build +WORKDIR /app +COPY go.mod main.go ./ +RUN CGO_ENABLED=0 go build -o ingest . + +FROM alpine:3.20 +COPY --from=build /app/ingest /usr/local/bin/ingest +ENTRYPOINT ["ingest"] \ No newline at end of file diff --git a/client/internal/metrics/infra/ingest/go.mod b/client/internal/metrics/infra/ingest/go.mod new file mode 100644 index 00000000000..0d50be2bef8 --- /dev/null +++ b/client/internal/metrics/infra/ingest/go.mod @@ -0,0 +1,3 @@ +module github.com/netbirdio/netbird/client/internal/metrics/infra/ingest + +go 1.25 \ No newline at end of file diff --git a/client/internal/metrics/infra/ingest/main.go b/client/internal/metrics/infra/ingest/main.go new file mode 100644 index 00000000000..663e3df9c30 --- /dev/null +++ b/client/internal/metrics/infra/ingest/main.go @@ -0,0 +1,225 @@ +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "strconv" + "strings" +) + +const ( + defaultListenAddr = ":8087" + defaultInfluxDBURL = "http://influxdb:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns" + maxBodySize = 1 * 1024 * 1024 // 1 MB max request body + maxTotalSeconds = 300.0 // reject total_seconds > 5 minutes +) + +var allowedMeasurements = map[string]map[string]bool{ + "netbird_peer_connection": { + "signaling_to_connection_seconds": true, + "connection_to_wg_handshake_seconds": true, + "total_seconds": true, + }, + "netbird_sync": { + "duration_seconds": true, + }, +} + +func main() { + listenAddr := envOr("INGEST_LISTEN_ADDR", defaultListenAddr) + influxURL := envOr("INFLUXDB_URL", defaultInfluxDBURL) + influxToken := os.Getenv("INFLUXDB_TOKEN") + + if influxToken == "" { + log.Fatal("INFLUXDB_TOKEN is required") + } + + client := &http.Client{Timeout: 10 * 1e9} // 10 seconds + + http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + body, err := io.ReadAll(io.LimitReader(r.Body, maxBodySize+1)) + if err != nil { + http.Error(w, "read error", http.StatusBadRequest) + return + } + if len(body) > maxBodySize { + http.Error(w, "body too large", http.StatusRequestEntityTooLarge) + return + } + + validated, err := validateLineProtocol(body) + if err != nil { + log.Printf("WARN validation failed from %s: %v", r.RemoteAddr, err) + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + req, err := http.NewRequestWithContext(r.Context(), http.MethodPost, influxURL, bytes.NewReader(validated)) + if err != nil { + log.Printf("ERROR create request: %v", err) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + req.Header.Set("Content-Type", "text/plain; charset=utf-8") + req.Header.Set("Authorization", "Token "+influxToken) + + resp, err := client.Do(req) + if err != nil { + log.Printf("ERROR forward to influxdb: %v", err) + http.Error(w, "upstream error", http.StatusBadGateway) + return + } + defer resp.Body.Close() + + w.WriteHeader(resp.StatusCode) + io.Copy(w, resp.Body) //nolint:errcheck + }) + + // Build config JSON once at startup from env vars + configJSON := buildConfigJSON() + if configJSON != nil { + log.Printf("serving remote config at /config") + } + + http.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + if configJSON == nil { + http.Error(w, "config not configured", http.StatusNotFound) + return + } + w.Header().Set("Content-Type", "application/json") + w.Write(configJSON) //nolint:errcheck + }) + + http.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, "ok") //nolint:errcheck + }) + + log.Printf("ingest server listening on %s, forwarding to %s", listenAddr, influxURL) + if err := http.ListenAndServe(listenAddr, nil); err != nil { //nolint:gosec + log.Fatal(err) + } +} + +// validateLineProtocol parses InfluxDB line protocol lines, +// whitelists measurements and fields, and checks value bounds. +func validateLineProtocol(body []byte) ([]byte, error) { + lines := strings.Split(strings.TrimSpace(string(body)), "\n") + var valid []string + + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + // line protocol: measurement,tag=val,tag=val field=val,field=val timestamp + parts := strings.SplitN(line, " ", 3) + if len(parts) < 2 { + return nil, fmt.Errorf("invalid line protocol: %q", truncate(line, 100)) + } + + measurementAndTags := parts[0] + measurement := measurementAndTags + if idx := strings.IndexByte(measurementAndTags, ','); idx >= 0 { + measurement = measurementAndTags[:idx] + } + + allowedFields, ok := allowedMeasurements[measurement] + if !ok { + return nil, fmt.Errorf("unknown measurement: %q", measurement) + } + + fieldPairs := strings.Split(parts[1], ",") + for _, pair := range fieldPairs { + kv := strings.SplitN(pair, "=", 2) + if len(kv) != 2 { + return nil, fmt.Errorf("invalid field: %q", pair) + } + fieldName := kv[0] + if !allowedFields[fieldName] { + return nil, fmt.Errorf("unknown field %q in measurement %q", fieldName, measurement) + } + + val, err := strconv.ParseFloat(kv[1], 64) + if err != nil { + return nil, fmt.Errorf("invalid field value %q for %q", kv[1], fieldName) + } + if val < 0 { + return nil, fmt.Errorf("negative value for %q: %g", fieldName, val) + } + if fieldName == "total_seconds" && val > maxTotalSeconds { + return nil, fmt.Errorf("total_seconds too large: %g > %g", val, maxTotalSeconds) + } + } + + valid = append(valid, line) + } + + if len(valid) == 0 { + return nil, fmt.Errorf("no valid lines") + } + + return []byte(strings.Join(valid, "\n") + "\n"), nil +} + +// buildConfigJSON builds the remote config JSON from env vars. +// Returns nil if required vars are not set. +func buildConfigJSON() []byte { + serverURL := os.Getenv("CONFIG_METRICS_SERVER_URL") + versionSince := envOr("CONFIG_VERSION_SINCE", "0.0.0") + versionUntil := envOr("CONFIG_VERSION_UNTIL", "99.99.99") + periodMinutes := envOr("CONFIG_PERIOD_MINUTES", "5") + + if serverURL == "" { + return nil + } + + period, err := strconv.Atoi(periodMinutes) + if err != nil || period <= 0 { + log.Printf("WARN invalid CONFIG_PERIOD_MINUTES: %q, using 5", periodMinutes) + period = 5 + } + + cfg := map[string]any{ + "server_url": serverURL, + "version-since": versionSince, + "version-until": versionUntil, + "period_minutes": period, + } + + data, err := json.Marshal(cfg) + if err != nil { + log.Printf("ERROR failed to marshal config: %v", err) + return nil + } + return data +} + +func envOr(key, defaultVal string) string { + if v := os.Getenv(key); v != "" { + return v + } + return defaultVal +} + +func truncate(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] + "..." +} diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 6d415e662c8..eba7d0471dc 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -82,7 +82,6 @@ func (c *ClientMetrics) RecordConnectionStages( agentInfo := c.agentInfo c.mu.RUnlock() - log.Infof("--- conn stages: %v, %v", connectionType, timestamps) c.impl.RecordConnectionStages(ctx, agentInfo, connectionType, isReconnection, timestamps) } @@ -119,7 +118,7 @@ func (c *ClientMetrics) Export(w io.Writer) error { } // StartPush starts periodic pushing of metrics with the given configuration -// Precedence: config parameter > env var > DefaultPushConfig +// Precedence: PushConfig.ServerAddress > remote config server_url func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) { if c == nil { return @@ -133,27 +132,26 @@ func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) { return } - ctx, cancel := context.WithCancel(ctx) - c.pushCancel = cancel - c.mu.RLock() agentVersion := c.agentInfo.Version c.mu.RUnlock() configManager := remoteconfig.NewManager(getMetricsConfigURL(), remoteconfig.DefaultMinRefreshInterval) - push := NewPush(c.impl, configManager, config, agentVersion) + push, err := NewPush(c.impl, configManager, config, agentVersion) + if err != nil { + log.Errorf("failed to create metrics push: %v", err) + return + } + + ctx, cancel := context.WithCancel(ctx) + c.pushCancel = cancel + c.wg.Add(1) go func() { defer c.wg.Done() push.Start(ctx) }() c.push = push - - if push.overrideInterval > 0 { - log.Infof("started metrics push to %s with override interval %s", push.pushURL, push.overrideInterval) - } else { - log.Infof("started metrics push to %s with remote config", push.pushURL) - } } func (c *ClientMetrics) StopPush() { @@ -168,4 +166,5 @@ func (c *ClientMetrics) StopPush() { c.pushCancel() c.wg.Wait() + c.push = nil } diff --git a/client/internal/metrics/metrics_default.go b/client/internal/metrics/metrics_default.go index 85705a540db..927ab51d1cb 100644 --- a/client/internal/metrics/metrics_default.go +++ b/client/internal/metrics/metrics_default.go @@ -5,7 +5,7 @@ package metrics // NewClientMetrics creates a new ClientMetrics instance func NewClientMetrics(agentInfo AgentInfo) *ClientMetrics { return &ClientMetrics{ - impl: newVictoriaMetrics(), + impl: newInfluxDBMetrics(), agentInfo: agentInfo, } } diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index 6960354ad12..d1579ebcf49 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -15,23 +15,36 @@ import ( ) const ( - // DefaultPushInterval is the default interval for pushing metrics - DefaultPushInterval = 5 * time.Minute + // defaultPushInterval is the default interval for pushing metrics + defaultPushInterval = 5 * time.Minute ) -var ( - DefaultPushConfig = PushConfig{ - URL: nil, - Interval: 0, - } -) +// defaultMetricsServerURL is used as fallback when NB_METRICS_FORCE_SENDING is true +var defaultMetricsServerURL *url.URL + +func init() { + defaultMetricsServerURL, _ = url.Parse("https://ingest.stage.npeer.io") +} // PushConfig holds configuration for metrics push type PushConfig struct { - // URL is the metrics server URL. If nil, uses env var or default - URL *url.URL - // Interval is how often to push metrics. If 0, uses env var or default (4h) + // ServerAddress is the metrics server URL. If nil, uses remote config server_url. + ServerAddress *url.URL + // Interval is how often to push metrics. If 0, uses remote config interval or defaultPushInterval. Interval time.Duration + // ForceSending skips remote configuration fetch and version checks, pushing unconditionally. + ForceSending bool +} + +// PushConfigFromEnv builds a PushConfig from environment variables. +func PushConfigFromEnv() PushConfig { + config := PushConfig{} + + config.ForceSending = isForceSending() + config.ServerAddress = getMetricsServerURL() + config.Interval = getMetricsInterval() + + return config } // remoteConfigProvider abstracts remote push config fetching for testability @@ -39,57 +52,75 @@ type remoteConfigProvider interface { RefreshIfNeeded(ctx context.Context) *remoteconfig.Config } -// Push handles periodic pushing of metrics to VictoriaMetrics +// Push handles periodic pushing of metrics type Push struct { - metrics metricsImplementation - pushURL string - agentVersion *goversion.Version - overrideInterval time.Duration // if set, bypass remote config and always push at this interval - + metrics metricsImplementation configManager remoteConfigProvider - client *http.Client + config PushConfig + agentVersion *goversion.Version + + client *http.Client + envInterval time.Duration + envAddress *url.URL } // NewPush creates a new Push instance with configuration resolution -func NewPush(metrics metricsImplementation, configManager remoteConfigProvider, config PushConfig, agentVersion string) *Push { - // Resolve URL: config > env var (always returns valid URL) - var pushURL url.URL - if config.URL != nil { - pushURL = *config.URL +func NewPush(metrics metricsImplementation, configManager remoteConfigProvider, config PushConfig, agentVersion string) (*Push, error) { + var envInterval time.Duration + var envAddress *url.URL + + if config.ForceSending { + envInterval = config.Interval + if config.Interval <= 0 { + envInterval = defaultPushInterval + } + + envAddress = config.ServerAddress + if envAddress == nil { + envAddress = defaultMetricsServerURL + } } else { - pushURL = getMetricsServerURL() - } + envAddress = config.ServerAddress - // If interval is explicitly set (config param or env var), bypass remote config entirely - overrideInterval := config.Interval - if overrideInterval == 0 { - overrideInterval = getMetricsInterval() // 0 if env var not set + if config.Interval < 0 { + log.Warnf("negative metrics push interval %s", config.Interval) + } else { + envInterval = config.Interval + } } parsedVersion, err := goversion.NewVersion(agentVersion) if err != nil { - log.Warnf("failed to parse agent version %q: %v", agentVersion, err) + if !config.ForceSending { + return nil, fmt.Errorf("failed to parse agent version %q: %w", agentVersion, err) + } } return &Push{ - metrics: metrics, - pushURL: pushURL.String(), - agentVersion: parsedVersion, - overrideInterval: overrideInterval, - configManager: configManager, + metrics: metrics, + configManager: configManager, + config: config, + agentVersion: parsedVersion, + envInterval: envInterval, + envAddress: envAddress, client: &http.Client{ Timeout: 10 * time.Second, }, - } + }, nil } // Start starts the periodic push loop. // If overrideInterval is set (via env var), pushes unconditionally at that interval. // Otherwise, fetches remote config to determine push period and version eligibility. func (p *Push) Start(ctx context.Context) { - if p.pushURL == "" { - log.Debug("metrics push URL not configured, skipping push") - return + // Log initial state + switch { + case p.config.ForceSending: + log.Infof("started metrics push with force sending to %s, interval %s", p.envAddress, p.envInterval) + case p.config.ServerAddress != nil: + log.Infof("started metrics push with server URL override: %s", p.config.ServerAddress.String()) + default: + log.Infof("started metrics push, server URL will be resolved from remote config") } timer := time.NewTimer(0) // fire immediately on first iteration @@ -103,52 +134,50 @@ func (p *Push) Start(ctx context.Context) { case <-timer.C: } - nextInterval := p.tick(ctx) - timer.Reset(nextInterval) - } -} - -// tick performs a single push cycle and returns the duration to wait before the next one. -func (p *Push) tick(ctx context.Context) time.Duration { - interval, shouldPush := p.resolveInterval(ctx) - if shouldPush { - if err := p.push(ctx); err != nil { - log.Errorf("failed to push metrics: %v", err) + pushURL, interval := p.resolve(ctx) + if pushURL != "" { + if err := p.push(ctx, pushURL); err != nil { + log.Errorf("failed to push metrics: %v", err) + } } + + timer.Reset(interval) } - return interval } -// resolveInterval determines the push interval and whether a push should happen. -// If overrideInterval is set, it bypasses remote config and always pushes. -// Otherwise, it fetches remote config and checks version eligibility. -func (p *Push) resolveInterval(ctx context.Context) (time.Duration, bool) { - if p.overrideInterval > 0 { - return p.overrideInterval, true +// resolve returns the push URL and interval for the next cycle. +// Returns empty pushURL to skip this cycle. +func (p *Push) resolve(ctx context.Context) (pushURL string, interval time.Duration) { + if p.config.ForceSending { + return p.resolveServerURL(nil), p.envInterval } config := p.configManager.RefreshIfNeeded(ctx) if config == nil { log.Debug("no metrics push config available, waiting to retry") - return DefaultPushInterval, false + return "", defaultPushInterval } - if p.agentVersion == nil { - log.Debug("agent version not available, skipping metrics push") - return config.Period, false + interval = config.Interval + if p.envInterval > 0 { + interval = p.envInterval } if !isVersionInRange(p.agentVersion, config.VersionSince, config.VersionUntil) { log.Debugf("agent version %s not in range [%s, %s), skipping metrics push", p.agentVersion, config.VersionSince, config.VersionUntil) - return config.Period, false + return "", interval } - return config.Period, true + pushURL = p.resolveServerURL(&config.ServerURL) + if pushURL == "" { + log.Warn("no metrics server URL available, skipping push") + } + return pushURL, interval } -// push exports metrics and sends them to VictoriaMetrics -func (p *Push) push(ctx context.Context) error { +// push exports metrics and sends them to the metrics server +func (p *Push) push(ctx context.Context, pushURL string) error { // Export metrics to buffer var buf bytes.Buffer if err := p.metrics.Export(&buf); err != nil { @@ -161,22 +190,12 @@ func (p *Push) push(ctx context.Context) error { return nil } - // Log what we're pushing (first 500 bytes) - preview := buf.String() - if len(preview) > 500 { - preview = preview[:500] - } - log.Tracef("pushing metrics (%d bytes): %s", buf.Len(), preview) - // Create HTTP request - req, err := http.NewRequestWithContext(ctx, "POST", p.pushURL, &buf) + req, err := http.NewRequestWithContext(ctx, "POST", pushURL, &buf) if err != nil { return fmt.Errorf("create request: %w", err) } req.Header.Set("Content-Type", "text/plain; charset=utf-8") - if token := getMetricsToken(); token != "" { - req.Header.Set("Authorization", "Token "+token) - } // Send request resp, err := p.client.Do(req) @@ -197,11 +216,28 @@ func (p *Push) push(ctx context.Context) error { return fmt.Errorf("push failed with status %d", resp.StatusCode) } - log.Debugf("successfully pushed metrics to %s", p.pushURL) + log.Debugf("successfully pushed metrics to %s", pushURL) p.metrics.Reset() return nil } +// resolveServerURL determines the push URL. +// Precedence: envAddress (env var) > remote config server_url +func (p *Push) resolveServerURL(remoteServerURL *url.URL) string { + var baseURL *url.URL + if p.envAddress != nil { + baseURL = p.envAddress + } else { + baseURL = remoteServerURL + } + + if baseURL == nil { + return "" + } + + return baseURL.String() +} + // isVersionInRange checks if current falls within [since, until) func isVersionInRange(current, since, until *goversion.Version) bool { return !current.LessThan(since) && current.LessThan(until) diff --git a/client/internal/metrics/push_test.go b/client/internal/metrics/push_test.go index 41e2cc41f6a..20b26cf8a82 100644 --- a/client/internal/metrics/push_test.go +++ b/client/internal/metrics/push_test.go @@ -5,6 +5,7 @@ import ( "io" "net/http" "net/http/httptest" + "net/url" "sync/atomic" "testing" "time" @@ -24,11 +25,28 @@ func mustVersion(s string) *goversion.Version { return v } -func testConfig(since, until string, period time.Duration) *remoteconfig.Config { +func mustURL(s string) url.URL { + u, err := url.Parse(s) + if err != nil { + panic(err) + } + return *u +} + +func parseURL(s string) *url.URL { + u, err := url.Parse(s) + if err != nil { + panic(err) + } + return u +} + +func testConfig(serverURL, since, until string, period time.Duration) *remoteconfig.Config { return &remoteconfig.Config{ + ServerURL: mustURL(serverURL), VersionSince: mustVersion(since), VersionUntil: mustVersion(until), - Period: period, + Interval: period, } } @@ -63,7 +81,7 @@ func (m *mockMetrics) Export(w io.Writer) error { func (m *mockMetrics) Reset() { } -func TestPush_OverrideIntervalAlwaysPushes(t *testing.T) { +func TestPush_OverrideIntervalPushes(t *testing.T) { var pushCount atomic.Int32 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { pushCount.Add(1) @@ -72,16 +90,13 @@ func TestPush_OverrideIntervalAlwaysPushes(t *testing.T) { defer server.Close() metrics := &mockMetrics{exportData: "test_metric 1\n"} - configProvider := &mockConfigProvider{config: nil} // no remote config - - push := &Push{ - metrics: metrics, - pushURL: server.URL, - agentVersion: mustVersion("1.0.0"), - overrideInterval: 50 * time.Millisecond, - configManager: configProvider, - client: &http.Client{Timeout: 5 * time.Second}, - } + configProvider := &mockConfigProvider{config: testConfig(server.URL, "1.0.0", "2.0.0", 60*time.Minute)} + + push, err := NewPush(metrics, configProvider, PushConfig{ + Interval: 50 * time.Millisecond, + ServerAddress: parseURL(server.URL), + }, "1.0.0") + require.NoError(t, err) ctx, cancel := context.WithCancel(context.Background()) done := make(chan struct{}) @@ -90,7 +105,6 @@ func TestPush_OverrideIntervalAlwaysPushes(t *testing.T) { close(done) }() - // Wait for a few pushes require.Eventually(t, func() bool { return pushCount.Load() >= 3 }, 2*time.Second, 10*time.Millisecond) @@ -100,44 +114,31 @@ func TestPush_OverrideIntervalAlwaysPushes(t *testing.T) { } func TestPush_RemoteConfigVersionInRange(t *testing.T) { - var pushCount atomic.Int32 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - pushCount.Add(1) w.WriteHeader(http.StatusNoContent) })) defer server.Close() metrics := &mockMetrics{exportData: "test_metric 1\n"} - configProvider := &mockConfigProvider{config: testConfig("1.0.0", "2.0.0", 1*time.Minute)} - - push := &Push{ - metrics: metrics, - pushURL: server.URL, - agentVersion: mustVersion("1.5.0"), - configManager: configProvider, - client: &http.Client{Timeout: 5 * time.Second}, - } + configProvider := &mockConfigProvider{config: testConfig(server.URL, "1.0.0", "2.0.0", 1*time.Minute)} - interval, shouldPush := push.resolveInterval(context.Background()) - assert.True(t, shouldPush) + push, err := NewPush(metrics, configProvider, PushConfig{}, "1.5.0") + require.NoError(t, err) + + pushURL, interval := push.resolve(context.Background()) + assert.NotEmpty(t, pushURL) assert.Equal(t, 1*time.Minute, interval) - assert.Equal(t, int32(0), pushCount.Load()) // resolveInterval doesn't push } func TestPush_RemoteConfigVersionOutOfRange(t *testing.T) { metrics := &mockMetrics{exportData: "test_metric 1\n"} - configProvider := &mockConfigProvider{config: testConfig("1.0.0", "1.5.0", 1*time.Minute)} - - push := &Push{ - metrics: metrics, - pushURL: "http://localhost", - agentVersion: mustVersion("2.0.0"), - configManager: configProvider, - client: &http.Client{Timeout: 5 * time.Second}, - } + configProvider := &mockConfigProvider{config: testConfig("http://localhost", "1.0.0", "1.5.0", 1*time.Minute)} + + push, err := NewPush(metrics, configProvider, PushConfig{}, "2.0.0") + require.NoError(t, err) - interval, shouldPush := push.resolveInterval(context.Background()) - assert.False(t, shouldPush) + pushURL, interval := push.resolve(context.Background()) + assert.Empty(t, pushURL) assert.Equal(t, 1*time.Minute, interval) } @@ -145,35 +146,45 @@ func TestPush_NoConfigReturnsDefault(t *testing.T) { metrics := &mockMetrics{} configProvider := &mockConfigProvider{config: nil} - push := &Push{ - metrics: metrics, - pushURL: "http://localhost", - agentVersion: mustVersion("1.0.0"), - configManager: configProvider, - client: &http.Client{Timeout: 5 * time.Second}, - } + push, err := NewPush(metrics, configProvider, PushConfig{}, "1.0.0") + require.NoError(t, err) - interval, shouldPush := push.resolveInterval(context.Background()) - assert.False(t, shouldPush) - assert.Equal(t, DefaultPushInterval, interval) + pushURL, interval := push.resolve(context.Background()) + assert.Empty(t, pushURL) + assert.Equal(t, defaultPushInterval, interval) } -func TestPush_OverrideIntervalBypassesRemoteConfig(t *testing.T) { +func TestPush_OverrideIntervalRespectsVersionCheck(t *testing.T) { metrics := &mockMetrics{} - // Remote config says version is out of range, but override should bypass it - configProvider := &mockConfigProvider{config: testConfig("3.0.0", "4.0.0", 60*time.Minute)} - - push := &Push{ - metrics: metrics, - pushURL: "http://localhost", - agentVersion: mustVersion("1.0.0"), - overrideInterval: 30 * time.Second, - configManager: configProvider, - client: &http.Client{Timeout: 5 * time.Second}, - } + configProvider := &mockConfigProvider{config: testConfig("http://localhost", "3.0.0", "4.0.0", 60*time.Minute)} + + push, err := NewPush(metrics, configProvider, PushConfig{ + Interval: 30 * time.Second, + ServerAddress: parseURL("http://localhost"), + }, "1.0.0") + require.NoError(t, err) - interval, shouldPush := push.resolveInterval(context.Background()) - assert.True(t, shouldPush) + pushURL, interval := push.resolve(context.Background()) + assert.Empty(t, pushURL) // version out of range + assert.Equal(t, 30*time.Second, interval) // but uses override interval +} + +func TestPush_OverrideIntervalUsedWhenVersionInRange(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNoContent) + })) + defer server.Close() + + metrics := &mockMetrics{} + configProvider := &mockConfigProvider{config: testConfig(server.URL, "1.0.0", "2.0.0", 60*time.Minute)} + + push, err := NewPush(metrics, configProvider, PushConfig{ + Interval: 30 * time.Second, + }, "1.5.0") + require.NoError(t, err) + + pushURL, interval := push.resolve(context.Background()) + assert.NotEmpty(t, pushURL) assert.Equal(t, 30*time.Second, interval) } @@ -186,39 +197,124 @@ func TestPush_NoMetricsSkipsPush(t *testing.T) { defer server.Close() metrics := &mockMetrics{exportData: ""} // no metrics to export + configProvider := &mockConfigProvider{config: nil} - push := &Push{ - metrics: metrics, - pushURL: server.URL, - client: &http.Client{Timeout: 5 * time.Second}, - } + push, err := NewPush(metrics, configProvider, PushConfig{}, "1.0.0") + require.NoError(t, err) - err := push.push(context.Background()) + err = push.push(context.Background(), server.URL) assert.NoError(t, err) - assert.Equal(t, int32(0), pushCount.Load()) // no HTTP request made + assert.Equal(t, int32(0), pushCount.Load()) } -func TestPush_EmptyURLSkipsStart(t *testing.T) { - push := &Push{ - pushURL: "", - } +func TestPush_ServerURLFromRemoteConfig(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNoContent) + })) + defer server.Close() - // Should return immediately without blocking - ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) - defer cancel() + metrics := &mockMetrics{exportData: "test_metric 1\n"} + configProvider := &mockConfigProvider{config: testConfig(server.URL, "1.0.0", "2.0.0", 1*time.Minute)} - done := make(chan struct{}) - go func() { - push.Start(ctx) - close(done) - }() + push, err := NewPush(metrics, configProvider, PushConfig{}, "1.5.0") + require.NoError(t, err) - select { - case <-done: - // good, returned immediately - case <-ctx.Done(): - t.Fatal("Start did not return for empty URL") - } + pushURL, interval := push.resolve(context.Background()) + assert.Contains(t, pushURL, server.URL) + assert.Equal(t, 1*time.Minute, interval) +} + +func TestPush_ServerAddressOverridesTakePrecedenceOverRemoteConfig(t *testing.T) { + overrideServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNoContent) + })) + defer overrideServer.Close() + + metrics := &mockMetrics{exportData: "test_metric 1\n"} + configProvider := &mockConfigProvider{config: testConfig("http://remote-config-server", "1.0.0", "2.0.0", 1*time.Minute)} + + push, err := NewPush(metrics, configProvider, PushConfig{ + ServerAddress: parseURL(overrideServer.URL), + }, "1.5.0") + require.NoError(t, err) + + pushURL, _ := push.resolve(context.Background()) + assert.Contains(t, pushURL, overrideServer.URL) + assert.NotContains(t, pushURL, "remote-config-server") +} + +func TestPush_OverrideIntervalWithoutOverrideURL_UsesRemoteConfigURL(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNoContent) + })) + defer server.Close() + + metrics := &mockMetrics{exportData: "test_metric 1\n"} + configProvider := &mockConfigProvider{config: testConfig(server.URL, "1.0.0", "2.0.0", 60*time.Minute)} + + push, err := NewPush(metrics, configProvider, PushConfig{ + Interval: 30 * time.Second, + }, "1.0.0") + require.NoError(t, err) + + pushURL, interval := push.resolve(context.Background()) + assert.Contains(t, pushURL, server.URL) + assert.Equal(t, 30*time.Second, interval) +} + +func TestPush_NoConfigSkipsPush(t *testing.T) { + metrics := &mockMetrics{exportData: "test_metric 1\n"} + configProvider := &mockConfigProvider{config: nil} + + push, err := NewPush(metrics, configProvider, PushConfig{ + Interval: 30 * time.Second, + }, "1.0.0") + require.NoError(t, err) + + pushURL, interval := push.resolve(context.Background()) + assert.Empty(t, pushURL) + assert.Equal(t, defaultPushInterval, interval) // no config available, use default retry interval +} + +func TestPush_ForceSendingSkipsRemoteConfig(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNoContent) + })) + defer server.Close() + + metrics := &mockMetrics{exportData: "test_metric 1\n"} + configProvider := &mockConfigProvider{config: nil} + + push, err := NewPush(metrics, configProvider, PushConfig{ + ForceSending: true, + Interval: 1 * time.Minute, + ServerAddress: parseURL(server.URL), + }, "1.0.0") + require.NoError(t, err) + + pushURL, interval := push.resolve(context.Background()) + assert.NotEmpty(t, pushURL) + assert.Equal(t, 1*time.Minute, interval) +} + +func TestPush_ForceSendingUsesDefaultInterval(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNoContent) + })) + defer server.Close() + + metrics := &mockMetrics{exportData: "test_metric 1\n"} + configProvider := &mockConfigProvider{config: nil} + + push, err := NewPush(metrics, configProvider, PushConfig{ + ForceSending: true, + ServerAddress: parseURL(server.URL), + }, "1.0.0") + require.NoError(t, err) + + pushURL, interval := push.resolve(context.Background()) + assert.NotEmpty(t, pushURL) + assert.Equal(t, defaultPushInterval, interval) } func TestIsVersionInRange(t *testing.T) { diff --git a/client/internal/metrics/remoteconfig/manager.go b/client/internal/metrics/remoteconfig/manager.go index 6ab39c33ae0..31f406c0bfd 100644 --- a/client/internal/metrics/remoteconfig/manager.go +++ b/client/internal/metrics/remoteconfig/manager.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "net/http" + "net/url" "sync" "time" @@ -19,13 +20,15 @@ const ( // Config holds the parsed remote push configuration type Config struct { + ServerURL url.URL VersionSince *goversion.Version VersionUntil *goversion.Version - Period time.Duration + Interval time.Duration } // rawConfig is the JSON wire format fetched from the remote server type rawConfig struct { + ServerURL string `json:"server_url"` VersionSince string `json:"version-since"` VersionUntil string `json:"version-until"` PeriodMinutes int `json:"period_minutes"` @@ -72,7 +75,7 @@ func (m *Manager) RefreshIfNeeded(ctx context.Context) *Config { m.lastFetched = time.Now() log.Tracef("fetched metrics remote config: version-since=%s version-until=%s period=%s", - fetchedConfig.VersionSince, fetchedConfig.VersionUntil, fetchedConfig.Period) + fetchedConfig.VersionSince, fetchedConfig.VersionUntil, fetchedConfig.Interval) return fetchedConfig } @@ -118,6 +121,15 @@ func (m *Manager) fetch(ctx context.Context) (*Config, error) { return nil, fmt.Errorf("invalid period_minutes: %d", raw.PeriodMinutes) } + if raw.ServerURL == "" { + return nil, fmt.Errorf("server_url is required") + } + + serverURL, err := url.Parse(raw.ServerURL) + if err != nil { + return nil, fmt.Errorf("parse server_url %q: %w", raw.ServerURL, err) + } + since, err := goversion.NewVersion(raw.VersionSince) if err != nil { return nil, fmt.Errorf("parse version-since %q: %w", raw.VersionSince, err) @@ -129,8 +141,9 @@ func (m *Manager) fetch(ctx context.Context) (*Config, error) { } return &Config{ + ServerURL: *serverURL, VersionSince: since, VersionUntil: until, - Period: time.Duration(raw.PeriodMinutes) * time.Minute, + Interval: time.Duration(raw.PeriodMinutes) * time.Minute, }, nil } diff --git a/client/internal/metrics/remoteconfig/manager_test.go b/client/internal/metrics/remoteconfig/manager_test.go index 346b44a5e89..68ca3b4c411 100644 --- a/client/internal/metrics/remoteconfig/manager_test.go +++ b/client/internal/metrics/remoteconfig/manager_test.go @@ -17,6 +17,7 @@ const testMinRefresh = 100 * time.Millisecond func TestManager_FetchSuccess(t *testing.T) { server := newConfigServer(t, rawConfig{ + ServerURL: "https://ingest.example.com", VersionSince: "1.0.0", VersionUntil: "2.0.0", PeriodMinutes: 60, @@ -27,9 +28,10 @@ func TestManager_FetchSuccess(t *testing.T) { config := mgr.RefreshIfNeeded(context.Background()) require.NotNil(t, config) + assert.Equal(t, "https://ingest.example.com", config.ServerURL.String()) assert.Equal(t, "1.0.0", config.VersionSince.String()) assert.Equal(t, "2.0.0", config.VersionUntil.String()) - assert.Equal(t, 60*time.Minute, config.Period) + assert.Equal(t, 60*time.Minute, config.Interval) } func TestManager_CachesConfig(t *testing.T) { @@ -37,6 +39,7 @@ func TestManager_CachesConfig(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { fetchCount.Add(1) err := json.NewEncoder(w).Encode(rawConfig{ + ServerURL: "https://ingest.example.com", VersionSince: "1.0.0", VersionUntil: "2.0.0", PeriodMinutes: 60, @@ -64,6 +67,7 @@ func TestManager_RefetchesWhenStale(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { fetchCount.Add(1) err := json.NewEncoder(w).Encode(rawConfig{ + ServerURL: "https://ingest.example.com", VersionSince: "1.0.0", VersionUntil: "2.0.0", PeriodMinutes: 60, @@ -107,6 +111,7 @@ func TestManager_FetchFailureReturnsCached(t *testing.T) { return } err := json.NewEncoder(w).Encode(rawConfig{ + ServerURL: "https://ingest.example.com", VersionSince: "1.0.0", VersionUntil: "2.0.0", PeriodMinutes: 60, @@ -142,6 +147,7 @@ func TestManager_RejectsInvalidPeriod(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { server := newConfigServer(t, rawConfig{ + ServerURL: "https://ingest.example.com", VersionSince: "1.0.0", VersionUntil: "2.0.0", PeriodMinutes: tt.period, @@ -155,6 +161,20 @@ func TestManager_RejectsInvalidPeriod(t *testing.T) { } } +func TestManager_RejectsEmptyServerURL(t *testing.T) { + server := newConfigServer(t, rawConfig{ + ServerURL: "", + VersionSince: "1.0.0", + VersionUntil: "2.0.0", + PeriodMinutes: 60, + }) + defer server.Close() + + mgr := NewManager(server.URL, testMinRefresh) + config := mgr.RefreshIfNeeded(context.Background()) + assert.Nil(t, config) +} + func TestManager_RejectsInvalidJSON(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { _, err := w.Write([]byte("not json")) diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go index 5ea73e38134..94f134be0e5 100644 --- a/client/internal/metrics/victoria.go +++ b/client/internal/metrics/victoria.go @@ -103,4 +103,4 @@ func (m *victoriaMetrics) Export(w io.Writer) error { // Reset clears all collected metrics func (m *victoriaMetrics) Reset() { m.set.UnregisterAllMetrics() -} \ No newline at end of file +} diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index df71660aa67..382638a3af4 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -831,8 +831,12 @@ func (conn *Conn) recordConnectionMetrics() { } // Determine connection type based on current priority + conn.mu.Lock() + priority := conn.currentConnPriority + conn.mu.Unlock() + var connType metrics.ConnectionType - switch conn.currentConnPriority { + switch priority { case conntype.Relay: connType = metrics.ConnectionTypeRelay default: diff --git a/client/internal/peer/metrics_saver.go b/client/internal/peer/metrics_saver.go index 369877215fb..981148854d6 100644 --- a/client/internal/peer/metrics_saver.go +++ b/client/internal/peer/metrics_saver.go @@ -13,11 +13,6 @@ type MetricsStages struct { mu sync.Mutex } -func (s *MetricsStages) RecordCreated() { - s.mu.Lock() - defer s.mu.Unlock() -} - // RecordSignalingReceived records when the first signal is received from the remote peer. // Used as the base for all subsequent stage durations to avoid inflating metrics when // the remote peer was offline. From 80543b5e2ffdc418042e072a87d139808ee75cd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Mon, 9 Mar 2026 15:26:23 +0100 Subject: [PATCH 22/52] [client] Add anonymised peer tracking to pushed metrics Introduce peer_id and connection_pair_id tags to InfluxDB metrics. Public keys are hashed (truncated SHA-256) for anonymisation. The connection pair ID is deterministic regardless of which side computes it, enabling deduplication of reconnections in the ICE vs Relay dashboard. Also pin Grafana to v11.6.0 for file-based provisioning and fix datasource UID references. --- client/internal/connect.go | 2 +- client/internal/metrics/influxdb.go | 8 +- .../internal/metrics/infra/docker-compose.yml | 6 +- .../json/netbird-influxdb-metrics.json | 30 ++--- .../provisioning/datasources/influxdb.yml | 1 + client/internal/metrics/metrics.go | 37 +++++- client/internal/metrics/push_test.go | 2 +- client/internal/metrics/victoria.go | 106 ------------------ client/internal/peer/conn.go | 2 + 9 files changed, 62 insertions(+), 132 deletions(-) delete mode 100644 client/internal/metrics/victoria.go diff --git a/client/internal/connect.go b/client/internal/connect.go index 107cce629bf..00ab9fa6044 100644 --- a/client/internal/connect.go +++ b/client/internal/connect.go @@ -255,7 +255,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan Version: version.NetbirdVersion(), OS: runtime.GOOS, } - c.clientMetrics.UpdateAgentInfo(agentInfo) + c.clientMetrics.UpdateAgentInfo(agentInfo, myPrivateKey.PublicKey().String()) log.Debugf("connected to the Management service %s", c.config.ManagementURL.Host) defer func() { diff --git a/client/internal/metrics/influxdb.go b/client/internal/metrics/influxdb.go index 74bfdca621e..1caa3ac79e4 100644 --- a/client/internal/metrics/influxdb.go +++ b/client/internal/metrics/influxdb.go @@ -38,6 +38,7 @@ func newInfluxDBMetrics() metricsImplementation { func (m *influxDBMetrics) RecordConnectionStages( _ context.Context, agentInfo AgentInfo, + connectionPairID string, connectionType ConnectionType, isReconnection bool, timestamps ConnectionStageTimestamps, @@ -62,12 +63,14 @@ func (m *influxDBMetrics) RecordConnectionStages( } connTypeStr := connectionType.String() - tags := fmt.Sprintf("deployment_type=%s,connection_type=%s,attempt_type=%s,version=%s,os=%s", + tags := fmt.Sprintf("deployment_type=%s,connection_type=%s,attempt_type=%s,version=%s,os=%s,peer_id=%s,connection_pair_id=%s", agentInfo.DeploymentType.String(), connTypeStr, attemptType, agentInfo.Version, agentInfo.OS, + agentInfo.peerID, + connectionPairID, ) now := time.Now() @@ -92,10 +95,11 @@ func (m *influxDBMetrics) RecordConnectionStages( } func (m *influxDBMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentInfo, duration time.Duration) { - tags := fmt.Sprintf("deployment_type=%s,version=%s,os=%s", + tags := fmt.Sprintf("deployment_type=%s,version=%s,os=%s,peer_id=%s", agentInfo.DeploymentType.String(), agentInfo.Version, agentInfo.OS, + agentInfo.peerID, ) m.mu.Lock() diff --git a/client/internal/metrics/infra/docker-compose.yml b/client/internal/metrics/infra/docker-compose.yml index 9533444abd4..1ad7ec7b4fe 100644 --- a/client/internal/metrics/infra/docker-compose.yml +++ b/client/internal/metrics/infra/docker-compose.yml @@ -11,7 +11,7 @@ services: command: - "--storageDataPath=/victoria-metrics-data" - "--httpListenAddr=:8428" - - "--retentionPeriod=12" # Keep data for 12 months + - "--retentionPeriod=3" - "--search.maxStalenessInterval=1m" # Stop forward-filling after 1 minute of no new data restart: unless-stopped networks: @@ -58,7 +58,7 @@ services: grafana: container_name: grafana - image: grafana/grafana:latest + image: grafana/grafana:11.6.0 ports: - "3001:3000" environment: @@ -84,4 +84,4 @@ volumes: networks: metrics: - driver: bridge \ No newline at end of file + driver: bridge diff --git a/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json b/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json index ad0ba09fcee..89c751b1ea7 100644 --- a/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json +++ b/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json @@ -10,7 +10,7 @@ "type": "stat", "datasource": { "type": "influxdb", - "uid": "" + "uid": "influxdb" }, "gridPos": { "h": 8, @@ -20,11 +20,11 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"peer_id\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", "refId": "A" }, { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"peer_id\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", "refId": "B" } ], @@ -49,7 +49,7 @@ "type": "stat", "datasource": { "type": "influxdb", - "uid": "" + "uid": "influxdb" }, "gridPos": { "h": 8, @@ -59,11 +59,11 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", "refId": "A" }, { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", "refId": "B" } ], @@ -88,7 +88,7 @@ "type": "timeseries", "datasource": { "type": "influxdb", - "uid": "" + "uid": "influxdb" }, "gridPos": { "h": 8, @@ -98,7 +98,7 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\"])\n |> set(key: \"_field\", value: \"Sync Duration\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"peer_id\"])\n |> set(key: \"_field\", value: \"Sync Duration\")", "refId": "A" } ], @@ -119,7 +119,7 @@ "type": "piechart", "datasource": { "type": "influxdb", - "uid": "" + "uid": "influxdb" }, "gridPos": { "h": 8, @@ -129,7 +129,7 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> drop(columns: [\"deployment_type\", \"attempt_type\", \"version\", \"os\"])\n |> group(columns: [\"connection_type\"])\n |> count()", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> drop(columns: [\"deployment_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\"])\n |> group(columns: [\"connection_pair_id\"])\n |> last()\n |> group(columns: [\"connection_type\"])\n |> count()", "refId": "A" } ], @@ -149,7 +149,7 @@ "type": "bargauge", "datasource": { "type": "influxdb", - "uid": "" + "uid": "influxdb" }, "gridPos": { "h": 8, @@ -159,11 +159,11 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"signaling_to_connection_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Signaling to Connection\"})", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"signaling_to_connection_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Signaling to Connection\"})", "refId": "A" }, { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"connection_to_wg_handshake_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Connection to WG Handshake\"})", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"connection_to_wg_handshake_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Connection to WG Handshake\"})", "refId": "B" } ], @@ -187,7 +187,7 @@ "type": "timeseries", "datasource": { "type": "influxdb", - "uid": "" + "uid": "influxdb" }, "gridPos": { "h": 8, @@ -197,7 +197,7 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\"])\n |> set(key: \"_field\", value: \"Total Connection Time\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> set(key: \"_field\", value: \"Total Connection Time\")", "refId": "A" } ], diff --git a/client/internal/metrics/infra/grafana/provisioning/datasources/influxdb.yml b/client/internal/metrics/infra/grafana/provisioning/datasources/influxdb.yml index ade9b89c2a5..69b96a93a5b 100644 --- a/client/internal/metrics/infra/grafana/provisioning/datasources/influxdb.yml +++ b/client/internal/metrics/infra/grafana/provisioning/datasources/influxdb.yml @@ -2,6 +2,7 @@ apiVersion: 1 datasources: - name: InfluxDB + uid: influxdb type: influxdb access: proxy url: http://influxdb:8086 diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index eba7d0471dc..31465b18718 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -2,6 +2,8 @@ package metrics import ( "context" + "crypto/sha256" + "encoding/hex" "fmt" "io" "sync" @@ -17,6 +19,25 @@ type AgentInfo struct { DeploymentType DeploymentType Version string OS string // runtime.GOOS (linux, darwin, windows, etc.) + peerID string // anonymised peer identifier (SHA-256 of WireGuard public key) +} + +// peerIDFromPublicKey returns a truncated SHA-256 hash (8 bytes / 16 hex chars) of the given WireGuard public key. +func peerIDFromPublicKey(pubKey string) string { + hash := sha256.Sum256([]byte(pubKey)) + return hex.EncodeToString(hash[:8]) +} + +// connectionPairID returns a deterministic identifier for a connection between two peers. +// It sorts the two peer IDs before hashing so the same pair always produces the same ID +// regardless of which side computes it. +func connectionPairID(peerID1, peerID2 string) string { + a, b := peerID1, peerID2 + if a > b { + a, b = b, a + } + hash := sha256.Sum256([]byte(a + b)) + return hex.EncodeToString(hash[:8]) } // metricsImplementation defines the internal interface for metrics implementations @@ -25,6 +46,7 @@ type metricsImplementation interface { RecordConnectionStages( ctx context.Context, agentInfo AgentInfo, + connectionPairID string, connectionType ConnectionType, isReconnection bool, timestamps ConnectionStageTimestamps, @@ -68,9 +90,11 @@ func (c ConnectionStageTimestamps) String() string { ) } -// RecordConnectionStages calculates stage durations from timestamps and records them +// RecordConnectionStages calculates stage durations from timestamps and records them. +// remotePubKey is the remote peer's WireGuard public key; it will be hashed for anonymisation. func (c *ClientMetrics) RecordConnectionStages( ctx context.Context, + remotePubKey string, connectionType ConnectionType, isReconnection bool, timestamps ConnectionStageTimestamps, @@ -82,7 +106,9 @@ func (c *ClientMetrics) RecordConnectionStages( agentInfo := c.agentInfo c.mu.RUnlock() - c.impl.RecordConnectionStages(ctx, agentInfo, connectionType, isReconnection, timestamps) + remotePeerID := peerIDFromPublicKey(remotePubKey) + pairID := connectionPairID(agentInfo.peerID, remotePeerID) + c.impl.RecordConnectionStages(ctx, agentInfo, pairID, connectionType, isReconnection, timestamps) } // RecordSyncDuration records the duration of sync message processing @@ -97,12 +123,15 @@ func (c *ClientMetrics) RecordSyncDuration(ctx context.Context, duration time.Du c.impl.RecordSyncDuration(ctx, agentInfo, duration) } -// UpdateAgentInfo updates the agent information (e.g., when switching profiles) -func (c *ClientMetrics) UpdateAgentInfo(agentInfo AgentInfo) { +// UpdateAgentInfo updates the agent information (e.g., when switching profiles). +// publicKey is the WireGuard public key; it will be hashed for anonymisation. +func (c *ClientMetrics) UpdateAgentInfo(agentInfo AgentInfo, publicKey string) { if c == nil { return } + agentInfo.peerID = peerIDFromPublicKey(publicKey) + c.mu.Lock() c.agentInfo = agentInfo c.mu.Unlock() diff --git a/client/internal/metrics/push_test.go b/client/internal/metrics/push_test.go index 20b26cf8a82..7b783bc721e 100644 --- a/client/internal/metrics/push_test.go +++ b/client/internal/metrics/push_test.go @@ -64,7 +64,7 @@ type mockMetrics struct { exportData string } -func (m *mockMetrics) RecordConnectionStages(_ context.Context, _ AgentInfo, _ ConnectionType, _ bool, _ ConnectionStageTimestamps) { +func (m *mockMetrics) RecordConnectionStages(_ context.Context, _ AgentInfo, _ string, _ ConnectionType, _ bool, _ ConnectionStageTimestamps) { } func (m *mockMetrics) RecordSyncDuration(_ context.Context, _ AgentInfo, _ time.Duration) { diff --git a/client/internal/metrics/victoria.go b/client/internal/metrics/victoria.go deleted file mode 100644 index 94f134be0e5..00000000000 --- a/client/internal/metrics/victoria.go +++ /dev/null @@ -1,106 +0,0 @@ -package metrics - -import ( - "context" - "fmt" - "io" - "time" - - "github.com/VictoriaMetrics/metrics" - log "github.com/sirupsen/logrus" -) - -// victoriaMetrics is the VictoriaMetrics implementation of ClientMetrics -type victoriaMetrics struct { - // Metrics set for managing all metrics - set *metrics.Set -} - -func newVictoriaMetrics() metricsImplementation { - return &victoriaMetrics{ - set: metrics.NewSet(), - } -} - -// RecordConnectionStages records the duration of each connection stage from timestamps -func (m *victoriaMetrics) RecordConnectionStages( - _ context.Context, - agentInfo AgentInfo, - connectionType ConnectionType, - isReconnection bool, - timestamps ConnectionStageTimestamps, -) { - var signalingReceivedToConnection, connectionToWgHandshake, totalDuration float64 - - if !timestamps.SignalingReceived.IsZero() && !timestamps.ConnectionReady.IsZero() { - signalingReceivedToConnection = timestamps.ConnectionReady.Sub(timestamps.SignalingReceived).Seconds() - } - - if !timestamps.ConnectionReady.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { - connectionToWgHandshake = timestamps.WgHandshakeSuccess.Sub(timestamps.ConnectionReady).Seconds() - } - - if !timestamps.SignalingReceived.IsZero() && !timestamps.WgHandshakeSuccess.IsZero() { - totalDuration = timestamps.WgHandshakeSuccess.Sub(timestamps.SignalingReceived).Seconds() - } - - attemptType := "initial" - if isReconnection { - attemptType = "reconnection" - } - - connTypeStr := connectionType.String() - - m.set.GetOrCreateHistogram( - m.getMetricName(agentInfo, "netbird_peer_connection_stage_signaling_received_to_connection", connTypeStr, attemptType), - ).Update(signalingReceivedToConnection) - - m.set.GetOrCreateHistogram( - m.getMetricName(agentInfo, "netbird_peer_connection_stage_connection_to_wg_handshake", connTypeStr, attemptType), - ).Update(connectionToWgHandshake) - - m.set.GetOrCreateHistogram( - m.getMetricName(agentInfo, "netbird_peer_connection_total_creation_to_wg_handshake", connTypeStr, attemptType), - ).Update(totalDuration) - - log.Tracef("peer connection metrics [%s, %s, %s]: signalingReceived→connection: %.3fs, connection→wg_handshake: %.3fs, total: %.3fs", - agentInfo.DeploymentType.String(), connTypeStr, attemptType, signalingReceivedToConnection, connectionToWgHandshake, totalDuration) -} - -// getMetricName constructs a metric name with labels -func (m *victoriaMetrics) getMetricName(agentInfo AgentInfo, baseName, connectionType, attemptType string) string { - return fmt.Sprintf(`%s{deployment_type=%q,connection_type=%q,attempt_type=%q,version=%q,os=%q}`, - baseName, - agentInfo.DeploymentType.String(), - connectionType, - attemptType, - agentInfo.Version, - agentInfo.OS, - ) -} - -// RecordSyncDuration records the duration of sync message processing -func (m *victoriaMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentInfo, duration time.Duration) { - metricName := fmt.Sprintf(`netbird_sync_duration_seconds{deployment_type=%q,version=%q,os=%q}`, - agentInfo.DeploymentType.String(), - agentInfo.Version, - agentInfo.OS, - ) - - m.set.GetOrCreateHistogram(metricName).Update(duration.Seconds()) -} - -// Export writes metrics in Prometheus text format -func (m *victoriaMetrics) Export(w io.Writer) error { - if m.set == nil { - return fmt.Errorf("metrics set not initialized") - } - - m.set.WritePrometheus(w) - return nil -} - -// Reset clears all collected metrics -func (m *victoriaMetrics) Reset() { - m.set.UnregisterAllMetrics() -} diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index 382638a3af4..b14376b086f 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -31,6 +31,7 @@ import ( type MetricsRecorder interface { RecordConnectionStages( ctx context.Context, + remotePubKey string, connectionType metrics.ConnectionType, isReconnection bool, timestamps metrics.ConnectionStageTimestamps, @@ -846,6 +847,7 @@ func (conn *Conn) recordConnectionMetrics() { // Record metrics with timestamps - duration calculation happens in metrics package conn.metricsRecorder.RecordConnectionStages( context.Background(), + conn.config.Key, connType, conn.metricsStages.IsReconnection(), conn.metricsStages.GetTimestamps(), From b5772892ff06e1001546f89ad160bf04e54e2359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Mon, 9 Mar 2026 15:29:53 +0100 Subject: [PATCH 23/52] Remove unused dependencies from go.mod and go.sum --- go.mod | 3 --- go.sum | 6 ------ 2 files changed, 9 deletions(-) diff --git a/go.mod b/go.mod index c47aaed3cab..4bcdbdc7810 100644 --- a/go.mod +++ b/go.mod @@ -33,7 +33,6 @@ require ( fyne.io/fyne/v2 v2.7.0 fyne.io/systray v1.12.1-0.20260116214250-81f8e1a496f9 github.com/TheJumpCloud/jcapi-go v3.0.0+incompatible - github.com/VictoriaMetrics/metrics v1.40.2 github.com/awnumar/memguard v0.23.0 github.com/aws/aws-sdk-go-v2 v1.36.3 github.com/aws/aws-sdk-go-v2/config v1.29.14 @@ -266,8 +265,6 @@ require ( github.com/stretchr/objx v0.5.2 // indirect github.com/tklauser/go-sysconf v0.3.14 // indirect github.com/tklauser/numcpus v0.8.0 // indirect - github.com/valyala/fastrand v1.1.0 // indirect - github.com/valyala/histogram v1.2.0 // indirect github.com/vishvananda/netns v0.0.5 // indirect github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect github.com/wlynxg/anet v0.0.5 // indirect diff --git a/go.sum b/go.sum index 1cb4677a975..1bd9396bb79 100644 --- a/go.sum +++ b/go.sum @@ -36,8 +36,6 @@ github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/TheJumpCloud/jcapi-go v3.0.0+incompatible h1:hqcTK6ZISdip65SR792lwYJTa/axESA0889D3UlZbLo= github.com/TheJumpCloud/jcapi-go v3.0.0+incompatible/go.mod h1:6B1nuc1MUs6c62ODZDl7hVE5Pv7O2XGSkgg2olnq34I= -github.com/VictoriaMetrics/metrics v1.40.2 h1:OVSjKcQEx6JAwGeu8/KQm9Su5qJ72TMEW4xYn5vw3Ac= -github.com/VictoriaMetrics/metrics v1.40.2/go.mod h1:XE4uudAAIRaJE614Tl5HMrtoEU6+GDZO4QTnNSsZRuA= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= @@ -577,10 +575,6 @@ github.com/tklauser/numcpus v0.8.0 h1:Mx4Wwe/FjZLeQsK/6kt2EOepwwSl7SmJrK5bV/dXYg github.com/tklauser/numcpus v0.8.0/go.mod h1:ZJZlAY+dmR4eut8epnzf0u/VwodKmryxR8txiloSqBE= github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= -github.com/valyala/fastrand v1.1.0 h1:f+5HkLW4rsgzdNoleUOB69hyT9IlD2ZQh9GyDMfb5G8= -github.com/valyala/fastrand v1.1.0/go.mod h1:HWqCzkrkg6QXT8V2EXWvXCoow7vLwOFN002oeRzjapQ= -github.com/valyala/histogram v1.2.0 h1:wyYGAZZt3CpwUiIb9AU/Zbllg1llXyrtApRS815OLoQ= -github.com/valyala/histogram v1.2.0/go.mod h1:Hb4kBwb4UxsaNbbbh+RRz8ZR6pdodR57tzWUS3BUzXY= github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4= github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY= From 46984909dc299ac74717b8414f8ae046dd17fe31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Mon, 9 Mar 2026 15:33:20 +0100 Subject: [PATCH 24/52] Refactor InfluxDB ingest pipeline: extract validation logic - Move line validation logic to `validateLine` and `validateField` helper functions. - Improve error handling with structured validation and clearer separation of concerns. - Add stderr redirection for error messages in `create-tokens.sh`. --- .../infra/influxdb/scripts/create-tokens.sh | 6 +- client/internal/metrics/infra/ingest/main.go | 91 +++++++++++-------- 2 files changed, 56 insertions(+), 41 deletions(-) diff --git a/client/internal/metrics/infra/influxdb/scripts/create-tokens.sh b/client/internal/metrics/infra/influxdb/scripts/create-tokens.sh index 5f50d65abba..2464803e8f1 100755 --- a/client/internal/metrics/infra/influxdb/scripts/create-tokens.sh +++ b/client/internal/metrics/infra/influxdb/scripts/create-tokens.sh @@ -5,9 +5,9 @@ BUCKET_ID=$(influx bucket list --org netbird --name metrics --json | grep -oP '"id"\s*:\s*"\K[^"]+' | head -1) ORG_ID=$(influx org list --name netbird --json | grep -oP '"id"\s*:\s*"\K[^"]+' | head -1) -if [ -z "$BUCKET_ID" ] || [ -z "$ORG_ID" ]; then - echo "ERROR: Could not determine bucket or org ID" - echo "BUCKET_ID=$BUCKET_ID ORG_ID=$ORG_ID" +if [[ -z "$BUCKET_ID" ]] || [[ -z "$ORG_ID" ]]; then + echo "ERROR: Could not determine bucket or org ID" >&2 + echo "BUCKET_ID=$BUCKET_ID ORG_ID=$ORG_ID" >&2 exit 1 fi diff --git a/client/internal/metrics/infra/ingest/main.go b/client/internal/metrics/infra/ingest/main.go index 663e3df9c30..012ee741a04 100644 --- a/client/internal/metrics/infra/ingest/main.go +++ b/client/internal/metrics/infra/ingest/main.go @@ -127,44 +127,8 @@ func validateLineProtocol(body []byte) ([]byte, error) { continue } - // line protocol: measurement,tag=val,tag=val field=val,field=val timestamp - parts := strings.SplitN(line, " ", 3) - if len(parts) < 2 { - return nil, fmt.Errorf("invalid line protocol: %q", truncate(line, 100)) - } - - measurementAndTags := parts[0] - measurement := measurementAndTags - if idx := strings.IndexByte(measurementAndTags, ','); idx >= 0 { - measurement = measurementAndTags[:idx] - } - - allowedFields, ok := allowedMeasurements[measurement] - if !ok { - return nil, fmt.Errorf("unknown measurement: %q", measurement) - } - - fieldPairs := strings.Split(parts[1], ",") - for _, pair := range fieldPairs { - kv := strings.SplitN(pair, "=", 2) - if len(kv) != 2 { - return nil, fmt.Errorf("invalid field: %q", pair) - } - fieldName := kv[0] - if !allowedFields[fieldName] { - return nil, fmt.Errorf("unknown field %q in measurement %q", fieldName, measurement) - } - - val, err := strconv.ParseFloat(kv[1], 64) - if err != nil { - return nil, fmt.Errorf("invalid field value %q for %q", kv[1], fieldName) - } - if val < 0 { - return nil, fmt.Errorf("negative value for %q: %g", fieldName, val) - } - if fieldName == "total_seconds" && val > maxTotalSeconds { - return nil, fmt.Errorf("total_seconds too large: %g > %g", val, maxTotalSeconds) - } + if err := validateLine(line); err != nil { + return nil, err } valid = append(valid, line) @@ -177,6 +141,57 @@ func validateLineProtocol(body []byte) ([]byte, error) { return []byte(strings.Join(valid, "\n") + "\n"), nil } +func validateLine(line string) error { + // line protocol: measurement,tag=val,tag=val field=val,field=val timestamp + parts := strings.SplitN(line, " ", 3) + if len(parts) < 2 { + return fmt.Errorf("invalid line protocol: %q", truncate(line, 100)) + } + + measurement := parts[0] + if idx := strings.IndexByte(measurement, ','); idx >= 0 { + measurement = measurement[:idx] + } + + allowedFields, ok := allowedMeasurements[measurement] + if !ok { + return fmt.Errorf("unknown measurement: %q", measurement) + } + + for _, pair := range strings.Split(parts[1], ",") { + if err := validateField(pair, measurement, allowedFields); err != nil { + return err + } + } + + return nil +} + +func validateField(pair, measurement string, allowedFields map[string]bool) error { + kv := strings.SplitN(pair, "=", 2) + if len(kv) != 2 { + return fmt.Errorf("invalid field: %q", pair) + } + + fieldName := kv[0] + if !allowedFields[fieldName] { + return fmt.Errorf("unknown field %q in measurement %q", fieldName, measurement) + } + + val, err := strconv.ParseFloat(kv[1], 64) + if err != nil { + return fmt.Errorf("invalid field value %q for %q", kv[1], fieldName) + } + if val < 0 { + return fmt.Errorf("negative value for %q: %g", fieldName, val) + } + if fieldName == "total_seconds" && val > maxTotalSeconds { + return fmt.Errorf("total_seconds too large: %g > %g", val, maxTotalSeconds) + } + + return nil +} + // buildConfigJSON builds the remote config JSON from env vars. // Returns nil if required vars are not set. func buildConfigJSON() []byte { From 1d5224bbad1016e211854463f9f7edda214f49b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Mon, 9 Mar 2026 15:37:45 +0100 Subject: [PATCH 25/52] Set non-root user in Dockerfile for Ingest service --- client/internal/metrics/infra/ingest/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/client/internal/metrics/infra/ingest/Dockerfile b/client/internal/metrics/infra/ingest/Dockerfile index 8293c6f0f35..3620c524be4 100644 --- a/client/internal/metrics/infra/ingest/Dockerfile +++ b/client/internal/metrics/infra/ingest/Dockerfile @@ -4,5 +4,7 @@ COPY go.mod main.go ./ RUN CGO_ENABLED=0 go build -o ingest . FROM alpine:3.20 +RUN adduser -D -H ingest COPY --from=build /app/ingest /usr/local/bin/ingest +USER ingest ENTRYPOINT ["ingest"] \ No newline at end of file From 15f12c883ff19a411c2f8e299cccdf20a6166591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Mon, 9 Mar 2026 17:01:59 +0100 Subject: [PATCH 26/52] Fix Windows CI: command line too long --- .github/workflows/golang-test-windows.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/golang-test-windows.yml b/.github/workflows/golang-test-windows.yml index 8af4046a780..8e672043d15 100644 --- a/.github/workflows/golang-test-windows.yml +++ b/.github/workflows/golang-test-windows.yml @@ -63,10 +63,15 @@ jobs: - run: PsExec64 -s -w ${{ github.workspace }} C:\hostedtoolcache\windows\go\${{ steps.go.outputs.go-version }}\x64\bin\go.exe env -w GOMODCACHE=${{ env.cache }} - run: PsExec64 -s -w ${{ github.workspace }} C:\hostedtoolcache\windows\go\${{ steps.go.outputs.go-version }}\x64\bin\go.exe env -w GOCACHE=${{ env.modcache }} - run: PsExec64 -s -w ${{ github.workspace }} C:\hostedtoolcache\windows\go\${{ steps.go.outputs.go-version }}\x64\bin\go.exe mod tidy - - run: echo "files=$(go list ./... | ForEach-Object { $_ } | Where-Object { $_ -notmatch '/management' } | Where-Object { $_ -notmatch '/relay' } | Where-Object { $_ -notmatch '/signal' } | Where-Object { $_ -notmatch '/proxy' } | Where-Object { $_ -notmatch '/combined' })" >> $env:GITHUB_ENV + - name: Generate test script + run: | + $packages = go list ./... | Where-Object { $_ -notmatch '/management' } | Where-Object { $_ -notmatch '/relay' } | Where-Object { $_ -notmatch '/signal' } | Where-Object { $_ -notmatch '/proxy' } | Where-Object { $_ -notmatch '/combined' } + $goExe = "C:\hostedtoolcache\windows\go\${{ steps.go.outputs.go-version }}\x64\bin\go.exe" + $cmd = "$goExe test -tags=devcert -timeout 10m -p 1 $($packages -join ' ') > test-out.txt 2>&1" + Set-Content -Path "${{ github.workspace }}\run-tests.cmd" -Value $cmd - name: test - run: PsExec64 -s -w ${{ github.workspace }} cmd.exe /c "C:\hostedtoolcache\windows\go\${{ steps.go.outputs.go-version }}\x64\bin\go.exe test -tags=devcert -timeout 10m -p 1 ${{ env.files }} > test-out.txt 2>&1" + run: PsExec64 -s -w ${{ github.workspace }} cmd.exe /c "${{ github.workspace }}\run-tests.cmd" - name: test output if: ${{ always() }} run: Get-Content test-out.txt From ddaaa927456c570c5f2c0b956fc959b817aec7da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 10:58:36 +0100 Subject: [PATCH 27/52] Remove Victoria metrics --- client/internal/metrics/infra/README.md | 22 +---------- .../internal/metrics/infra/docker-compose.yml | 37 ------------------- 2 files changed, 2 insertions(+), 57 deletions(-) diff --git a/client/internal/metrics/infra/README.md b/client/internal/metrics/infra/README.md index 87cb4db9bf4..5240c783c65 100644 --- a/client/internal/metrics/infra/README.md +++ b/client/internal/metrics/infra/README.md @@ -4,14 +4,7 @@ Internal documentation for the NetBird client metrics system. ## Overview -Client metrics track connection performance and sync durations. Two backend implementations are available: - -- **InfluxDB** (`influxdb.go`): Timestamped samples in InfluxDB line protocol. Best for sparse one-shot events (connections, syncs). Each event is pushed once then cleared. -- **VictoriaMetrics** (`victoria.go`): Prometheus-style cumulative histograms. Better for continuous/high-frequency metrics. - -Select the implementation in `metrics_default.go`: -- `newInfluxDBMetrics()` — InfluxDB line protocol -- `newVictoriaMetrics()` — Prometheus format +Client metrics track connection performance and sync durations using InfluxDB line protocol (`influxdb.go`). Each event is pushed once then cleared. Metrics are: - Disabled by default (opt-in via `NB_METRICS_ENABLED=true`) @@ -131,9 +124,7 @@ For URL and Interval, the precedence is: 5. On success: `Reset()` clears pushed samples 6. `StopPush()` cancels context and waits for goroutine -**InfluxDB mode:** Samples are collected with exact timestamps, pushed once, then cleared. No data is resent. - -**VictoriaMetrics mode:** Cumulative histograms accumulate in memory. After successful push, metrics are unregistered. Use `rate(sum)/rate(count)` for averages. +Samples are collected with exact timestamps, pushed once, then cleared. No data is resent. ## Local Development Setup @@ -150,7 +141,6 @@ This starts: - **Ingest server** on http://localhost:8087 — accepts client metrics (no auth needed) - **InfluxDB** — internal only, not exposed to host - **Grafana** on http://localhost:3001 -- **VictoriaMetrics** on http://localhost:8428 ### 2. Configure Client @@ -171,7 +161,6 @@ go run ./client/ up ### 4. View in Grafana - **InfluxDB dashboard:** http://localhost:3001/d/netbird-influxdb-metrics -- **VictoriaMetrics dashboard:** http://localhost:3001/d/netbird-connection-metrics ### 5. Verify Data @@ -183,11 +172,4 @@ docker compose exec influxdb influx query \ # Check ingest server health curl http://localhost:8087/health - -# VictoriaMetrics - list metrics -curl http://localhost:8428/api/v1/label/__name__/values - -# VictoriaMetrics - delete all data -curl -s http://localhost:8428/api/v1/admin/tsdb/delete_series \ - --data-urlencode 'match[]={__name__=~".+"}' ``` \ No newline at end of file diff --git a/client/internal/metrics/infra/docker-compose.yml b/client/internal/metrics/infra/docker-compose.yml index 1ad7ec7b4fe..7ca320f9c3b 100644 --- a/client/internal/metrics/infra/docker-compose.yml +++ b/client/internal/metrics/infra/docker-compose.yml @@ -1,41 +1,6 @@ version: '3.8' services: - victoriametrics: - container_name: victoriametrics - image: victoriametrics/victoria-metrics:latest - ports: - - "8428:8428" - volumes: - - victoria-metrics-data:/victoria-metrics-data - command: - - "--storageDataPath=/victoria-metrics-data" - - "--httpListenAddr=:8428" - - "--retentionPeriod=3" - - "--search.maxStalenessInterval=1m" # Stop forward-filling after 1 minute of no new data - restart: unless-stopped - networks: - - metrics - - influxdb: - container_name: influxdb - image: influxdb:2 - # No ports exposed — only accessible within the metrics network - volumes: - - influxdb-data:/var/lib/influxdb2 - - ./influxdb/scripts:/docker-entrypoint-initdb.d - environment: - - DOCKER_INFLUXDB_INIT_MODE=setup - - DOCKER_INFLUXDB_INIT_USERNAME=admin - - DOCKER_INFLUXDB_INIT_PASSWORD=${INFLUXDB_ADMIN_PASSWORD:?required} - - DOCKER_INFLUXDB_INIT_ORG=netbird - - DOCKER_INFLUXDB_INIT_BUCKET=metrics - - DOCKER_INFLUXDB_INIT_RETENTION=365d - - DOCKER_INFLUXDB_INIT_ADMIN_TOKEN=${INFLUXDB_ADMIN_TOKEN:-} - restart: unless-stopped - networks: - - metrics - ingest: container_name: ingest build: @@ -71,14 +36,12 @@ services: - grafana-data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning depends_on: - - victoriametrics - influxdb restart: unless-stopped networks: - metrics volumes: - victoria-metrics-data: influxdb-data: grafana-data: From d4c80ef4936bd9b48af200ad8b58b3f2d531313d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 11:04:12 +0100 Subject: [PATCH 28/52] Add hashed peer ID as Authorization header in metrics push --- client/internal/metrics/metrics.go | 9 +++++++++ client/internal/metrics/push.go | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 31465b18718..3c63f7a3fab 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -135,6 +135,13 @@ func (c *ClientMetrics) UpdateAgentInfo(agentInfo AgentInfo, publicKey string) { c.mu.Lock() c.agentInfo = agentInfo c.mu.Unlock() + + c.pushMu.Lock() + push := c.push + c.pushMu.Unlock() + if push != nil { + push.SetPeerID(agentInfo.peerID) + } } // Export exports metrics to the writer @@ -163,6 +170,7 @@ func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) { c.mu.RLock() agentVersion := c.agentInfo.Version + peerID := c.agentInfo.peerID c.mu.RUnlock() configManager := remoteconfig.NewManager(getMetricsConfigURL(), remoteconfig.DefaultMinRefreshInterval) @@ -171,6 +179,7 @@ func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) { log.Errorf("failed to create metrics push: %v", err) return } + push.SetPeerID(peerID) ctx, cancel := context.WithCancel(ctx) c.pushCancel = cancel diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index d1579ebcf49..19086991dc7 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -6,6 +6,7 @@ import ( "fmt" "net/http" "net/url" + "sync" "time" goversion "github.com/hashicorp/go-version" @@ -59,6 +60,9 @@ type Push struct { config PushConfig agentVersion *goversion.Version + peerID string + peerMu sync.RWMutex + client *http.Client envInterval time.Duration envAddress *url.URL @@ -109,6 +113,13 @@ func NewPush(metrics metricsImplementation, configManager remoteConfigProvider, }, nil } +// SetPeerID updates the hashed peer ID used for the Authorization header. +func (p *Push) SetPeerID(peerID string) { + p.peerMu.Lock() + p.peerID = peerID + p.peerMu.Unlock() +} + // Start starts the periodic push loop. // If overrideInterval is set (via env var), pushes unconditionally at that interval. // Otherwise, fetches remote config to determine push period and version eligibility. @@ -197,6 +208,13 @@ func (p *Push) push(ctx context.Context, pushURL string) error { } req.Header.Set("Content-Type", "text/plain; charset=utf-8") + p.peerMu.RLock() + peerID := p.peerID + p.peerMu.RUnlock() + if peerID != "" { + req.Header.Set("Authorization", peerID) + } + // Send request resp, err := p.client.Do(req) if err != nil { From ebfd984a652ef04b80100e41429a5729d93e3698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 11:10:24 +0100 Subject: [PATCH 29/52] Revert influxdb in docker compose --- .../internal/metrics/infra/docker-compose.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/client/internal/metrics/infra/docker-compose.yml b/client/internal/metrics/infra/docker-compose.yml index 7ca320f9c3b..a7f5791de30 100644 --- a/client/internal/metrics/infra/docker-compose.yml +++ b/client/internal/metrics/infra/docker-compose.yml @@ -21,6 +21,25 @@ services: networks: - metrics + influxdb: + container_name: influxdb + image: influxdb:2 + # No ports exposed — only accessible within the metrics network + volumes: + - influxdb-data:/var/lib/influxdb2 + - ./influxdb/scripts:/docker-entrypoint-initdb.d + environment: + - DOCKER_INFLUXDB_INIT_MODE=setup + - DOCKER_INFLUXDB_INIT_USERNAME=admin + - DOCKER_INFLUXDB_INIT_PASSWORD=${INFLUXDB_ADMIN_PASSWORD:?required} + - DOCKER_INFLUXDB_INIT_ORG=netbird + - DOCKER_INFLUXDB_INIT_BUCKET=metrics + - DOCKER_INFLUXDB_INIT_RETENTION=365d + - DOCKER_INFLUXDB_INIT_ADMIN_TOKEN=${INFLUXDB_ADMIN_TOKEN:-} + restart: unless-stopped + networks: + - metrics + grafana: container_name: grafana image: grafana/grafana:11.6.0 From da63e2f99729c93a5c7949c3da9ec816cac26e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 11:17:55 +0100 Subject: [PATCH 30/52] Enable gzip compression and authorization validation for metrics push and ingest --- client/internal/metrics/infra/ingest/main.go | 43 +++++++++++++++++++- client/internal/metrics/push.go | 24 ++++++++++- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/client/internal/metrics/infra/ingest/main.go b/client/internal/metrics/infra/ingest/main.go index 012ee741a04..3c3d2981699 100644 --- a/client/internal/metrics/infra/ingest/main.go +++ b/client/internal/metrics/infra/ingest/main.go @@ -2,6 +2,8 @@ package main import ( "bytes" + "compress/gzip" + "encoding/hex" "encoding/json" "fmt" "io" @@ -17,6 +19,7 @@ const ( defaultInfluxDBURL = "http://influxdb:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns" maxBodySize = 1 * 1024 * 1024 // 1 MB max request body maxTotalSeconds = 300.0 // reject total_seconds > 5 minutes + peerIDLength = 16 // truncated SHA-256: 8 bytes = 16 hex chars ) var allowedMeasurements = map[string]map[string]bool{ @@ -47,9 +50,14 @@ func main() { return } - body, err := io.ReadAll(io.LimitReader(r.Body, maxBodySize+1)) + if err := validateAuth(r); err != nil { + http.Error(w, err.Error(), http.StatusUnauthorized) + return + } + + body, err := readBody(r) if err != nil { - http.Error(w, "read error", http.StatusBadRequest) + http.Error(w, err.Error(), http.StatusBadRequest) return } if len(body) > maxBodySize { @@ -115,6 +123,37 @@ func main() { } } +// validateAuth checks that the Authorization header contains a valid hashed peer ID. +func validateAuth(r *http.Request) error { + peerID := r.Header.Get("Authorization") + if peerID == "" { + return fmt.Errorf("missing Authorization header") + } + if len(peerID) != peerIDLength { + return fmt.Errorf("invalid Authorization header length") + } + if _, err := hex.DecodeString(peerID); err != nil { + return fmt.Errorf("invalid Authorization header format") + } + return nil +} + +// readBody reads the request body, decompressing gzip if Content-Encoding indicates it. +func readBody(r *http.Request) ([]byte, error) { + reader := io.LimitReader(r.Body, maxBodySize+1) + + if r.Header.Get("Content-Encoding") == "gzip" { + gz, err := gzip.NewReader(reader) + if err != nil { + return nil, fmt.Errorf("invalid gzip: %w", err) + } + defer gz.Close() + reader = gz + } + + return io.ReadAll(reader) +} + // validateLineProtocol parses InfluxDB line protocol lines, // whitelists measurements and fields, and checks value bounds. func validateLineProtocol(body []byte) ([]byte, error) { diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index 19086991dc7..06bc29a1c89 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -2,6 +2,7 @@ package metrics import ( "bytes" + "compress/gzip" "context" "fmt" "net/http" @@ -201,12 +202,19 @@ func (p *Push) push(ctx context.Context, pushURL string) error { return nil } + // Gzip compress the body + compressed, err := gzipCompress(buf.Bytes()) + if err != nil { + return fmt.Errorf("gzip compress: %w", err) + } + // Create HTTP request - req, err := http.NewRequestWithContext(ctx, "POST", pushURL, &buf) + req, err := http.NewRequestWithContext(ctx, "POST", pushURL, compressed) if err != nil { return fmt.Errorf("create request: %w", err) } req.Header.Set("Content-Type", "text/plain; charset=utf-8") + req.Header.Set("Content-Encoding", "gzip") p.peerMu.RLock() peerID := p.peerID @@ -256,6 +264,20 @@ func (p *Push) resolveServerURL(remoteServerURL *url.URL) string { return baseURL.String() } +// gzipCompress compresses data using gzip and returns the compressed buffer. +func gzipCompress(data []byte) (*bytes.Buffer, error) { + var buf bytes.Buffer + gz := gzip.NewWriter(&buf) + defer func() { _ = gz.Close() }() + if _, err := gz.Write(data); err != nil { + return nil, err + } + if err := gz.Close(); err != nil { + return nil, err + } + return &buf, nil +} + // isVersionInRange checks if current falls within [since, until) func isVersionInRange(current, since, until *goversion.Version) bool { return !current.LessThan(since) && current.LessThan(until) From 5815facc133c44166cbe8b2892386be5a404fcf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 11:21:01 +0100 Subject: [PATCH 31/52] Reducate code of complexity --- client/internal/metrics/infra/ingest/main.go | 102 ++++++++++--------- 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/client/internal/metrics/infra/ingest/main.go b/client/internal/metrics/infra/ingest/main.go index 3c3d2981699..784d503c71d 100644 --- a/client/internal/metrics/infra/ingest/main.go +++ b/client/internal/metrics/infra/ingest/main.go @@ -44,7 +44,40 @@ func main() { client := &http.Client{Timeout: 10 * 1e9} // 10 seconds - http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + http.HandleFunc("/", handleIngest(client, influxURL, influxToken)) + + // Build config JSON once at startup from env vars + configJSON := buildConfigJSON() + if configJSON != nil { + log.Printf("serving remote config at /config") + } + + http.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + if configJSON == nil { + http.Error(w, "config not configured", http.StatusNotFound) + return + } + w.Header().Set("Content-Type", "application/json") + w.Write(configJSON) //nolint:errcheck + }) + + http.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, "ok") //nolint:errcheck + }) + + log.Printf("ingest server listening on %s, forwarding to %s", listenAddr, influxURL) + if err := http.ListenAndServe(listenAddr, nil); err != nil { //nolint:gosec + log.Fatal(err) + } +} + +func handleIngest(client *http.Client, influxURL, influxToken string) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return @@ -72,55 +105,32 @@ func main() { return } - req, err := http.NewRequestWithContext(r.Context(), http.MethodPost, influxURL, bytes.NewReader(validated)) - if err != nil { - log.Printf("ERROR create request: %v", err) - http.Error(w, "internal error", http.StatusInternalServerError) - return - } - req.Header.Set("Content-Type", "text/plain; charset=utf-8") - req.Header.Set("Authorization", "Token "+influxToken) - - resp, err := client.Do(req) - if err != nil { - log.Printf("ERROR forward to influxdb: %v", err) - http.Error(w, "upstream error", http.StatusBadGateway) - return - } - defer resp.Body.Close() - - w.WriteHeader(resp.StatusCode) - io.Copy(w, resp.Body) //nolint:errcheck - }) - - // Build config JSON once at startup from env vars - configJSON := buildConfigJSON() - if configJSON != nil { - log.Printf("serving remote config at /config") + forwardToInflux(w, r, client, influxURL, influxToken, validated) } +} - http.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) { - if r.Method != http.MethodGet { - http.Error(w, "method not allowed", http.StatusMethodNotAllowed) - return - } - if configJSON == nil { - http.Error(w, "config not configured", http.StatusNotFound) - return - } - w.Header().Set("Content-Type", "application/json") - w.Write(configJSON) //nolint:errcheck - }) - - http.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusOK) - fmt.Fprint(w, "ok") //nolint:errcheck - }) +func forwardToInflux(w http.ResponseWriter, r *http.Request, client *http.Client, influxURL, influxToken string, body []byte) { + req, err := http.NewRequestWithContext(r.Context(), http.MethodPost, influxURL, bytes.NewReader(body)) + if err != nil { + log.Printf("ERROR create request: %v", err) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + req.Header.Set("Content-Type", "text/plain; charset=utf-8") + req.Header.Set("Authorization", "Token "+influxToken) - log.Printf("ingest server listening on %s, forwarding to %s", listenAddr, influxURL) - if err := http.ListenAndServe(listenAddr, nil); err != nil { //nolint:gosec - log.Fatal(err) + resp, err := client.Do(req) + if err != nil { + log.Printf("ERROR forward to influxdb: %v", err) + http.Error(w, "upstream error", http.StatusBadGateway) + return } + defer func(Body io.ReadCloser) { + _ = Body.Close() + }(resp.Body) + + w.WriteHeader(resp.StatusCode) + io.Copy(w, resp.Body) //nolint:errcheck } // validateAuth checks that the Authorization header contains a valid hashed peer ID. From 1dd2b9bb5eb213a02f98ee8222ae474298d4ab06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 16:02:40 +0100 Subject: [PATCH 32/52] Update debug documentation to include metrics.txt description --- client/internal/debug/debug.go | 1 + 1 file changed, 1 insertion(+) diff --git a/client/internal/debug/debug.go b/client/internal/debug/debug.go index b2208e68f5e..a4a4afb405d 100644 --- a/client/internal/debug/debug.go +++ b/client/internal/debug/debug.go @@ -53,6 +53,7 @@ resolved_domains.txt: Anonymized resolved domain IP addresses from the status re config.txt: Anonymized configuration information of the NetBird client. network_map.json: Anonymized sync response containing peer configurations, routes, DNS settings, and firewall rules. state.json: Anonymized client state dump containing netbird states for the active profile. +metrics.txt: Buffered client metrics in InfluxDB line protocol format. Only present when metrics collection is enabled. Peer identifiers are anonymized. mutex.prof: Mutex profiling information. goroutine.prof: Goroutine profiling information. block.prof: Block profiling information. From f6353c3473ca97375dbc173ba5e568a1d7a71ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 16:04:48 +0100 Subject: [PATCH 33/52] Increase `maxBodySize` limit to 50 MB and update gzip reader wrapping logic --- client/internal/metrics/infra/ingest/main.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/client/internal/metrics/infra/ingest/main.go b/client/internal/metrics/infra/ingest/main.go index 784d503c71d..fc5a51b03e6 100644 --- a/client/internal/metrics/infra/ingest/main.go +++ b/client/internal/metrics/infra/ingest/main.go @@ -17,9 +17,9 @@ import ( const ( defaultListenAddr = ":8087" defaultInfluxDBURL = "http://influxdb:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns" - maxBodySize = 1 * 1024 * 1024 // 1 MB max request body - maxTotalSeconds = 300.0 // reject total_seconds > 5 minutes - peerIDLength = 16 // truncated SHA-256: 8 bytes = 16 hex chars + maxBodySize = 50 * 1024 * 1024 // 50 MB max request body + maxTotalSeconds = 300.0 // reject total_seconds > 5 minutes + peerIDLength = 16 // truncated SHA-256: 8 bytes = 16 hex chars ) var allowedMeasurements = map[string]map[string]bool{ @@ -158,7 +158,7 @@ func readBody(r *http.Request) ([]byte, error) { return nil, fmt.Errorf("invalid gzip: %w", err) } defer gz.Close() - reader = gz + reader = io.LimitReader(gz, maxBodySize+1) } return io.ReadAll(reader) From d8118df5149e03b6a191a55493258155b7bcb100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 16:08:21 +0100 Subject: [PATCH 34/52] Refactor deployment type detection to use URL parsing for improved accuracy --- client/internal/metrics/deployment_type.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/client/internal/metrics/deployment_type.go b/client/internal/metrics/deployment_type.go index 4bf4fa020d1..141173cb811 100644 --- a/client/internal/metrics/deployment_type.go +++ b/client/internal/metrics/deployment_type.go @@ -1,6 +1,7 @@ package metrics import ( + "net/url" "strings" ) @@ -37,8 +38,12 @@ func DetermineDeploymentType(managementURL string) DeploymentType { return DeploymentTypeUnknown } - // Check for NetBird cloud API domain - if strings.Contains(strings.ToLower(managementURL), "api.netbird.io") { + u, err := url.Parse(managementURL) + if err != nil { + return DeploymentTypeSelfHosted + } + + if strings.ToLower(u.Hostname()) == "api.netbird.io" { return DeploymentTypeCloud } From d78f05d8c91730bc4aa19eabc1a8502b9eb52a87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 16:10:06 +0100 Subject: [PATCH 35/52] Update readme --- client/internal/metrics/infra/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/client/internal/metrics/infra/README.md b/client/internal/metrics/infra/README.md index 5240c783c65..c31fe759a68 100644 --- a/client/internal/metrics/infra/README.md +++ b/client/internal/metrics/infra/README.md @@ -132,8 +132,7 @@ Samples are collected with exact timestamps, pushed once, then cleared. No data ```bash # From this directory (client/internal/metrics/infra) -cp .env.example .env -# Edit .env — set INFLUXDB_ADMIN_PASSWORD, INFLUXDB_ADMIN_TOKEN +# Edit .env to change INFLUXDB_ADMIN_PASSWORD and INFLUXDB_ADMIN_TOKEN docker compose up -d ``` From 3625b3b18c4b9f1a0a0b25b308faa3d73535fd11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 16:16:51 +0100 Subject: [PATCH 36/52] Throttle remote config retries on fetch failure --- client/internal/metrics/remoteconfig/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/internal/metrics/remoteconfig/manager.go b/client/internal/metrics/remoteconfig/manager.go index 31f406c0bfd..01c37891f91 100644 --- a/client/internal/metrics/remoteconfig/manager.go +++ b/client/internal/metrics/remoteconfig/manager.go @@ -66,13 +66,13 @@ func (m *Manager) RefreshIfNeeded(ctx context.Context) *Config { } fetchedConfig, err := m.fetch(ctx) + m.lastFetched = time.Now() if err != nil { log.Warnf("failed to fetch metrics remote config: %v", err) return m.lastConfig // return cached (may be nil) } m.lastConfig = fetchedConfig - m.lastFetched = time.Now() log.Tracef("fetched metrics remote config: version-since=%s version-until=%s period=%s", fetchedConfig.VersionSince, fetchedConfig.VersionUntil, fetchedConfig.Interval) From e24c0bb88e23cac4509241ad289101a33f3aca73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 16:19:34 +0100 Subject: [PATCH 37/52] Preserve first WG handshake timestamp, ignore rekeys --- client/internal/peer/metrics_saver.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/internal/peer/metrics_saver.go b/client/internal/peer/metrics_saver.go index 981148854d6..e32afbfe522 100644 --- a/client/internal/peer/metrics_saver.go +++ b/client/internal/peer/metrics_saver.go @@ -37,7 +37,7 @@ func (s *MetricsStages) RecordWGHandshakeSuccess(handshakeTime time.Time) { s.mu.Lock() defer s.mu.Unlock() - if !s.stageTimestamps.ConnectionReady.IsZero() { + if !s.stageTimestamps.ConnectionReady.IsZero() && s.stageTimestamps.WgHandshakeSuccess.IsZero() { // WireGuard only reports handshake times with second precision, but ConnectionReady // is captured with microsecond precision. If handshake appears before ConnectionReady // due to truncation (e.g., handshake at 6.042s truncated to 6.000s), normalize to From 804cd5dd55be8ef74b9ec5ac167416bd3a4a0b7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Tue, 10 Mar 2026 16:21:28 +0100 Subject: [PATCH 38/52] Skip adding empty metrics.txt to debug bundle in debug mode --- client/internal/debug/debug.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/client/internal/debug/debug.go b/client/internal/debug/debug.go index a4a4afb405d..a1ebf324457 100644 --- a/client/internal/debug/debug.go +++ b/client/internal/debug/debug.go @@ -768,6 +768,11 @@ func (g *BundleGenerator) addMetrics() error { return fmt.Errorf("export metrics: %w", err) } + if buf.Len() == 0 { + log.Debugf("skipping metrics.txt in debug bundle: no metrics data") + return nil + } + if err := g.addFileToZip(&buf, "metrics.txt"); err != nil { return fmt.Errorf("add metrics file to zip: %w", err) } From b4cd717390b79e372d3c4e82a9acdeeb3f840370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Thu, 12 Mar 2026 16:52:29 +0100 Subject: [PATCH 39/52] Update default metrics server URL to https://ingest.netbird.io --- client/internal/metrics/push.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index 06bc29a1c89..205ff850cb9 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -25,7 +25,7 @@ const ( var defaultMetricsServerURL *url.URL func init() { - defaultMetricsServerURL, _ = url.Parse("https://ingest.stage.npeer.io") + defaultMetricsServerURL, _ = url.Parse("https://ingest.netbird.io") } // PushConfig holds configuration for metrics push From d2727555a6a37614b9cacbedae52f2d3c7658218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Fri, 13 Mar 2026 10:25:25 +0100 Subject: [PATCH 40/52] Atomic metrics export-and-reset to prevent sample loss between Export and Reset calls --- client/internal/metrics/influxdb.go | 26 ++++++++++++++++++-------- client/internal/metrics/metrics.go | 7 ++++--- client/internal/metrics/push.go | 5 ++--- client/internal/metrics/push_test.go | 3 ++- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/client/internal/metrics/influxdb.go b/client/internal/metrics/influxdb.go index 1caa3ac79e4..e0de0668b39 100644 --- a/client/internal/metrics/influxdb.go +++ b/client/internal/metrics/influxdb.go @@ -116,7 +116,7 @@ func (m *influxDBMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentI m.trimLocked() } -// Export writes pending samples in InfluxDB line protocol format. +// Export writes pending samples in InfluxDB line protocol format without clearing them. // Format: measurement,tag=val,tag=val field=val,field=val timestamp_ns func (m *influxDBMetrics) Export(w io.Writer) error { m.mu.Lock() @@ -124,6 +124,23 @@ func (m *influxDBMetrics) Export(w io.Writer) error { copy(samples, m.samples) m.mu.Unlock() + return writeSamples(w, samples) +} + +// ExportAndReset atomically snapshots and clears pending samples, then writes +// the snapshot in InfluxDB line protocol format. Samples recorded after the +// swap are preserved for the next cycle. +func (m *influxDBMetrics) ExportAndReset(w io.Writer) error { + m.mu.Lock() + samples := m.samples + m.samples = nil + m.mu.Unlock() + + return writeSamples(w, samples) +} + +// writeSamples writes samples in InfluxDB line protocol format. +func writeSamples(w io.Writer, samples []influxSample) error { for _, s := range samples { if _, err := fmt.Fprintf(w, "%s,%s ", s.measurement, s.tags); err != nil { return err @@ -149,13 +166,6 @@ func (m *influxDBMetrics) Export(w io.Writer) error { return nil } -// Reset clears pending samples after a successful push -func (m *influxDBMetrics) Reset() { - m.mu.Lock() - defer m.mu.Unlock() - m.samples = m.samples[:0] -} - // trimLocked removes samples that exceed age or size limits. // Must be called with m.mu held. func (m *influxDBMetrics) trimLocked() { diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 3c63f7a3fab..a1abe903e9b 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -55,11 +55,12 @@ type metricsImplementation interface { // RecordSyncDuration records how long it took to process a sync message RecordSyncDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration) - // Export exports metrics in Prometheus format + // Export exports metrics in Prometheus format (read-only, does not clear) Export(w io.Writer) error - // Reset clears all collected metrics - Reset() + // ExportAndReset atomically exports metrics and clears the buffer, + // ensuring no samples recorded between export and clear are lost. + ExportAndReset(w io.Writer) error } type ClientMetrics struct { diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index 205ff850cb9..ee85db3d824 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -190,9 +190,9 @@ func (p *Push) resolve(ctx context.Context) (pushURL string, interval time.Durat // push exports metrics and sends them to the metrics server func (p *Push) push(ctx context.Context, pushURL string) error { - // Export metrics to buffer + // Atomically export and clear metrics var buf bytes.Buffer - if err := p.metrics.Export(&buf); err != nil { + if err := p.metrics.ExportAndReset(&buf); err != nil { return fmt.Errorf("export metrics: %w", err) } @@ -243,7 +243,6 @@ func (p *Push) push(ctx context.Context, pushURL string) error { } log.Debugf("successfully pushed metrics to %s", pushURL) - p.metrics.Reset() return nil } diff --git a/client/internal/metrics/push_test.go b/client/internal/metrics/push_test.go index 7b783bc721e..a339759bd1b 100644 --- a/client/internal/metrics/push_test.go +++ b/client/internal/metrics/push_test.go @@ -78,7 +78,8 @@ func (m *mockMetrics) Export(w io.Writer) error { return nil } -func (m *mockMetrics) Reset() { +func (m *mockMetrics) ExportAndReset(w io.Writer) error { + return m.Export(w) } func TestPush_OverrideIntervalPushes(t *testing.T) { From 21ffd879a506a3fb54a0cbf91371f8ab493159a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Fri, 13 Mar 2026 10:33:43 +0100 Subject: [PATCH 41/52] Fix doc --- client/internal/metrics/env.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/client/internal/metrics/env.go b/client/internal/metrics/env.go index e829ee34e78..9c23c49eac6 100644 --- a/client/internal/metrics/env.go +++ b/client/internal/metrics/env.go @@ -24,8 +24,8 @@ const ( EnvMetricsServerURL = "NB_METRICS_SERVER_URL" // EnvMetricsInterval overrides the push interval from the remote config. - // When set, metrics are always pushed at this interval, ignoring remote config's - // period_minutes and version range filtering. + // Only affects how often metrics are pushed; remote config availability + // and version range checks are still respected. // Format: duration string like "1h", "30m", "4h" EnvMetricsInterval = "NB_METRICS_INTERVAL" From 9df13ba7a02db032238b47e41fb8cd3f83a829df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Fri, 13 Mar 2026 11:04:11 +0100 Subject: [PATCH 42/52] Refactor Push configuration to improve clarity and enforce minimum push interval --- client/internal/metrics/push.go | 66 ++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index ee85db3d824..15892e5487d 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -19,6 +19,7 @@ import ( const ( // defaultPushInterval is the default interval for pushing metrics defaultPushInterval = 5 * time.Minute + minPushInterval = 1 * time.Second ) // defaultMetricsServerURL is used as fallback when NB_METRICS_FORCE_SENDING is true @@ -58,39 +59,39 @@ type remoteConfigProvider interface { type Push struct { metrics metricsImplementation configManager remoteConfigProvider - config PushConfig agentVersion *goversion.Version peerID string peerMu sync.RWMutex - client *http.Client - envInterval time.Duration - envAddress *url.URL + client *http.Client + cfgForceSending bool + cfgInterval time.Duration + cfgAddress *url.URL } // NewPush creates a new Push instance with configuration resolution func NewPush(metrics metricsImplementation, configManager remoteConfigProvider, config PushConfig, agentVersion string) (*Push, error) { - var envInterval time.Duration - var envAddress *url.URL + var cfgInterval time.Duration + var cfgAddress *url.URL if config.ForceSending { - envInterval = config.Interval + cfgInterval = config.Interval if config.Interval <= 0 { - envInterval = defaultPushInterval + cfgInterval = defaultPushInterval } - envAddress = config.ServerAddress - if envAddress == nil { - envAddress = defaultMetricsServerURL + cfgAddress = config.ServerAddress + if cfgAddress == nil { + cfgAddress = defaultMetricsServerURL } } else { - envAddress = config.ServerAddress + cfgAddress = config.ServerAddress if config.Interval < 0 { log.Warnf("negative metrics push interval %s", config.Interval) } else { - envInterval = config.Interval + cfgInterval = config.Interval } } @@ -102,12 +103,12 @@ func NewPush(metrics metricsImplementation, configManager remoteConfigProvider, } return &Push{ - metrics: metrics, - configManager: configManager, - config: config, - agentVersion: parsedVersion, - envInterval: envInterval, - envAddress: envAddress, + metrics: metrics, + configManager: configManager, + agentVersion: parsedVersion, + cfgForceSending: config.ForceSending, + cfgInterval: cfgInterval, + cfgAddress: cfgAddress, client: &http.Client{ Timeout: 10 * time.Second, }, @@ -127,10 +128,10 @@ func (p *Push) SetPeerID(peerID string) { func (p *Push) Start(ctx context.Context) { // Log initial state switch { - case p.config.ForceSending: - log.Infof("started metrics push with force sending to %s, interval %s", p.envAddress, p.envInterval) - case p.config.ServerAddress != nil: - log.Infof("started metrics push with server URL override: %s", p.config.ServerAddress.String()) + case p.cfgForceSending: + log.Infof("started metrics push with force sending to %s, interval %s", p.cfgAddress, p.cfgInterval) + case p.cfgAddress != nil: + log.Infof("started metrics push with server URL override: %s", p.cfgAddress.String()) default: log.Infof("started metrics push, server URL will be resolved from remote config") } @@ -153,6 +154,9 @@ func (p *Push) Start(ctx context.Context) { } } + if interval < minPushInterval { + interval = defaultPushInterval + } timer.Reset(interval) } } @@ -160,8 +164,8 @@ func (p *Push) Start(ctx context.Context) { // resolve returns the push URL and interval for the next cycle. // Returns empty pushURL to skip this cycle. func (p *Push) resolve(ctx context.Context) (pushURL string, interval time.Duration) { - if p.config.ForceSending { - return p.resolveServerURL(nil), p.envInterval + if p.cfgForceSending { + return p.resolveServerURL(nil), p.cfgInterval } config := p.configManager.RefreshIfNeeded(ctx) @@ -170,9 +174,11 @@ func (p *Push) resolve(ctx context.Context) (pushURL string, interval time.Durat return "", defaultPushInterval } - interval = config.Interval - if p.envInterval > 0 { - interval = p.envInterval + // prefer env variables instead of remote config + if p.cfgInterval > 0 { + interval = p.cfgInterval + } else { + interval = config.Interval } if !isVersionInRange(p.agentVersion, config.VersionSince, config.VersionUntil) { @@ -250,8 +256,8 @@ func (p *Push) push(ctx context.Context, pushURL string) error { // Precedence: envAddress (env var) > remote config server_url func (p *Push) resolveServerURL(remoteServerURL *url.URL) string { var baseURL *url.URL - if p.envAddress != nil { - baseURL = p.envAddress + if p.cfgAddress != nil { + baseURL = p.cfgAddress } else { baseURL = remoteServerURL } From 1085ad0cb2c7fd66170666fd64e51f1588e809a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Fri, 13 Mar 2026 11:19:15 +0100 Subject: [PATCH 43/52] Remove `minPushInterval` and update push interval validation logic --- client/internal/metrics/push.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index 15892e5487d..b683f0491da 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -19,7 +19,6 @@ import ( const ( // defaultPushInterval is the default interval for pushing metrics defaultPushInterval = 5 * time.Minute - minPushInterval = 1 * time.Second ) // defaultMetricsServerURL is used as fallback when NB_METRICS_FORCE_SENDING is true @@ -154,7 +153,7 @@ func (p *Push) Start(ctx context.Context) { } } - if interval < minPushInterval { + if interval <= 0 { interval = defaultPushInterval } timer.Reset(interval) From 44edbfd9b6bbc11608c40c0003f54c1b0702391b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Fri, 13 Mar 2026 13:21:40 +0100 Subject: [PATCH 44/52] Revert ExportAndReset, it is acceptable data loss --- client/internal/metrics/influxdb.go | 26 ++++++++------------------ client/internal/metrics/metrics.go | 7 +++---- client/internal/metrics/push.go | 5 +++-- client/internal/metrics/push_test.go | 3 +-- 4 files changed, 15 insertions(+), 26 deletions(-) diff --git a/client/internal/metrics/influxdb.go b/client/internal/metrics/influxdb.go index e0de0668b39..1caa3ac79e4 100644 --- a/client/internal/metrics/influxdb.go +++ b/client/internal/metrics/influxdb.go @@ -116,7 +116,7 @@ func (m *influxDBMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentI m.trimLocked() } -// Export writes pending samples in InfluxDB line protocol format without clearing them. +// Export writes pending samples in InfluxDB line protocol format. // Format: measurement,tag=val,tag=val field=val,field=val timestamp_ns func (m *influxDBMetrics) Export(w io.Writer) error { m.mu.Lock() @@ -124,23 +124,6 @@ func (m *influxDBMetrics) Export(w io.Writer) error { copy(samples, m.samples) m.mu.Unlock() - return writeSamples(w, samples) -} - -// ExportAndReset atomically snapshots and clears pending samples, then writes -// the snapshot in InfluxDB line protocol format. Samples recorded after the -// swap are preserved for the next cycle. -func (m *influxDBMetrics) ExportAndReset(w io.Writer) error { - m.mu.Lock() - samples := m.samples - m.samples = nil - m.mu.Unlock() - - return writeSamples(w, samples) -} - -// writeSamples writes samples in InfluxDB line protocol format. -func writeSamples(w io.Writer, samples []influxSample) error { for _, s := range samples { if _, err := fmt.Fprintf(w, "%s,%s ", s.measurement, s.tags); err != nil { return err @@ -166,6 +149,13 @@ func writeSamples(w io.Writer, samples []influxSample) error { return nil } +// Reset clears pending samples after a successful push +func (m *influxDBMetrics) Reset() { + m.mu.Lock() + defer m.mu.Unlock() + m.samples = m.samples[:0] +} + // trimLocked removes samples that exceed age or size limits. // Must be called with m.mu held. func (m *influxDBMetrics) trimLocked() { diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index a1abe903e9b..3c63f7a3fab 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -55,12 +55,11 @@ type metricsImplementation interface { // RecordSyncDuration records how long it took to process a sync message RecordSyncDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration) - // Export exports metrics in Prometheus format (read-only, does not clear) + // Export exports metrics in Prometheus format Export(w io.Writer) error - // ExportAndReset atomically exports metrics and clears the buffer, - // ensuring no samples recorded between export and clear are lost. - ExportAndReset(w io.Writer) error + // Reset clears all collected metrics + Reset() } type ClientMetrics struct { diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index b683f0491da..e9dfebf9a07 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -195,9 +195,9 @@ func (p *Push) resolve(ctx context.Context) (pushURL string, interval time.Durat // push exports metrics and sends them to the metrics server func (p *Push) push(ctx context.Context, pushURL string) error { - // Atomically export and clear metrics + // Export metrics without clearing var buf bytes.Buffer - if err := p.metrics.ExportAndReset(&buf); err != nil { + if err := p.metrics.Export(&buf); err != nil { return fmt.Errorf("export metrics: %w", err) } @@ -248,6 +248,7 @@ func (p *Push) push(ctx context.Context, pushURL string) error { } log.Debugf("successfully pushed metrics to %s", pushURL) + p.metrics.Reset() return nil } diff --git a/client/internal/metrics/push_test.go b/client/internal/metrics/push_test.go index a339759bd1b..7b783bc721e 100644 --- a/client/internal/metrics/push_test.go +++ b/client/internal/metrics/push_test.go @@ -78,8 +78,7 @@ func (m *mockMetrics) Export(w io.Writer) error { return nil } -func (m *mockMetrics) ExportAndReset(w io.Writer) error { - return m.Export(w) +func (m *mockMetrics) Reset() { } func TestPush_OverrideIntervalPushes(t *testing.T) { From 20d056923ed6fc2261d8e11154578058a5b3aa3e Mon Sep 17 00:00:00 2001 From: Viktor Liu Date: Wed, 18 Mar 2026 19:31:16 +0100 Subject: [PATCH 45/52] Fix metrics review issues: rename env var, remove stale infra, add tests - Rename NB_METRICS_ENABLED to NB_METRICS_PUSH_ENABLED to clarify that collection is always active (for debug bundles) and only push is opt-in - Change default config URL from staging to production (ingest.netbird.io) - Delete broken Prometheus dashboard (used non-existent metric names) - Delete unused VictoriaMetrics datasource config - Replace committed .env with .env.example containing placeholder values - Wire Grafana admin credentials through env vars in docker-compose - Make metricsStages a pointer to prevent reset-vs-write race on reconnect - Fix typed-nil interface in debug bundle path (GetClientMetrics) - Use deterministic field order in InfluxDB Export (sorted keys) - Replace Authorization header with X-Peer-ID for metrics push - Fix ingest server timeout to use time.Second instead of float - Fix gzip double-close, stale comments, trim log levels - Add tests for influxdb.go and MetricsStages --- client/internal/connect.go | 3 +- client/internal/metrics/env.go | 14 +- client/internal/metrics/influxdb.go | 11 +- client/internal/metrics/influxdb_test.go | 179 ++++++++++++++++++ client/internal/metrics/infra/.env | 10 - client/internal/metrics/infra/.env.example | 16 ++ client/internal/metrics/infra/.gitignore | 2 +- client/internal/metrics/infra/README.md | 15 +- .../internal/metrics/infra/docker-compose.yml | 4 +- .../json/netbird-connection-metrics.json | 131 ------------- .../datasources/victoriametrics.yml | 12 -- client/internal/metrics/infra/ingest/main.go | 13 +- client/internal/metrics/metrics.go | 2 +- client/internal/metrics/push.go | 6 +- client/internal/peer/conn.go | 8 +- client/internal/peer/metrics_saver_test.go | 125 ++++++++++++ client/server/debug.go | 4 +- 17 files changed, 365 insertions(+), 190 deletions(-) create mode 100644 client/internal/metrics/influxdb_test.go delete mode 100644 client/internal/metrics/infra/.env create mode 100644 client/internal/metrics/infra/.env.example delete mode 100644 client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-connection-metrics.json delete mode 100644 client/internal/metrics/infra/grafana/provisioning/datasources/victoriametrics.yml create mode 100644 client/internal/peer/metrics_saver_test.go diff --git a/client/internal/connect.go b/client/internal/connect.go index 00ab9fa6044..b1d8d93caff 100644 --- a/client/internal/connect.go +++ b/client/internal/connect.go @@ -144,9 +144,8 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan nbnet.Init() - // Initialize metrics once at startup + // Initialize metrics once at startup (always active for debug bundles) if c.clientMetrics == nil { - // Start with unknown deployment type, will be updated on first successful connection agentInfo := metrics.AgentInfo{ DeploymentType: metrics.DeploymentTypeUnknown, Version: version.NetbirdVersion(), diff --git a/client/internal/metrics/env.go b/client/internal/metrics/env.go index 9c23c49eac6..99465621528 100644 --- a/client/internal/metrics/env.go +++ b/client/internal/metrics/env.go @@ -10,8 +10,10 @@ import ( ) const ( - // EnvMetricsEnabled is the environment variable to enable metrics push (default: disabled) - EnvMetricsEnabled = "NB_METRICS_ENABLED" + // EnvMetricsPushEnabled controls whether collected metrics are pushed to the backend. + // Metrics collection itself is always active (for debug bundles). + // Disabled by default. Set NB_METRICS_PUSH_ENABLED=true to enable push. + EnvMetricsPushEnabled = "NB_METRICS_PUSH_ENABLED" // EnvMetricsForceSending if set to true, skips remote configuration fetch and forces metric sending EnvMetricsForceSending = "NB_METRICS_FORCE_SENDING" @@ -29,13 +31,13 @@ const ( // Format: duration string like "1h", "30m", "4h" EnvMetricsInterval = "NB_METRICS_INTERVAL" - defaultMetricsConfigURL = "https://ingest.stage.npeer.io/config" + defaultMetricsConfigURL = "https://ingest.netbird.io/config" ) -// IsMetricsPushEnabled returns true if metrics push is enabled via NB_METRICS_ENABLED env var -// Disabled by default. Set NB_METRICS_ENABLED=true to enable +// IsMetricsPushEnabled returns true if metrics push is enabled via NB_METRICS_PUSH_ENABLED env var. +// Disabled by default. Metrics collection is always active for debug bundles. func IsMetricsPushEnabled() bool { - enabled, _ := strconv.ParseBool(os.Getenv(EnvMetricsEnabled)) + enabled, _ := strconv.ParseBool(os.Getenv(EnvMetricsPushEnabled)) return enabled } diff --git a/client/internal/metrics/influxdb.go b/client/internal/metrics/influxdb.go index 1caa3ac79e4..2084ab71fd7 100644 --- a/client/internal/metrics/influxdb.go +++ b/client/internal/metrics/influxdb.go @@ -4,6 +4,8 @@ import ( "context" "fmt" "io" + "maps" + "slices" "sync" "time" @@ -129,14 +131,15 @@ func (m *influxDBMetrics) Export(w io.Writer) error { return err } + sortedKeys := slices.Sorted(maps.Keys(s.fields)) first := true - for k, v := range s.fields { + for _, k := range sortedKeys { if !first { if _, err := fmt.Fprint(w, ","); err != nil { return err } } - if _, err := fmt.Fprintf(w, "%s=%g", k, v); err != nil { + if _, err := fmt.Fprintf(w, "%s=%g", k, s.fields[k]); err != nil { return err } first = false @@ -169,7 +172,7 @@ func (m *influxDBMetrics) trimLocked() { if cutoff > 0 { copy(m.samples, m.samples[cutoff:]) m.samples = m.samples[:len(m.samples)-cutoff] - log.Warnf("influxdb metrics: dropped %d samples older than %s", cutoff, maxSampleAge) + log.Debugf("influxdb metrics: dropped %d samples older than %s", cutoff, maxSampleAge) } // drop oldest samples if estimated size exceeds maxBufferSize @@ -178,6 +181,6 @@ func (m *influxDBMetrics) trimLocked() { drop := len(m.samples) - maxSamples copy(m.samples, m.samples[drop:]) m.samples = m.samples[:maxSamples] - log.Warnf("influxdb metrics: dropped %d oldest samples to stay under %d MB size limit", drop, maxBufferSize/(1024*1024)) + log.Debugf("influxdb metrics: dropped %d oldest samples to stay under %d MB size limit", drop, maxBufferSize/(1024*1024)) } } diff --git a/client/internal/metrics/influxdb_test.go b/client/internal/metrics/influxdb_test.go new file mode 100644 index 00000000000..c8c47f57948 --- /dev/null +++ b/client/internal/metrics/influxdb_test.go @@ -0,0 +1,179 @@ +package metrics + +import ( + "bytes" + "context" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestInfluxDBMetrics_RecordAndExport(t *testing.T) { + m := newInfluxDBMetrics().(*influxDBMetrics) + + agentInfo := AgentInfo{ + DeploymentType: DeploymentTypeCloud, + Version: "1.0.0", + OS: "linux", + peerID: "abc123", + } + + ts := ConnectionStageTimestamps{ + SignalingReceived: time.Now().Add(-3 * time.Second), + ConnectionReady: time.Now().Add(-2 * time.Second), + WgHandshakeSuccess: time.Now().Add(-1 * time.Second), + } + + m.RecordConnectionStages(context.Background(), agentInfo, "pair123", ConnectionTypeICE, false, ts) + + var buf bytes.Buffer + err := m.Export(&buf) + require.NoError(t, err) + + output := buf.String() + assert.Contains(t, output, "netbird_peer_connection,") + assert.Contains(t, output, "connection_to_wg_handshake_seconds=") + assert.Contains(t, output, "signaling_to_connection_seconds=") + assert.Contains(t, output, "total_seconds=") +} + +func TestInfluxDBMetrics_ExportDeterministicFieldOrder(t *testing.T) { + m := newInfluxDBMetrics().(*influxDBMetrics) + + agentInfo := AgentInfo{ + DeploymentType: DeploymentTypeCloud, + Version: "1.0.0", + OS: "linux", + peerID: "abc123", + } + + ts := ConnectionStageTimestamps{ + SignalingReceived: time.Now().Add(-3 * time.Second), + ConnectionReady: time.Now().Add(-2 * time.Second), + WgHandshakeSuccess: time.Now().Add(-1 * time.Second), + } + + // Record multiple times and verify consistent field order + for i := 0; i < 10; i++ { + m.RecordConnectionStages(context.Background(), agentInfo, "pair123", ConnectionTypeICE, false, ts) + } + + var buf bytes.Buffer + err := m.Export(&buf) + require.NoError(t, err) + + lines := strings.Split(strings.TrimSpace(buf.String()), "\n") + require.Len(t, lines, 10) + + // Extract field portion from each line and verify they're all identical + var fieldSections []string + for _, line := range lines { + parts := strings.SplitN(line, " ", 3) + require.Len(t, parts, 3, "each line should have measurement, fields, timestamp") + fieldSections = append(fieldSections, parts[1]) + } + + for i := 1; i < len(fieldSections); i++ { + assert.Equal(t, fieldSections[0], fieldSections[i], "field order should be deterministic across samples") + } + + // Fields should be alphabetically sorted + assert.True(t, strings.HasPrefix(fieldSections[0], "connection_to_wg_handshake_seconds="), + "fields should be sorted: connection_to_wg < signaling_to < total") +} + +func TestInfluxDBMetrics_RecordSyncDuration(t *testing.T) { + m := newInfluxDBMetrics().(*influxDBMetrics) + + agentInfo := AgentInfo{ + DeploymentType: DeploymentTypeSelfHosted, + Version: "2.0.0", + OS: "darwin", + peerID: "def456", + } + + m.RecordSyncDuration(context.Background(), agentInfo, 1500*time.Millisecond) + + var buf bytes.Buffer + err := m.Export(&buf) + require.NoError(t, err) + + output := buf.String() + assert.Contains(t, output, "netbird_sync,") + assert.Contains(t, output, "duration_seconds=1.5") + assert.Contains(t, output, "deployment_type=selfhosted") +} + +func TestInfluxDBMetrics_Reset(t *testing.T) { + m := newInfluxDBMetrics().(*influxDBMetrics) + + agentInfo := AgentInfo{ + DeploymentType: DeploymentTypeCloud, + Version: "1.0.0", + OS: "linux", + peerID: "abc123", + } + + m.RecordSyncDuration(context.Background(), agentInfo, time.Second) + + var buf bytes.Buffer + err := m.Export(&buf) + require.NoError(t, err) + assert.NotEmpty(t, buf.String()) + + m.Reset() + + buf.Reset() + err = m.Export(&buf) + require.NoError(t, err) + assert.Empty(t, buf.String(), "should be empty after reset") +} + +func TestInfluxDBMetrics_ExportEmpty(t *testing.T) { + m := newInfluxDBMetrics().(*influxDBMetrics) + + var buf bytes.Buffer + err := m.Export(&buf) + require.NoError(t, err) + assert.Empty(t, buf.String()) +} + +func TestInfluxDBMetrics_TrimByAge(t *testing.T) { + m := newInfluxDBMetrics().(*influxDBMetrics) + + m.mu.Lock() + m.samples = append(m.samples, influxSample{ + measurement: "old", + tags: "t=1", + fields: map[string]float64{"v": 1}, + timestamp: time.Now().Add(-maxSampleAge - time.Hour), + }) + m.trimLocked() + remaining := len(m.samples) + m.mu.Unlock() + + assert.Equal(t, 0, remaining, "old samples should be trimmed") +} + +func TestInfluxDBMetrics_TrimBySize(t *testing.T) { + m := newInfluxDBMetrics().(*influxDBMetrics) + + maxSamples := maxBufferSize / estimatedSampleSize + m.mu.Lock() + for i := 0; i < maxSamples+100; i++ { + m.samples = append(m.samples, influxSample{ + measurement: "test", + tags: "t=1", + fields: map[string]float64{"v": float64(i)}, + timestamp: time.Now(), + }) + } + m.trimLocked() + remaining := len(m.samples) + m.mu.Unlock() + + assert.Equal(t, maxSamples, remaining, "should trim to max samples") +} diff --git a/client/internal/metrics/infra/.env b/client/internal/metrics/infra/.env deleted file mode 100644 index 958d3ff8362..00000000000 --- a/client/internal/metrics/infra/.env +++ /dev/null @@ -1,10 +0,0 @@ -# InfluxDB admin (server-side only, never exposed to clients) -INFLUXDB_ADMIN_PASSWORD=adminadmin -INFLUXDB_ADMIN_TOKEN=stage-admin-token - -# Remote config served by ingest at /config -# Set CONFIG_SERVER_URL to the ingest server's public address to enable -CONFIG_METRICS_SERVER_URL= -CONFIG_VERSION_SINCE=0.0.0 -CONFIG_VERSION_UNTIL=99.99.99 -CONFIG_PERIOD_MINUTES=5 diff --git a/client/internal/metrics/infra/.env.example b/client/internal/metrics/infra/.env.example new file mode 100644 index 00000000000..9c5c1a258eb --- /dev/null +++ b/client/internal/metrics/infra/.env.example @@ -0,0 +1,16 @@ +# Copy to .env and adjust values before running docker compose + +# InfluxDB admin (server-side only, never exposed to clients) +INFLUXDB_ADMIN_PASSWORD=changeme +INFLUXDB_ADMIN_TOKEN=changeme + +# Grafana admin credentials +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=changeme + +# Remote config served by ingest at /config +# Set CONFIG_METRICS_SERVER_URL to the ingest server's public address to enable +CONFIG_METRICS_SERVER_URL= +CONFIG_VERSION_SINCE=0.0.0 +CONFIG_VERSION_UNTIL=99.99.99 +CONFIG_PERIOD_MINUTES=5 diff --git a/client/internal/metrics/infra/.gitignore b/client/internal/metrics/infra/.gitignore index 2eea525d885..4c49bd78f1d 100644 --- a/client/internal/metrics/infra/.gitignore +++ b/client/internal/metrics/infra/.gitignore @@ -1 +1 @@ -.env \ No newline at end of file +.env diff --git a/client/internal/metrics/infra/README.md b/client/internal/metrics/infra/README.md index c31fe759a68..4f87ca402b3 100644 --- a/client/internal/metrics/infra/README.md +++ b/client/internal/metrics/infra/README.md @@ -6,8 +6,8 @@ Internal documentation for the NetBird client metrics system. Client metrics track connection performance and sync durations using InfluxDB line protocol (`influxdb.go`). Each event is pushed once then cleared. -Metrics are: -- Disabled by default (opt-in via `NB_METRICS_ENABLED=true`) +Metrics collection is always active (for debug bundles). Push to backend is: +- Disabled by default (opt-in via `NB_METRICS_PUSH_ENABLED=true`) - Managed at daemon layer (survives engine restarts) ## Architecture @@ -88,11 +88,11 @@ The InfluxDB backend limits in-memory sample storage to prevent unbounded growth | Variable | Default | Description | |----------|---------|-------------| -| `NB_METRICS_ENABLED` | `false` | Enable metrics push | -| `NB_METRICS_SERVER_URL` | *(from remote config)* | Ingest server URL (e.g., `https://ingest.npeer.io`) | +| `NB_METRICS_PUSH_ENABLED` | `false` | Enable metrics push to backend | +| `NB_METRICS_SERVER_URL` | *(from remote config)* | Ingest server URL (e.g., `https://ingest.netbird.io`) | | `NB_METRICS_INTERVAL` | *(from remote config)* | Push interval (e.g., "1m", "30m", "4h") | | `NB_METRICS_FORCE_SENDING` | `false` | Skip remote config, push unconditionally | -| `NB_METRICS_CONFIG_URL` | `https://api.netbird.io/client-metrics-config.json` | Remote push config URL | +| `NB_METRICS_CONFIG_URL` | `https://ingest.netbird.io/config` | Remote push config URL | ### Ingest Server Environment Variables @@ -132,7 +132,8 @@ Samples are collected with exact timestamps, pushed once, then cleared. No data ```bash # From this directory (client/internal/metrics/infra) -# Edit .env to change INFLUXDB_ADMIN_PASSWORD and INFLUXDB_ADMIN_TOKEN +cp .env.example .env +# Edit .env to set INFLUXDB_ADMIN_PASSWORD, INFLUXDB_ADMIN_TOKEN, and GRAFANA_ADMIN_PASSWORD docker compose up -d ``` @@ -144,7 +145,7 @@ This starts: ### 2. Configure Client ```bash -export NB_METRICS_ENABLED=true +export NB_METRICS_PUSH_ENABLED=true export NB_METRICS_FORCE_SENDING=true export NB_METRICS_SERVER_URL=http://localhost:8087 export NB_METRICS_INTERVAL=1m diff --git a/client/internal/metrics/infra/docker-compose.yml b/client/internal/metrics/infra/docker-compose.yml index a7f5791de30..0f2b6b8894a 100644 --- a/client/internal/metrics/infra/docker-compose.yml +++ b/client/internal/metrics/infra/docker-compose.yml @@ -46,8 +46,8 @@ services: ports: - "3001:3000" environment: - - GF_SECURITY_ADMIN_USER=admin - - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:?required} - GF_USERS_ALLOW_SIGN_UP=false - GF_INSTALL_PLUGINS= - INFLUXDB_ADMIN_TOKEN=${INFLUXDB_ADMIN_TOKEN:-} diff --git a/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-connection-metrics.json b/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-connection-metrics.json deleted file mode 100644 index be2a28021aa..00000000000 --- a/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-connection-metrics.json +++ /dev/null @@ -1,131 +0,0 @@ -{ - "uid": "netbird-connection-metrics", - "title": "NetBird Client Connection Metrics", - "tags": ["netbird", "connections"], - "timezone": "browser", - "panels": [ - { - "id": 8, - "title": "Sync Duration", - "type": "timeseries", - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "netbird_sync_duration_seconds_sum / netbird_sync_duration_seconds_count * 1000", - "legendFormat": "{{deployment_type}}/{{os}}", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "min": 0, - "custom": { - "drawStyle": "points", - "pointSize": 5 - } - } - } - }, - { - "id": 1, - "title": "Connection Stage Durations", - "type": "timeseries", - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 6 - }, - "targets": [ - { - "expr": "netbird_peer_connection_stage_signaling_received_to_connection_sum / netbird_peer_connection_stage_signaling_received_to_connection_count * 1000", - "legendFormat": "1. SignalingReceived→Connection ({{deployment_type}}/{{connection_type}})", - "refId": "A" - }, - { - "expr": "netbird_peer_connection_stage_connection_to_wg_handshake_sum / netbird_peer_connection_stage_connection_to_wg_handshake_count * 1000", - "legendFormat": "2. Connection→WG Handshake ({{deployment_type}}/{{connection_type}})", - "refId": "B" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "drawStyle": "points", - "pointSize": 5 - } - } - } - }, - { - "id": 2, - "title": "Total Connection Time", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 15 - }, - "targets": [ - { - "expr": "netbird_peer_connection_total_creation_to_wg_handshake_sum / netbird_peer_connection_total_creation_to_wg_handshake_count * 1000", - "legendFormat": "{{deployment_type}}/{{connection_type}}", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "ms", - "min": 0, - "custom": { - "drawStyle": "points", - "pointSize": 5 - } - } - } - }, - { - "id": 4, - "title": "ICE vs Relay", - "type": "piechart", - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 23 - }, - "targets": [ - { - "expr": "sum(netbird_peer_connection_total_creation_to_wg_handshake_count{connection_type=\"ice\"})", - "legendFormat": "ICE", - "refId": "A" - }, - { - "expr": "sum(netbird_peer_connection_total_creation_to_wg_handshake_count{connection_type=\"relay\"})", - "legendFormat": "Relay", - "refId": "B" - } - ], - "options": { - "reduceOptions": { - "calcs": ["sum"] - }, - "pieType": "donut", - "tooltip": { - "mode": "multi" - } - } - } - ], - "schemaVersion": 27, - "version": 12, - "refresh": "30s" -} \ No newline at end of file diff --git a/client/internal/metrics/infra/grafana/provisioning/datasources/victoriametrics.yml b/client/internal/metrics/infra/grafana/provisioning/datasources/victoriametrics.yml deleted file mode 100644 index 4b1c84b9552..00000000000 --- a/client/internal/metrics/infra/grafana/provisioning/datasources/victoriametrics.yml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: 1 - -datasources: - - name: VictoriaMetrics - type: prometheus - access: proxy - url: http://victoriametrics:8428 - isDefault: true - editable: true - jsonData: - httpMethod: POST - timeInterval: 30s \ No newline at end of file diff --git a/client/internal/metrics/infra/ingest/main.go b/client/internal/metrics/infra/ingest/main.go index fc5a51b03e6..9d05c0460c9 100644 --- a/client/internal/metrics/infra/ingest/main.go +++ b/client/internal/metrics/infra/ingest/main.go @@ -12,6 +12,7 @@ import ( "os" "strconv" "strings" + "time" ) const ( @@ -42,7 +43,7 @@ func main() { log.Fatal("INFLUXDB_TOKEN is required") } - client := &http.Client{Timeout: 10 * 1e9} // 10 seconds + client := &http.Client{Timeout: 10 * time.Second} http.HandleFunc("/", handleIngest(client, influxURL, influxToken)) @@ -133,17 +134,17 @@ func forwardToInflux(w http.ResponseWriter, r *http.Request, client *http.Client io.Copy(w, resp.Body) //nolint:errcheck } -// validateAuth checks that the Authorization header contains a valid hashed peer ID. +// validateAuth checks that the X-Peer-ID header contains a valid hashed peer ID. func validateAuth(r *http.Request) error { - peerID := r.Header.Get("Authorization") + peerID := r.Header.Get("X-Peer-ID") if peerID == "" { - return fmt.Errorf("missing Authorization header") + return fmt.Errorf("missing X-Peer-ID header") } if len(peerID) != peerIDLength { - return fmt.Errorf("invalid Authorization header length") + return fmt.Errorf("invalid X-Peer-ID header length") } if _, err := hex.DecodeString(peerID); err != nil { - return fmt.Errorf("invalid Authorization header format") + return fmt.Errorf("invalid X-Peer-ID header format") } return nil } diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 3c63f7a3fab..d40430e5215 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -55,7 +55,7 @@ type metricsImplementation interface { // RecordSyncDuration records how long it took to process a sync message RecordSyncDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration) - // Export exports metrics in Prometheus format + // Export exports metrics in InfluxDB line protocol format Export(w io.Writer) error // Reset clears all collected metrics diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index e9dfebf9a07..9381832e027 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -97,7 +97,7 @@ func NewPush(metrics metricsImplementation, configManager remoteConfigProvider, parsedVersion, err := goversion.NewVersion(agentVersion) if err != nil { if !config.ForceSending { - return nil, fmt.Errorf("failed to parse agent version %q: %w", agentVersion, err) + return nil, fmt.Errorf("parse agent version %q: %w", agentVersion, err) } } @@ -225,7 +225,7 @@ func (p *Push) push(ctx context.Context, pushURL string) error { peerID := p.peerID p.peerMu.RUnlock() if peerID != "" { - req.Header.Set("Authorization", peerID) + req.Header.Set("X-Peer-ID", peerID) } // Send request @@ -273,8 +273,8 @@ func (p *Push) resolveServerURL(remoteServerURL *url.URL) string { func gzipCompress(data []byte) (*bytes.Buffer, error) { var buf bytes.Buffer gz := gzip.NewWriter(&buf) - defer func() { _ = gz.Close() }() if _, err := gz.Write(data); err != nil { + _ = gz.Close() return nil, err } if err := gz.Close(); err != nil { diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index b14376b086f..bea0725f213 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -131,7 +131,7 @@ type Conn struct { // Connection stage timestamps for metrics metricsRecorder MetricsRecorder - metricsStages MetricsStages + metricsStages *MetricsStages } // NewConn creates a new not opened Conn to the remote peer. @@ -174,8 +174,8 @@ func (conn *Conn) Open(engineCtx context.Context) error { return nil } - // Record the start time - beginning of connection attempt - conn.metricsStages = MetricsStages{} + // Allocate new metrics stages so old goroutines don't corrupt new state + conn.metricsStages = &MetricsStages{} conn.ctx, conn.ctxCancel = context.WithCancel(engineCtx) @@ -188,7 +188,7 @@ func (conn *Conn) Open(engineCtx context.Context) error { } conn.workerICE = workerICE - conn.handshaker = NewHandshaker(conn.Log, conn.config, conn.signaler, conn.workerICE, conn.workerRelay, &conn.metricsStages) + conn.handshaker = NewHandshaker(conn.Log, conn.config, conn.signaler, conn.workerICE, conn.workerRelay, conn.metricsStages) conn.handshaker.AddRelayListener(conn.workerRelay.OnNewOffer) if !isForceRelayed() { diff --git a/client/internal/peer/metrics_saver_test.go b/client/internal/peer/metrics_saver_test.go new file mode 100644 index 00000000000..01c0aa9acff --- /dev/null +++ b/client/internal/peer/metrics_saver_test.go @@ -0,0 +1,125 @@ +package peer + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/netbirdio/netbird/client/internal/metrics" +) + +func TestMetricsStages_RecordSignalingReceived(t *testing.T) { + s := &MetricsStages{} + + s.RecordSignalingReceived() + ts := s.GetTimestamps() + require.False(t, ts.SignalingReceived.IsZero()) + + // Second call should not overwrite + first := ts.SignalingReceived + time.Sleep(time.Millisecond) + s.RecordSignalingReceived() + ts = s.GetTimestamps() + assert.Equal(t, first, ts.SignalingReceived, "should keep the first signaling timestamp") +} + +func TestMetricsStages_RecordConnectionReady(t *testing.T) { + s := &MetricsStages{} + + now := time.Now() + s.RecordConnectionReady(now) + ts := s.GetTimestamps() + assert.Equal(t, now, ts.ConnectionReady) + + // Second call should not overwrite + later := now.Add(time.Second) + s.RecordConnectionReady(later) + ts = s.GetTimestamps() + assert.Equal(t, now, ts.ConnectionReady, "should keep the first connection ready timestamp") +} + +func TestMetricsStages_RecordWGHandshakeSuccess(t *testing.T) { + s := &MetricsStages{} + + connReady := time.Now() + s.RecordConnectionReady(connReady) + + handshake := connReady.Add(500 * time.Millisecond) + s.RecordWGHandshakeSuccess(handshake) + + ts := s.GetTimestamps() + assert.Equal(t, handshake, ts.WgHandshakeSuccess) +} + +func TestMetricsStages_HandshakeBeforeConnectionReady_Normalizes(t *testing.T) { + s := &MetricsStages{} + + connReady := time.Now() + s.RecordConnectionReady(connReady) + + // WG handshake appears before ConnectionReady due to second-precision truncation + handshake := connReady.Add(-100 * time.Millisecond) + s.RecordWGHandshakeSuccess(handshake) + + ts := s.GetTimestamps() + assert.Equal(t, connReady, ts.WgHandshakeSuccess, "should normalize to ConnectionReady when handshake appears earlier") +} + +func TestMetricsStages_HandshakeIgnoredWithoutConnectionReady(t *testing.T) { + s := &MetricsStages{} + + s.RecordWGHandshakeSuccess(time.Now()) + ts := s.GetTimestamps() + assert.True(t, ts.WgHandshakeSuccess.IsZero(), "should not record handshake without connection ready") +} + +func TestMetricsStages_HandshakeRecordedOnce(t *testing.T) { + s := &MetricsStages{} + + connReady := time.Now() + s.RecordConnectionReady(connReady) + + first := connReady.Add(time.Second) + s.RecordWGHandshakeSuccess(first) + + // Second call (rekey) should be ignored + second := connReady.Add(2 * time.Second) + s.RecordWGHandshakeSuccess(second) + + ts := s.GetTimestamps() + assert.Equal(t, first, ts.WgHandshakeSuccess, "should preserve first handshake, ignore rekeys") +} + +func TestMetricsStages_Disconnected(t *testing.T) { + s := &MetricsStages{} + + s.RecordSignalingReceived() + s.RecordConnectionReady(time.Now()) + assert.False(t, s.IsReconnection()) + + s.Disconnected() + + assert.True(t, s.IsReconnection()) + ts := s.GetTimestamps() + assert.True(t, ts.SignalingReceived.IsZero(), "timestamps should be reset after disconnect") + assert.True(t, ts.ConnectionReady.IsZero(), "timestamps should be reset after disconnect") + assert.True(t, ts.WgHandshakeSuccess.IsZero(), "timestamps should be reset after disconnect") +} + +func TestMetricsStages_GetTimestamps(t *testing.T) { + s := &MetricsStages{} + + ts := s.GetTimestamps() + assert.Equal(t, metrics.ConnectionStageTimestamps{}, ts) + + now := time.Now() + s.RecordSignalingReceived() + s.RecordConnectionReady(now) + + ts = s.GetTimestamps() + assert.False(t, ts.SignalingReceived.IsZero()) + assert.Equal(t, now, ts.ConnectionReady) + assert.True(t, ts.WgHandshakeSuccess.IsZero()) +} diff --git a/client/server/debug.go b/client/server/debug.go index e563f79c17e..81708e57641 100644 --- a/client/server/debug.go +++ b/client/server/debug.go @@ -29,7 +29,9 @@ func (s *Server) DebugBundle(_ context.Context, req *proto.DebugBundleRequest) ( var clientMetrics debug.MetricsExporter if s.connectClient != nil { if engine := s.connectClient.Engine(); engine != nil { - clientMetrics = engine.GetClientMetrics() + if cm := engine.GetClientMetrics(); cm != nil { + clientMetrics = cm + } } } From d4be42b68fb485eae72aae96b5d7736aa6c9e329 Mon Sep 17 00:00:00 2001 From: Viktor Liu Date: Wed, 18 Mar 2026 19:48:42 +0100 Subject: [PATCH 46/52] Add login duration metric, ingest tag validation, and duration bounds - Add netbird_login measurement recording login/auth duration to management server, with success/failure result tag - Validate InfluxDB tags against per-measurement allowlists in ingest server to prevent arbitrary tag injection - Cap all duration fields (*_seconds) at 300s instead of only total_seconds - Add ingest server tests for tag/field validation, bounds, and auth --- client/internal/connect.go | 3 + client/internal/metrics/influxdb.go | 30 +++++ client/internal/metrics/influxdb_test.go | 44 +++++++ client/internal/metrics/infra/README.md | 14 ++ client/internal/metrics/infra/ingest/go.mod | 10 +- client/internal/metrics/infra/ingest/go.sum | 10 ++ client/internal/metrics/infra/ingest/main.go | 90 +++++++++++-- .../metrics/infra/ingest/main_test.go | 124 ++++++++++++++++++ client/internal/metrics/metrics.go | 15 +++ client/internal/metrics/push_test.go | 3 + 10 files changed, 328 insertions(+), 15 deletions(-) create mode 100644 client/internal/metrics/infra/ingest/go.sum create mode 100644 client/internal/metrics/infra/ingest/main_test.go diff --git a/client/internal/connect.go b/client/internal/connect.go index b1d8d93caff..1b1976cb1ff 100644 --- a/client/internal/connect.go +++ b/client/internal/connect.go @@ -264,8 +264,10 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan }() // connect (just a connection, no stream yet) and login to Management Service to get an initial global Netbird config + loginStarted := time.Now() loginResp, err := loginToManagement(engineCtx, mgmClient, publicSSHKey, c.config) if err != nil { + c.clientMetrics.RecordLoginDuration(engineCtx, time.Since(loginStarted), false) log.Debug(err) if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) { state.Set(StatusNeedsLogin) @@ -274,6 +276,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan } return wrapErr(err) } + c.clientMetrics.RecordLoginDuration(engineCtx, time.Since(loginStarted), true) c.statusRecorder.MarkManagementConnected() localPeerState := peer.LocalPeerState{ diff --git a/client/internal/metrics/influxdb.go b/client/internal/metrics/influxdb.go index 2084ab71fd7..847486413b2 100644 --- a/client/internal/metrics/influxdb.go +++ b/client/internal/metrics/influxdb.go @@ -118,6 +118,36 @@ func (m *influxDBMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentI m.trimLocked() } +func (m *influxDBMetrics) RecordLoginDuration(_ context.Context, agentInfo AgentInfo, duration time.Duration, success bool) { + result := "success" + if !success { + result = "failure" + } + + tags := fmt.Sprintf("deployment_type=%s,result=%s,version=%s,os=%s,peer_id=%s", + agentInfo.DeploymentType.String(), + result, + agentInfo.Version, + agentInfo.OS, + agentInfo.peerID, + ) + + m.mu.Lock() + defer m.mu.Unlock() + + m.samples = append(m.samples, influxSample{ + measurement: "netbird_login", + tags: tags, + fields: map[string]float64{ + "duration_seconds": duration.Seconds(), + }, + timestamp: time.Now(), + }) + m.trimLocked() + + log.Tracef("login metrics [%s, %s]: duration=%.3fs", agentInfo.DeploymentType.String(), result, duration.Seconds()) +} + // Export writes pending samples in InfluxDB line protocol format. // Format: measurement,tag=val,tag=val field=val,field=val timestamp_ns func (m *influxDBMetrics) Export(w io.Writer) error { diff --git a/client/internal/metrics/influxdb_test.go b/client/internal/metrics/influxdb_test.go index c8c47f57948..0c6ddcb6fea 100644 --- a/client/internal/metrics/influxdb_test.go +++ b/client/internal/metrics/influxdb_test.go @@ -158,6 +158,50 @@ func TestInfluxDBMetrics_TrimByAge(t *testing.T) { assert.Equal(t, 0, remaining, "old samples should be trimmed") } +func TestInfluxDBMetrics_RecordLoginDuration(t *testing.T) { + m := newInfluxDBMetrics().(*influxDBMetrics) + + agentInfo := AgentInfo{ + DeploymentType: DeploymentTypeCloud, + Version: "1.0.0", + OS: "linux", + peerID: "abc123", + } + + m.RecordLoginDuration(context.Background(), agentInfo, 2500*time.Millisecond, true) + + var buf bytes.Buffer + err := m.Export(&buf) + require.NoError(t, err) + + output := buf.String() + assert.Contains(t, output, "netbird_login,") + assert.Contains(t, output, "duration_seconds=2.5") + assert.Contains(t, output, "result=success") +} + +func TestInfluxDBMetrics_RecordLoginDurationFailure(t *testing.T) { + m := newInfluxDBMetrics().(*influxDBMetrics) + + agentInfo := AgentInfo{ + DeploymentType: DeploymentTypeSelfHosted, + Version: "1.0.0", + OS: "darwin", + peerID: "xyz789", + } + + m.RecordLoginDuration(context.Background(), agentInfo, 5*time.Second, false) + + var buf bytes.Buffer + err := m.Export(&buf) + require.NoError(t, err) + + output := buf.String() + assert.Contains(t, output, "netbird_login,") + assert.Contains(t, output, "result=failure") + assert.Contains(t, output, "deployment_type=selfhosted") +} + func TestInfluxDBMetrics_TrimBySize(t *testing.T) { m := newInfluxDBMetrics().(*influxDBMetrics) diff --git a/client/internal/metrics/infra/README.md b/client/internal/metrics/infra/README.md index 4f87ca402b3..0cbbee01120 100644 --- a/client/internal/metrics/infra/README.md +++ b/client/internal/metrics/infra/README.md @@ -76,6 +76,20 @@ Tags: - `version`: NetBird version string - `os`: Operating system (linux, darwin, windows, android, ios, etc.) +### Login Duration + +Measurement: `netbird_login` + +| Field | Description | +|-------|-------------| +| `duration_seconds` | Time to complete the login/auth exchange with management server | + +Tags: +- `deployment_type`: "cloud" | "selfhosted" | "unknown" +- `result`: "success" | "failure" +- `version`: NetBird version string +- `os`: Operating system (linux, darwin, windows, android, ios, etc.) + ## Buffer Limits The InfluxDB backend limits in-memory sample storage to prevent unbounded growth when pushes fail: diff --git a/client/internal/metrics/infra/ingest/go.mod b/client/internal/metrics/infra/ingest/go.mod index 0d50be2bef8..aaf1ea9dab7 100644 --- a/client/internal/metrics/infra/ingest/go.mod +++ b/client/internal/metrics/infra/ingest/go.mod @@ -1,3 +1,11 @@ module github.com/netbirdio/netbird/client/internal/metrics/infra/ingest -go 1.25 \ No newline at end of file +go 1.25 + +require github.com/stretchr/testify v1.11.1 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/client/internal/metrics/infra/ingest/go.sum b/client/internal/metrics/infra/ingest/go.sum new file mode 100644 index 00000000000..c4c1710c475 --- /dev/null +++ b/client/internal/metrics/infra/ingest/go.sum @@ -0,0 +1,10 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/client/internal/metrics/infra/ingest/main.go b/client/internal/metrics/infra/ingest/main.go index 9d05c0460c9..02beae607d1 100644 --- a/client/internal/metrics/infra/ingest/main.go +++ b/client/internal/metrics/infra/ingest/main.go @@ -19,18 +19,55 @@ const ( defaultListenAddr = ":8087" defaultInfluxDBURL = "http://influxdb:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns" maxBodySize = 50 * 1024 * 1024 // 50 MB max request body - maxTotalSeconds = 300.0 // reject total_seconds > 5 minutes + maxDurationSeconds = 300.0 // reject any duration field > 5 minutes peerIDLength = 16 // truncated SHA-256: 8 bytes = 16 hex chars + maxTagValueLength = 64 // reject tag values longer than this ) -var allowedMeasurements = map[string]map[string]bool{ +type measurementSpec struct { + allowedFields map[string]bool + allowedTags map[string]bool +} + +var allowedMeasurements = map[string]measurementSpec{ "netbird_peer_connection": { - "signaling_to_connection_seconds": true, - "connection_to_wg_handshake_seconds": true, - "total_seconds": true, + allowedFields: map[string]bool{ + "signaling_to_connection_seconds": true, + "connection_to_wg_handshake_seconds": true, + "total_seconds": true, + }, + allowedTags: map[string]bool{ + "deployment_type": true, + "connection_type": true, + "attempt_type": true, + "version": true, + "os": true, + "peer_id": true, + "connection_pair_id": true, + }, }, "netbird_sync": { - "duration_seconds": true, + allowedFields: map[string]bool{ + "duration_seconds": true, + }, + allowedTags: map[string]bool{ + "deployment_type": true, + "version": true, + "os": true, + "peer_id": true, + }, + }, + "netbird_login": { + allowedFields: map[string]bool{ + "duration_seconds": true, + }, + allowedTags: map[string]bool{ + "deployment_type": true, + "result": true, + "version": true, + "os": true, + "peer_id": true, + }, }, } @@ -198,18 +235,25 @@ func validateLine(line string) error { return fmt.Errorf("invalid line protocol: %q", truncate(line, 100)) } - measurement := parts[0] - if idx := strings.IndexByte(measurement, ','); idx >= 0 { - measurement = measurement[:idx] - } + // parts[0] is "measurement,tag=val,tag=val" + measurementAndTags := strings.Split(parts[0], ",") + measurement := measurementAndTags[0] - allowedFields, ok := allowedMeasurements[measurement] + spec, ok := allowedMeasurements[measurement] if !ok { return fmt.Errorf("unknown measurement: %q", measurement) } + // Validate tags (everything after measurement name in parts[0]) + for _, tagPair := range measurementAndTags[1:] { + if err := validateTag(tagPair, measurement, spec.allowedTags); err != nil { + return err + } + } + + // Validate fields for _, pair := range strings.Split(parts[1], ",") { - if err := validateField(pair, measurement, allowedFields); err != nil { + if err := validateField(pair, measurement, spec.allowedFields); err != nil { return err } } @@ -217,6 +261,24 @@ func validateLine(line string) error { return nil } +func validateTag(pair, measurement string, allowedTags map[string]bool) error { + kv := strings.SplitN(pair, "=", 2) + if len(kv) != 2 { + return fmt.Errorf("invalid tag: %q", pair) + } + + tagName := kv[0] + if !allowedTags[tagName] { + return fmt.Errorf("unknown tag %q in measurement %q", tagName, measurement) + } + + if len(kv[1]) > maxTagValueLength { + return fmt.Errorf("tag value too long for %q: %d > %d", tagName, len(kv[1]), maxTagValueLength) + } + + return nil +} + func validateField(pair, measurement string, allowedFields map[string]bool) error { kv := strings.SplitN(pair, "=", 2) if len(kv) != 2 { @@ -235,8 +297,8 @@ func validateField(pair, measurement string, allowedFields map[string]bool) erro if val < 0 { return fmt.Errorf("negative value for %q: %g", fieldName, val) } - if fieldName == "total_seconds" && val > maxTotalSeconds { - return fmt.Errorf("total_seconds too large: %g > %g", val, maxTotalSeconds) + if strings.HasSuffix(fieldName, "_seconds") && val > maxDurationSeconds { + return fmt.Errorf("%q too large: %g > %g", fieldName, val, maxDurationSeconds) } return nil diff --git a/client/internal/metrics/infra/ingest/main_test.go b/client/internal/metrics/infra/ingest/main_test.go new file mode 100644 index 00000000000..3c16c3ec0e8 --- /dev/null +++ b/client/internal/metrics/infra/ingest/main_test.go @@ -0,0 +1,124 @@ +package main + +import ( + "net/http" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestValidateLine_ValidPeerConnection(t *testing.T) { + line := `netbird_peer_connection,deployment_type=cloud,connection_type=ice,attempt_type=initial,version=1.0.0,os=linux,peer_id=abcdef0123456789,connection_pair_id=pair1234 signaling_to_connection_seconds=1.5,connection_to_wg_handshake_seconds=0.5,total_seconds=2 1234567890` + assert.NoError(t, validateLine(line)) +} + +func TestValidateLine_ValidSync(t *testing.T) { + line := `netbird_sync,deployment_type=selfhosted,version=2.0.0,os=darwin,peer_id=abcdef0123456789 duration_seconds=1.5 1234567890` + assert.NoError(t, validateLine(line)) +} + +func TestValidateLine_ValidLogin(t *testing.T) { + line := `netbird_login,deployment_type=cloud,result=success,version=1.0.0,os=linux,peer_id=abcdef0123456789 duration_seconds=3.2 1234567890` + assert.NoError(t, validateLine(line)) +} + +func TestValidateLine_UnknownMeasurement(t *testing.T) { + line := `unknown_metric,foo=bar value=1 1234567890` + err := validateLine(line) + require.Error(t, err) + assert.Contains(t, err.Error(), "unknown measurement") +} + +func TestValidateLine_UnknownTag(t *testing.T) { + line := `netbird_sync,deployment_type=cloud,evil_tag=injected,version=1.0.0,os=linux,peer_id=abc duration_seconds=1.5 1234567890` + err := validateLine(line) + require.Error(t, err) + assert.Contains(t, err.Error(), "unknown tag") +} + +func TestValidateLine_UnknownField(t *testing.T) { + line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc injected_field=1 1234567890` + err := validateLine(line) + require.Error(t, err) + assert.Contains(t, err.Error(), "unknown field") +} + +func TestValidateLine_NegativeValue(t *testing.T) { + line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc duration_seconds=-1.5 1234567890` + err := validateLine(line) + require.Error(t, err) + assert.Contains(t, err.Error(), "negative") +} + +func TestValidateLine_DurationTooLarge(t *testing.T) { + line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc duration_seconds=999 1234567890` + err := validateLine(line) + require.Error(t, err) + assert.Contains(t, err.Error(), "too large") +} + +func TestValidateLine_TotalSecondsTooLarge(t *testing.T) { + line := `netbird_peer_connection,deployment_type=cloud,connection_type=ice,attempt_type=initial,version=1.0.0,os=linux,peer_id=abc,connection_pair_id=pair total_seconds=500 1234567890` + err := validateLine(line) + require.Error(t, err) + assert.Contains(t, err.Error(), "too large") +} + +func TestValidateLine_TagValueTooLong(t *testing.T) { + longTag := strings.Repeat("a", maxTagValueLength+1) + line := `netbird_sync,deployment_type=` + longTag + `,version=1.0.0,os=linux,peer_id=abc duration_seconds=1.5 1234567890` + err := validateLine(line) + require.Error(t, err) + assert.Contains(t, err.Error(), "tag value too long") +} + +func TestValidateLineProtocol_MultipleLines(t *testing.T) { + body := []byte( + "netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc duration_seconds=1.5 1234567890\n" + + "netbird_login,deployment_type=cloud,result=success,version=1.0.0,os=linux,peer_id=abc duration_seconds=2.0 1234567890\n", + ) + validated, err := validateLineProtocol(body) + require.NoError(t, err) + assert.Contains(t, string(validated), "netbird_sync") + assert.Contains(t, string(validated), "netbird_login") +} + +func TestValidateLineProtocol_RejectsOnBadLine(t *testing.T) { + body := []byte( + "netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc duration_seconds=1.5 1234567890\n" + + "evil_metric,foo=bar value=1 1234567890\n", + ) + _, err := validateLineProtocol(body) + require.Error(t, err) +} + +func TestValidateAuth(t *testing.T) { + tests := []struct { + name string + peerID string + wantErr bool + }{ + {"valid hex", "abcdef0123456789", false}, + {"empty", "", true}, + {"too short", "abcdef01234567", true}, + {"too long", "abcdef01234567890", true}, + {"invalid hex", "ghijklmnopqrstuv", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r, _ := http.NewRequest(http.MethodPost, "/", nil) + if tt.peerID != "" { + r.Header.Set("X-Peer-ID", tt.peerID) + } + err := validateAuth(r) + if tt.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index d40430e5215..1f99c42b19b 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -55,6 +55,9 @@ type metricsImplementation interface { // RecordSyncDuration records how long it took to process a sync message RecordSyncDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration) + // RecordLoginDuration records how long the login to management took + RecordLoginDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration, success bool) + // Export exports metrics in InfluxDB line protocol format Export(w io.Writer) error @@ -123,6 +126,18 @@ func (c *ClientMetrics) RecordSyncDuration(ctx context.Context, duration time.Du c.impl.RecordSyncDuration(ctx, agentInfo, duration) } +// RecordLoginDuration records how long the login to management server took +func (c *ClientMetrics) RecordLoginDuration(ctx context.Context, duration time.Duration, success bool) { + if c == nil { + return + } + c.mu.RLock() + agentInfo := c.agentInfo + c.mu.RUnlock() + + c.impl.RecordLoginDuration(ctx, agentInfo, duration, success) +} + // UpdateAgentInfo updates the agent information (e.g., when switching profiles). // publicKey is the WireGuard public key; it will be hashed for anonymisation. func (c *ClientMetrics) UpdateAgentInfo(agentInfo AgentInfo, publicKey string) { diff --git a/client/internal/metrics/push_test.go b/client/internal/metrics/push_test.go index 7b783bc721e..20a509da16a 100644 --- a/client/internal/metrics/push_test.go +++ b/client/internal/metrics/push_test.go @@ -70,6 +70,9 @@ func (m *mockMetrics) RecordConnectionStages(_ context.Context, _ AgentInfo, _ s func (m *mockMetrics) RecordSyncDuration(_ context.Context, _ AgentInfo, _ time.Duration) { } +func (m *mockMetrics) RecordLoginDuration(_ context.Context, _ AgentInfo, _ time.Duration, _ bool) { +} + func (m *mockMetrics) Export(w io.Writer) error { if m.exportData != "" { _, err := w.Write([]byte(m.exportData)) From 672fc66cf27a457d20771a4d72586b613584398c Mon Sep 17 00:00:00 2001 From: Viktor Liu Date: Wed, 18 Mar 2026 19:55:10 +0100 Subject: [PATCH 47/52] Add arch tag to all metrics --- client/internal/connect.go | 2 ++ client/internal/metrics/influxdb.go | 9 ++++--- client/internal/metrics/influxdb_test.go | 6 +++++ client/internal/metrics/infra/README.md | 3 +++ client/internal/metrics/infra/ingest/main.go | 17 +++++++------ .../metrics/infra/ingest/main_test.go | 24 +++++++++---------- client/internal/metrics/metrics.go | 1 + 7 files changed, 40 insertions(+), 22 deletions(-) diff --git a/client/internal/connect.go b/client/internal/connect.go index 1b1976cb1ff..dcf0770c211 100644 --- a/client/internal/connect.go +++ b/client/internal/connect.go @@ -150,6 +150,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan DeploymentType: metrics.DeploymentTypeUnknown, Version: version.NetbirdVersion(), OS: runtime.GOOS, + Arch: runtime.GOARCH, } c.clientMetrics = metrics.NewClientMetrics(agentInfo) log.Debugf("initialized client metrics") @@ -253,6 +254,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan DeploymentType: deploymentType, Version: version.NetbirdVersion(), OS: runtime.GOOS, + Arch: runtime.GOARCH, } c.clientMetrics.UpdateAgentInfo(agentInfo, myPrivateKey.PublicKey().String()) diff --git a/client/internal/metrics/influxdb.go b/client/internal/metrics/influxdb.go index 847486413b2..531f6a9867c 100644 --- a/client/internal/metrics/influxdb.go +++ b/client/internal/metrics/influxdb.go @@ -65,12 +65,13 @@ func (m *influxDBMetrics) RecordConnectionStages( } connTypeStr := connectionType.String() - tags := fmt.Sprintf("deployment_type=%s,connection_type=%s,attempt_type=%s,version=%s,os=%s,peer_id=%s,connection_pair_id=%s", + tags := fmt.Sprintf("deployment_type=%s,connection_type=%s,attempt_type=%s,version=%s,os=%s,arch=%s,peer_id=%s,connection_pair_id=%s", agentInfo.DeploymentType.String(), connTypeStr, attemptType, agentInfo.Version, agentInfo.OS, + agentInfo.Arch, agentInfo.peerID, connectionPairID, ) @@ -97,10 +98,11 @@ func (m *influxDBMetrics) RecordConnectionStages( } func (m *influxDBMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentInfo, duration time.Duration) { - tags := fmt.Sprintf("deployment_type=%s,version=%s,os=%s,peer_id=%s", + tags := fmt.Sprintf("deployment_type=%s,version=%s,os=%s,arch=%s,peer_id=%s", agentInfo.DeploymentType.String(), agentInfo.Version, agentInfo.OS, + agentInfo.Arch, agentInfo.peerID, ) @@ -124,11 +126,12 @@ func (m *influxDBMetrics) RecordLoginDuration(_ context.Context, agentInfo Agent result = "failure" } - tags := fmt.Sprintf("deployment_type=%s,result=%s,version=%s,os=%s,peer_id=%s", + tags := fmt.Sprintf("deployment_type=%s,result=%s,version=%s,os=%s,arch=%s,peer_id=%s", agentInfo.DeploymentType.String(), result, agentInfo.Version, agentInfo.OS, + agentInfo.Arch, agentInfo.peerID, ) diff --git a/client/internal/metrics/influxdb_test.go b/client/internal/metrics/influxdb_test.go index 0c6ddcb6fea..b964e31a3f5 100644 --- a/client/internal/metrics/influxdb_test.go +++ b/client/internal/metrics/influxdb_test.go @@ -18,6 +18,7 @@ func TestInfluxDBMetrics_RecordAndExport(t *testing.T) { DeploymentType: DeploymentTypeCloud, Version: "1.0.0", OS: "linux", + Arch: "amd64", peerID: "abc123", } @@ -47,6 +48,7 @@ func TestInfluxDBMetrics_ExportDeterministicFieldOrder(t *testing.T) { DeploymentType: DeploymentTypeCloud, Version: "1.0.0", OS: "linux", + Arch: "amd64", peerID: "abc123", } @@ -92,6 +94,7 @@ func TestInfluxDBMetrics_RecordSyncDuration(t *testing.T) { DeploymentType: DeploymentTypeSelfHosted, Version: "2.0.0", OS: "darwin", + Arch: "arm64", peerID: "def456", } @@ -114,6 +117,7 @@ func TestInfluxDBMetrics_Reset(t *testing.T) { DeploymentType: DeploymentTypeCloud, Version: "1.0.0", OS: "linux", + Arch: "amd64", peerID: "abc123", } @@ -165,6 +169,7 @@ func TestInfluxDBMetrics_RecordLoginDuration(t *testing.T) { DeploymentType: DeploymentTypeCloud, Version: "1.0.0", OS: "linux", + Arch: "amd64", peerID: "abc123", } @@ -187,6 +192,7 @@ func TestInfluxDBMetrics_RecordLoginDurationFailure(t *testing.T) { DeploymentType: DeploymentTypeSelfHosted, Version: "1.0.0", OS: "darwin", + Arch: "arm64", peerID: "xyz789", } diff --git a/client/internal/metrics/infra/README.md b/client/internal/metrics/infra/README.md index 0cbbee01120..d2134a36c01 100644 --- a/client/internal/metrics/infra/README.md +++ b/client/internal/metrics/infra/README.md @@ -60,6 +60,7 @@ Tags: - `attempt_type`: "initial" | "reconnection" - `version`: NetBird version string - `os`: Operating system (linux, darwin, windows, android, ios, etc.) +- `arch`: CPU architecture (amd64, arm64, etc.) **Note:** `SignalingReceived` is set when the first offer or answer arrives from the remote peer (in both initial and reconnection paths). It excludes the potentially unbounded wait for the remote peer to come online. @@ -75,6 +76,7 @@ Tags: - `deployment_type`: "cloud" | "selfhosted" | "unknown" - `version`: NetBird version string - `os`: Operating system (linux, darwin, windows, android, ios, etc.) +- `arch`: CPU architecture (amd64, arm64, etc.) ### Login Duration @@ -89,6 +91,7 @@ Tags: - `result`: "success" | "failure" - `version`: NetBird version string - `os`: Operating system (linux, darwin, windows, android, ios, etc.) +- `arch`: CPU architecture (amd64, arm64, etc.) ## Buffer Limits diff --git a/client/internal/metrics/infra/ingest/main.go b/client/internal/metrics/infra/ingest/main.go index 02beae607d1..a5031a873fc 100644 --- a/client/internal/metrics/infra/ingest/main.go +++ b/client/internal/metrics/infra/ingest/main.go @@ -37,13 +37,14 @@ var allowedMeasurements = map[string]measurementSpec{ "total_seconds": true, }, allowedTags: map[string]bool{ - "deployment_type": true, - "connection_type": true, - "attempt_type": true, - "version": true, - "os": true, - "peer_id": true, - "connection_pair_id": true, + "deployment_type": true, + "connection_type": true, + "attempt_type": true, + "version": true, + "os": true, + "arch": true, + "peer_id": true, + "connection_pair_id": true, }, }, "netbird_sync": { @@ -54,6 +55,7 @@ var allowedMeasurements = map[string]measurementSpec{ "deployment_type": true, "version": true, "os": true, + "arch": true, "peer_id": true, }, }, @@ -66,6 +68,7 @@ var allowedMeasurements = map[string]measurementSpec{ "result": true, "version": true, "os": true, + "arch": true, "peer_id": true, }, }, diff --git a/client/internal/metrics/infra/ingest/main_test.go b/client/internal/metrics/infra/ingest/main_test.go index 3c16c3ec0e8..bacaa4588fc 100644 --- a/client/internal/metrics/infra/ingest/main_test.go +++ b/client/internal/metrics/infra/ingest/main_test.go @@ -10,17 +10,17 @@ import ( ) func TestValidateLine_ValidPeerConnection(t *testing.T) { - line := `netbird_peer_connection,deployment_type=cloud,connection_type=ice,attempt_type=initial,version=1.0.0,os=linux,peer_id=abcdef0123456789,connection_pair_id=pair1234 signaling_to_connection_seconds=1.5,connection_to_wg_handshake_seconds=0.5,total_seconds=2 1234567890` + line := `netbird_peer_connection,deployment_type=cloud,connection_type=ice,attempt_type=initial,version=1.0.0,os=linux,arch=amd64,peer_id=abcdef0123456789,connection_pair_id=pair1234 signaling_to_connection_seconds=1.5,connection_to_wg_handshake_seconds=0.5,total_seconds=2 1234567890` assert.NoError(t, validateLine(line)) } func TestValidateLine_ValidSync(t *testing.T) { - line := `netbird_sync,deployment_type=selfhosted,version=2.0.0,os=darwin,peer_id=abcdef0123456789 duration_seconds=1.5 1234567890` + line := `netbird_sync,deployment_type=selfhosted,version=2.0.0,os=darwin,arch=arm64,peer_id=abcdef0123456789 duration_seconds=1.5 1234567890` assert.NoError(t, validateLine(line)) } func TestValidateLine_ValidLogin(t *testing.T) { - line := `netbird_login,deployment_type=cloud,result=success,version=1.0.0,os=linux,peer_id=abcdef0123456789 duration_seconds=3.2 1234567890` + line := `netbird_login,deployment_type=cloud,result=success,version=1.0.0,os=linux,arch=amd64,peer_id=abcdef0123456789 duration_seconds=3.2 1234567890` assert.NoError(t, validateLine(line)) } @@ -32,35 +32,35 @@ func TestValidateLine_UnknownMeasurement(t *testing.T) { } func TestValidateLine_UnknownTag(t *testing.T) { - line := `netbird_sync,deployment_type=cloud,evil_tag=injected,version=1.0.0,os=linux,peer_id=abc duration_seconds=1.5 1234567890` + line := `netbird_sync,deployment_type=cloud,evil_tag=injected,version=1.0.0,os=linux,arch=amd64,peer_id=abc duration_seconds=1.5 1234567890` err := validateLine(line) require.Error(t, err) assert.Contains(t, err.Error(), "unknown tag") } func TestValidateLine_UnknownField(t *testing.T) { - line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc injected_field=1 1234567890` + line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,arch=amd64,peer_id=abc injected_field=1 1234567890` err := validateLine(line) require.Error(t, err) assert.Contains(t, err.Error(), "unknown field") } func TestValidateLine_NegativeValue(t *testing.T) { - line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc duration_seconds=-1.5 1234567890` + line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,arch=amd64,peer_id=abc duration_seconds=-1.5 1234567890` err := validateLine(line) require.Error(t, err) assert.Contains(t, err.Error(), "negative") } func TestValidateLine_DurationTooLarge(t *testing.T) { - line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc duration_seconds=999 1234567890` + line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,arch=amd64,peer_id=abc duration_seconds=999 1234567890` err := validateLine(line) require.Error(t, err) assert.Contains(t, err.Error(), "too large") } func TestValidateLine_TotalSecondsTooLarge(t *testing.T) { - line := `netbird_peer_connection,deployment_type=cloud,connection_type=ice,attempt_type=initial,version=1.0.0,os=linux,peer_id=abc,connection_pair_id=pair total_seconds=500 1234567890` + line := `netbird_peer_connection,deployment_type=cloud,connection_type=ice,attempt_type=initial,version=1.0.0,os=linux,arch=amd64,peer_id=abc,connection_pair_id=pair total_seconds=500 1234567890` err := validateLine(line) require.Error(t, err) assert.Contains(t, err.Error(), "too large") @@ -68,7 +68,7 @@ func TestValidateLine_TotalSecondsTooLarge(t *testing.T) { func TestValidateLine_TagValueTooLong(t *testing.T) { longTag := strings.Repeat("a", maxTagValueLength+1) - line := `netbird_sync,deployment_type=` + longTag + `,version=1.0.0,os=linux,peer_id=abc duration_seconds=1.5 1234567890` + line := `netbird_sync,deployment_type=` + longTag + `,version=1.0.0,os=linux,arch=amd64,peer_id=abc duration_seconds=1.5 1234567890` err := validateLine(line) require.Error(t, err) assert.Contains(t, err.Error(), "tag value too long") @@ -76,8 +76,8 @@ func TestValidateLine_TagValueTooLong(t *testing.T) { func TestValidateLineProtocol_MultipleLines(t *testing.T) { body := []byte( - "netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc duration_seconds=1.5 1234567890\n" + - "netbird_login,deployment_type=cloud,result=success,version=1.0.0,os=linux,peer_id=abc duration_seconds=2.0 1234567890\n", + "netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,arch=amd64,peer_id=abc duration_seconds=1.5 1234567890\n" + + "netbird_login,deployment_type=cloud,result=success,version=1.0.0,os=linux,arch=amd64,peer_id=abc duration_seconds=2.0 1234567890\n", ) validated, err := validateLineProtocol(body) require.NoError(t, err) @@ -87,7 +87,7 @@ func TestValidateLineProtocol_MultipleLines(t *testing.T) { func TestValidateLineProtocol_RejectsOnBadLine(t *testing.T) { body := []byte( - "netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,peer_id=abc duration_seconds=1.5 1234567890\n" + + "netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,arch=amd64,peer_id=abc duration_seconds=1.5 1234567890\n" + "evil_metric,foo=bar value=1 1234567890\n", ) _, err := validateLineProtocol(body) diff --git a/client/internal/metrics/metrics.go b/client/internal/metrics/metrics.go index 1f99c42b19b..4ebb4349659 100644 --- a/client/internal/metrics/metrics.go +++ b/client/internal/metrics/metrics.go @@ -19,6 +19,7 @@ type AgentInfo struct { DeploymentType DeploymentType Version string OS string // runtime.GOOS (linux, darwin, windows, etc.) + Arch string // runtime.GOARCH (amd64, arm64, etc.) peerID string // anonymised peer identifier (SHA-256 of WireGuard public key) } From c44b7975039345024625446a7c03ee93fa6383cc Mon Sep 17 00:00:00 2001 From: Viktor Liu Date: Wed, 18 Mar 2026 20:07:49 +0100 Subject: [PATCH 48/52] Fix Grafana dashboard: add arch to drop columns, add login panels --- .../json/netbird-influxdb-metrics.json | 81 ++++++++++++++++--- 1 file changed, 71 insertions(+), 10 deletions(-) diff --git a/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json b/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json index 89c751b1ea7..2bcc9cbabf3 100644 --- a/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json +++ b/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-influxdb-metrics.json @@ -20,11 +20,11 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"peer_id\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"arch\", \"peer_id\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", "refId": "A" }, { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"peer_id\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"arch\", \"peer_id\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", "refId": "B" } ], @@ -59,11 +59,11 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"arch\", \"peer_id\", \"connection_pair_id\"])\n |> min()\n |> set(key: \"_field\", value: \"Min\")", "refId": "A" }, { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"arch\", \"peer_id\", \"connection_pair_id\"])\n |> max()\n |> set(key: \"_field\", value: \"Max\")", "refId": "B" } ], @@ -98,7 +98,7 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"peer_id\"])\n |> set(key: \"_field\", value: \"Sync Duration\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"arch\", \"peer_id\"])\n |> set(key: \"_field\", value: \"Sync Duration\")", "refId": "A" } ], @@ -129,7 +129,7 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> drop(columns: [\"deployment_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\"])\n |> group(columns: [\"connection_pair_id\"])\n |> last()\n |> group(columns: [\"connection_type\"])\n |> count()", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> drop(columns: [\"deployment_type\", \"attempt_type\", \"version\", \"os\", \"arch\", \"peer_id\"])\n |> group(columns: [\"connection_pair_id\"])\n |> last()\n |> group(columns: [\"connection_type\"])\n |> count()", "refId": "A" } ], @@ -159,11 +159,11 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"signaling_to_connection_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Signaling to Connection\"})", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"signaling_to_connection_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"arch\", \"peer_id\", \"connection_pair_id\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Signaling to Connection\"})", "refId": "A" }, { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"connection_to_wg_handshake_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Connection to WG Handshake\"})", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"connection_to_wg_handshake_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"arch\", \"peer_id\", \"connection_pair_id\"])\n |> mean()\n |> drop(columns: [\"_start\", \"_stop\", \"_measurement\", \"_time\", \"_field\"])\n |> rename(columns: {_value: \"Avg Connection to WG Handshake\"})", "refId": "B" } ], @@ -197,7 +197,7 @@ }, "targets": [ { - "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"peer_id\", \"connection_pair_id\"])\n |> set(key: \"_field\", value: \"Total Connection Time\")", + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_peer_connection\" and r._field == \"total_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"connection_type\", \"attempt_type\", \"version\", \"os\", \"arch\", \"peer_id\", \"connection_pair_id\"])\n |> set(key: \"_field\", value: \"Total Connection Time\")", "refId": "A" } ], @@ -211,9 +211,70 @@ } } } + }, + { + "id": 7, + "title": "Login Duration", + "type": "timeseries", + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_login\" and r._field == \"duration_seconds\")\n |> map(fn: (r) => ({r with _value: r._value * 1000.0}))\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"arch\", \"peer_id\"])\n |> set(key: \"_field\", value: \"Login Duration\")", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "min": 0, + "custom": { + "drawStyle": "points", + "pointSize": 5 + } + } + } + }, + { + "id": 8, + "title": "Login Success vs Failure", + "type": "piechart", + "datasource": { + "type": "influxdb", + "uid": "influxdb" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "targets": [ + { + "query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_login\" and r._field == \"duration_seconds\")\n |> drop(columns: [\"deployment_type\", \"version\", \"os\", \"arch\", \"peer_id\"])\n |> group(columns: [\"result\"])\n |> count()", + "refId": "A" + } + ], + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "pieType": "donut", + "tooltip": { + "mode": "multi" + } + } } ], "schemaVersion": 27, - "version": 1, + "version": 2, "refresh": "30s" } From e67ae9a742937cd15e4c7dfc84d4aac6dd254d7b Mon Sep 17 00:00:00 2001 From: Viktor Liu Date: Wed, 18 Mar 2026 20:20:02 +0100 Subject: [PATCH 49/52] Validate NB_METRICS_SERVER_URL is an absolute HTTP(S) URL --- client/internal/metrics/env.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/client/internal/metrics/env.go b/client/internal/metrics/env.go index 99465621528..1f06ce4849d 100644 --- a/client/internal/metrics/env.go +++ b/client/internal/metrics/env.go @@ -80,9 +80,13 @@ func getMetricsServerURL() *url.URL { if envURL == "" { return nil } - parsed, err := url.Parse(envURL) - if err != nil { - log.Warnf("invalid metrics server URL from env: %v", err) + parsed, err := url.ParseRequestURI(envURL) + if err != nil || parsed.Host == "" { + log.Warnf("invalid metrics server URL %q: must be an absolute HTTP(S) URL", envURL) + return nil + } + if parsed.Scheme != "http" && parsed.Scheme != "https" { + log.Warnf("invalid metrics server URL %q: unsupported scheme %q", envURL, parsed.Scheme) return nil } return parsed From 59f8c5de755b9cdd0ff8c6d2c3ea635b9cbe770d Mon Sep 17 00:00:00 2001 From: Viktor Liu Date: Wed, 18 Mar 2026 20:22:18 +0100 Subject: [PATCH 50/52] Address review comments: fix README wording, update stale comments --- client/internal/metrics/infra/README.md | 10 +++++----- client/internal/metrics/push.go | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/client/internal/metrics/infra/README.md b/client/internal/metrics/infra/README.md index d2134a36c01..5f50d72b859 100644 --- a/client/internal/metrics/infra/README.md +++ b/client/internal/metrics/infra/README.md @@ -14,7 +14,7 @@ Metrics collection is always active (for debug bundles). Push to backend is: ### Layer Separation -``` +```text Daemon Layer (connect.go) ├─ Creates ClientMetrics instance once ├─ Starts/stops push lifecycle @@ -29,16 +29,16 @@ Engine Layer (engine.go) Clients do not talk to InfluxDB directly. An ingest server sits between clients and InfluxDB: -``` +```text Client ──POST──▶ Ingest Server (:8087) ──▶ InfluxDB (internal) │ ├─ Validates line protocol - ├─ Whitelists measurements & fields + ├─ Allowlists measurements, fields, and tags ├─ Rejects out-of-bound values └─ Serves remote config at /config ``` -- **No client-side auth required** — the ingest server holds the InfluxDB token server-side +- **No secret/token-based client auth** — the ingest server holds the InfluxDB token server-side. Clients must send a hashed peer ID via `X-Peer-ID` header. - **InfluxDB is not exposed** — only accessible within the docker network - Source: `ingest/main.go` @@ -155,7 +155,7 @@ docker compose up -d ``` This starts: -- **Ingest server** on http://localhost:8087 — accepts client metrics (no auth needed) +- **Ingest server** on http://localhost:8087 — accepts client metrics (requires `X-Peer-ID` header, no secret/token auth) - **InfluxDB** — internal only, not exposed to host - **Grafana** on http://localhost:3001 diff --git a/client/internal/metrics/push.go b/client/internal/metrics/push.go index 9381832e027..ee0508f360e 100644 --- a/client/internal/metrics/push.go +++ b/client/internal/metrics/push.go @@ -122,8 +122,8 @@ func (p *Push) SetPeerID(peerID string) { } // Start starts the periodic push loop. -// If overrideInterval is set (via env var), pushes unconditionally at that interval. -// Otherwise, fetches remote config to determine push period and version eligibility. +// The env interval override controls tick frequency but does not bypass remote config +// version gating. Use ForceSending to skip remote config entirely. func (p *Push) Start(ctx context.Context) { // Log initial state switch { From 8a761ccb7f6cdd400e003ee0335d6dd1a0a49f15 Mon Sep 17 00:00:00 2001 From: Viktor Liu Date: Wed, 18 Mar 2026 20:34:37 +0100 Subject: [PATCH 51/52] Clarify env var precedence does not bypass remote config eligibility --- client/internal/metrics/infra/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/client/internal/metrics/infra/README.md b/client/internal/metrics/infra/README.md index 5f50d72b859..5a93dbd8738 100644 --- a/client/internal/metrics/infra/README.md +++ b/client/internal/metrics/infra/README.md @@ -111,6 +111,8 @@ The InfluxDB backend limits in-memory sample storage to prevent unbounded growth | `NB_METRICS_FORCE_SENDING` | `false` | Skip remote config, push unconditionally | | `NB_METRICS_CONFIG_URL` | `https://ingest.netbird.io/config` | Remote push config URL | +`NB_METRICS_SERVER_URL` and `NB_METRICS_INTERVAL` override their respective values but do not bypass remote config eligibility checks (version range). Use `NB_METRICS_FORCE_SENDING=true` to skip all remote config gating. + ### Ingest Server Environment Variables | Variable | Default | Description | From 909445dafc44eb9c49982086c5333b9afced5220 Mon Sep 17 00:00:00 2001 From: Viktor Liu Date: Wed, 18 Mar 2026 20:35:41 +0100 Subject: [PATCH 52/52] Remove accidentally committed pprof files --- management/cmd/pprof.go | 31 ------------------------------- signal/cmd/pprof.go | 31 ------------------------------- 2 files changed, 62 deletions(-) delete mode 100644 management/cmd/pprof.go delete mode 100644 signal/cmd/pprof.go diff --git a/management/cmd/pprof.go b/management/cmd/pprof.go deleted file mode 100644 index f707ac74382..00000000000 --- a/management/cmd/pprof.go +++ /dev/null @@ -1,31 +0,0 @@ -//go:build pprof - -package cmd - -import ( - "net/http" - _ "net/http/pprof" - "os" - - log "github.com/sirupsen/logrus" -) - -func init() { - addr := pprofAddr() - go pprof(addr) -} - -func pprofAddr() string { - listenAddr := os.Getenv("NB_PPROF_ADDR") - if listenAddr == "" { - return "localhost:6060" - } - return listenAddr -} - -func pprof(listenAddr string) { - log.Infof("listening pprof on: %s", listenAddr) - if err := http.ListenAndServe(listenAddr, nil); err != nil { - log.Fatalf("pprof server: %v", err) - } -} diff --git a/signal/cmd/pprof.go b/signal/cmd/pprof.go deleted file mode 100644 index f707ac74382..00000000000 --- a/signal/cmd/pprof.go +++ /dev/null @@ -1,31 +0,0 @@ -//go:build pprof - -package cmd - -import ( - "net/http" - _ "net/http/pprof" - "os" - - log "github.com/sirupsen/logrus" -) - -func init() { - addr := pprofAddr() - go pprof(addr) -} - -func pprofAddr() string { - listenAddr := os.Getenv("NB_PPROF_ADDR") - if listenAddr == "" { - return "localhost:6060" - } - return listenAddr -} - -func pprof(listenAddr string) { - log.Infof("listening pprof on: %s", listenAddr) - if err := http.ListenAndServe(listenAddr, nil); err != nil { - log.Fatalf("pprof server: %v", err) - } -}