From 85b6727f8fde68d125e6f3d434f2177bdb4d79b6 Mon Sep 17 00:00:00 2001 From: Russell Jones Date: Thu, 14 Oct 2021 00:04:40 +0000 Subject: [PATCH] Added metrics for missing SSH tunnels. Added metrics and logging for missing SSH reverse tunnels. This is useful for debugging to find if nodes are discovering all proxies. --- docs/pages/setup/reference/metrics.mdx | 1 + lib/reversetunnel/localsite.go | 95 +++++++++++++++++++++++++- metrics.go | 3 + 3 files changed, 97 insertions(+), 2 deletions(-) diff --git a/docs/pages/setup/reference/metrics.mdx b/docs/pages/setup/reference/metrics.mdx index 7864830fe7e70..3501ef5325f4c 100644 --- a/docs/pages/setup/reference/metrics.mdx +++ b/docs/pages/setup/reference/metrics.mdx @@ -104,6 +104,7 @@ Now you can see the monitoring information by visiting several endpoints: | `promhttp_metric_handler_requests_in_flight` | gauge | prometheus | Current number of scrapes being served. | | `promhttp_metric_handler_requests_total` | counter | prometheus | Total number of scrapes by HTTP status code. | | `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the proxy connection limit. | +| `proxy_missing_ssh_tunnels` | gauge | Teleport Proxy | Number of missing SSH tunnels. Used to debug if nodes have discovered all proxies. | | `reversetunnel_connected_proxies` | gauge | Teleport | Number of known proxies being sought. | | `rx` | counter | Teleport | Number of bytes received. | | `server_interactive_sessions_total` | gauge | Teleport | Number of active sessions. | diff --git a/lib/reversetunnel/localsite.go b/lib/reversetunnel/localsite.go index 87ec5480a5a63..6599e5af7fb60 100644 --- a/lib/reversetunnel/localsite.go +++ b/lib/reversetunnel/localsite.go @@ -29,9 +29,12 @@ import ( "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/api/utils/sshutils" "github.com/gravitational/teleport/lib/auth" + "github.com/gravitational/teleport/lib/defaults" "github.com/gravitational/teleport/lib/services" "github.com/gravitational/teleport/lib/srv/forward" + "github.com/gravitational/teleport/lib/utils" "github.com/gravitational/teleport/lib/utils/proxy" + "github.com/prometheus/client_golang/prometheus" "github.com/gravitational/trace" "github.com/jonboulle/clockwork" @@ -39,6 +42,11 @@ import ( ) func newlocalSite(srv *server, domainName string, client auth.ClientI) (*localSite, error) { + err := utils.RegisterPrometheusCollectors(localClusterCollectors...) + if err != nil { + return nil, trace.Wrap(err) + } + accessPoint, err := srv.newAccessPoint(client, []string{"reverse", domainName}) if err != nil { return nil, trace.Wrap(err) @@ -53,7 +61,7 @@ func newlocalSite(srv *server, domainName string, client auth.ClientI) (*localSi return nil, trace.Wrap(err) } - return &localSite{ + s := &localSite{ srv: srv, client: client, accessPoint: accessPoint, @@ -68,7 +76,12 @@ func newlocalSite(srv *server, domainName string, client auth.ClientI) (*localSi }, }), offlineThreshold: srv.offlineThreshold, - }, nil + } + + // Start periodic functions for the the local cluster in the background. + go s.periodicFunctions() + + return s, nil } // localSite allows to directly access the remote servers @@ -468,3 +481,81 @@ func (s *localSite) chanTransportConn(rconn *remoteConn, dreq *sshutils.DialReq) return conn, nil } + +// periodicFunctions runs functions periodic functions for the local cluster. +func (s *localSite) periodicFunctions() { + ticker := time.NewTicker(defaults.ResyncInterval) + defer ticker.Stop() + + for { + select { + case <-s.srv.ctx.Done(): + return + case <-ticker.C: + if err := s.sshTunnelStats(); err != nil { + s.log.Warningf("Failed to report SSH tunnel statistics for: %v: %v.", s.domainName, err) + } + } + } +} + +// sshTunnelStats reports SSH tunnel statistics for the cluster. +func (s *localSite) sshTunnelStats() error { + servers, err := s.accessPoint.GetNodes(s.srv.ctx, apidefaults.Namespace) + if err != nil { + return trace.Wrap(err) + } + + var missing []string + + for _, server := range servers { + // Skip over any servers that that have a TTL larger than announce TTL (10 + // minutes) and are non-IoT SSH servers (they won't have tunnels). + // + // Servers with a TTL larger than the announce TTL skipped over to work around + // an issue with DynamoDB where objects can hang around for 48 hours after + // their TTL value. + ttl := s.clock.Now().Add(-1 * apidefaults.ServerAnnounceTTL) + if server.Expiry().Before(ttl) { + continue + } + if !server.GetUseTunnel() { + continue + } + + // Check if the tunnel actually exists. + _, err := s.getRemoteConn(&sshutils.DialReq{ + ServerID: fmt.Sprintf("%v.%v", server.GetName(), s.domainName), + ConnType: types.NodeTunnel, + }) + if err == nil { + continue + } + + missing = append(missing, server.GetName()) + } + + // Update Prometheus metrics and also log if any tunnels are missing. + missingSSHTunnels.Set(float64(len(missing))) + if len(missing) > 0 { + // Don't show all the missing nodes, thousands could be missing, just show + // the first 10. + n := len(missing) + if n > 10 { + n = 10 + } + log.Debugf("Cluster %v is missing %v tunnels. A small number of missing tunnels is normal, for example, a node could have just been shut down, the proxy restarted, etc. However, if this error persists with an elevated number of missing tunnels, it often indicates nodes can not discover all registered proxies. Check that all of your proxies are behind a load balancer and the load balancer is using a round robin strategy. Some of the missing hosts: %v.", s.domainName, len(missing), missing[:n]) + } + return nil +} + +var ( + missingSSHTunnels = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: teleport.MetricMissingSSHTunnels, + Help: "Number of missing SSH tunnels", + }, + ) + + localClusterCollectors = []prometheus.Collector{missingSSHTunnels} +) diff --git a/metrics.go b/metrics.go index 6a18e840ab974..dc29142f4aa4d 100644 --- a/metrics.go +++ b/metrics.go @@ -76,6 +76,9 @@ const ( // MetricWatcherEventSizes measures the size of watcher events that are emitted MetricWatcherEventSizes = "watcher_event_sizes" + // MetricMissingSSHTunnels returns the number of missing SSH tunnels for this proxy. + MetricMissingSSHTunnels = "proxy_missing_ssh_tunnels" + // TagCluster is a metric tag for a cluster TagCluster = "cluster" )