diff --git a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json index 9ee69716..77ee965f 100644 --- a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json +++ b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json @@ -27,7 +27,7 @@ "liveNow": false, "panels": [ { - "description": "Combined messages received by all servers", + "description": "Aggregate messages received by all servers from clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -101,7 +101,7 @@ } }, { - "description": "Combined messages sent by all servers", + "description": "Aggregate messages sent by all servers to clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -175,7 +175,7 @@ } }, { - "description": "Combined bytes received by all servers", + "description": "Aggregate bytes received by all servers from clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -249,7 +249,7 @@ } }, { - "description": "Combined bytes sent by all servers", + "description": "Aggregate bytes sent by all servers to clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -2214,7 +2214,7 @@ "type": "timeseries" }, { - "description": "Combined total messages for all servers", + "description": "Aggregate messages across all servers and all connection types: clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -2310,7 +2310,7 @@ } }, { - "description": "Combined cluster network usage", + "description": "Aggregate network usage across all servers and all connection types: clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -3905,14 +3905,14 @@ "id": 24, "panels": [ { - "description": "Data sent per second", + "description": "Messages/s sent by servers to client connections (excludes routes, gateways, leafnodes)", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisLabel": "msgs/s", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -3930,7 +3930,7 @@ "type": "linear" }, "showPoints": "never", - "spanNulls": true, + "spanNulls": false, "stacking": { "group": "A", "mode": "none" @@ -3954,17 +3954,17 @@ } ] }, - "unit": "Bps" + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 7, + "h": 9, "w": 12, "x": 0, "y": 9 }, - "id": 26, + "id": 68, "interval": "", "options": { "legend": { @@ -3975,32 +3975,35 @@ "min" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false }, "tooltip": { - "mode": "single" + "mode": "multi", + "sort": "none" } }, "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_sent_bytes{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_sent_to_client_msgs_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Bytes Sent", + "title": "Messages Sent to Clients", "type": "timeseries" }, { - "description": "Data received per second", + "description": "Messages/s received by servers from client connections (excludes routes, gateways, leafnodes)", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisLabel": "msgs/s", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -4018,7 +4021,7 @@ "type": "linear" }, "showPoints": "never", - "spanNulls": true, + "spanNulls": false, "stacking": { "group": "A", "mode": "none" @@ -4042,17 +4045,17 @@ } ] }, - "unit": "Bps" + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 7, + "h": 9, "w": 12, "x": 12, "y": 9 }, - "id": 27, + "id": 69, "interval": "", "options": { "legend": { @@ -4063,25 +4066,28 @@ "min" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false }, "tooltip": { - "mode": "single" + "mode": "multi", + "sort": "none" } }, "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_recv_bytes{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_recv_from_client_msgs_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Bytes Received", + "title": "Messages Received from Clients", "type": "timeseries" }, { - "description": "Messages sent per second", + "description": "Bytes/s sent by servers to client connections (excludes routes, gateways, leafnodes)", "fieldConfig": { "defaults": { "color": { @@ -4106,7 +4112,7 @@ "type": "linear" }, "showPoints": "never", - "spanNulls": true, + "spanNulls": false, "stacking": { "group": "A", "mode": "none" @@ -4130,17 +4136,17 @@ } ] }, - "unit": "short" + "unit": "Bps" }, "overrides": [] }, "gridPos": { - "h": 7, + "h": 9, "w": 12, "x": 0, - "y": 16 + "y": 18 }, - "id": 68, + "id": 26, "interval": "", "options": { "legend": { @@ -4151,25 +4157,28 @@ "min" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false }, "tooltip": { - "mode": "single" + "mode": "multi", + "sort": "none" } }, "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_sent_msgs_count{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_sent_to_client_bytes_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Messages Sent", + "title": "Bytes Sent to Clients", "type": "timeseries" }, { - "description": "Messages received per second", + "description": "Bytes/s received by servers from client connections (excludes routes, gateways, leafnodes)", "fieldConfig": { "defaults": { "color": { @@ -4194,7 +4203,7 @@ "type": "linear" }, "showPoints": "never", - "spanNulls": true, + "spanNulls": false, "stacking": { "group": "A", "mode": "none" @@ -4218,17 +4227,17 @@ } ] }, - "unit": "short" + "unit": "Bps" }, "overrides": [] }, "gridPos": { - "h": 7, + "h": 9, "w": 12, "x": 12, - "y": 16 + "y": 18 }, - "id": 69, + "id": 27, "interval": "", "options": { "legend": { @@ -4239,21 +4248,24 @@ "min" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false }, "tooltip": { - "mode": "single" + "mode": "multi", + "sort": "none" } }, "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_recv_msgs_count{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_recv_from_client_bytes_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Messages Received", + "title": "Bytes Received from Clients", "type": "timeseries" } ], diff --git a/go.mod b/go.mod index 7dac884f..d4d96d07 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/fsnotify/fsnotify v1.10.1 github.com/klauspost/compress v1.18.6 github.com/nats-io/jsm.go v0.4.1 - github.com/nats-io/nats-server/v2 v2.14.0 + github.com/nats-io/nats-server/v2 v2.14.1 github.com/nats-io/nats.go v1.52.0 github.com/nats-io/nuid v1.0.1 github.com/prometheus/client_golang v1.23.2 @@ -38,7 +38,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/nats-io/jwt/v2 v2.8.1 // indirect github.com/nats-io/nkeys v0.4.15 // indirect - github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/pelletier/go-toml/v2 v2.3.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/procfs v0.20.1 // indirect github.com/sagikazarmark/locafero v0.12.0 // indirect diff --git a/go.sum b/go.sum index 36f64bf9..7d0a8ebb 100644 --- a/go.sum +++ b/go.sum @@ -49,8 +49,8 @@ github.com/nats-io/jsm.go v0.4.1 h1:hT0Ksd8Jk5wg9uZKiM5EMAJwSoC8zAu1ivPVB5UA9cI= github.com/nats-io/jsm.go v0.4.1/go.mod h1:rWdrAnJSsCBjjeGbkSvMCB17oPTn+A5kXidixqn0M/E= github.com/nats-io/jwt/v2 v2.8.1 h1:V0xpGuD/N8Mi+fQNDynXohVvp7ZztevW5io8CUWlPmU= github.com/nats-io/jwt/v2 v2.8.1/go.mod h1:nWnOEEiVMiKHQpnAy4eXlizVEtSfzacZ1Q43LIRavZg= -github.com/nats-io/nats-server/v2 v2.14.0 h1:+8q0HrDFotwLLcGH/legOEOnowunhK+aZ4GYBIWpQlM= -github.com/nats-io/nats-server/v2 v2.14.0/go.mod h1:ImVUUDvfClJbb6cuJQRc1VmgDCXKM5ds0OoiG9MVOKo= +github.com/nats-io/nats-server/v2 v2.14.1 h1:wXs/a5fw9Hzm3CvuzLxGeIwpjPulSa7gMT3eSuhGkcg= +github.com/nats-io/nats-server/v2 v2.14.1/go.mod h1:4N17zLpuS7WMbG8T9gsE2B7z9hC9PraPyulVBfpK6nU= github.com/nats-io/nats.go v1.52.0 h1:n3avV4VBsCgsdwh71TppsTwtv+QdPs7ntSKM8qJLGsc= github.com/nats-io/nats.go v1.52.0/go.mod h1:26HypzazeOkyO3/mqd1zZd53STJN0EjCYF9Uy2ZOBno= github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4= @@ -63,6 +63,8 @@ github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= +github.com/pelletier/go-toml/v2 v2.3.1 h1:MYEvvGnQjeNkRF1qUuGolNtNExTDwct51yp7olPtrEc= +github.com/pelletier/go-toml/v2 v2.3.1/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= diff --git a/surveyor/collector_statz.go b/surveyor/collector_statz.go index eba69df7..d9b36902 100644 --- a/surveyor/collector_statz.go +++ b/surveyor/collector_statz.go @@ -55,25 +55,29 @@ var CollectJszIds = map[CollectJsz][]string{ // statzDescs holds the metric descriptions type statzDescs struct { - Info *GaugeVec - Start *GaugeVec - Uptime *GaugeVec - Mem *GaugeVec - GoMemLimit *GaugeVec - Cores *GaugeVec - CPU *GaugeVec - Connections *GaugeVec - TotalConnections *CounterVec - ActiveAccounts *GaugeVec - NumSubs *GaugeVec - SentMsgs *CounterVec - SentBytes *CounterVec - RecvMsgs *CounterVec - RecvBytes *CounterVec - SlowConsumers *GaugeVec - RTT *GaugeVec - Routes *GaugeVec - Gateways *GaugeVec + Info *GaugeVec + Start *GaugeVec + Uptime *GaugeVec + Mem *GaugeVec + GoMemLimit *GaugeVec + Cores *GaugeVec + CPU *GaugeVec + Connections *GaugeVec + TotalConnections *CounterVec + ActiveAccounts *GaugeVec + NumSubs *GaugeVec + SentMsgs *CounterVec + SentBytes *CounterVec + SentToClientMsgs *CounterVec + SentToClientBytes *CounterVec + RecvMsgs *CounterVec + RecvBytes *CounterVec + RecvFromClientMsgs *CounterVec + RecvFromClientBytes *CounterVec + SlowConsumers *GaugeVec + RTT *GaugeVec + Routes *GaugeVec + Gateways *GaugeVec // Routes RouteSentMsgs *CounterVec @@ -400,10 +404,15 @@ func (sc *StatzCollector) buildDescs() { sc.descs.TotalConnections = newCounterVec(newName("total_connection_count"), "Total number of client connections serviced counter", sc.constLabels, sc.serverLabels) sc.descs.ActiveAccounts = newGaugeVec(newName("active_account_count"), "Number of active accounts gauge", sc.constLabels, sc.serverLabels) sc.descs.NumSubs = newGaugeVec(newName("subs_count"), "Current number of subscriptions gauge", sc.constLabels, sc.serverLabels) - sc.descs.SentMsgs = newCounterVec(newName("sent_msgs_count"), "Number of messages sent counter", sc.constLabels, sc.serverLabels) - sc.descs.SentBytes = newCounterVec(newName("sent_bytes"), "Number of bytes sent counter", sc.constLabels, sc.serverLabels) - sc.descs.RecvMsgs = newCounterVec(newName("recv_msgs_count"), "Number of messages received counter", sc.constLabels, sc.serverLabels) - sc.descs.RecvBytes = newCounterVec(newName("recv_bytes"), "Number of bytes received counter", sc.constLabels, sc.serverLabels) + // _total is the correct Prometheus suffix for cumulative counters; legacy _count names are kept to avoid breaking existing user dashboards + sc.descs.SentMsgs = newCounterVec(newName("sent_msgs_count"), "Number of messages sent by the server to all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) + sc.descs.SentBytes = newCounterVec(newName("sent_bytes"), "Number of bytes sent by the server to all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) + sc.descs.SentToClientMsgs = newCounterVec(newName("sent_to_client_msgs_total"), "Number of messages sent by the server to client connections (excludes routes, gateways, leafnodes) counter", sc.constLabels, sc.serverLabels) + sc.descs.SentToClientBytes = newCounterVec(newName("sent_to_client_bytes_total"), "Number of bytes sent by the server to client connections (excludes routes, gateways, leafnodes) counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvMsgs = newCounterVec(newName("recv_msgs_count"), "Number of messages received by the server from all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvBytes = newCounterVec(newName("recv_bytes"), "Number of bytes received by the server from all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvFromClientMsgs = newCounterVec(newName("recv_from_client_msgs_total"), "Number of messages received by the server from client connections (excludes routes, gateways, leafnodes) counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvFromClientBytes = newCounterVec(newName("recv_from_client_bytes_total"), "Number of bytes received by the server from client connections (excludes routes, gateways, leafnodes) counter", sc.constLabels, sc.serverLabels) sc.descs.SlowConsumers = newGaugeVec(newName("slow_consumer_count"), "Number of slow consumers gauge", sc.constLabels, sc.serverLabels) sc.descs.RTT = newGaugeVec(newName("rtt_nanoseconds"), "RTT in nanoseconds gauge", sc.constLabels, sc.serverLabels) sc.descs.Routes = newGaugeVec(newName("route_count"), "Number of active routes gauge", sc.constLabels, sc.serverLabels) @@ -1536,8 +1545,12 @@ func (sc *StatzCollector) MetricInfos() []MetricInfo { sc.descs.NumSubs, sc.descs.SentMsgs, sc.descs.SentBytes, + sc.descs.SentToClientMsgs, + sc.descs.SentToClientBytes, sc.descs.RecvMsgs, sc.descs.RecvBytes, + sc.descs.RecvFromClientMsgs, + sc.descs.RecvFromClientBytes, sc.descs.SlowConsumers, sc.descs.RTT, sc.descs.Routes, @@ -1791,8 +1804,12 @@ func (sc *StatzCollector) Collect(ch chan<- prometheus.Metric) { metrics.newGaugeMetric(sc.descs.NumSubs, float64(sm.Stats.NumSubs), labels) metrics.newCounterMetric(sc.descs.SentMsgs, float64(sm.Stats.Sent.Msgs), labels) metrics.newCounterMetric(sc.descs.SentBytes, float64(sm.Stats.Sent.Bytes), labels) + metrics.newCounterMetric(sc.descs.SentToClientMsgs, float64(sm.Stats.SentToClients.Msgs), labels) + metrics.newCounterMetric(sc.descs.SentToClientBytes, float64(sm.Stats.SentToClients.Bytes), labels) metrics.newCounterMetric(sc.descs.RecvMsgs, float64(sm.Stats.Received.Msgs), labels) metrics.newCounterMetric(sc.descs.RecvBytes, float64(sm.Stats.Received.Bytes), labels) + metrics.newCounterMetric(sc.descs.RecvFromClientMsgs, float64(sm.Stats.ReceivedFromClients.Msgs), labels) + metrics.newCounterMetric(sc.descs.RecvFromClientBytes, float64(sm.Stats.ReceivedFromClients.Bytes), labels) metrics.newGaugeMetric(sc.descs.SlowConsumers, float64(sm.Stats.SlowConsumers), labels) metrics.newGaugeMetric(sc.descs.RTT, float64(sc.rtts[sm.Server.ID]), labels) metrics.newGaugeMetric(sc.descs.Routes, float64(len(sm.Stats.Routes)), labels) diff --git a/surveyor/collector_statz_test.go b/surveyor/collector_statz_test.go index 7cca5aa2..5f0c85bf 100644 --- a/surveyor/collector_statz_test.go +++ b/surveyor/collector_statz_test.go @@ -421,6 +421,44 @@ func TestStatzCollector_GoMemLimit(t *testing.T) { } } +func TestStatzCollector_ClientTrafficMetrics(t *testing.T) { + const ( + sentMsgs = 11 + sentBytes = 23 + recvMsgs = 37 + recvBytes = 53 + ) + + stats := &server.ServerStatsMsg{ + Server: server.ServerInfo{ID: "test-server", Name: "test-server"}, + Stats: server.ServerStats{ + SentToClients: server.DataStats{MsgBytes: server.MsgBytes{Msgs: sentMsgs, Bytes: sentBytes}}, + ReceivedFromClients: server.DataStats{MsgBytes: server.MsgBytes{Msgs: recvMsgs, Bytes: recvBytes}}, + }, + } + + sc, err := NewStatzCollectorOpts( + WithStats(WithStatsBatch{Stats: []*server.ServerStatsMsg{stats}}), + ) + if err != nil { + t.Fatalf("error creating statz collector: %v", err) + } + + output := gatherStatzCollectorMetrics(t, sc) + + want := []string{ + `nats_core_sent_to_client_msgs_total{server_cluster="",server_id="test-server",server_name="test-server"} 11`, + `nats_core_sent_to_client_bytes_total{server_cluster="",server_id="test-server",server_name="test-server"} 23`, + `nats_core_recv_from_client_msgs_total{server_cluster="",server_id="test-server",server_name="test-server"} 37`, + `nats_core_recv_from_client_bytes_total{server_cluster="",server_id="test-server",server_name="test-server"} 53`, + } + for _, w := range want { + if !strings.Contains(output, w) { + t.Errorf("missing or wrong metric line:\n want: %s\n output:\n%v", w, output) + } + } +} + func TestStatzCollector_MetricInfos(t *testing.T) { sc, err := NewStatzCollectorOpts( WithStats(WithStatsBatch{