From c34274e64549e14a94ad23546c30353849024111 Mon Sep 17 00:00:00 2001 From: Alex Bozhenko Date: Wed, 25 Feb 2026 19:56:10 +0300 Subject: [PATCH 1/7] add new metrics --- .../dashboards/nats-surveyor-dashboard.json | 242 +++++++++++++++--- surveyor/collector_statz.go | 24 +- 2 files changed, 232 insertions(+), 34 deletions(-) diff --git a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json index 9ee69716..68a5c1b6 100644 --- a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json +++ b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json @@ -27,7 +27,7 @@ "liveNow": false, "panels": [ { - "description": "Combined messages received by all servers", + "description": "Aggregate messages received by all servers from clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -101,7 +101,7 @@ } }, { - "description": "Combined messages sent by all servers", + "description": "Aggregate messages sent by all servers to clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -175,7 +175,7 @@ } }, { - "description": "Combined bytes received by all servers", + "description": "Aggregate bytes received by all servers from clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -249,7 +249,7 @@ } }, { - "description": "Combined bytes sent by all servers", + "description": "Aggregate bytes sent by all servers to clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -2214,7 +2214,181 @@ "type": "timeseries" }, { - "description": "Combined total messages for all servers", + "datasource": { + "type": "prometheus", + "uid": "${prom_ds}" + }, + "description": "Number of streams each server is the leader for (includes R1 streams). The top of the stack is the total unique stream count across the cluster.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Streams", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 311, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false, + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "nats_core_jetstream_server_total_stream_leaders{cluster_name=~\"$cluster\"}", + "legendFormat": "{{server_name}}", + "refId": "B" + } + ], + "title": "Stream Leaders by Server (stacked)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prom_ds}" + }, + "description": "Number of consumers each server is the leader for (includes R1 consumers). The top of the stack is the total unique consumer count across the cluster.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Consumers", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 312, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false, + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "nats_core_jetstream_server_total_consumer_leaders{cluster_name=~\"$cluster\"}", + "legendFormat": "{{server_name}}", + "refId": "B" + } + ], + "title": "Consumer Leaders by Server (stacked)", + "type": "timeseries" + }, + { + "description": "Aggregate messages across all servers and all connection types: clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -2310,7 +2484,7 @@ } }, { - "description": "Combined cluster network usage", + "description": "Aggregate network usage across all servers and all connection types: clients, routes, gateways and leafnodes", "fieldConfig": { "defaults": { "color": { @@ -3905,7 +4079,7 @@ "id": 24, "panels": [ { - "description": "Data sent per second", + "description": "Data sent to clients per second", "fieldConfig": { "defaults": { "color": { @@ -3959,7 +4133,7 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 9, "w": 12, "x": 0, "y": 9 @@ -3975,7 +4149,9 @@ "min" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false }, "tooltip": { "mode": "single" @@ -3984,16 +4160,16 @@ "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_sent_bytes{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_sent_client_bytes{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Bytes Sent", + "title": "Bytes Sent to Clients", "type": "timeseries" }, { - "description": "Data received per second", + "description": "Data received from clients per second", "fieldConfig": { "defaults": { "color": { @@ -4047,7 +4223,7 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 9, "w": 12, "x": 12, "y": 9 @@ -4063,7 +4239,9 @@ "min" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false }, "tooltip": { "mode": "single" @@ -4072,23 +4250,23 @@ "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_recv_bytes{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_recv_client_bytes{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Bytes Received", + "title": "Bytes Received from Clients", "type": "timeseries" }, { - "description": "Messages sent per second", + "description": "Messages sent to clients per second", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisLabel": "msgs/s", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -4135,10 +4313,10 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 9, "w": 12, "x": 0, - "y": 16 + "y": 18 }, "id": 68, "interval": "", @@ -4151,7 +4329,9 @@ "min" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false }, "tooltip": { "mode": "single" @@ -4160,23 +4340,23 @@ "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_sent_msgs_count{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_sent_client_msgs_count{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Messages Sent", + "title": "Messages Sent to Clients", "type": "timeseries" }, { - "description": "Messages received per second", + "description": "Messages received from clients per second", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisLabel": "msgs/s", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -4223,10 +4403,10 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 9, "w": 12, "x": 12, - "y": 16 + "y": 18 }, "id": 69, "interval": "", @@ -4239,7 +4419,9 @@ "min" ], "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "sortBy": "Name", + "sortDesc": false }, "tooltip": { "mode": "single" @@ -4248,12 +4430,12 @@ "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_recv_msgs_count{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_recv_client_msgs_count{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Messages Received", + "title": "Messages Received from Clients", "type": "timeseries" } ], diff --git a/surveyor/collector_statz.go b/surveyor/collector_statz.go index eba69df7..cfaff6d7 100644 --- a/surveyor/collector_statz.go +++ b/surveyor/collector_statz.go @@ -68,8 +68,12 @@ type statzDescs struct { NumSubs *GaugeVec SentMsgs *CounterVec SentBytes *CounterVec + SentClientMsgs *CounterVec + SentClientBytes *CounterVec RecvMsgs *CounterVec RecvBytes *CounterVec + RecvClientMsgs *CounterVec + RecvClientBytes *CounterVec SlowConsumers *GaugeVec RTT *GaugeVec Routes *GaugeVec @@ -400,10 +404,14 @@ func (sc *StatzCollector) buildDescs() { sc.descs.TotalConnections = newCounterVec(newName("total_connection_count"), "Total number of client connections serviced counter", sc.constLabels, sc.serverLabels) sc.descs.ActiveAccounts = newGaugeVec(newName("active_account_count"), "Number of active accounts gauge", sc.constLabels, sc.serverLabels) sc.descs.NumSubs = newGaugeVec(newName("subs_count"), "Current number of subscriptions gauge", sc.constLabels, sc.serverLabels) - sc.descs.SentMsgs = newCounterVec(newName("sent_msgs_count"), "Number of messages sent counter", sc.constLabels, sc.serverLabels) - sc.descs.SentBytes = newCounterVec(newName("sent_bytes"), "Number of bytes sent counter", sc.constLabels, sc.serverLabels) - sc.descs.RecvMsgs = newCounterVec(newName("recv_msgs_count"), "Number of messages received counter", sc.constLabels, sc.serverLabels) - sc.descs.RecvBytes = newCounterVec(newName("recv_bytes"), "Number of bytes received counter", sc.constLabels, sc.serverLabels) + sc.descs.SentMsgs = newCounterVec(newName("sent_msgs_count"), "Number of messages sent by the server to all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) + sc.descs.SentBytes = newCounterVec(newName("sent_bytes"), "Number of bytes sent by the server to all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) + sc.descs.SentClientMsgs = newCounterVec(newName("sent_client_msgs_count"), "Number of messages sent by the server to clients only counter", sc.constLabels, sc.serverLabels) + sc.descs.SentClientBytes = newCounterVec(newName("sent_client_bytes"), "Number of bytes sent by the server to clients only counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvMsgs = newCounterVec(newName("recv_msgs_count"), "Number of messages received by the server from all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvBytes = newCounterVec(newName("recv_bytes"), "Number of bytes received by the server from all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvClientMsgs = newCounterVec(newName("recv_client_msgs_count"), "Number of messages received by the server from clients only counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvClientBytes = newCounterVec(newName("recv_client_bytes"), "Number of bytes received by the server from clients only counter", sc.constLabels, sc.serverLabels) sc.descs.SlowConsumers = newGaugeVec(newName("slow_consumer_count"), "Number of slow consumers gauge", sc.constLabels, sc.serverLabels) sc.descs.RTT = newGaugeVec(newName("rtt_nanoseconds"), "RTT in nanoseconds gauge", sc.constLabels, sc.serverLabels) sc.descs.Routes = newGaugeVec(newName("route_count"), "Number of active routes gauge", sc.constLabels, sc.serverLabels) @@ -1536,8 +1544,12 @@ func (sc *StatzCollector) MetricInfos() []MetricInfo { sc.descs.NumSubs, sc.descs.SentMsgs, sc.descs.SentBytes, + sc.descs.SentClientMsgs, + sc.descs.SentClientBytes, sc.descs.RecvMsgs, sc.descs.RecvBytes, + sc.descs.RecvClientMsgs, + sc.descs.RecvClientBytes, sc.descs.SlowConsumers, sc.descs.RTT, sc.descs.Routes, @@ -1791,8 +1803,12 @@ func (sc *StatzCollector) Collect(ch chan<- prometheus.Metric) { metrics.newGaugeMetric(sc.descs.NumSubs, float64(sm.Stats.NumSubs), labels) metrics.newCounterMetric(sc.descs.SentMsgs, float64(sm.Stats.Sent.Msgs), labels) metrics.newCounterMetric(sc.descs.SentBytes, float64(sm.Stats.Sent.Bytes), labels) + metrics.newCounterMetric(sc.descs.SentClientMsgs, float64(sm.Stats.SentToClients.Msgs), labels) + metrics.newCounterMetric(sc.descs.SentClientBytes, float64(sm.Stats.SentToClients.Bytes), labels) metrics.newCounterMetric(sc.descs.RecvMsgs, float64(sm.Stats.Received.Msgs), labels) metrics.newCounterMetric(sc.descs.RecvBytes, float64(sm.Stats.Received.Bytes), labels) + metrics.newCounterMetric(sc.descs.RecvClientMsgs, float64(sm.Stats.ReceivedFromClients.Msgs), labels) + metrics.newCounterMetric(sc.descs.RecvClientBytes, float64(sm.Stats.ReceivedFromClients.Bytes), labels) metrics.newGaugeMetric(sc.descs.SlowConsumers, float64(sm.Stats.SlowConsumers), labels) metrics.newGaugeMetric(sc.descs.RTT, float64(sc.rtts[sm.Server.ID]), labels) metrics.newGaugeMetric(sc.descs.Routes, float64(len(sm.Stats.Routes)), labels) From b9ef52a4453d67cc1b6b8c047edd195d3d4c9c98 Mon Sep 17 00:00:00 2001 From: Alex Bozhenko Date: Thu, 14 May 2026 16:09:07 -0700 Subject: [PATCH 2/7] fix merge artifact --- .../dashboards/nats-surveyor-dashboard.json | 174 ------------------ 1 file changed, 174 deletions(-) diff --git a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json index 68a5c1b6..257c509f 100644 --- a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json +++ b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json @@ -2213,180 +2213,6 @@ "title": "Consumer Leaders by Server (stacked)", "type": "timeseries" }, - { - "datasource": { - "type": "prometheus", - "uid": "${prom_ds}" - }, - "description": "Number of streams each server is the leader for (includes R1 streams). The top of the stack is the total unique stream count across the cluster.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Streams", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 26 - }, - "id": 311, - "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "sortBy": "Name", - "sortDesc": false, - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "expr": "nats_core_jetstream_server_total_stream_leaders{cluster_name=~\"$cluster\"}", - "legendFormat": "{{server_name}}", - "refId": "B" - } - ], - "title": "Stream Leaders by Server (stacked)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${prom_ds}" - }, - "description": "Number of consumers each server is the leader for (includes R1 consumers). The top of the stack is the total unique consumer count across the cluster.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Consumers", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 26 - }, - "id": 312, - "options": { - "legend": { - "calcs": [ - "lastNotNull" - ], - "displayMode": "list", - "placement": "bottom", - "sortBy": "Name", - "sortDesc": false, - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "expr": "nats_core_jetstream_server_total_consumer_leaders{cluster_name=~\"$cluster\"}", - "legendFormat": "{{server_name}}", - "refId": "B" - } - ], - "title": "Consumer Leaders by Server (stacked)", - "type": "timeseries" - }, { "description": "Aggregate messages across all servers and all connection types: clients, routes, gateways and leafnodes", "fieldConfig": { From 0c0c9bcc807b35b6f89940e8197c3030ef36141f Mon Sep 17 00:00:00 2001 From: Alex Bozhenko Date: Thu, 14 May 2026 16:41:51 -0700 Subject: [PATCH 3/7] rename counters --- .../dashboards/nats-surveyor-dashboard.json | 8 +-- surveyor/collector_statz.go | 71 ++++++++++--------- 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json index 257c509f..c0bc4b5b 100644 --- a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json +++ b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json @@ -3986,7 +3986,7 @@ "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_sent_client_bytes{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_sent_to_client_bytes_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } @@ -4076,7 +4076,7 @@ "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_recv_client_bytes{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_recv_from_client_bytes_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } @@ -4166,7 +4166,7 @@ "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_sent_client_msgs_count{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_sent_to_client_msgs_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } @@ -4256,7 +4256,7 @@ "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_recv_client_msgs_count{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_recv_from_client_msgs_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } diff --git a/surveyor/collector_statz.go b/surveyor/collector_statz.go index cfaff6d7..c793eb50 100644 --- a/surveyor/collector_statz.go +++ b/surveyor/collector_statz.go @@ -55,29 +55,29 @@ var CollectJszIds = map[CollectJsz][]string{ // statzDescs holds the metric descriptions type statzDescs struct { - Info *GaugeVec - Start *GaugeVec - Uptime *GaugeVec - Mem *GaugeVec - GoMemLimit *GaugeVec - Cores *GaugeVec - CPU *GaugeVec - Connections *GaugeVec - TotalConnections *CounterVec - ActiveAccounts *GaugeVec - NumSubs *GaugeVec - SentMsgs *CounterVec - SentBytes *CounterVec - SentClientMsgs *CounterVec - SentClientBytes *CounterVec - RecvMsgs *CounterVec - RecvBytes *CounterVec - RecvClientMsgs *CounterVec - RecvClientBytes *CounterVec - SlowConsumers *GaugeVec - RTT *GaugeVec - Routes *GaugeVec - Gateways *GaugeVec + Info *GaugeVec + Start *GaugeVec + Uptime *GaugeVec + Mem *GaugeVec + GoMemLimit *GaugeVec + Cores *GaugeVec + CPU *GaugeVec + Connections *GaugeVec + TotalConnections *CounterVec + ActiveAccounts *GaugeVec + NumSubs *GaugeVec + SentMsgs *CounterVec + SentBytes *CounterVec + SentToClientMsgs *CounterVec + SentToClientBytes *CounterVec + RecvMsgs *CounterVec + RecvBytes *CounterVec + RecvFromClientMsgs *CounterVec + RecvFromClientBytes *CounterVec + SlowConsumers *GaugeVec + RTT *GaugeVec + Routes *GaugeVec + Gateways *GaugeVec // Routes RouteSentMsgs *CounterVec @@ -406,12 +406,13 @@ func (sc *StatzCollector) buildDescs() { sc.descs.NumSubs = newGaugeVec(newName("subs_count"), "Current number of subscriptions gauge", sc.constLabels, sc.serverLabels) sc.descs.SentMsgs = newCounterVec(newName("sent_msgs_count"), "Number of messages sent by the server to all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) sc.descs.SentBytes = newCounterVec(newName("sent_bytes"), "Number of bytes sent by the server to all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) - sc.descs.SentClientMsgs = newCounterVec(newName("sent_client_msgs_count"), "Number of messages sent by the server to clients only counter", sc.constLabels, sc.serverLabels) - sc.descs.SentClientBytes = newCounterVec(newName("sent_client_bytes"), "Number of bytes sent by the server to clients only counter", sc.constLabels, sc.serverLabels) + // New counters use _total per Prometheus convention; legacy counters keep _count for backwards compat. + sc.descs.SentToClientMsgs = newCounterVec(newName("sent_to_client_msgs_total"), "Number of messages sent by the server to clients only counter", sc.constLabels, sc.serverLabels) + sc.descs.SentToClientBytes = newCounterVec(newName("sent_to_client_bytes_total"), "Number of bytes sent by the server to clients only counter", sc.constLabels, sc.serverLabels) sc.descs.RecvMsgs = newCounterVec(newName("recv_msgs_count"), "Number of messages received by the server from all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) sc.descs.RecvBytes = newCounterVec(newName("recv_bytes"), "Number of bytes received by the server from all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) - sc.descs.RecvClientMsgs = newCounterVec(newName("recv_client_msgs_count"), "Number of messages received by the server from clients only counter", sc.constLabels, sc.serverLabels) - sc.descs.RecvClientBytes = newCounterVec(newName("recv_client_bytes"), "Number of bytes received by the server from clients only counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvFromClientMsgs = newCounterVec(newName("recv_from_client_msgs_total"), "Number of messages received by the server from clients only counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvFromClientBytes = newCounterVec(newName("recv_from_client_bytes_total"), "Number of bytes received by the server from clients only counter", sc.constLabels, sc.serverLabels) sc.descs.SlowConsumers = newGaugeVec(newName("slow_consumer_count"), "Number of slow consumers gauge", sc.constLabels, sc.serverLabels) sc.descs.RTT = newGaugeVec(newName("rtt_nanoseconds"), "RTT in nanoseconds gauge", sc.constLabels, sc.serverLabels) sc.descs.Routes = newGaugeVec(newName("route_count"), "Number of active routes gauge", sc.constLabels, sc.serverLabels) @@ -1544,12 +1545,12 @@ func (sc *StatzCollector) MetricInfos() []MetricInfo { sc.descs.NumSubs, sc.descs.SentMsgs, sc.descs.SentBytes, - sc.descs.SentClientMsgs, - sc.descs.SentClientBytes, + sc.descs.SentToClientMsgs, + sc.descs.SentToClientBytes, sc.descs.RecvMsgs, sc.descs.RecvBytes, - sc.descs.RecvClientMsgs, - sc.descs.RecvClientBytes, + sc.descs.RecvFromClientMsgs, + sc.descs.RecvFromClientBytes, sc.descs.SlowConsumers, sc.descs.RTT, sc.descs.Routes, @@ -1803,12 +1804,12 @@ func (sc *StatzCollector) Collect(ch chan<- prometheus.Metric) { metrics.newGaugeMetric(sc.descs.NumSubs, float64(sm.Stats.NumSubs), labels) metrics.newCounterMetric(sc.descs.SentMsgs, float64(sm.Stats.Sent.Msgs), labels) metrics.newCounterMetric(sc.descs.SentBytes, float64(sm.Stats.Sent.Bytes), labels) - metrics.newCounterMetric(sc.descs.SentClientMsgs, float64(sm.Stats.SentToClients.Msgs), labels) - metrics.newCounterMetric(sc.descs.SentClientBytes, float64(sm.Stats.SentToClients.Bytes), labels) + metrics.newCounterMetric(sc.descs.SentToClientMsgs, float64(sm.Stats.SentToClients.Msgs), labels) + metrics.newCounterMetric(sc.descs.SentToClientBytes, float64(sm.Stats.SentToClients.Bytes), labels) metrics.newCounterMetric(sc.descs.RecvMsgs, float64(sm.Stats.Received.Msgs), labels) metrics.newCounterMetric(sc.descs.RecvBytes, float64(sm.Stats.Received.Bytes), labels) - metrics.newCounterMetric(sc.descs.RecvClientMsgs, float64(sm.Stats.ReceivedFromClients.Msgs), labels) - metrics.newCounterMetric(sc.descs.RecvClientBytes, float64(sm.Stats.ReceivedFromClients.Bytes), labels) + metrics.newCounterMetric(sc.descs.RecvFromClientMsgs, float64(sm.Stats.ReceivedFromClients.Msgs), labels) + metrics.newCounterMetric(sc.descs.RecvFromClientBytes, float64(sm.Stats.ReceivedFromClients.Bytes), labels) metrics.newGaugeMetric(sc.descs.SlowConsumers, float64(sm.Stats.SlowConsumers), labels) metrics.newGaugeMetric(sc.descs.RTT, float64(sc.rtts[sm.Server.ID]), labels) metrics.newGaugeMetric(sc.descs.Routes, float64(len(sm.Stats.Routes)), labels) From 6b4a24e67e737af27daac060684d4161d40d369a Mon Sep 17 00:00:00 2001 From: Alex Bozhenko Date: Thu, 14 May 2026 16:59:25 -0700 Subject: [PATCH 4/7] fix comment --- surveyor/collector_statz.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/surveyor/collector_statz.go b/surveyor/collector_statz.go index c793eb50..d9b36902 100644 --- a/surveyor/collector_statz.go +++ b/surveyor/collector_statz.go @@ -404,15 +404,15 @@ func (sc *StatzCollector) buildDescs() { sc.descs.TotalConnections = newCounterVec(newName("total_connection_count"), "Total number of client connections serviced counter", sc.constLabels, sc.serverLabels) sc.descs.ActiveAccounts = newGaugeVec(newName("active_account_count"), "Number of active accounts gauge", sc.constLabels, sc.serverLabels) sc.descs.NumSubs = newGaugeVec(newName("subs_count"), "Current number of subscriptions gauge", sc.constLabels, sc.serverLabels) + // _total is the correct Prometheus suffix for cumulative counters; legacy _count names are kept to avoid breaking existing user dashboards sc.descs.SentMsgs = newCounterVec(newName("sent_msgs_count"), "Number of messages sent by the server to all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) sc.descs.SentBytes = newCounterVec(newName("sent_bytes"), "Number of bytes sent by the server to all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) - // New counters use _total per Prometheus convention; legacy counters keep _count for backwards compat. - sc.descs.SentToClientMsgs = newCounterVec(newName("sent_to_client_msgs_total"), "Number of messages sent by the server to clients only counter", sc.constLabels, sc.serverLabels) - sc.descs.SentToClientBytes = newCounterVec(newName("sent_to_client_bytes_total"), "Number of bytes sent by the server to clients only counter", sc.constLabels, sc.serverLabels) + sc.descs.SentToClientMsgs = newCounterVec(newName("sent_to_client_msgs_total"), "Number of messages sent by the server to client connections (excludes routes, gateways, leafnodes) counter", sc.constLabels, sc.serverLabels) + sc.descs.SentToClientBytes = newCounterVec(newName("sent_to_client_bytes_total"), "Number of bytes sent by the server to client connections (excludes routes, gateways, leafnodes) counter", sc.constLabels, sc.serverLabels) sc.descs.RecvMsgs = newCounterVec(newName("recv_msgs_count"), "Number of messages received by the server from all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) sc.descs.RecvBytes = newCounterVec(newName("recv_bytes"), "Number of bytes received by the server from all connections including clients, routes, gateways and leafnodes counter", sc.constLabels, sc.serverLabels) - sc.descs.RecvFromClientMsgs = newCounterVec(newName("recv_from_client_msgs_total"), "Number of messages received by the server from clients only counter", sc.constLabels, sc.serverLabels) - sc.descs.RecvFromClientBytes = newCounterVec(newName("recv_from_client_bytes_total"), "Number of bytes received by the server from clients only counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvFromClientMsgs = newCounterVec(newName("recv_from_client_msgs_total"), "Number of messages received by the server from client connections (excludes routes, gateways, leafnodes) counter", sc.constLabels, sc.serverLabels) + sc.descs.RecvFromClientBytes = newCounterVec(newName("recv_from_client_bytes_total"), "Number of bytes received by the server from client connections (excludes routes, gateways, leafnodes) counter", sc.constLabels, sc.serverLabels) sc.descs.SlowConsumers = newGaugeVec(newName("slow_consumer_count"), "Number of slow consumers gauge", sc.constLabels, sc.serverLabels) sc.descs.RTT = newGaugeVec(newName("rtt_nanoseconds"), "RTT in nanoseconds gauge", sc.constLabels, sc.serverLabels) sc.descs.Routes = newGaugeVec(newName("route_count"), "Number of active routes gauge", sc.constLabels, sc.serverLabels) From 6744407910f7ab51d4e3fa143599568dfc49387e Mon Sep 17 00:00:00 2001 From: Alex Bozhenko Date: Thu, 14 May 2026 19:14:38 -0700 Subject: [PATCH 5/7] add test for new metrics --- surveyor/collector_statz_test.go | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/surveyor/collector_statz_test.go b/surveyor/collector_statz_test.go index 7cca5aa2..5f0c85bf 100644 --- a/surveyor/collector_statz_test.go +++ b/surveyor/collector_statz_test.go @@ -421,6 +421,44 @@ func TestStatzCollector_GoMemLimit(t *testing.T) { } } +func TestStatzCollector_ClientTrafficMetrics(t *testing.T) { + const ( + sentMsgs = 11 + sentBytes = 23 + recvMsgs = 37 + recvBytes = 53 + ) + + stats := &server.ServerStatsMsg{ + Server: server.ServerInfo{ID: "test-server", Name: "test-server"}, + Stats: server.ServerStats{ + SentToClients: server.DataStats{MsgBytes: server.MsgBytes{Msgs: sentMsgs, Bytes: sentBytes}}, + ReceivedFromClients: server.DataStats{MsgBytes: server.MsgBytes{Msgs: recvMsgs, Bytes: recvBytes}}, + }, + } + + sc, err := NewStatzCollectorOpts( + WithStats(WithStatsBatch{Stats: []*server.ServerStatsMsg{stats}}), + ) + if err != nil { + t.Fatalf("error creating statz collector: %v", err) + } + + output := gatherStatzCollectorMetrics(t, sc) + + want := []string{ + `nats_core_sent_to_client_msgs_total{server_cluster="",server_id="test-server",server_name="test-server"} 11`, + `nats_core_sent_to_client_bytes_total{server_cluster="",server_id="test-server",server_name="test-server"} 23`, + `nats_core_recv_from_client_msgs_total{server_cluster="",server_id="test-server",server_name="test-server"} 37`, + `nats_core_recv_from_client_bytes_total{server_cluster="",server_id="test-server",server_name="test-server"} 53`, + } + for _, w := range want { + if !strings.Contains(output, w) { + t.Errorf("missing or wrong metric line:\n want: %s\n output:\n%v", w, output) + } + } +} + func TestStatzCollector_MetricInfos(t *testing.T) { sc, err := NewStatzCollectorOpts( WithStats(WithStatsBatch{ From 49af3163d14188ff3cc4899c2a071c31dc2c4bc1 Mon Sep 17 00:00:00 2001 From: Alex Bozhenko Date: Thu, 14 May 2026 20:45:40 -0700 Subject: [PATCH 6/7] nit dashboard fixes --- .../dashboards/nats-surveyor-dashboard.json | 68 ++++++++++--------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json index c0bc4b5b..77ee965f 100644 --- a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json +++ b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json @@ -3905,14 +3905,14 @@ "id": 24, "panels": [ { - "description": "Data sent to clients per second", + "description": "Messages/s sent by servers to client connections (excludes routes, gateways, leafnodes)", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisLabel": "msgs/s", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -3930,7 +3930,7 @@ "type": "linear" }, "showPoints": "never", - "spanNulls": true, + "spanNulls": false, "stacking": { "group": "A", "mode": "none" @@ -3954,7 +3954,7 @@ } ] }, - "unit": "Bps" + "unit": "short" }, "overrides": [] }, @@ -3964,7 +3964,7 @@ "x": 0, "y": 9 }, - "id": 26, + "id": 68, "interval": "", "options": { "legend": { @@ -3980,29 +3980,30 @@ "sortDesc": false }, "tooltip": { - "mode": "single" + "mode": "multi", + "sort": "none" } }, "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_sent_to_client_bytes_total{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_sent_to_client_msgs_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Bytes Sent to Clients", + "title": "Messages Sent to Clients", "type": "timeseries" }, { - "description": "Data received from clients per second", + "description": "Messages/s received by servers from client connections (excludes routes, gateways, leafnodes)", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisLabel": "msgs/s", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -4020,7 +4021,7 @@ "type": "linear" }, "showPoints": "never", - "spanNulls": true, + "spanNulls": false, "stacking": { "group": "A", "mode": "none" @@ -4044,7 +4045,7 @@ } ] }, - "unit": "Bps" + "unit": "short" }, "overrides": [] }, @@ -4054,7 +4055,7 @@ "x": 12, "y": 9 }, - "id": 27, + "id": 69, "interval": "", "options": { "legend": { @@ -4070,29 +4071,30 @@ "sortDesc": false }, "tooltip": { - "mode": "single" + "mode": "multi", + "sort": "none" } }, "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_recv_from_client_bytes_total{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_recv_from_client_msgs_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Bytes Received from Clients", + "title": "Messages Received from Clients", "type": "timeseries" }, { - "description": "Messages sent to clients per second", + "description": "Bytes/s sent by servers to client connections (excludes routes, gateways, leafnodes)", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisLabel": "msgs/s", + "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -4110,7 +4112,7 @@ "type": "linear" }, "showPoints": "never", - "spanNulls": true, + "spanNulls": false, "stacking": { "group": "A", "mode": "none" @@ -4134,7 +4136,7 @@ } ] }, - "unit": "short" + "unit": "Bps" }, "overrides": [] }, @@ -4144,7 +4146,7 @@ "x": 0, "y": 18 }, - "id": 68, + "id": 26, "interval": "", "options": { "legend": { @@ -4160,29 +4162,30 @@ "sortDesc": false }, "tooltip": { - "mode": "single" + "mode": "multi", + "sort": "none" } }, "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_sent_to_client_msgs_total{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_sent_to_client_bytes_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Messages Sent to Clients", + "title": "Bytes Sent to Clients", "type": "timeseries" }, { - "description": "Messages received from clients per second", + "description": "Bytes/s received by servers from client connections (excludes routes, gateways, leafnodes)", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisLabel": "msgs/s", + "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -4200,7 +4203,7 @@ "type": "linear" }, "showPoints": "never", - "spanNulls": true, + "spanNulls": false, "stacking": { "group": "A", "mode": "none" @@ -4224,7 +4227,7 @@ } ] }, - "unit": "short" + "unit": "Bps" }, "overrides": [] }, @@ -4234,7 +4237,7 @@ "x": 12, "y": 18 }, - "id": 69, + "id": 27, "interval": "", "options": { "legend": { @@ -4250,18 +4253,19 @@ "sortDesc": false }, "tooltip": { - "mode": "single" + "mode": "multi", + "sort": "none" } }, "pluginVersion": "8.3.4", "targets": [ { - "expr": "rate(nats_core_recv_from_client_msgs_total{server_cluster=~\"$cluster\"}[1m])", + "expr": "rate(nats_core_recv_from_client_bytes_total{server_cluster=~\"$cluster\"}[1m])", "legendFormat": "{{server_name}}", "refId": "A" } ], - "title": "Messages Received from Clients", + "title": "Bytes Received from Clients", "type": "timeseries" } ], From e3cdb0f570fd3d666e572a259e30040231fe7c5a Mon Sep 17 00:00:00 2001 From: Alex Bozhenko Date: Wed, 20 May 2026 10:30:21 -0700 Subject: [PATCH 7/7] upd go deps --- go.mod | 4 ++-- go.sum | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 7dac884f..d4d96d07 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/fsnotify/fsnotify v1.10.1 github.com/klauspost/compress v1.18.6 github.com/nats-io/jsm.go v0.4.1 - github.com/nats-io/nats-server/v2 v2.14.0 + github.com/nats-io/nats-server/v2 v2.14.1 github.com/nats-io/nats.go v1.52.0 github.com/nats-io/nuid v1.0.1 github.com/prometheus/client_golang v1.23.2 @@ -38,7 +38,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/nats-io/jwt/v2 v2.8.1 // indirect github.com/nats-io/nkeys v0.4.15 // indirect - github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/pelletier/go-toml/v2 v2.3.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/procfs v0.20.1 // indirect github.com/sagikazarmark/locafero v0.12.0 // indirect diff --git a/go.sum b/go.sum index 36f64bf9..7d0a8ebb 100644 --- a/go.sum +++ b/go.sum @@ -49,8 +49,8 @@ github.com/nats-io/jsm.go v0.4.1 h1:hT0Ksd8Jk5wg9uZKiM5EMAJwSoC8zAu1ivPVB5UA9cI= github.com/nats-io/jsm.go v0.4.1/go.mod h1:rWdrAnJSsCBjjeGbkSvMCB17oPTn+A5kXidixqn0M/E= github.com/nats-io/jwt/v2 v2.8.1 h1:V0xpGuD/N8Mi+fQNDynXohVvp7ZztevW5io8CUWlPmU= github.com/nats-io/jwt/v2 v2.8.1/go.mod h1:nWnOEEiVMiKHQpnAy4eXlizVEtSfzacZ1Q43LIRavZg= -github.com/nats-io/nats-server/v2 v2.14.0 h1:+8q0HrDFotwLLcGH/legOEOnowunhK+aZ4GYBIWpQlM= -github.com/nats-io/nats-server/v2 v2.14.0/go.mod h1:ImVUUDvfClJbb6cuJQRc1VmgDCXKM5ds0OoiG9MVOKo= +github.com/nats-io/nats-server/v2 v2.14.1 h1:wXs/a5fw9Hzm3CvuzLxGeIwpjPulSa7gMT3eSuhGkcg= +github.com/nats-io/nats-server/v2 v2.14.1/go.mod h1:4N17zLpuS7WMbG8T9gsE2B7z9hC9PraPyulVBfpK6nU= github.com/nats-io/nats.go v1.52.0 h1:n3avV4VBsCgsdwh71TppsTwtv+QdPs7ntSKM8qJLGsc= github.com/nats-io/nats.go v1.52.0/go.mod h1:26HypzazeOkyO3/mqd1zZd53STJN0EjCYF9Uy2ZOBno= github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4= @@ -63,6 +63,8 @@ github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= +github.com/pelletier/go-toml/v2 v2.3.1 h1:MYEvvGnQjeNkRF1qUuGolNtNExTDwct51yp7olPtrEc= +github.com/pelletier/go-toml/v2 v2.3.1/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=