diff --git a/Dockerfile b/Dockerfile index 2a4cfcc1..d982bff0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.24-alpine3.21 AS build +FROM golang:1.25-alpine3.23 AS build COPY . /go/src/nats-surveyor WORKDIR /go/src/nats-surveyor ENV GO111MODULE=on @@ -8,7 +8,7 @@ RUN go build FROM alpine:latest AS osdeps RUN apk add --no-cache ca-certificates -FROM alpine:3.21 +FROM alpine:3.23 COPY --from=build /go/src/nats-surveyor/nats-surveyor /nats-surveyor COPY --from=osdeps /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 4dbc5fc1..5cf604f9 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -23,7 +23,7 @@ services: - NATS_SURVEYOR_PASSWORD - NATS_SURVEYOR_SERVERS - NATS_SURVEYOR_SERVER_COUNT - command: --count ${NATS_SURVEYOR_SERVER_COUNT} -s "${NATS_SURVEYOR_SERVERS}" --accounts --observe /observations --jetstream /jetstream --jsz all + command: --count ${NATS_SURVEYOR_SERVER_COUNT} -s "${NATS_SURVEYOR_SERVERS}" --accounts --accounts-detailed --observe /observations --jetstream /jetstream --jsz all networks: - monitor-net labels: @@ -33,6 +33,8 @@ services: image: prom/prometheus:${PROMETHEUS_DOCKER_TAG} container_name: prometheus restart: "no" + ports: + - "9090:9090" volumes: - ./prometheus/:/etc/prometheus/ - $PROMETHEUS_STORAGE:/usr/local/share/prometheus diff --git a/docker-compose/grafana/provisioning/dashboards/jetstream-metalayer-snapshot-dashboard.json b/docker-compose/grafana/provisioning/dashboards/jetstream-metalayer-snapshot-dashboard.json new file mode 100644 index 00000000..1a7a0c79 --- /dev/null +++ b/docker-compose/grafana/provisioning/dashboards/jetstream-metalayer-snapshot-dashboard.json @@ -0,0 +1,377 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prom_ds}" + }, + "enable": true, + "hide": false, + "iconColor": "light-orange", + "mappings": {}, + "name": "Metalayer snapshot events annotations", + "tagKeys": "{{server_name}}", + "target": { + "expr": "nats_core_jetstream_meta_snapshot_last_timestamp_seconds *1000", + "interval": "", + "refId": "Anno" + }, + "textFormat": "Metalayer snapshot event", + "titleFormat": "{{server_name}}", + "useValueForTime": true + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": true, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${prom_ds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "nats_core_jetstream_meta_snapshot_last_duration_seconds", + "legendFormat": "{{server_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Metalayer snapshot duration, seconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prom_ds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "s1" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": true, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "nats_core_jetstream_meta_snapshot_pending_bytes", + "legendFormat": "{{server_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Size of the pending entries in the metalayer, bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${prom_ds}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "nats_core_jetstream_meta_snapshot_pending_entries", + "legendFormat": "{{server_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Pending entries in the metalayer", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 42, + "tags": [ + "surveyor" + ], + "templating": { + "list": [ + { + "type": "datasource", + "name": "prom_ds", + "label": "Prometheus Datasource", + "query": "prometheus", + "refresh": 1 + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "JetStream Metalayer snapshot metrics", + "uid": "adrr5gg", + "version": 1, + "weekStart": "" +} diff --git a/surveyor/collector_statz.go b/surveyor/collector_statz.go index c9708d15..244b636d 100644 --- a/surveyor/collector_statz.go +++ b/surveyor/collector_statz.go @@ -112,6 +112,7 @@ type statzDescs struct { JetstreamMetaSnapshotPendingEntries *GaugeVec JetstreamMetaSnapshotPendingBytes *GaugeVec JetstreamMetaSnapshotLastDuration *GaugeVec + JetstreamMetaSnapshotLastTimestamp *GaugeVec // JetStream server stats JetstreamServerDisabled *GaugeVec JetstreamServerStreams *GaugeVec @@ -448,7 +449,8 @@ func (sc *StatzCollector) buildDescs() { jsMetaSnapshotLabelKeys := []string{"server_id", "server_name", "cluster_name"} sc.descs.JetstreamMetaSnapshotPendingEntries = newGaugeVec(newName("jetstream_meta_snapshot_pending_entries"), "Number of pending entries awaiting meta snapshot", sc.constLabels, jsMetaSnapshotLabelKeys) sc.descs.JetstreamMetaSnapshotPendingBytes = newGaugeVec(newName("jetstream_meta_snapshot_pending_bytes"), "Size in bytes of pending entries awaiting meta snapshot", sc.constLabels, jsMetaSnapshotLabelKeys) - sc.descs.JetstreamMetaSnapshotLastDuration = newGaugeVec(newName("jetstream_meta_snapshot_last_duration"), "Duration of the last meta snapshot in nanoseconds", sc.constLabels, jsMetaSnapshotLabelKeys) + sc.descs.JetstreamMetaSnapshotLastDuration = newGaugeVec(newName("jetstream_meta_snapshot_last_duration_seconds"), "Duration of the last meta snapshot in seconds", sc.constLabels, jsMetaSnapshotLabelKeys) + sc.descs.JetstreamMetaSnapshotLastTimestamp = newGaugeVec(newName("jetstream_meta_snapshot_last_timestamp_seconds"), "Timestamp of the last meta snapshot as Unix epoch in seconds", sc.constLabels, jsMetaSnapshotLabelKeys) jsServerLabelKeys := []string{"server_id", "server_name", "cluster_name"} sc.descs.JetstreamServerDisabled = newGaugeVec(newName("jetstream_server_jetstream_disabled"), "JetStream disabled or not", sc.constLabels, jsServerLabelKeys) @@ -845,6 +847,7 @@ func WithStats(batch WithStatsBatch) StatzCollectorOpt { } // NewStatzCollector creates a NATS Statz Collector. +// // Deprecated: NewStatzCollector is deprecated. Use NewStatzCollectorOpts instead. func NewStatzCollector(nc *nats.Conn, logger *logrus.Logger, numServers int, serverDiscoveryWait, pollTimeout time.Duration, accounts, accountsDetailed bool, gatewayz bool, @@ -1460,6 +1463,7 @@ func (sc *StatzCollector) MetricInfos() []MetricInfo { sc.descs.JetstreamMetaSnapshotPendingEntries, sc.descs.JetstreamMetaSnapshotPendingBytes, sc.descs.JetstreamMetaSnapshotLastDuration, + sc.descs.JetstreamMetaSnapshotLastTimestamp, } // Account scope metrics @@ -1718,14 +1722,6 @@ func (sc *StatzCollector) Collect(ch chan<- prometheus.Metric) { } } - // Meta snapshot stats - if sm.Stats.JetStream.Meta.Snapshot != nil { - jsMetaSnapshotLabelValues := []string{sm.Server.ID, serverName(&sm.Server), sm.Server.Cluster} - snapshot := sm.Stats.JetStream.Meta.Snapshot - metrics.newGaugeMetric(sc.descs.JetstreamMetaSnapshotPendingEntries, float64(snapshot.PendingEntries), jsMetaSnapshotLabelValues) - metrics.newGaugeMetric(sc.descs.JetstreamMetaSnapshotPendingBytes, float64(snapshot.PendingSize), jsMetaSnapshotLabelValues) - metrics.newGaugeMetric(sc.descs.JetstreamMetaSnapshotLastDuration, float64(snapshot.LastDuration), jsMetaSnapshotLabelValues) - } } } @@ -1978,6 +1974,19 @@ func (sc *StatzCollector) Collect(ch chan<- prometheus.Metric) { metrics.newGaugeMetric(sc.descs.JetstreamServerBytes, float64(jss.Data.Bytes), jsServerLabelValues) metrics.newGaugeMetric(sc.descs.JetstreamServerMaxMemory, float64(jss.Data.Config.MaxMemory), jsServerLabelValues) metrics.newGaugeMetric(sc.descs.JetstreamServerMaxStorage, float64(jss.Data.Config.MaxStore), jsServerLabelValues) + + // Meta snaphost stats + if jss.Data.Meta != nil { + stats := jss.Data.Meta.Snapshot + metrics.newGaugeMetric(sc.descs.JetstreamMetaSnapshotPendingEntries, + float64(stats.PendingEntries), jsServerLabelValues) + metrics.newGaugeMetric(sc.descs.JetstreamMetaSnapshotPendingBytes, + float64(stats.PendingSize), jsServerLabelValues) + metrics.newGaugeMetric(sc.descs.JetstreamMetaSnapshotLastDuration, + stats.LastDuration.Seconds(), jsServerLabelValues) + metrics.newGaugeMetric(sc.descs.JetstreamMetaSnapshotLastTimestamp, + float64(stats.LastTime.Unix()), jsServerLabelValues) + } } } diff --git a/surveyor/collector_statz_test.go b/surveyor/collector_statz_test.go index 7575524f..057d65f8 100644 --- a/surveyor/collector_statz_test.go +++ b/surveyor/collector_statz_test.go @@ -194,6 +194,12 @@ func TestStatzCollector_WithStats_Jsz(t *testing.T) { "nats_consumer_num_redelivered", "nats_consumer_num_waiting", } + metaSnapshotMetrics := []string{ + "nats_core_jetstream_meta_snapshot_last_duration_seconds", + "nats_core_jetstream_meta_snapshot_last_timestamp_seconds", + "nats_core_jetstream_meta_snapshot_pending_bytes", + "nats_core_jetstream_meta_snapshot_pending_entries", + } allMetrics := []string{} allMetrics = append(allMetrics, streamMetrics...) @@ -228,7 +234,7 @@ func TestStatzCollector_WithStats_Jsz(t *testing.T) { jszLeadersOnly: false, jszFilters: nil, assert: func(t *testing.T, test *test, output string) { - for _, m := range allMetrics { + for _, m := range append(allMetrics, metaSnapshotMetrics...) { if !strings.Contains(output, m) { t.Fatalf("invalid output, missing '%s':\n%v\n", m, output) } @@ -412,55 +418,6 @@ func TestStatzCollector_GoMemLimit(t *testing.T) { } } -func TestStatzCollector_WithStats_JetstreamMetaSnapshot(t *testing.T) { - statsRaw, err := os.ReadFile("testdata/stats/stats-meta-snapshot.json") - if err != nil { - t.Fatalf("error reading testdata: %v", err) - } - stats := &server.ServerStatsMsg{} - err = json.Unmarshal(statsRaw, stats) - if err != nil { - t.Fatalf("error unmarshalling stats: %v", err) - } - - sc, err := NewStatzCollectorOpts( - WithStats(WithStatsBatch{ - Stats: []*server.ServerStatsMsg{stats}, - }), - ) - if err != nil { - t.Fatalf("error creating statz collector: %v", err) - } - - output := gatherStatzCollectorMetrics(t, sc) - - // Verify all JetStream meta cluster snapshot metrics are present - want := []string{ - "nats_core_jetstream_meta_snapshot_pending_entries", - "nats_core_jetstream_meta_snapshot_pending_bytes", - "nats_core_jetstream_meta_snapshot_last_duration", - } - - for _, m := range want { - if !strings.Contains(output, m) { - t.Fatalf("invalid output, missing '%s':\n%v\n", m, output) - } - } - - // Verify specific metric values - expectedValues := []string{ - "nats_core_jetstream_meta_snapshot_pending_entries{cluster_name=\"meta-cluster\",server_id=\"meta-server1\",server_name=\"meta-server1\"} 1500", - "nats_core_jetstream_meta_snapshot_pending_bytes{cluster_name=\"meta-cluster\",server_id=\"meta-server1\",server_name=\"meta-server1\"} 524288", - "nats_core_jetstream_meta_snapshot_last_duration{cluster_name=\"meta-cluster\",server_id=\"meta-server1\",server_name=\"meta-server1\"} 1.23456789e+09", - } - - for _, expectedValue := range expectedValues { - if !strings.Contains(output, expectedValue) { - t.Fatalf("expected metric value not found. Expected: %s\nActual output:\n%v", expectedValue, output) - } - } -} - func TestStatzCollector_MetricInfos(t *testing.T) { sc, err := NewStatzCollectorOpts( WithStats(WithStatsBatch{ diff --git a/surveyor/testdata/stats/jsz.json b/surveyor/testdata/stats/jsz.json index c8640f43..f02e0f28 100644 --- a/surveyor/testdata/stats/jsz.json +++ b/surveyor/testdata/stats/jsz.json @@ -32,6 +32,19 @@ "consumers": 10, "messages": 10, "bytes": 1024, + "meta_cluster": { + "name": "proc_compose", + "leader": "s2", + "peer": "noC8kOtg", + "cluster_size": 3, + "pending": 0, + "snapshot": { + "pending_entries": 42, + "pending_size": 336, + "last_time": "2026-02-03T09:20:55.329870139Z", + "last_duration": 77120006 + } + }, "account_details": [ { "name": "account1", @@ -68,7 +81,7 @@ "replicas": 3, "storage": "file" }, - "state": {}, + "state": { }, "stream_raft_group": "raft", "consumer_raft_groups": [ { diff --git a/surveyor/testdata/stats/stats-meta-snapshot.json b/surveyor/testdata/stats/stats-meta-snapshot.json deleted file mode 100644 index 17e99f2f..00000000 --- a/surveyor/testdata/stats/stats-meta-snapshot.json +++ /dev/null @@ -1,82 +0,0 @@ -{ - "server": { - "name": "meta-server1", - "host": "0.0.0.0", - "id": "meta-server1", - "cluster": "meta-cluster", - "version": "2.12.3", - "jetstream": true, - "flags": 1 - }, - "statsz": { - "start": "2026-01-01T00:00:00Z", - "mem": 2048, - "cores": 4, - "cpu": 0.3, - "connections": 20, - "total_connections": 200, - "active_accounts": 15, - "num_subs": 150, - "sent": { - "bytes": 2048, - "msgs": 200 - }, - "received": { - "bytes": 2048, - "msgs": 200 - }, - "active_servers": 3, - "jetstream": { - "config": { - "max_memory": 2048, - "max_storage": 2048, - "sync_interval": 2000, - "sync_always": true - }, - "stats": { - "memory": 2048, - "storage": 2048, - "reserved_memory": 1024, - "reserved_storage": 1024, - "accounts": 15, - "ha_assets": 15, - "api": { - "level": 1, - "total": 100, - "errors": 2 - } - }, - "meta": { - "name": "meta-cluster", - "leader": "meta-server1", - "peer": "meta-server2", - "replicas": [ - { - "name": "meta-server2", - "current": true, - "offline": false, - "active": 2000, - "lag": 50, - "peer": "meta-server2" - }, - { - "name": "meta-server3", - "current": true, - "offline": false, - "active": 2100, - "lag": 25, - "peer": "meta-server3" - } - ], - "size": 3, - "pending": 0, - "snapshot": { - "pending_entries": 1500, - "pending_size": 524288, - "last_time": "2026-01-20T15:30:45.123456789Z", - "last_duration": 1234567890 - } - } - } - } -}