From 51c199ac5eeeecb44bfb7bad06f85c0df7c5f223 Mon Sep 17 00:00:00 2001 From: Yury Samarin Date: Mon, 20 Apr 2026 20:01:16 +0600 Subject: [PATCH 1/8] QVAC-17131 feat: add Prometheus metrics monitoring to registry server (#1600) * feat: add Prometheus metrics monitoring to registry server * fix: restrict registry ping RPC to role and timestamp to avoid exposing operational data * fix: make metrics bind host configurable and move off port 9090 --- packages/qvac-lib-registry-server/README.md | 2 +- .../client/lib/client.js | 1 + .../docs/DEPLOYMENT_GUIDE.md | 133 ++ .../docs/grafana/REGISTRY_DASHBOARD.json | 1917 +++++++++++++++++ .../ecosystem.config.js | 22 + .../lib/metrics-server.js | 64 + .../qvac-lib-registry-server/lib/metrics.js | 195 ++ .../lib/registry-service.js | 216 +- .../qvac-lib-registry-server/package.json | 4 + .../scripts/add-model.js | 2 + .../qvac-lib-registry-server/scripts/bin.js | 37 +- .../integration/metrics.integration.test.js | 249 +++ 12 files changed, 2760 insertions(+), 82 deletions(-) create mode 100644 packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json create mode 100644 packages/qvac-lib-registry-server/ecosystem.config.js create mode 100644 packages/qvac-lib-registry-server/lib/metrics-server.js create mode 100644 packages/qvac-lib-registry-server/lib/metrics.js create mode 100644 packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js diff --git a/packages/qvac-lib-registry-server/README.md b/packages/qvac-lib-registry-server/README.md index 25fe9e68a5..a24dd04871 100644 --- a/packages/qvac-lib-registry-server/README.md +++ b/packages/qvac-lib-registry-server/README.md @@ -302,7 +302,7 @@ Regenerate specs with `npm run build:spec` and restart the service. node scripts/check-peers.js [--key ] ``` -**`ping-server.js`**: Pings a running registry server via RPC to check availability and retrieve server status (role, view key, lengths, connected peers). +**`ping-server.js`**: Pings a running registry server via RPC to verify availability and confirm the connected peer is the indexer rather than a blind relay. Returns `role` and `timestamp` only — operational metrics (model count, view core lag, peer counts, etc.) are exposed via the Prometheus `/metrics` endpoint instead. ```bash node scripts/ping-server.js [--peer ] diff --git a/packages/qvac-lib-registry-server/client/lib/client.js b/packages/qvac-lib-registry-server/client/lib/client.js index 6acac3e751..27e8e44cec 100644 --- a/packages/qvac-lib-registry-server/client/lib/client.js +++ b/packages/qvac-lib-registry-server/client/lib/client.js @@ -56,6 +56,7 @@ class QVACRegistryClient extends ReadyResource { this.hyperswarm.on('connection', this._connectionHandler) this._metadataReady = this._connectMetadataCore() + await this._metadataReady } async _connectMetadataCore () { diff --git a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md index 079f3aedb7..0fa924a141 100644 --- a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md +++ b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md @@ -531,6 +531,137 @@ node scripts/bin.js run --storage ./new-writer --bootstrap --skip-storage- | Admin command retries | May need 1-2 retries | Usually works first try | | Writer coordination | Manual timing recommended | Automated/scripted works | +## Monitoring + +Four layers of operational visibility, each independently deployable. + +### Layer 1: In-Process Prometheus /metrics Endpoint + +The registry server exposes Prometheus metrics via an HTTP endpoint bound to `127.0.0.1`. + +**Start with metrics enabled (default port 9090):** + +```bash +node scripts/bin.js run --storage ./corestore --metrics-port 9090 +``` + +**Or disable metrics:** + +```bash +node scripts/bin.js run --storage ./corestore --metrics-port 0 +``` + +**What is exposed:** + +- **Holepunch P2P metrics** (via `hypercore-stats`, `hyperswarm-stats`, `hypermetrics`): core stats, swarm connections, DHT, UDX bytes/packets, per-core upload/download counters. +- **QVAC-specific metrics:** + +| Metric | Type | Description | +|--------|------|-------------| +| `qvac_registry_models_total` | Gauge | Total models in the registry | +| `qvac_registry_blob_cores_total` | Gauge | Number of blob cores | +| `qvac_registry_blob_core_peers` | Gauge | Connected peers per blob core | +| `qvac_registry_blob_core_fully_downloaded` | Gauge | Whether each blob core is fully replicated | +| `qvac_registry_view_core_length` | Gauge | View core length (total blocks) | +| `qvac_registry_view_core_contiguous_length` | Gauge | View core contiguous length (gap indicates replication lag) | +| `qvac_registry_rpc_requests_total` | Counter | RPC requests by method | +| `qvac_registry_rpc_errors_total` | Counter | RPC errors by method | +| `qvac_registry_is_indexer` | Gauge | Whether this node is an indexer | +| `qvac_registry_blind_peers_connected` | Gauge | Number of configured blind peers with an active connection | +| `qvac_registry_blind_peer_connected` | Gauge | Per-blind-peer connection status (labeled by `peer_key`) | +| `qvac_registry_blob_core_byte_length` | Gauge | Byte length per blob core | +| `qvac_registry_model_size_bytes` | Gauge | Size of each model blob (labeled by path, engine, quantization) | + +**Prometheus scrape config (local Prometheus, loopback bind):** + +```yaml +scrape_configs: + - job_name: 'qvac-registry' + scrape_interval: 30s + static_configs: + - targets: ['127.0.0.1:9210'] +``` + +**Prometheus scrape config (central Prometheus scraping multiple registry VMs):** + +Run the registry with `--metrics-host 0.0.0.0` (or the private-network NIC address) so a remote Prometheus can reach the endpoint. Attach matching labels across jobs (`node-exporter`, `pm2-prometheus-exporter`, `qvac-registry`) so Grafana template variables work uniformly. + +```yaml +scrape_configs: + - job_name: 'qvac-registry' + scrape_interval: 30s + static_configs: + - targets: [':9210'] + labels: + vm_name: '' + network: '' + zone: '' + - targets: [':9210'] + labels: + vm_name: '' + network: '' + zone: '' +``` + +**Security:** Port 9210 is chosen to avoid confusion with Prometheus's own port 9090 and to sit next to pm2-prometheus-exporter on 9209. The endpoint binds to `127.0.0.1` by default. When exposing on a private network via `--metrics-host`, restrict access with firewall rules, VPN/overlay network ACLs (WireGuard, Tailscale, Nebula), or a VPC security group. Do not expose to the public internet. + +### Layer 2: hyper-health-check Sidecar + +Run [hyper-health-check](https://github.com/holepunchto/hyper-health-check) as a separate PM2 process to independently verify that cores are discoverable and downloadable from the swarm. The server might report healthy internals while peers cannot actually reach it. + +```bash +pm2 start node_modules/.bin/hyper-health-check -- run \ + --core :registry-view \ + --core :blob-models \ + --port 9091 \ + --grace-period 600000 +``` + +The 10-minute grace period accommodates replication lag after model additions — blind peers need time to download multi-GB blobs before being flagged as unhealthy. + +**Exposed metrics (on port 9091):** + +- `hyper_health_peers_total` — peers swarming each core +- `hyper_health_peers_with_all_data_total` — peers with full replication +- `hyper_health_ips_with_all_data_total` — unique IPs with full data (geographic diversity) + +### Layer 3: PM2 Ecosystem Config + +The repository includes `ecosystem.config.js` for standardized PM2 process management: + +```bash +pm2 start ecosystem.config.js +``` + +This starts both the registry server (with metrics on port 9210, loopback by default) and the health-check sidecar (on port 9091). For remote Prometheus scraping, edit the `args` field to add `--metrics-host ` and ensure the port is firewalled to trusted scrapers only. + +**Per-deployment customization:** Override `--core` flags for the health-check app via PM2 environment variables or by editing the `args` field. + +**Process-level metrics:** Install `pm2-prometheus-exporter` for CPU, memory, heap, event loop latency, restarts, and uptime metrics: + +```bash +pm2 install pm2-prometheus-exporter +``` + +This exposes process metrics on `localhost:9209` alongside the application-level metrics from Layers 1 and 2. + +### Layer 4: Grafana Dashboard + +Use Holepunch's pre-built [Grafana dashboard](https://grafana.com/grafana/dashboards/22313-hypercore-hyperswarm/) (ID: 22313) as a baseline. It includes panels for Hypercore, Hyperswarm, HyperDHT, UDX, and Node.js process stats. + +**Add QVAC-specific panels for:** + +- **Model availability:** `qvac_registry_models_total`, `hyper_health_peers_with_all_data_total` +- **Storage breakdown:** `qvac_registry_model_size_bytes` by engine/quantization, `sum(qvac_registry_blob_core_byte_length)` +- **RPC activity:** `rate(qvac_registry_rpc_requests_total[5m])`, error ratio +- **Cluster health:** `qvac_registry_is_indexer` across nodes, `qvac_registry_view_core_length` vs `qvac_registry_view_core_contiguous_length` + +**Import the baseline dashboard:** + +1. Add Prometheus as a data source in Grafana (URL of the Prometheus server itself, e.g. `http://prometheus-vm:9090`) +2. Import dashboard ID `22313` +3. Add custom panels for QVAC metrics + ## Reference ### Environment Variables @@ -554,6 +685,8 @@ node scripts/bin.js run --storage ./new-writer --bootstrap --skip-storage- | `node scripts/bin.js run --storage ` | Start a writer | | `node scripts/bin.js run --bootstrap ` | Join existing cluster | | `node scripts/bin.js run --blind-peers ` | Enable blind peer replication | +| `node scripts/bin.js run --metrics-port ` | Prometheus metrics port (default: 9210, 0 to disable) | +| `node scripts/bin.js run --metrics-host ` | Prometheus metrics bind address (default: 127.0.0.1; use 0.0.0.0 or a private NIC IP to expose) | | `node scripts/bin.js run --skip-storage-check` | Bypass storage/bootstrap key mismatch check | | `node scripts/bin.js init-writer --storage ` | Initialize/authorize a writer client | | `node scripts/bin.js sync-models --file ` | Sync models from JSON config | diff --git a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json new file mode 100644 index 0000000000..92a8f92304 --- /dev/null +++ b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json @@ -0,0 +1,1917 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + }, + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_up{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Process Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "green", + "value": 86400 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_uptime{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_restarts{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Restarts", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_memory{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_cpu{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "100 - (avg by (vm_name) (rate(node_cpu_seconds_total{mode=\"idle\", vm_name=~\"$vm\"}[5m])) * 100)", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "VM CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "(1 - (node_memory_MemAvailable_bytes{vm_name=~\"$vm\"} / node_memory_MemTotal_bytes{vm_name=~\"$vm\"})) * 100", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "VM Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{mountpoint=\"/\", vm_name=~\"$vm\"} / node_filesystem_size_bytes{mountpoint=\"/\", vm_name=~\"$vm\"}) * 100)", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Disk Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 5000000000 + }, + { + "color": "green", + "value": 20000000000 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "node_filesystem_avail_bytes{mountpoint=\"/\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Disk Available", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{device!~\"lo|tailscale.*\", vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} rx", + "refId": "A" + }, + { + "expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|tailscale.*\", vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} tx", + "refId": "B" + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} read", + "refId": "A" + }, + { + "expr": "-rate(node_disk_written_bytes_total{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} write", + "refId": "B" + } + ], + "title": "Disk I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "node_load15{vm_name=~\"$vm\"} / count without (cpu, mode) (node_cpu_seconds_total{mode=\"idle\", vm_name=~\"$vm\"})", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 15, + "panels": [], + "title": "QVAC Registry Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 37 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_models_total", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Models", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + }, + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 37 + }, + "id": 17, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_is_indexer", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Indexer", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "yellow", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 37 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blind_peers_connected", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Blind Peers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 37 + }, + "id": 19, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_cores_total", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Blob Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "No" + }, + "1": { + "color": "green", + "text": "Yes" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 37 + }, + "id": 20, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "min(qvac_registry_blob_core_fully_downloaded)", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Blobs Synced", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 37 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(qvac_registry_blob_core_byte_length)", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Blob Storage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 22, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_view_core_length", + "legendFormat": "length", + "refId": "A" + }, + { + "expr": "qvac_registry_view_core_contiguous_length", + "legendFormat": "contiguous", + "refId": "B" + }, + { + "expr": "qvac_registry_view_core_length - qvac_registry_view_core_contiguous_length", + "legendFormat": "gap (replication lag)", + "refId": "C" + } + ], + "title": "View Core Replication", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_peers", + "legendFormat": "{{core_name}}", + "refId": "A" + } + ], + "title": "Blob Core Peers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(qvac_registry_rpc_requests_total[5m])", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "title": "RPC Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(qvac_registry_rpc_errors_total[5m])", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "title": "RPC Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 57 + }, + "id": 26, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_model_size_bytes", + "legendFormat": "{{path}}", + "refId": "A" + } + ], + "title": "Model Size Breakdown", + "type": "bargauge" + }, + { + "datasource": { + "type": "loki" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 13, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "showControls": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "unwrappedColumns": false, + "wrapLogMessage": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "{job=\"registry\"}", + "refId": "A" + } + ], + "title": "Registry Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 75 + }, + "id": 14, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "showControls": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "unwrappedColumns": false, + "wrapLogMessage": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "{job=\"registry\", level=\"error\"}", + "refId": "A" + } + ], + "title": "Registry Error Logs", + "type": "logs" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 42, + "tags": [ + "registry", + "pm2", + "qvac" + ], + "templating": { + "list": [ + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus" + }, + "includeAll": true, + "multi": true, + "name": "vm", + "options": [], + "query": "label_values(pm2_up{name=\"registry\"}, vm_name)", + "refresh": 1, + "regexApplyTo": "value", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "QVAC Registry Overview", + "uid": "f43aa479-9d22-4b3b-baa6-f527a2615981", + "version": 1, + "weekStart": "" +} diff --git a/packages/qvac-lib-registry-server/ecosystem.config.js b/packages/qvac-lib-registry-server/ecosystem.config.js new file mode 100644 index 0000000000..526cdd6f0d --- /dev/null +++ b/packages/qvac-lib-registry-server/ecosystem.config.js @@ -0,0 +1,22 @@ +'use strict' + +module.exports = { + apps: [ + { + name: 'registry', + script: 'scripts/bin.js', + args: 'run --storage ./corestore --metrics-port 9210', + env: { + NODE_ENV: 'production' + } + }, + { + name: 'health-check', + script: 'node_modules/.bin/hyper-health-check', + args: 'run --port 9091 --grace-period 600000', + env: { + NODE_ENV: 'production' + } + } + ] +} diff --git a/packages/qvac-lib-registry-server/lib/metrics-server.js b/packages/qvac-lib-registry-server/lib/metrics-server.js new file mode 100644 index 0000000000..a01a6b3302 --- /dev/null +++ b/packages/qvac-lib-registry-server/lib/metrics-server.js @@ -0,0 +1,64 @@ +'use strict' + +const http = require('http') +const ReadyResource = require('ready-resource') + +const DEFAULT_PORT = 9210 +const DEFAULT_HOST = '127.0.0.1' + +class MetricsServer extends ReadyResource { + constructor (promRegister, opts = {}) { + super() + + this._register = promRegister + this._port = opts.port || DEFAULT_PORT + this._host = opts.host || DEFAULT_HOST + this._logger = opts.logger || console + this._server = null + } + + async _open () { + this._server = http.createServer(async (req, res) => { + if (req.url === '/metrics' && req.method === 'GET') { + try { + const metrics = await this._register.metrics() + res.writeHead(200, { 'Content-Type': this._register.contentType }) + res.end(metrics) + } catch (err) { + this._logger.error({ err }, 'MetricsServer: failed to collect metrics') + res.writeHead(500) + res.end('Internal Server Error') + } + return + } + + res.writeHead(404) + res.end('Not Found') + }) + + await new Promise((resolve, reject) => { + this._server.listen(this._port, this._host, () => { + this._logger.info({ + host: this._host, + port: this._port + }, 'MetricsServer: listening') + resolve() + }) + this._server.on('error', reject) + }) + } + + async _close () { + if (!this._server) return + + await new Promise((resolve) => { + this._server.close(() => { + this._logger.info('MetricsServer: closed') + resolve() + }) + }) + this._server = null + } +} + +module.exports = MetricsServer diff --git a/packages/qvac-lib-registry-server/lib/metrics.js b/packages/qvac-lib-registry-server/lib/metrics.js new file mode 100644 index 0000000000..e39d4ce17b --- /dev/null +++ b/packages/qvac-lib-registry-server/lib/metrics.js @@ -0,0 +1,195 @@ +'use strict' + +const promClient = require('prom-client') + +const MODEL_CACHE_TTL_MS = 15000 + +class QvacMetrics { + constructor (service, opts = {}) { + this._service = service + this._logger = opts.logger || console + + this._modelCache = null + this._modelCacheExpiry = 0 + + this._rpcRequests = new promClient.Counter({ + name: 'qvac_registry_rpc_requests_total', + help: 'Total RPC requests by method', + labelNames: ['method'] + }) + + this._rpcErrors = new promClient.Counter({ + name: 'qvac_registry_rpc_errors_total', + help: 'Total RPC errors by method', + labelNames: ['method'] + }) + + this._registerGauges() + } + + recordRpcRequest (method) { + this._rpcRequests.inc({ method }) + } + + recordRpcError (method) { + this._rpcErrors.inc({ method }) + } + + _registerGauges () { + const self = this + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_models_total', + help: 'Total number of models in the registry', + async collect () { + const models = await self._getCachedModels() + this.set(models.length) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blob_cores_total', + help: 'Number of blob cores', + collect () { + this.set(self._service.blobsCores.size) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blob_core_peers', + help: 'Number of connected peers per blob core', + labelNames: ['core_name'], + collect () { + this.reset() + for (const [name, { core }] of self._service.blobsCores) { + this.set({ core_name: name }, core.peers.length) + } + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blob_core_fully_downloaded', + help: 'Whether each blob core is fully downloaded (1=yes, 0=no)', + labelNames: ['core_name'], + collect () { + this.reset() + for (const [name, { core }] of self._service.blobsCores) { + const full = core.contiguousLength === core.length && core.length > 0 ? 1 : 0 + this.set({ core_name: name }, full) + } + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_view_core_length', + help: 'View core length (total blocks)', + collect () { + const viewCore = self._service.view?.core + this.set(viewCore ? viewCore.length : 0) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_view_core_contiguous_length', + help: 'View core contiguous length (gap = length - contiguous indicates replication lag)', + collect () { + const viewCore = self._service.view?.core + this.set(viewCore ? viewCore.contiguousLength : 0) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_is_indexer', + help: 'Whether this node is an indexer (1=yes, 0=no)', + collect () { + this.set(self._service.base?.isIndexer ? 1 : 0) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blind_peers_connected', + help: 'Number of configured blind peers with an active connection', + collect () { + this.set(self._service.getConnectedBlindPeerKeys().length) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blind_peer_connected', + help: 'Whether each configured blind peer currently has an active connection (1=yes, 0=no)', + labelNames: ['peer_key'], + collect () { + this.reset() + for (const peerKey of self._service.getConfiguredBlindPeerKeys()) { + this.set( + { peer_key: peerKey }, + self._service.isBlindPeerConnected(peerKey) ? 1 : 0 + ) + } + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blob_core_byte_length', + help: 'Byte length of each blob core', + labelNames: ['core_name'], + collect () { + this.reset() + for (const [name, { core }] of self._service.blobsCores) { + this.set({ core_name: name }, core.byteLength) + } + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_model_size_bytes', + help: 'Size in bytes of each model blob', + labelNames: ['path', 'engine', 'quantization'], + async collect () { + const models = await self._getCachedModels() + this.reset() + for (const m of models) { + if (m.blobBinding && m.blobBinding.byteLength > 0) { + this.set({ + path: m.path, + engine: m.engine || '', + quantization: m.quantization || '' + }, m.blobBinding.byteLength) + } + } + } + }) + } + + async _getCachedModels () { + const now = Date.now() + if (this._modelCache && now < this._modelCacheExpiry) { + return this._modelCache + } + + try { + const view = this._service.view + if (!view || !view.opened) return this._modelCache || [] + const models = await view.findModelsByPath({}).toArray() + this._modelCache = models + this._modelCacheExpiry = now + MODEL_CACHE_TTL_MS + return models + } catch (err) { + this._logger.warn({ err: err.message }, 'QvacMetrics: failed to query models') + return this._modelCache || [] + } + } +} + +module.exports = QvacMetrics diff --git a/packages/qvac-lib-registry-server/lib/registry-service.js b/packages/qvac-lib-registry-server/lib/registry-service.js index 63794b2bea..74c30a27d0 100644 --- a/packages/qvac-lib-registry-server/lib/registry-service.js +++ b/packages/qvac-lib-registry-server/lib/registry-service.js @@ -70,10 +70,12 @@ class RegistryService extends ReadyResource { this.blobsStore = this.store.namespace('blobs') this.blobsCores = new Map() + this._peerConnectionCounts = new Map() this._indexerMonitor = null this._mirroredCoreIds = new Set() this.blindPeering = null this.reseedTracker = null + this.metrics = null this._registerApplyHandlers() @@ -148,10 +150,15 @@ class RegistryService extends ReadyResource { }) this.swarm.on('connection', (conn, peerInfo) => { - const peerKey = peerInfo?.publicKey ? IdEnc.normalize(peerInfo.publicKey) : 'unknown' + const peerKey = peerInfo?.publicKey ? IdEnc.normalize(peerInfo.publicKey) : null - this.logger.info({ peer: peerKey }, 'Swarm connection opened') - conn.on('close', () => this.logger.info({ peer: peerKey }, 'Swarm connection closed')) + if (peerKey) this._trackPeerConnection(peerKey) + + this.logger.info({ peer: peerKey || 'unknown' }, 'Swarm connection opened') + conn.on('close', () => { + if (peerKey) this._untrackPeerConnection(peerKey) + this.logger.info({ peer: peerKey || 'unknown' }, 'Swarm connection closed') + }) this._setupRpc(conn) @@ -307,6 +314,7 @@ class RegistryService extends ReadyResource { }) } this.blobsCores.clear() + this._peerConnectionCounts.clear() this._mirroredCoreIds.clear() this.logger.info('RegistryService: closed') @@ -468,28 +476,34 @@ class RegistryService extends ReadyResource { rpc.respond( 'add-model', async (entry) => { - ensureWriterAccess() + if (this.metrics) this.metrics.recordRpcRequest('add-model') + try { + ensureWriterAccess() - if (!this.opened) await this.ready() - await this._ensureIndexer() + if (!this.opened) await this.ready() + await this._ensureIndexer() - const skipExisting = entry.skipExisting || false - const modelEntry = { ...entry } - delete modelEntry.skipExisting + const skipExisting = entry.skipExisting || false + const modelEntry = { ...entry } + delete modelEntry.skipExisting - const result = await this.addModel(modelEntry, { skipExisting }) - - this.logger.info({ - path: result.path, - source: result.source - }, 'RPC: add-model completed') + const result = await this.addModel(modelEntry, { skipExisting }) - return { - success: true, - model: { + this.logger.info({ path: result.path, source: result.source + }, 'RPC: add-model completed') + + return { + success: true, + model: { + path: result.path, + source: result.source + } } + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('add-model') + throw err } } ) @@ -497,19 +511,25 @@ class RegistryService extends ReadyResource { rpc.respond( 'put-license', async (licenseRecord) => { - ensureWriterAccess() + if (this.metrics) this.metrics.recordRpcRequest('put-license') + try { + ensureWriterAccess() - if (!this.opened) await this.ready() - await this._ensureIndexer() - await this.putLicense(licenseRecord) + if (!this.opened) await this.ready() + await this._ensureIndexer() + await this.putLicense(licenseRecord) - this.logger.info({ - spdxId: licenseRecord.spdxId - }, 'RPC: put-license completed') + this.logger.info({ + spdxId: licenseRecord.spdxId + }, 'RPC: put-license completed') - return { - success: true, - message: 'License operation appended' + return { + success: true, + message: 'License operation appended' + } + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('put-license') + throw err } } ) @@ -517,45 +537,51 @@ class RegistryService extends ReadyResource { rpc.respond( 'update-model-metadata', async (data) => { - ensureWriterAccess() - - if (!data.path) throw new TypeError('path is required') - if (!data.source) throw new TypeError('source is required') - - if (!this.opened) await this.ready() - await this._ensureIndexer() - - const existing = await this.getModelByKey({ path: data.path, source: data.source }) - if (!existing) throw new Error(`Model not found: ${data.path}`) - - // If explicitly undeprecating, clear deprecation fields - const isUndeprecating = data.deprecated === false - - const updated = { - ...existing, - engine: data.engine ?? existing.engine, - licenseId: data.licenseId ?? existing.licenseId, - description: data.description ?? existing.description, - quantization: data.quantization ?? existing.quantization, - params: data.params ?? existing.params, - notes: data.notes ?? existing.notes, - tags: data.tags ?? existing.tags, - deprecated: data.deprecated !== undefined ? data.deprecated : existing.deprecated, - deprecatedAt: isUndeprecating ? '' : (data.deprecatedAt ?? existing.deprecatedAt), - replacedBy: isUndeprecating ? '' : (data.replacedBy ?? existing.replacedBy), - deprecationReason: isUndeprecating ? '' : (data.deprecationReason ?? existing.deprecationReason) - } + if (this.metrics) this.metrics.recordRpcRequest('update-model-metadata') + try { + ensureWriterAccess() + + if (!data.path) throw new TypeError('path is required') + if (!data.source) throw new TypeError('source is required') + + if (!this.opened) await this.ready() + await this._ensureIndexer() + + const existing = await this.getModelByKey({ path: data.path, source: data.source }) + if (!existing) throw new Error(`Model not found: ${data.path}`) + + // If explicitly undeprecating, clear deprecation fields + const isUndeprecating = data.deprecated === false + + const updated = { + ...existing, + engine: data.engine ?? existing.engine, + licenseId: data.licenseId ?? existing.licenseId, + description: data.description ?? existing.description, + quantization: data.quantization ?? existing.quantization, + params: data.params ?? existing.params, + notes: data.notes ?? existing.notes, + tags: data.tags ?? existing.tags, + deprecated: data.deprecated !== undefined ? data.deprecated : existing.deprecated, + deprecatedAt: isUndeprecating ? '' : (data.deprecatedAt ?? existing.deprecatedAt), + replacedBy: isUndeprecating ? '' : (data.replacedBy ?? existing.replacedBy), + deprecationReason: isUndeprecating ? '' : (data.deprecationReason ?? existing.deprecationReason) + } - await this._appendOperation(DISPATCH_PUT_MODEL, updated) + await this._appendOperation(DISPATCH_PUT_MODEL, updated) - const viewLength = this.view?.core?.length ?? 0 - const viewContiguous = this.view?.core?.contiguousLength ?? 0 - const viewSigned = this.view?.core?.signedLength ?? 0 - this.logger.info({ path: data.path, viewLength, viewContiguous, viewSigned }, 'RPC: update-model-metadata completed') + const viewLength = this.view?.core?.length ?? 0 + const viewContiguous = this.view?.core?.contiguousLength ?? 0 + const viewSigned = this.view?.core?.signedLength ?? 0 + this.logger.info({ path: data.path, viewLength, viewContiguous, viewSigned }, 'RPC: update-model-metadata completed') - return { - success: true, - model: updated + return { + success: true, + model: updated + } + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('update-model-metadata') + throw err } } ) @@ -563,28 +589,33 @@ class RegistryService extends ReadyResource { rpc.respond( 'delete-model', async (data) => { - ensureWriterAccess() + if (this.metrics) this.metrics.recordRpcRequest('delete-model') + try { + ensureWriterAccess() - if (!data.path) throw new TypeError('path is required') - if (!data.source) throw new TypeError('source is required') + if (!data.path) throw new TypeError('path is required') + if (!data.source) throw new TypeError('source is required') - if (!this.opened) await this.ready() - await this._ensureIndexer() + if (!this.opened) await this.ready() + await this._ensureIndexer() - const result = await this.deleteModel({ path: data.path, source: data.source }) + const result = await this.deleteModel({ path: data.path, source: data.source }) - this.logger.info({ - path: data.path, - source: data.source - }, 'RPC: delete-model completed') - - return result + this.logger.info({ + path: data.path, + source: data.source + }, 'RPC: delete-model completed') + + return result + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('delete-model') + throw err + } } ) - // Server identification endpoint - allows RPC clients to verify they connected - // to the actual server and not a blind peer (which won't have RPC responders) rpc.respond('ping', async () => { + if (this.metrics) this.metrics.recordRpcRequest('ping') return { role: 'registry-server', timestamp: Date.now() @@ -826,6 +857,34 @@ class RegistryService extends ReadyResource { this.logger.info('Indexer status confirmed') } + _trackPeerConnection (peerKey) { + const current = this._peerConnectionCounts.get(peerKey) || 0 + this._peerConnectionCounts.set(peerKey, current + 1) + } + + _untrackPeerConnection (peerKey) { + const current = this._peerConnectionCounts.get(peerKey) || 0 + if (current <= 1) { + this._peerConnectionCounts.delete(peerKey) + return + } + + this._peerConnectionCounts.set(peerKey, current - 1) + } + + getConfiguredBlindPeerKeys () { + return [...new Set(this.blindPeerKeys)] + } + + isBlindPeerConnected (peerKey) { + return (this._peerConnectionCounts.get(peerKey) || 0) > 0 + } + + getConnectedBlindPeerKeys () { + return this.getConfiguredBlindPeerKeys() + .filter(peerKey => this.isBlindPeerConnected(peerKey)) + } + async _downloadArtifact (sourceInfo, localPath) { switch (sourceInfo.protocol) { case 'hf': @@ -969,7 +1028,6 @@ class RegistryService extends ReadyResource { discoveryKey: core.discoveryKey.toString('hex') }, 'Hyperblobs core ready') - // Caller is responsible for mirroring after data is added return entry } diff --git a/packages/qvac-lib-registry-server/package.json b/packages/qvac-lib-registry-server/package.json index 3330611838..34ecf056d1 100644 --- a/packages/qvac-lib-registry-server/package.json +++ b/packages/qvac-lib-registry-server/package.json @@ -67,15 +67,19 @@ "crypto": "npm:bare-node-crypto", "http": "npm:bare-node-http", "https": "npm:bare-node-https", + "hyper-health-check": "^1.3.0", "hyperblobs": "^2.8.0", "hypercore-id-encoding": "^1.3.0", + "hypercore-stats": "^2.4.0", "hyperdb": "^4.16.0", "hyperdispatch": "^1.4.0", "hyperschema": "^1.13.0", "hyperswarm": "^4.10.5", + "hyperswarm-stats": "^1.3.0", "paparam": "^1.8.6", "pino": "^9.9.4", "pino-pretty": "^13.1.1", + "prom-client": "^15.1.3", "protomux-rpc": "^1.7.0", "readline": "npm:bare-node-readline", "ready-resource": "^1.2.0", diff --git a/packages/qvac-lib-registry-server/scripts/add-model.js b/packages/qvac-lib-registry-server/scripts/add-model.js index 076f18dd6d..116b10ed40 100644 --- a/packages/qvac-lib-registry-server/scripts/add-model.js +++ b/packages/qvac-lib-registry-server/scripts/add-model.js @@ -24,6 +24,8 @@ async function addModel () { primaryKey = args[++i] } else if (args[i] === '--models-file' || args[i] === '-f') { modelsFile = args[++i] + } else if (args[i] === '--') { + continue } else if (!canonicalSource && args[i] !== '--skip-existing') { canonicalSource = args[i] } diff --git a/packages/qvac-lib-registry-server/scripts/bin.js b/packages/qvac-lib-registry-server/scripts/bin.js index c01f21d53a..c9cf4069d2 100644 --- a/packages/qvac-lib-registry-server/scripts/bin.js +++ b/packages/qvac-lib-registry-server/scripts/bin.js @@ -14,8 +14,16 @@ const fs = require('fs') const RegistryService = require('../lib/registry-service') const RegistryConfig = require('../lib/config') +const MetricsServer = require('../lib/metrics-server') +const QvacMetrics = require('../lib/metrics') +const HypercoreStats = require('hypercore-stats') +const HyperswarmStats = require('hyperswarm-stats') +const promClient = require('prom-client') const { AUTOBASE_NAMESPACE } = require('@qvac/registry-schema') +const DEFAULT_METRICS_PORT = 9210 +const DEFAULT_METRICS_HOST = '127.0.0.1' + const DEFAULT_STORAGE = './corestore' const DEFAULT_WRITER_STORAGE = './writer-storage' @@ -36,6 +44,8 @@ const runCmd = command('run', flag('--clear-after-reseed', 'Clear blob blocks after successful replication to blind peers'), flag('--compaction-interval [ms]', `Periodic RocksDB compaction interval in ms (default: ${DEFAULT_COMPACTION_INTERVAL_MS}, 0 to disable)`), flag('--skip-storage-check', 'Skip storage/bootstrap key mismatch check (use when joining existing cluster with fresh storage)'), + flag('--metrics-port [port]', `Prometheus metrics HTTP port (default: ${DEFAULT_METRICS_PORT}, 0 to disable)`), + flag('--metrics-host [host]', `Prometheus metrics HTTP bind address (default: ${DEFAULT_METRICS_HOST}; use 0.0.0.0 to expose on all interfaces)`), async function ({ flags }) { const logger = createLogger() @@ -86,8 +96,30 @@ const runCmd = command('run', config.setAutobaseKey(IdEnc.normalize(service.base.key)) config.setRegistryCoreKey(IdEnc.normalize(service.registryCoreKey)) + const metricsPort = flags.metricsPort !== undefined + ? parseInt(flags.metricsPort, 10) + : DEFAULT_METRICS_PORT + + if (Number.isNaN(metricsPort) || metricsPort < 0) { + throw new Error('--metrics-port must be a non-negative integer (0 to disable)') + } + + const metricsHost = flags.metricsHost || DEFAULT_METRICS_HOST + + let metricsServer = null + if (metricsPort > 0) { + const qvacMetrics = new QvacMetrics(service, { logger }) + service.metrics = qvacMetrics + + HypercoreStats.fromCorestore(store).registerPrometheusMetrics(promClient) + new HyperswarmStats(swarm).registerPrometheusMetrics(promClient) // eslint-disable-line no-new + + metricsServer = new MetricsServer(promClient.register, { port: metricsPort, host: metricsHost, logger }) + await metricsServer.ready() + } + logServiceInfo(logger, service) - registerShutdown(logger, service, swarm, store) + registerShutdown(logger, service, swarm, store, metricsServer) } ) @@ -155,13 +187,14 @@ const cmd = command('registry', runCmd, initWriter, syncModelsCmd) cmd.parse() -function registerShutdown (logger, service, swarm, store) { +function registerShutdown (logger, service, swarm, store, metricsServer) { let closing = false const shutdown = async () => { if (closing) return closing = true logger.info('Shutting down gracefully…') try { + if (metricsServer) await metricsServer.close() await service.close() await swarm.destroy() await store.close() diff --git a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js new file mode 100644 index 0000000000..1386e2ee73 --- /dev/null +++ b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js @@ -0,0 +1,249 @@ +'use strict' + +const test = require('brittle') +const Corestore = require('corestore') +const Hyperswarm = require('hyperswarm') +const http = require('http') +const IdEnc = require('hypercore-id-encoding') +const promClient = require('prom-client') + +const RegistryService = require('../../lib/registry-service') +const RegistryConfig = require('../../lib/config') +const MetricsServer = require('../../lib/metrics-server') +const QvacMetrics = require('../../lib/metrics') +const { AUTOBASE_NAMESPACE } = require('../../shared/constants') +const { createTempStorage } = require('../helpers/test-utils') + +const noopLogger = { + info () {}, + debug () {}, + error () {}, + warn () {} +} + +async function createServiceWithMetrics (t, opts = {}) { + const basePath = await createTempStorage(t) + const store = new Corestore(basePath) + await store.ready() + + const swarm = new Hyperswarm({ bootstrap: [] }) + const config = new RegistryConfig({ logger: noopLogger }) + + const service = new RegistryService( + store.namespace(AUTOBASE_NAMESPACE), + swarm, + config, + { + logger: noopLogger, + ackInterval: 5, + skipStorageCheck: true + } + ) + + await service.ready() + + // Fresh registry per test to avoid metric name collisions + const registry = new promClient.Registry() + promClient.register.clear() + + const qvacMetrics = new QvacMetrics(service, { logger: noopLogger }) + service.metrics = qvacMetrics + + const port = opts.port || 0 + const metricsServer = new MetricsServer(promClient.register, { + port, + logger: noopLogger + }) + await metricsServer.ready() + + const actualPort = metricsServer._server.address().port + + return { service, store, swarm, metricsServer, qvacMetrics, port: actualPort, registry } +} + +async function cleanup (ctx) { + if (ctx.metricsServer) await ctx.metricsServer.close().catch(() => {}) + if (ctx.service && ctx.service.opened) await ctx.service.close().catch(() => {}) + if (ctx.swarm) await ctx.swarm.destroy().catch(() => {}) + if (ctx.store) await ctx.store.close().catch(() => {}) + promClient.register.clear() +} + +function httpGet (port, path) { + return new Promise((resolve, reject) => { + const req = http.get(`http://127.0.0.1:${port}${path}`, (res) => { + let body = '' + res.on('data', (chunk) => { body += chunk }) + res.on('end', () => resolve({ status: res.statusCode, body, headers: res.headers })) + }) + req.on('error', reject) + }) +} + +test('MetricsServer serves Prometheus text at /metrics', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + const res = await httpGet(ctx.port, '/metrics') + t.is(res.status, 200, 'returns 200') + t.ok(res.headers['content-type'].includes('text/plain') || res.headers['content-type'].includes('openmetrics'), 'correct content type') + t.ok(res.body.length > 0, 'body is non-empty') + } finally { + await cleanup(ctx) + } +}) + +test('MetricsServer returns 404 for non-metrics paths', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + const res = await httpGet(ctx.port, '/health') + t.is(res.status, 404, 'returns 404') + } finally { + await cleanup(ctx) + } +}) + +test('/metrics includes QVAC custom gauges', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + const res = await httpGet(ctx.port, '/metrics') + const body = res.body + + t.ok(body.includes('qvac_registry_models_total'), 'has models_total') + t.ok(body.includes('qvac_registry_blob_cores_total'), 'has blob_cores_total') + t.ok(body.includes('qvac_registry_view_core_length'), 'has view_core_length') + t.ok(body.includes('qvac_registry_view_core_contiguous_length'), 'has view_core_contiguous_length') + t.ok(body.includes('qvac_registry_is_indexer'), 'has is_indexer') + t.ok(body.includes('qvac_registry_blind_peers_connected'), 'has blind_peers_connected') + t.ok(body.includes('qvac_registry_blind_peer_connected'), 'has blind_peer_connected') + t.ok(body.includes('qvac_registry_rpc_requests_total'), 'has rpc_requests_total') + t.ok(body.includes('qvac_registry_rpc_errors_total'), 'has rpc_errors_total') + } finally { + await cleanup(ctx) + } +}) + +test('RPC metrics counters increment', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + ctx.qvacMetrics.recordRpcRequest('ping') + ctx.qvacMetrics.recordRpcRequest('ping') + ctx.qvacMetrics.recordRpcRequest('add-model') + ctx.qvacMetrics.recordRpcError('add-model') + + const res = await httpGet(ctx.port, '/metrics') + const body = res.body + + const pingLine = body.split('\n').find(l => l.includes('qvac_registry_rpc_requests_total') && l.includes('ping')) + t.ok(pingLine, 'has ping request counter line') + t.ok(pingLine.includes('2'), 'ping counter is 2') + + const errorLine = body.split('\n').find(l => l.includes('qvac_registry_rpc_errors_total') && l.includes('add-model')) + t.ok(errorLine, 'has add-model error counter line') + t.ok(errorLine.includes('1'), 'error counter is 1') + } finally { + await cleanup(ctx) + } +}) + +test('blind peer metrics track configured peers with active connections', async (t) => { + const blindPeerKeys = [ + IdEnc.normalize(Buffer.alloc(32, 1)), + IdEnc.normalize(Buffer.alloc(32, 2)) + ] + const ctx = await createServiceWithMetrics(t) + + try { + ctx.service.blindPeerKeys = blindPeerKeys + ctx.service._trackPeerConnection(blindPeerKeys[0]) + ctx.service._trackPeerConnection(blindPeerKeys[0]) + ctx.service._trackPeerConnection('writer-peer') + + let res = await httpGet(ctx.port, '/metrics') + let body = res.body + + const connectedPeerLine = body.split('\n') + .find(line => line.startsWith(`qvac_registry_blind_peer_connected{peer_key="${blindPeerKeys[0]}"}`)) + t.ok(connectedPeerLine, 'has connected blind peer series') + t.ok(connectedPeerLine.endsWith(' 1'), 'connected blind peer is reported as 1') + + const disconnectedPeerLine = body.split('\n') + .find(line => line.startsWith(`qvac_registry_blind_peer_connected{peer_key="${blindPeerKeys[1]}"}`)) + t.ok(disconnectedPeerLine, 'has disconnected blind peer series') + t.ok(disconnectedPeerLine.endsWith(' 0'), 'disconnected blind peer is reported as 0') + + ctx.service._untrackPeerConnection(blindPeerKeys[0]) + ctx.service._untrackPeerConnection(blindPeerKeys[0]) + + res = await httpGet(ctx.port, '/metrics') + body = res.body + + const afterCloseCountLine = body.split('\n') + .find(line => line.startsWith('qvac_registry_blind_peers_connected ')) + t.ok(afterCloseCountLine.endsWith(' 0'), 'blind peer count drops after connection closes') + + const afterClosePeerLine = body.split('\n') + .find(line => line.startsWith(`qvac_registry_blind_peer_connected{peer_key="${blindPeerKeys[0]}"}`)) + t.ok(afterClosePeerLine.endsWith(' 0'), 'blind peer status drops after connection closes') + } finally { + await cleanup(ctx) + } +}) + +test('MetricsServer closes cleanly', async (t) => { + const ctx = await createServiceWithMetrics(t) + + await ctx.metricsServer.close() + ctx.metricsServer = null + + try { + await httpGet(ctx.port, '/metrics') + t.fail('should not connect after close') + } catch (err) { + t.ok(err.code === 'ECONNREFUSED', 'connection refused after close') + } finally { + await cleanup(ctx) + } +}) + +test('MetricsServer binds to custom host', async (t) => { + const basePath = await createTempStorage(t) + const store = new Corestore(basePath) + await store.ready() + + const swarm = new Hyperswarm({ bootstrap: [] }) + const config = new RegistryConfig({ logger: noopLogger }) + + const service = new RegistryService( + store.namespace(AUTOBASE_NAMESPACE), + swarm, + config, + { logger: noopLogger, ackInterval: 5, skipStorageCheck: true } + ) + await service.ready() + + promClient.register.clear() + + const metricsServer = new MetricsServer(promClient.register, { + port: 0, + host: '127.0.0.1', + logger: noopLogger + }) + + const ctx = { service, store, swarm, metricsServer } + + try { + await metricsServer.ready() + + const address = metricsServer._server.address() + t.is(address.address, '127.0.0.1', 'bound to requested host') + + const res = await httpGet(address.port, '/metrics') + t.is(res.status, 200, 'reachable on requested host') + } finally { + await cleanup(ctx) + } +}) From 0aed93db1c8a9d025113aeb697c6baa54bce57d2 Mon Sep 17 00:00:00 2001 From: Yury Samarin Date: Tue, 21 Apr 2026 18:18:58 +0600 Subject: [PATCH 2/8] feat: replace per-model size gauge with view-derived total blob bytes (#1689) --- .../docs/DEPLOYMENT_GUIDE.md | 14 ++-- .../docs/grafana/REGISTRY_DASHBOARD.json | 61 +------------- .../qvac-lib-registry-server/lib/metrics.js | 72 ++++++----------- .../lib/registry-service.js | 81 ++++++++++++++----- .../integration/metrics.integration.test.js | 14 ++++ 5 files changed, 108 insertions(+), 134 deletions(-) diff --git a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md index 0fa924a141..bda3a54294 100644 --- a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md +++ b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md @@ -558,10 +558,13 @@ node scripts/bin.js run --storage ./corestore --metrics-port 0 | Metric | Type | Description | |--------|------|-------------| -| `qvac_registry_models_total` | Gauge | Total models in the registry | -| `qvac_registry_blob_cores_total` | Gauge | Number of blob cores | +| `qvac_registry_models_total` | Gauge | Total models in the registry (refreshed every 5 min and on local writes) | +| `qvac_registry_total_blob_bytes` | Gauge | Sum of `blobBinding.byteLength` across every model record in the view | +| `qvac_registry_totals_refreshed_age_seconds` | Gauge | Seconds since `total_blob_bytes` / `models_total` were last recomputed (-1 if never) | +| `qvac_registry_blob_cores_total` | Gauge | Number of blob cores opened locally on this node | | `qvac_registry_blob_core_peers` | Gauge | Connected peers per blob core | | `qvac_registry_blob_core_fully_downloaded` | Gauge | Whether each blob core is fully replicated | +| `qvac_registry_blob_core_byte_length` | Gauge | Byte length per locally-opened blob core | | `qvac_registry_view_core_length` | Gauge | View core length (total blocks) | | `qvac_registry_view_core_contiguous_length` | Gauge | View core contiguous length (gap indicates replication lag) | | `qvac_registry_rpc_requests_total` | Counter | RPC requests by method | @@ -569,8 +572,8 @@ node scripts/bin.js run --storage ./corestore --metrics-port 0 | `qvac_registry_is_indexer` | Gauge | Whether this node is an indexer | | `qvac_registry_blind_peers_connected` | Gauge | Number of configured blind peers with an active connection | | `qvac_registry_blind_peer_connected` | Gauge | Per-blind-peer connection status (labeled by `peer_key`) | -| `qvac_registry_blob_core_byte_length` | Gauge | Byte length per blob core | -| `qvac_registry_model_size_bytes` | Gauge | Size of each model blob (labeled by path, engine, quantization) | + +`qvac_registry_total_blob_bytes` is derived from the view, not from the on-disk blob cores, so it reports the logical registry size consistently on every node (indexers that do not store blobs locally still report the same value). **Prometheus scrape config (local Prometheus, loopback bind):** @@ -652,9 +655,10 @@ Use Holepunch's pre-built [Grafana dashboard](https://grafana.com/grafana/dashbo **Add QVAC-specific panels for:** - **Model availability:** `qvac_registry_models_total`, `hyper_health_peers_with_all_data_total` -- **Storage breakdown:** `qvac_registry_model_size_bytes` by engine/quantization, `sum(qvac_registry_blob_core_byte_length)` +- **Storage:** `qvac_registry_total_blob_bytes` (view-derived logical size), `sum(qvac_registry_blob_core_byte_length)` (on-disk per node) - **RPC activity:** `rate(qvac_registry_rpc_requests_total[5m])`, error ratio - **Cluster health:** `qvac_registry_is_indexer` across nodes, `qvac_registry_view_core_length` vs `qvac_registry_view_core_contiguous_length` +- **Metric freshness:** `qvac_registry_totals_refreshed_age_seconds` — alert if it exceeds 15 minutes (background refresh runs every 5) **Import the baseline dashboard:** diff --git a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json index 92a8f92304..3289be4d48 100644 --- a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json +++ b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json @@ -1373,7 +1373,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "sum(qvac_registry_blob_core_byte_length)", + "expr": "qvac_registry_total_blob_bytes", "legendFormat": "", "refId": "A" } @@ -1743,65 +1743,6 @@ "title": "RPC Error Rate", "type": "timeseries" }, - { - "datasource": { - "type": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-GrYlRd" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 57 - }, - "id": 26, - "options": { - "displayMode": "gradient", - "maxVizHeight": 300, - "minVizHeight": 16, - "minVizWidth": 8, - "namePlacement": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true, - "sizing": "auto", - "valueMode": "color" - }, - "pluginVersion": "12.4.0", - "targets": [ - { - "expr": "qvac_registry_model_size_bytes", - "legendFormat": "{{path}}", - "refId": "A" - } - ], - "title": "Model Size Breakdown", - "type": "bargauge" - }, { "datasource": { "type": "loki" diff --git a/packages/qvac-lib-registry-server/lib/metrics.js b/packages/qvac-lib-registry-server/lib/metrics.js index e39d4ce17b..ffc51c4eee 100644 --- a/packages/qvac-lib-registry-server/lib/metrics.js +++ b/packages/qvac-lib-registry-server/lib/metrics.js @@ -2,16 +2,11 @@ const promClient = require('prom-client') -const MODEL_CACHE_TTL_MS = 15000 - class QvacMetrics { constructor (service, opts = {}) { this._service = service this._logger = opts.logger || console - this._modelCache = null - this._modelCacheExpiry = 0 - this._rpcRequests = new promClient.Counter({ name: 'qvac_registry_rpc_requests_total', help: 'Total RPC requests by method', @@ -42,9 +37,29 @@ class QvacMetrics { new promClient.Gauge({ name: 'qvac_registry_models_total', help: 'Total number of models in the registry', - async collect () { - const models = await self._getCachedModels() - this.set(models.length) + collect () { + this.set(self._service.modelCount) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_total_blob_bytes', + help: 'Total bytes across all model blobs (sum of blobBinding.byteLength across view records)', + collect () { + this.set(self._service.totalModelBytes) + } + }) + + // Derived from totalModelBytes via a background refresh; expose staleness so + // operators can alert when the refresh stalls. + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_totals_refreshed_age_seconds', + help: 'Seconds since qvac_registry_total_blob_bytes and qvac_registry_models_total were last recomputed (-1 if never)', + collect () { + const ts = self._service.totalsRefreshedAt + this.set(ts ? (Date.now() - ts) / 1000 : -1) } }) @@ -141,7 +156,7 @@ class QvacMetrics { // eslint-disable-next-line no-new new promClient.Gauge({ name: 'qvac_registry_blob_core_byte_length', - help: 'Byte length of each blob core', + help: 'Byte length of each blob core (only populated on nodes that opened the blob core locally)', labelNames: ['core_name'], collect () { this.reset() @@ -150,45 +165,6 @@ class QvacMetrics { } } }) - - // eslint-disable-next-line no-new - new promClient.Gauge({ - name: 'qvac_registry_model_size_bytes', - help: 'Size in bytes of each model blob', - labelNames: ['path', 'engine', 'quantization'], - async collect () { - const models = await self._getCachedModels() - this.reset() - for (const m of models) { - if (m.blobBinding && m.blobBinding.byteLength > 0) { - this.set({ - path: m.path, - engine: m.engine || '', - quantization: m.quantization || '' - }, m.blobBinding.byteLength) - } - } - } - }) - } - - async _getCachedModels () { - const now = Date.now() - if (this._modelCache && now < this._modelCacheExpiry) { - return this._modelCache - } - - try { - const view = this._service.view - if (!view || !view.opened) return this._modelCache || [] - const models = await view.findModelsByPath({}).toArray() - this._modelCache = models - this._modelCacheExpiry = now + MODEL_CACHE_TTL_MS - return models - } catch (err) { - this._logger.warn({ err: err.message }, 'QvacMetrics: failed to query models') - return this._modelCache || [] - } } } diff --git a/packages/qvac-lib-registry-server/lib/registry-service.js b/packages/qvac-lib-registry-server/lib/registry-service.js index 74c30a27d0..27b4fff385 100644 --- a/packages/qvac-lib-registry-server/lib/registry-service.js +++ b/packages/qvac-lib-registry-server/lib/registry-service.js @@ -46,6 +46,8 @@ const DISPATCH_DELETE_MODEL = `@${QVAC_MAIN_REGISTRY}/delete-model` const BLOB_CORE_NAME = 'models' +const MODEL_TOTALS_REFRESH_INTERVAL_MS = 5 * 60 * 1000 + class RegistryService extends ReadyResource { constructor (store, swarm, config, opts = {}) { super() @@ -77,6 +79,11 @@ class RegistryService extends ReadyResource { this.reseedTracker = null this.metrics = null + this._totalModelBytes = 0 + this._modelCount = 0 + this._totalModelBytesRefreshedAt = 0 + this._totalsRefreshTimer = null + this._registerApplyHandlers() this.base = new Autobase(this.store, this.autobaseBootstrap, { @@ -145,10 +152,6 @@ class RegistryService extends ReadyResource { this.view = this.base.view await this.view.ready() - this._logAvailableModels().catch(err => { - this.logger.error({ err }, 'RegistryService: Failed to log available models') - }) - this.swarm.on('connection', (conn, peerInfo) => { const peerKey = peerInfo?.publicKey ? IdEnc.normalize(peerInfo.publicKey) : null @@ -259,12 +262,25 @@ class RegistryService extends ReadyResource { this._startCompactionInterval() } + await this._refreshTotals() + this._totalsRefreshTimer = setInterval(() => { + this._refreshTotals().catch(err => { + this.logger.warn({ err: err.message }, 'RegistryService: failed to refresh model totals') + }) + }, MODEL_TOTALS_REFRESH_INTERVAL_MS) + if (this._totalsRefreshTimer.unref) this._totalsRefreshTimer.unref() + this.logger.info('RegistryService: swarm joined and flushed') } async _close () { this.logger.info('RegistryService: closing') + if (this._totalsRefreshTimer) { + clearInterval(this._totalsRefreshTimer) + this._totalsRefreshTimer = null + } + if (this._compactionInterval) { clearInterval(this._compactionInterval) this._compactionInterval = null @@ -704,6 +720,8 @@ class RegistryService extends ReadyResource { await this._appendOperation(DISPATCH_PUT_MODEL, modelData) + this._scheduleTotalsRefresh() + if (this.reseedTracker) { await this.reseedTracker.waitForComplete() this.logger.info({ @@ -1132,6 +1150,8 @@ class RegistryService extends ReadyResource { await this._appendOperation(DISPATCH_DELETE_MODEL, { path, source }) + this._scheduleTotalsRefresh() + this.logger.info({ path, source }, 'deleteModel: completed') return { success: true, path, source } @@ -1212,26 +1232,45 @@ class RegistryService extends ReadyResource { } } - async _logAvailableModels () { - if (!this.view) return + async _refreshTotals () { + if (!this.view || !this.view.opened) return - try { - if (!this.view.opened) await this.view.ready() - const models = await this.view.findModelsByPath({}).toArray() + const startedAt = Date.now() + let total = 0 + let count = 0 - if (models.length === 0) { - this.logger.info('RegistryService: No models in registry yet') - } else { - const modelsToLog = models.length > 5 ? models.slice(-5) : models - this.logger.info({ - count: models.length, - showing: modelsToLog.length, - models: modelsToLog.map(m => `${m.path} [${m.engine}]`) - }, 'RegistryService: models available') - } - } catch (err) { - this.logger.error({ err }, 'RegistryService: Failed to log models') + for await (const model of this.view.findModelsByPath({})) { + total += model.blobBinding?.byteLength || 0 + count++ } + + this._totalModelBytes = total + this._modelCount = count + this._totalModelBytesRefreshedAt = Date.now() + + this.logger.debug({ + totalBytes: total, + models: count, + durationMs: Date.now() - startedAt + }, 'RegistryService: refreshed model totals') + } + + _scheduleTotalsRefresh () { + this._refreshTotals().catch(err => { + this.logger.warn({ err: err.message }, 'RegistryService: failed to refresh model totals') + }) + } + + get totalModelBytes () { + return this._totalModelBytes + } + + get modelCount () { + return this._modelCount + } + + get totalsRefreshedAt () { + return this._totalModelBytesRefreshedAt } _normalizeKey (key) { diff --git a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js index 1386e2ee73..bf59508eb5 100644 --- a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js +++ b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js @@ -112,6 +112,8 @@ test('/metrics includes QVAC custom gauges', async (t) => { const body = res.body t.ok(body.includes('qvac_registry_models_total'), 'has models_total') + t.ok(body.includes('qvac_registry_total_blob_bytes'), 'has total_blob_bytes') + t.ok(body.includes('qvac_registry_totals_refreshed_age_seconds'), 'has totals_refreshed_age_seconds') t.ok(body.includes('qvac_registry_blob_cores_total'), 'has blob_cores_total') t.ok(body.includes('qvac_registry_view_core_length'), 'has view_core_length') t.ok(body.includes('qvac_registry_view_core_contiguous_length'), 'has view_core_contiguous_length') @@ -120,6 +122,18 @@ test('/metrics includes QVAC custom gauges', async (t) => { t.ok(body.includes('qvac_registry_blind_peer_connected'), 'has blind_peer_connected') t.ok(body.includes('qvac_registry_rpc_requests_total'), 'has rpc_requests_total') t.ok(body.includes('qvac_registry_rpc_errors_total'), 'has rpc_errors_total') + + const totalBytesLine = body.split('\n') + .find(line => line.startsWith('qvac_registry_total_blob_bytes ')) + t.ok(totalBytesLine, 'exports total_blob_bytes as a single series') + t.ok(totalBytesLine.endsWith(' 0'), 'total_blob_bytes is 0 on an empty registry') + + const modelsTotalLine = body.split('\n') + .find(line => line.startsWith('qvac_registry_models_total ')) + t.ok(modelsTotalLine, 'exports models_total as a single series') + t.ok(modelsTotalLine.endsWith(' 0'), 'models_total is 0 on an empty registry') + + t.absent(body.includes('qvac_registry_model_size_bytes'), 'per-path model_size_bytes metric is removed') } finally { await cleanup(ctx) } From ad15947b5dc5ce2ac91dfcf20f0596a7eecd9b83 Mon Sep 17 00:00:00 2001 From: Yury Samarin Date: Wed, 22 Apr 2026 17:32:22 +0800 Subject: [PATCH 3/8] feat[bc]: rename gauges, add seeder metrics, and eagerly open blob core on indexers (#1692) * feat[bc]: rename gauge metrics off _total suffix and pre-initialise rpc counters * feat: add core seeder metrics and eagerly open blob core on indexers * style: drop eslint-disable directives via helper function for gauge registration * refactor[bc]: drop core_name label from blob core metrics and use median for view-derived stat panels * style: drop noisy comment above registerGauge helper --- .../docs/DEPLOYMENT_GUIDE.md | 23 ++- .../docs/grafana/REGISTRY_DASHBOARD.json | 14 +- .../qvac-lib-registry-server/lib/metrics.js | 134 +++++++++++------- .../lib/registry-service.js | 13 ++ .../integration/metrics.integration.test.js | 31 +++- 5 files changed, 147 insertions(+), 68 deletions(-) diff --git a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md index bda3a54294..ca92e20540 100644 --- a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md +++ b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md @@ -558,15 +558,17 @@ node scripts/bin.js run --storage ./corestore --metrics-port 0 | Metric | Type | Description | |--------|------|-------------| -| `qvac_registry_models_total` | Gauge | Total models in the registry (refreshed every 5 min and on local writes) | +| `qvac_registry_model_count` | Gauge | Number of models in the registry (refreshed every 5 min and on local writes) | | `qvac_registry_total_blob_bytes` | Gauge | Sum of `blobBinding.byteLength` across every model record in the view | -| `qvac_registry_totals_refreshed_age_seconds` | Gauge | Seconds since `total_blob_bytes` / `models_total` were last recomputed (-1 if never) | -| `qvac_registry_blob_cores_total` | Gauge | Number of blob cores opened locally on this node | -| `qvac_registry_blob_core_peers` | Gauge | Connected peers per blob core | -| `qvac_registry_blob_core_fully_downloaded` | Gauge | Whether each blob core is fully replicated | -| `qvac_registry_blob_core_byte_length` | Gauge | Byte length per locally-opened blob core | +| `qvac_registry_totals_refreshed_age_seconds` | Gauge | Seconds since `total_blob_bytes` / `model_count` were last recomputed (-1 if never) | +| `qvac_registry_blob_core_count` | Gauge | Number of blob cores opened locally on this node | +| `qvac_registry_blob_core_peers` | Gauge | Peers connected to this node's local blob core (may be partial replicas) | +| `qvac_registry_blob_core_seeders` | Gauge | Peers holding this node's local blob core fully and uploading (full replicas) | +| `qvac_registry_blob_core_fully_downloaded` | Gauge | Whether this node's local blob core is fully replicated (1/0) | +| `qvac_registry_blob_core_byte_length` | Gauge | Byte length of this node's local blob core | | `qvac_registry_view_core_length` | Gauge | View core length (total blocks) | | `qvac_registry_view_core_contiguous_length` | Gauge | View core contiguous length (gap indicates replication lag) | +| `qvac_registry_view_core_seeders` | Gauge | Peers holding the view core fully and willing to upload (full replicas in the swarm) | | `qvac_registry_rpc_requests_total` | Counter | RPC requests by method | | `qvac_registry_rpc_errors_total` | Counter | RPC errors by method | | `qvac_registry_is_indexer` | Gauge | Whether this node is an indexer | @@ -575,6 +577,12 @@ node scripts/bin.js run --storage ./corestore --metrics-port 0 `qvac_registry_total_blob_bytes` is derived from the view, not from the on-disk blob cores, so it reports the logical registry size consistently on every node (indexers that do not store blobs locally still report the same value). +`qvac_registry_blob_core_*` metrics are populated on writer/indexer nodes — the blob core is opened eagerly at startup. Reader-only nodes that don't hold writer state do not open the blob core locally and will export `0` for these gauges. Each indexer owns exactly one writable blob core namespaced to its own primary key, so these metrics are single-series per node; Prometheus's automatic `instance` label distinguishes nodes at scrape time. + +**Multi-indexer dashboards:** view-derived metrics (`qvac_registry_model_count`, `qvac_registry_total_blob_bytes`, `qvac_registry_totals_refreshed_age_seconds`) report the same value on every indexer because the view is authoritative and identical cluster-wide. For single-stat panels use `quantile(0.5, …)` or `avg(…)` to collapse to one value without triple-counting. On-disk metrics (`qvac_registry_blob_core_byte_length`, `qvac_registry_blob_core_peers`, `qvac_registry_blob_core_seeders`) are per-node and should be displayed per `instance` or summed for cluster totals. + +`*_seeders` count peers whose replication handshake has completed, who advertise `remoteUploading`, and whose `remoteContiguousLength` covers the local core length. For the view core they converge to the number of connected replicating peers within an RTT because the view is small (a few MB of autobase metadata); for blob cores the gap `peers - seeders` indicates peers currently downloading rather than serving. + **Prometheus scrape config (local Prometheus, loopback bind):** ```yaml @@ -654,8 +662,9 @@ Use Holepunch's pre-built [Grafana dashboard](https://grafana.com/grafana/dashbo **Add QVAC-specific panels for:** -- **Model availability:** `qvac_registry_models_total`, `hyper_health_peers_with_all_data_total` +- **Model availability:** `qvac_registry_model_count`, `hyper_health_peers_with_all_data_total` - **Storage:** `qvac_registry_total_blob_bytes` (view-derived logical size), `sum(qvac_registry_blob_core_byte_length)` (on-disk per node) +- **Replication durability:** `qvac_registry_view_core_seeders`, `qvac_registry_blob_core_seeders` — alert when either drops below a redundancy floor (e.g. `< 2`). Gap between `blob_core_peers` and `blob_core_seeders` surfaces peers mid-download. - **RPC activity:** `rate(qvac_registry_rpc_requests_total[5m])`, error ratio - **Cluster health:** `qvac_registry_is_indexer` across nodes, `qvac_registry_view_core_length` vs `qvac_registry_view_core_contiguous_length` - **Metric freshness:** `qvac_registry_totals_refreshed_age_seconds` — alert if it exceeds 15 minutes (background refresh runs every 5) diff --git a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json index 3289be4d48..3030064db9 100644 --- a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json +++ b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json @@ -1062,7 +1062,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "qvac_registry_models_total", + "expr": "quantile(0.5, qvac_registry_model_count)", "legendFormat": "", "refId": "A" } @@ -1246,7 +1246,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "qvac_registry_blob_cores_total", + "expr": "qvac_registry_blob_core_count", "legendFormat": "", "refId": "A" } @@ -1373,7 +1373,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "qvac_registry_total_blob_bytes", + "expr": "quantile(0.5, qvac_registry_total_blob_bytes)", "legendFormat": "", "refId": "A" } @@ -1461,17 +1461,17 @@ "targets": [ { "expr": "qvac_registry_view_core_length", - "legendFormat": "length", + "legendFormat": "{{instance}} length", "refId": "A" }, { "expr": "qvac_registry_view_core_contiguous_length", - "legendFormat": "contiguous", + "legendFormat": "{{instance}} contiguous", "refId": "B" }, { "expr": "qvac_registry_view_core_length - qvac_registry_view_core_contiguous_length", - "legendFormat": "gap (replication lag)", + "legendFormat": "{{instance}} gap", "refId": "C" } ], @@ -1558,7 +1558,7 @@ "targets": [ { "expr": "qvac_registry_blob_core_peers", - "legendFormat": "{{core_name}}", + "legendFormat": "{{instance}}", "refId": "A" } ], diff --git a/packages/qvac-lib-registry-server/lib/metrics.js b/packages/qvac-lib-registry-server/lib/metrics.js index ffc51c4eee..f041eab69e 100644 --- a/packages/qvac-lib-registry-server/lib/metrics.js +++ b/packages/qvac-lib-registry-server/lib/metrics.js @@ -2,6 +2,17 @@ const promClient = require('prom-client') +// RPC methods that the registry service exposes. Pre-initialising counter +// series at zero for each method means `rate()` returns 0 (instead of NaN) +// from the first scrape, so dashboards do not appear empty on a fresh start. +const RPC_METHODS = Object.freeze([ + 'add-model', + 'put-license', + 'update-model-metadata', + 'delete-model', + 'ping' +]) + class QvacMetrics { constructor (service, opts = {}) { this._service = service @@ -19,6 +30,11 @@ class QvacMetrics { labelNames: ['method'] }) + for (const method of RPC_METHODS) { + this._rpcRequests.inc({ method }, 0) + this._rpcErrors.inc({ method }, 0) + } + this._registerGauges() } @@ -33,17 +49,15 @@ class QvacMetrics { _registerGauges () { const self = this - // eslint-disable-next-line no-new - new promClient.Gauge({ - name: 'qvac_registry_models_total', - help: 'Total number of models in the registry', + registerGauge({ + name: 'qvac_registry_model_count', + help: 'Number of models in the registry', collect () { this.set(self._service.modelCount) } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ + registerGauge({ name: 'qvac_registry_total_blob_bytes', help: 'Total bytes across all model blobs (sum of blobBinding.byteLength across view records)', collect () { @@ -53,54 +67,46 @@ class QvacMetrics { // Derived from totalModelBytes via a background refresh; expose staleness so // operators can alert when the refresh stalls. - // eslint-disable-next-line no-new - new promClient.Gauge({ + registerGauge({ name: 'qvac_registry_totals_refreshed_age_seconds', - help: 'Seconds since qvac_registry_total_blob_bytes and qvac_registry_models_total were last recomputed (-1 if never)', + help: 'Seconds since qvac_registry_total_blob_bytes and qvac_registry_model_count were last recomputed (-1 if never)', collect () { const ts = self._service.totalsRefreshedAt this.set(ts ? (Date.now() - ts) / 1000 : -1) } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ - name: 'qvac_registry_blob_cores_total', - help: 'Number of blob cores', + registerGauge({ + name: 'qvac_registry_blob_core_count', + help: 'Number of blob cores opened locally on this node', collect () { this.set(self._service.blobsCores.size) } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ + // Each indexer owns exactly one writable blob core (namespaced on its + // own primary key), so per-node blob-core metrics are single-series and + // don't need an extra label - Prometheus's automatic `instance` label + // distinguishes nodes at scrape time. + registerGauge({ name: 'qvac_registry_blob_core_peers', - help: 'Number of connected peers per blob core', - labelNames: ['core_name'], + help: 'Number of peers connected to this node\'s local blob core (may be partial replicas)', collect () { - this.reset() - for (const [name, { core }] of self._service.blobsCores) { - this.set({ core_name: name }, core.peers.length) - } + this.set(firstBlobCore(self._service)?.peers.length ?? 0) } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ + registerGauge({ name: 'qvac_registry_blob_core_fully_downloaded', - help: 'Whether each blob core is fully downloaded (1=yes, 0=no)', - labelNames: ['core_name'], + help: 'Whether this node\'s local blob core is fully downloaded (1=yes, 0=no)', collect () { - this.reset() - for (const [name, { core }] of self._service.blobsCores) { - const full = core.contiguousLength === core.length && core.length > 0 ? 1 : 0 - this.set({ core_name: name }, full) - } + const core = firstBlobCore(self._service) + if (!core || core.length === 0) { this.set(0); return } + this.set(core.contiguousLength === core.length ? 1 : 0) } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ + registerGauge({ name: 'qvac_registry_view_core_length', help: 'View core length (total blocks)', collect () { @@ -109,8 +115,7 @@ class QvacMetrics { } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ + registerGauge({ name: 'qvac_registry_view_core_contiguous_length', help: 'View core contiguous length (gap = length - contiguous indicates replication lag)', collect () { @@ -119,8 +124,15 @@ class QvacMetrics { } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ + registerGauge({ + name: 'qvac_registry_view_core_seeders', + help: 'Peers that hold the view core fully and are willing to upload (full replicas available in the swarm)', + collect () { + this.set(countSeeders(self._service.view?.core)) + } + }) + + registerGauge({ name: 'qvac_registry_is_indexer', help: 'Whether this node is an indexer (1=yes, 0=no)', collect () { @@ -128,8 +140,7 @@ class QvacMetrics { } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ + registerGauge({ name: 'qvac_registry_blind_peers_connected', help: 'Number of configured blind peers with an active connection', collect () { @@ -137,8 +148,7 @@ class QvacMetrics { } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ + registerGauge({ name: 'qvac_registry_blind_peer_connected', help: 'Whether each configured blind peer currently has an active connection (1=yes, 0=no)', labelNames: ['peer_key'], @@ -153,19 +163,47 @@ class QvacMetrics { } }) - // eslint-disable-next-line no-new - new promClient.Gauge({ + registerGauge({ name: 'qvac_registry_blob_core_byte_length', - help: 'Byte length of each blob core (only populated on nodes that opened the blob core locally)', - labelNames: ['core_name'], + help: 'Byte length of this node\'s local blob core (only populated on nodes that opened the blob core locally)', collect () { - this.reset() - for (const [name, { core }] of self._service.blobsCores) { - this.set({ core_name: name }, core.byteLength) - } + this.set(firstBlobCore(self._service)?.byteLength ?? 0) } }) + + registerGauge({ + name: 'qvac_registry_blob_core_seeders', + help: 'Peers holding this node\'s local blob core fully and willing to upload (full replicas)', + collect () { + this.set(countSeeders(firstBlobCore(self._service))) + } + }) + } +} + +// Each indexer owns at most one writable blob core; this helper returns it +// or null when the node is a reader that hasn't opened the core locally. +function firstBlobCore (service) { + const iter = service.blobsCores.values().next() + return iter.done ? null : iter.value.core +} + +function registerGauge (opts) { + return new promClient.Gauge(opts) +} + +// A peer is a "seeder" for a core when the replication handshake has opened, +// the remote has advertised willingness to upload, and the remote's contiguous +// length covers the core's current length. `remoteContiguousLength` is zero +// until the handshake completes, so the `remoteOpened` check avoids counting +// partially-initialised peers as full replicas. +function countSeeders (core) { + if (!core || !Array.isArray(core.peers) || core.length === 0) return 0 + let n = 0 + for (const p of core.peers) { + if (p.remoteOpened && p.remoteUploading && p.remoteContiguousLength >= core.length) n++ } + return n } module.exports = QvacMetrics diff --git a/packages/qvac-lib-registry-server/lib/registry-service.js b/packages/qvac-lib-registry-server/lib/registry-service.js index 27b4fff385..8fb831f5de 100644 --- a/packages/qvac-lib-registry-server/lib/registry-service.js +++ b/packages/qvac-lib-registry-server/lib/registry-service.js @@ -258,6 +258,19 @@ class RegistryService extends ReadyResource { await this._setupBlindPeering() } + // Eagerly open the blob core on writer/indexer nodes so that per-core + // metrics (peers, seeders, byte length) and on-disk replication health + // are observable immediately, without waiting for the first addModel RPC + // or for blind-peering setup to run. On reader-only nodes we skip this, + // because `writable: true` would create a local core with the wrong key. + if (this.base.isIndexer || this.base.localWriter) { + try { + await this._getOrCreateBlobsCore(BLOB_CORE_NAME) + } catch (err) { + this.logger.warn({ err: err.message }, 'RegistryService: failed to open blob core at startup') + } + } + if (this.compactionIntervalMs > 0) { this._startCompactionInterval() } diff --git a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js index bf59508eb5..9c4f8c6025 100644 --- a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js +++ b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js @@ -111,12 +111,14 @@ test('/metrics includes QVAC custom gauges', async (t) => { const res = await httpGet(ctx.port, '/metrics') const body = res.body - t.ok(body.includes('qvac_registry_models_total'), 'has models_total') + t.ok(body.includes('qvac_registry_model_count'), 'has model_count') t.ok(body.includes('qvac_registry_total_blob_bytes'), 'has total_blob_bytes') t.ok(body.includes('qvac_registry_totals_refreshed_age_seconds'), 'has totals_refreshed_age_seconds') - t.ok(body.includes('qvac_registry_blob_cores_total'), 'has blob_cores_total') + t.ok(body.includes('qvac_registry_blob_core_count'), 'has blob_core_count') + t.ok(body.includes('qvac_registry_blob_core_seeders'), 'has blob_core_seeders') t.ok(body.includes('qvac_registry_view_core_length'), 'has view_core_length') t.ok(body.includes('qvac_registry_view_core_contiguous_length'), 'has view_core_contiguous_length') + t.ok(body.includes('qvac_registry_view_core_seeders'), 'has view_core_seeders') t.ok(body.includes('qvac_registry_is_indexer'), 'has is_indexer') t.ok(body.includes('qvac_registry_blind_peers_connected'), 'has blind_peers_connected') t.ok(body.includes('qvac_registry_blind_peer_connected'), 'has blind_peer_connected') @@ -128,12 +130,29 @@ test('/metrics includes QVAC custom gauges', async (t) => { t.ok(totalBytesLine, 'exports total_blob_bytes as a single series') t.ok(totalBytesLine.endsWith(' 0'), 'total_blob_bytes is 0 on an empty registry') - const modelsTotalLine = body.split('\n') - .find(line => line.startsWith('qvac_registry_models_total ')) - t.ok(modelsTotalLine, 'exports models_total as a single series') - t.ok(modelsTotalLine.endsWith(' 0'), 'models_total is 0 on an empty registry') + const modelCountLine = body.split('\n') + .find(line => line.startsWith('qvac_registry_model_count ')) + t.ok(modelCountLine, 'exports model_count as a single series') + t.ok(modelCountLine.endsWith(' 0'), 'model_count is 0 on an empty registry') + t.absent(body.includes('qvac_registry_models_total'), 'legacy models_total name is removed') + t.absent(body.includes('qvac_registry_blob_cores_total'), 'legacy blob_cores_total name is removed') t.absent(body.includes('qvac_registry_model_size_bytes'), 'per-path model_size_bytes metric is removed') + + const viewSeedersLine = body.split('\n') + .find(line => line.startsWith('qvac_registry_view_core_seeders ')) + t.ok(viewSeedersLine, 'exports view_core_seeders as a single series') + t.ok(viewSeedersLine.endsWith(' 0'), 'view_core_seeders is 0 with no connected peers') + + const rpcPingRequests = body.split('\n') + .find(line => line.startsWith('qvac_registry_rpc_requests_total{method="ping"}')) + t.ok(rpcPingRequests, 'rpc_requests_total{method="ping"} series is pre-initialised') + t.ok(rpcPingRequests.endsWith(' 0'), 'rpc_requests_total{method="ping"} starts at 0') + + const rpcPingErrors = body.split('\n') + .find(line => line.startsWith('qvac_registry_rpc_errors_total{method="add-model"}')) + t.ok(rpcPingErrors, 'rpc_errors_total{method="add-model"} series is pre-initialised') + t.ok(rpcPingErrors.endsWith(' 0'), 'rpc_errors_total{method="add-model"} starts at 0') } finally { await cleanup(ctx) } From 1de851bbf710b3d9db4e9359be99758017e8f2f9 Mon Sep 17 00:00:00 2001 From: Yury Samarin Date: Wed, 22 Apr 2026 20:04:36 +0800 Subject: [PATCH 4/8] feat[bc]: replace blob_core_fully_downloaded with length/contiguous_length pair and drop blind-peer metrics (#1702) --- .../docs/DEPLOYMENT_GUIDE.md | 5 +- .../docs/grafana/REGISTRY_DASHBOARD.json | 68 ++----------------- .../qvac-lib-registry-server/lib/metrics.js | 39 +++-------- .../integration/metrics.integration.test.js | 53 +-------------- 4 files changed, 20 insertions(+), 145 deletions(-) diff --git a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md index ca92e20540..0cd0225e7b 100644 --- a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md +++ b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md @@ -564,7 +564,8 @@ node scripts/bin.js run --storage ./corestore --metrics-port 0 | `qvac_registry_blob_core_count` | Gauge | Number of blob cores opened locally on this node | | `qvac_registry_blob_core_peers` | Gauge | Peers connected to this node's local blob core (may be partial replicas) | | `qvac_registry_blob_core_seeders` | Gauge | Peers holding this node's local blob core fully and uploading (full replicas) | -| `qvac_registry_blob_core_fully_downloaded` | Gauge | Whether this node's local blob core is fully replicated (1/0) | +| `qvac_registry_blob_core_length` | Gauge | This node's local blob core length in blocks | +| `qvac_registry_blob_core_contiguous_length` | Gauge | Blob core contiguous length in blocks (gap indicates missing blocks on disk) | | `qvac_registry_blob_core_byte_length` | Gauge | Byte length of this node's local blob core | | `qvac_registry_view_core_length` | Gauge | View core length (total blocks) | | `qvac_registry_view_core_contiguous_length` | Gauge | View core contiguous length (gap indicates replication lag) | @@ -572,8 +573,6 @@ node scripts/bin.js run --storage ./corestore --metrics-port 0 | `qvac_registry_rpc_requests_total` | Counter | RPC requests by method | | `qvac_registry_rpc_errors_total` | Counter | RPC errors by method | | `qvac_registry_is_indexer` | Gauge | Whether this node is an indexer | -| `qvac_registry_blind_peers_connected` | Gauge | Number of configured blind peers with an active connection | -| `qvac_registry_blind_peer_connected` | Gauge | Per-blind-peer connection status (labeled by `peer_key`) | `qvac_registry_total_blob_bytes` is derived from the view, not from the on-disk blob cores, so it reports the logical registry size consistently on every node (indexers that do not store blobs locally still report the same value). diff --git a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json index 3030064db9..eb766a7b57 100644 --- a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json +++ b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json @@ -1142,64 +1142,6 @@ "title": "Indexer", "type": "stat" }, - { - "datasource": { - "type": "prometheus" - }, - "fieldConfig": { - "defaults": { - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "yellow", - "value": 0 - }, - { - "color": "green", - "value": 1 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 37 - }, - "id": 18, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.4.0", - "targets": [ - { - "expr": "qvac_registry_blind_peers_connected", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Blind Peers", - "type": "stat" - }, { "datasource": { "type": "prometheus" @@ -1222,7 +1164,7 @@ "gridPos": { "h": 4, "w": 4, - "x": 12, + "x": 8, "y": 37 }, "id": 19, @@ -1294,7 +1236,7 @@ "gridPos": { "h": 4, "w": 4, - "x": 16, + "x": 12, "y": 37 }, "id": 20, @@ -1318,12 +1260,12 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "min(qvac_registry_blob_core_fully_downloaded)", + "expr": "min(qvac_registry_blob_core_contiguous_length == bool qvac_registry_blob_core_length) and min(qvac_registry_blob_core_length) > bool 0", "legendFormat": "", "refId": "A" } ], - "title": "Blobs Synced", + "title": "Blob Core Contiguous", "type": "stat" }, { @@ -1349,7 +1291,7 @@ "gridPos": { "h": 4, "w": 4, - "x": 20, + "x": 16, "y": 37 }, "id": 21, diff --git a/packages/qvac-lib-registry-server/lib/metrics.js b/packages/qvac-lib-registry-server/lib/metrics.js index f041eab69e..f0411cc235 100644 --- a/packages/qvac-lib-registry-server/lib/metrics.js +++ b/packages/qvac-lib-registry-server/lib/metrics.js @@ -97,12 +97,18 @@ class QvacMetrics { }) registerGauge({ - name: 'qvac_registry_blob_core_fully_downloaded', - help: 'Whether this node\'s local blob core is fully downloaded (1=yes, 0=no)', + name: 'qvac_registry_blob_core_length', + help: 'This node\'s local blob core length (total blocks)', collect () { - const core = firstBlobCore(self._service) - if (!core || core.length === 0) { this.set(0); return } - this.set(core.contiguousLength === core.length ? 1 : 0) + this.set(firstBlobCore(self._service)?.length ?? 0) + } + }) + + registerGauge({ + name: 'qvac_registry_blob_core_contiguous_length', + help: 'This node\'s local blob core contiguous length (gap = length - contiguous indicates missing blocks on disk)', + collect () { + this.set(firstBlobCore(self._service)?.contiguousLength ?? 0) } }) @@ -140,29 +146,6 @@ class QvacMetrics { } }) - registerGauge({ - name: 'qvac_registry_blind_peers_connected', - help: 'Number of configured blind peers with an active connection', - collect () { - this.set(self._service.getConnectedBlindPeerKeys().length) - } - }) - - registerGauge({ - name: 'qvac_registry_blind_peer_connected', - help: 'Whether each configured blind peer currently has an active connection (1=yes, 0=no)', - labelNames: ['peer_key'], - collect () { - this.reset() - for (const peerKey of self._service.getConfiguredBlindPeerKeys()) { - this.set( - { peer_key: peerKey }, - self._service.isBlindPeerConnected(peerKey) ? 1 : 0 - ) - } - } - }) - registerGauge({ name: 'qvac_registry_blob_core_byte_length', help: 'Byte length of this node\'s local blob core (only populated on nodes that opened the blob core locally)', diff --git a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js index 9c4f8c6025..7be99521be 100644 --- a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js +++ b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js @@ -4,7 +4,6 @@ const test = require('brittle') const Corestore = require('corestore') const Hyperswarm = require('hyperswarm') const http = require('http') -const IdEnc = require('hypercore-id-encoding') const promClient = require('prom-client') const RegistryService = require('../../lib/registry-service') @@ -116,12 +115,12 @@ test('/metrics includes QVAC custom gauges', async (t) => { t.ok(body.includes('qvac_registry_totals_refreshed_age_seconds'), 'has totals_refreshed_age_seconds') t.ok(body.includes('qvac_registry_blob_core_count'), 'has blob_core_count') t.ok(body.includes('qvac_registry_blob_core_seeders'), 'has blob_core_seeders') + t.ok(body.includes('qvac_registry_blob_core_length'), 'has blob_core_length') + t.ok(body.includes('qvac_registry_blob_core_contiguous_length'), 'has blob_core_contiguous_length') t.ok(body.includes('qvac_registry_view_core_length'), 'has view_core_length') t.ok(body.includes('qvac_registry_view_core_contiguous_length'), 'has view_core_contiguous_length') t.ok(body.includes('qvac_registry_view_core_seeders'), 'has view_core_seeders') t.ok(body.includes('qvac_registry_is_indexer'), 'has is_indexer') - t.ok(body.includes('qvac_registry_blind_peers_connected'), 'has blind_peers_connected') - t.ok(body.includes('qvac_registry_blind_peer_connected'), 'has blind_peer_connected') t.ok(body.includes('qvac_registry_rpc_requests_total'), 'has rpc_requests_total') t.ok(body.includes('qvac_registry_rpc_errors_total'), 'has rpc_errors_total') @@ -135,10 +134,6 @@ test('/metrics includes QVAC custom gauges', async (t) => { t.ok(modelCountLine, 'exports model_count as a single series') t.ok(modelCountLine.endsWith(' 0'), 'model_count is 0 on an empty registry') - t.absent(body.includes('qvac_registry_models_total'), 'legacy models_total name is removed') - t.absent(body.includes('qvac_registry_blob_cores_total'), 'legacy blob_cores_total name is removed') - t.absent(body.includes('qvac_registry_model_size_bytes'), 'per-path model_size_bytes metric is removed') - const viewSeedersLine = body.split('\n') .find(line => line.startsWith('qvac_registry_view_core_seeders ')) t.ok(viewSeedersLine, 'exports view_core_seeders as a single series') @@ -182,50 +177,6 @@ test('RPC metrics counters increment', async (t) => { } }) -test('blind peer metrics track configured peers with active connections', async (t) => { - const blindPeerKeys = [ - IdEnc.normalize(Buffer.alloc(32, 1)), - IdEnc.normalize(Buffer.alloc(32, 2)) - ] - const ctx = await createServiceWithMetrics(t) - - try { - ctx.service.blindPeerKeys = blindPeerKeys - ctx.service._trackPeerConnection(blindPeerKeys[0]) - ctx.service._trackPeerConnection(blindPeerKeys[0]) - ctx.service._trackPeerConnection('writer-peer') - - let res = await httpGet(ctx.port, '/metrics') - let body = res.body - - const connectedPeerLine = body.split('\n') - .find(line => line.startsWith(`qvac_registry_blind_peer_connected{peer_key="${blindPeerKeys[0]}"}`)) - t.ok(connectedPeerLine, 'has connected blind peer series') - t.ok(connectedPeerLine.endsWith(' 1'), 'connected blind peer is reported as 1') - - const disconnectedPeerLine = body.split('\n') - .find(line => line.startsWith(`qvac_registry_blind_peer_connected{peer_key="${blindPeerKeys[1]}"}`)) - t.ok(disconnectedPeerLine, 'has disconnected blind peer series') - t.ok(disconnectedPeerLine.endsWith(' 0'), 'disconnected blind peer is reported as 0') - - ctx.service._untrackPeerConnection(blindPeerKeys[0]) - ctx.service._untrackPeerConnection(blindPeerKeys[0]) - - res = await httpGet(ctx.port, '/metrics') - body = res.body - - const afterCloseCountLine = body.split('\n') - .find(line => line.startsWith('qvac_registry_blind_peers_connected ')) - t.ok(afterCloseCountLine.endsWith(' 0'), 'blind peer count drops after connection closes') - - const afterClosePeerLine = body.split('\n') - .find(line => line.startsWith(`qvac_registry_blind_peer_connected{peer_key="${blindPeerKeys[0]}"}`)) - t.ok(afterClosePeerLine.endsWith(' 0'), 'blind peer status drops after connection closes') - } finally { - await cleanup(ctx) - } -}) - test('MetricsServer closes cleanly', async (t) => { const ctx = await createServiceWithMetrics(t) From 4a19b329d3dae482b9baddca6df34fb4b073f07f Mon Sep 17 00:00:00 2001 From: Yury Samarin Date: Thu, 23 Apr 2026 15:42:31 +0800 Subject: [PATCH 5/8] feat: expand Grafana dashboard with blob-core replication, seeders, and Holepunch P2P panels (#1716) * feat: expand Grafana dashboard with blob-core replication, seeders, and Holepunch P2P panels Made-with: Cursor * fix: use vm_name label in QVAC and Holepunch panel legends instead of raw instance IP:port Made-with: Cursor * fix: apply $vm template filter to QVAC and Holepunch selectors for consistent per-node filtering Made-with: Cursor --- .../docs/grafana/REGISTRY_DASHBOARD.json | 1304 ++++++++++++++++- 1 file changed, 1287 insertions(+), 17 deletions(-) diff --git a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json index eb766a7b57..e922f68638 100644 --- a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json +++ b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json @@ -1062,7 +1062,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "quantile(0.5, qvac_registry_model_count)", + "expr": "quantile(0.5, qvac_registry_model_count{vm_name=~\"$vm\"})", "legendFormat": "", "refId": "A" } @@ -1134,7 +1134,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "qvac_registry_is_indexer", + "expr": "qvac_registry_is_indexer{vm_name=~\"$vm\"}", "legendFormat": "", "refId": "A" } @@ -1188,7 +1188,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "qvac_registry_blob_core_count", + "expr": "qvac_registry_blob_core_count{vm_name=~\"$vm\"}", "legendFormat": "", "refId": "A" } @@ -1260,7 +1260,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "min(qvac_registry_blob_core_contiguous_length == bool qvac_registry_blob_core_length) and min(qvac_registry_blob_core_length) > bool 0", + "expr": "min(qvac_registry_blob_core_contiguous_length{vm_name=~\"$vm\"} == bool qvac_registry_blob_core_length{vm_name=~\"$vm\"}) and min(qvac_registry_blob_core_length{vm_name=~\"$vm\"}) > bool 0", "legendFormat": "", "refId": "A" } @@ -1315,7 +1315,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "quantile(0.5, qvac_registry_total_blob_bytes)", + "expr": "quantile(0.5, qvac_registry_total_blob_bytes{vm_name=~\"$vm\"})", "legendFormat": "", "refId": "A" } @@ -1402,18 +1402,18 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "qvac_registry_view_core_length", - "legendFormat": "{{instance}} length", + "expr": "qvac_registry_view_core_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }} length", "refId": "A" }, { - "expr": "qvac_registry_view_core_contiguous_length", - "legendFormat": "{{instance}} contiguous", + "expr": "qvac_registry_view_core_contiguous_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }} contiguous", "refId": "B" }, { - "expr": "qvac_registry_view_core_length - qvac_registry_view_core_contiguous_length", - "legendFormat": "{{instance}} gap", + "expr": "qvac_registry_view_core_length{vm_name=~\"$vm\"} - qvac_registry_view_core_contiguous_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }} gap", "refId": "C" } ], @@ -1499,8 +1499,8 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "qvac_registry_blob_core_peers", - "legendFormat": "{{instance}}", + "expr": "qvac_registry_blob_core_peers{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", "refId": "A" } ], @@ -1586,7 +1586,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "rate(qvac_registry_rpc_requests_total[5m])", + "expr": "rate(qvac_registry_rpc_requests_total{vm_name=~\"$vm\"}[5m])", "legendFormat": "{{method}}", "refId": "A" } @@ -1677,7 +1677,7 @@ "pluginVersion": "12.4.0", "targets": [ { - "expr": "rate(qvac_registry_rpc_errors_total[5m])", + "expr": "rate(qvac_registry_rpc_errors_total{vm_name=~\"$vm\"}[5m])", "legendFormat": "{{method}}", "refId": "A" } @@ -1697,7 +1697,7 @@ "h": 10, "w": 24, "x": 0, - "y": 65 + "y": 102 }, "id": 13, "options": { @@ -1733,7 +1733,7 @@ "h": 10, "w": 24, "x": 0, - "y": 75 + "y": 112 }, "id": 14, "options": { @@ -1756,6 +1756,1276 @@ ], "title": "Registry Error Logs", "type": "logs" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "-1": { + "color": "red", + "text": "never" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 300 + }, + { + "color": "red", + "value": 600 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 37 + }, + "id": 26, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "max(qvac_registry_totals_refreshed_age_seconds{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Totals Refresh Age", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }} length", + "refId": "A" + }, + { + "expr": "qvac_registry_blob_core_contiguous_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }} contiguous", + "refId": "B" + }, + { + "expr": "qvac_registry_blob_core_length{vm_name=~\"$vm\"} - qvac_registry_blob_core_contiguous_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }} gap", + "refId": "C" + } + ], + "title": "Blob Core Replication", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_view_core_seeders{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }} view", + "refId": "A" + }, + { + "expr": "qvac_registry_blob_core_seeders{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }} blob", + "refId": "B" + } + ], + "title": "Core Seeders (full replicas)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_byte_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Blob Core Bytes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 73 + }, + "id": 30, + "panels": [], + "title": "Holepunch P2P Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "green", + "value": 3 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 74 + }, + "id": 31, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(hyperswarm_nr_peers{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Swarm Peers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 74 + }, + "id": 32, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(dht_is_firewalled{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Firewalled Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 74 + }, + "id": 33, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(hypercore_invalid_data{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Hypercore Invalid Data", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 74 + }, + "id": 34, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(hypercore_invalid_requests{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Hypercore Invalid Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 100 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 74 + }, + "id": 35, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(udx_packets_dropped_total{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "UDX Packet Drops", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 74 + }, + "id": 36, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "avg(hyperswarm_avg_congestion_window{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Avg Congestion Window", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 78 + }, + "id": 37, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "hyperswarm_nr_peers{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Swarm Peers Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 78 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "hypercore_round_trip_time_avg_seconds{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Replication RTT", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 86 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(hyperswarm_client_connections_opened{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} client-opened", + "refId": "A" + }, + { + "expr": "rate(hyperswarm_server_connections_opened{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} server-opened", + "refId": "B" + }, + { + "expr": "rate(hyperswarm_client_connections_closed{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} client-closed", + "refId": "C" + }, + { + "expr": "rate(hyperswarm_server_connections_closed{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} server-closed", + "refId": "D" + } + ], + "title": "Swarm Connection Churn (per-second)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 86 + }, + "id": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(hypercore_total_wire_data_transmitted{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} tx", + "refId": "A" + }, + { + "expr": "rate(hypercore_total_wire_data_received{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} rx", + "refId": "B" + } + ], + "title": "Hypercore Wire Data Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 94 + }, + "id": 41, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(udx_total_bytes_transmitted{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} tx", + "refId": "A" + }, + { + "expr": "rate(udx_total_bytes_received{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} rx", + "refId": "B" + } + ], + "title": "UDX Bytes (DHT transport)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 94 + }, + "id": 42, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(dht_total_queries{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} queries", + "refId": "A" + }, + { + "expr": "rate(dht_total_requests{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} requests", + "refId": "B" + } + ], + "title": "DHT Query & Request Rate", + "type": "timeseries" } ], "preload": false, From 7ae407b697593c69c7104819343ca1e8de783377 Mon Sep 17 00:00:00 2001 From: Yury Samarin Date: Thu, 23 Apr 2026 17:18:30 +0800 Subject: [PATCH 6/8] chore[docs]: tighten registry Grafana dashboard panels based on staging review (#1718) * chore[docs]: tighten registry Grafana dashboard panels based on staging review * chore[docs]: drop redundant Blob Core Contiguous stat, cluster blob panels near the top --- .../docs/grafana/REGISTRY_DASHBOARD.json | 374 ++++++++++-------- 1 file changed, 207 insertions(+), 167 deletions(-) diff --git a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json index e922f68638..1d5f089ccc 100644 --- a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json +++ b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json @@ -1067,7 +1067,7 @@ "refId": "A" } ], - "title": "Models", + "title": "Model Files", "type": "stat" }, { @@ -1118,7 +1118,7 @@ "colorMode": "background", "graphMode": "none", "justifyMode": "auto", - "orientation": "auto", + "orientation": "vertical", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ @@ -1128,14 +1128,14 @@ "values": false }, "showPercentChange": false, - "textMode": "auto", + "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "12.4.0", "targets": [ { "expr": "qvac_registry_is_indexer{vm_name=~\"$vm\"}", - "legendFormat": "", + "legendFormat": "{{vm_name}}", "refId": "A" } ], @@ -1172,7 +1172,7 @@ "colorMode": "value", "graphMode": "none", "justifyMode": "auto", - "orientation": "auto", + "orientation": "vertical", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ @@ -1182,92 +1182,20 @@ "values": false }, "showPercentChange": false, - "textMode": "auto", + "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "12.4.0", "targets": [ { "expr": "qvac_registry_blob_core_count{vm_name=~\"$vm\"}", - "legendFormat": "", + "legendFormat": "{{vm_name}}", "refId": "A" } ], "title": "Blob Cores", "type": "stat" }, - { - "datasource": { - "type": "prometheus" - }, - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "0": { - "color": "red", - "text": "No" - }, - "1": { - "color": "green", - "text": "Yes" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": 0 - }, - { - "color": "green", - "value": 1 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 37 - }, - "id": 20, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.4.0", - "targets": [ - { - "expr": "min(qvac_registry_blob_core_contiguous_length{vm_name=~\"$vm\"} == bool qvac_registry_blob_core_length{vm_name=~\"$vm\"}) and min(qvac_registry_blob_core_length{vm_name=~\"$vm\"}) > bool 0", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Blob Core Contiguous", - "type": "stat" - }, { "datasource": { "type": "prometheus" @@ -1291,7 +1219,7 @@ "gridPos": { "h": 4, "w": 4, - "x": 16, + "x": 12, "y": 37 }, "id": 21, @@ -1382,8 +1310,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 41 + "x": 12, + "y": 57 }, "id": 22, "options": { @@ -1479,8 +1407,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 41 + "x": 0, + "y": 49 }, "id": 23, "options": { @@ -1567,7 +1495,7 @@ "h": 8, "w": 12, "x": 0, - "y": 49 + "y": 65 }, "id": 24, "options": { @@ -1658,7 +1586,7 @@ "h": 8, "w": 12, "x": 12, - "y": 49 + "y": 65 }, "id": 25, "options": { @@ -1798,7 +1726,7 @@ "gridPos": { "h": 4, "w": 4, - "x": 20, + "x": 16, "y": 37 }, "id": 26, @@ -1888,9 +1816,9 @@ }, "gridPos": { "h": 8, - "w": 12, + "w": 8, "x": 0, - "y": 57 + "y": 41 }, "id": 27, "options": { @@ -1910,21 +1838,196 @@ "targets": [ { "expr": "qvac_registry_blob_core_length{vm_name=~\"$vm\"}", - "legendFormat": "{{ vm_name }} length", + "legendFormat": "{{ vm_name }}", "refId": "A" + } + ], + "title": "Blob Core Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 41 + }, + "id": 43, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ { "expr": "qvac_registry_blob_core_contiguous_length{vm_name=~\"$vm\"}", - "legendFormat": "{{ vm_name }} contiguous", - "refId": "B" + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Blob Core Contiguous Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 1000000 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 41 + }, + "id": 44, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ { "expr": "qvac_registry_blob_core_length{vm_name=~\"$vm\"} - qvac_registry_blob_core_contiguous_length{vm_name=~\"$vm\"}", - "legendFormat": "{{ vm_name }} gap", - "refId": "C" + "legendFormat": "{{ vm_name }}", + "refId": "A" } ], - "title": "Blob Core Replication", + "title": "Blob Core Gap (length - contiguous)", "type": "timeseries" }, { @@ -1987,7 +2090,7 @@ "h": 8, "w": 12, "x": 12, - "y": 57 + "y": 49 }, "id": 28, "options": { @@ -2005,18 +2108,13 @@ }, "pluginVersion": "12.4.0", "targets": [ - { - "expr": "qvac_registry_view_core_seeders{vm_name=~\"$vm\"}", - "legendFormat": "{{ vm_name }} view", - "refId": "A" - }, { "expr": "qvac_registry_blob_core_seeders{vm_name=~\"$vm\"}", - "legendFormat": "{{ vm_name }} blob", - "refId": "B" + "legendFormat": "{{ vm_name }}", + "refId": "A" } ], - "title": "Core Seeders (full replicas)", + "title": "Blob Core Seeders (full replicas)", "type": "timeseries" }, { @@ -2078,9 +2176,9 @@ }, "gridPos": { "h": 8, - "w": 24, + "w": 12, "x": 0, - "y": 65 + "y": 57 }, "id": 29, "options": { @@ -2211,64 +2309,6 @@ "x": 4, "y": 74 }, - "id": 32, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "12.4.0", - "targets": [ - { - "expr": "sum(dht_is_firewalled{vm_name=~\"$vm\"})", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Firewalled Nodes", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus" - }, - "fieldConfig": { - "defaults": { - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": 0 - }, - { - "color": "red", - "value": 1 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 74 - }, "id": 33, "options": { "colorMode": "background", @@ -2324,7 +2364,7 @@ "gridPos": { "h": 4, "w": 4, - "x": 12, + "x": 8, "y": 74 }, "id": 34, @@ -2386,7 +2426,7 @@ "gridPos": { "h": 4, "w": 4, - "x": 16, + "x": 12, "y": 74 }, "id": 35, @@ -2440,7 +2480,7 @@ "gridPos": { "h": 4, "w": 4, - "x": 20, + "x": 16, "y": 74 }, "id": 36, From 31e10d88fbd80cb1a86066cb3fa5846bb47e9b06 Mon Sep 17 00:00:00 2001 From: Yury Samarin Date: Thu, 23 Apr 2026 18:36:35 +0800 Subject: [PATCH 7/8] chore[docs]: promote View Core Replication and Blob Core Bytes to the top of the metrics section (#1719) * chore[docs]: promote View Core Replication and Blob Core Bytes to the top of the metrics section * chore[docs]: split View Core Replication into length, contiguous, and gap panels --- .../docs/grafana/REGISTRY_DASHBOARD.json | 239 +++++++++++++++--- 1 file changed, 207 insertions(+), 32 deletions(-) diff --git a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json index 1d5f089ccc..9af1254c6c 100644 --- a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json +++ b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json @@ -1309,9 +1309,9 @@ }, "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 57 + "w": 8, + "x": 0, + "y": 41 }, "id": 22, "options": { @@ -1331,21 +1331,196 @@ "targets": [ { "expr": "qvac_registry_view_core_length{vm_name=~\"$vm\"}", - "legendFormat": "{{ vm_name }} length", + "legendFormat": "{{ vm_name }}", "refId": "A" + } + ], + "title": "View Core Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 41 + }, + "id": 45, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ { "expr": "qvac_registry_view_core_contiguous_length{vm_name=~\"$vm\"}", - "legendFormat": "{{ vm_name }} contiguous", - "refId": "B" + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "View Core Contiguous Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 1000000 + } + ] + } }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 41 + }, + "id": 46, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ { "expr": "qvac_registry_view_core_length{vm_name=~\"$vm\"} - qvac_registry_view_core_contiguous_length{vm_name=~\"$vm\"}", - "legendFormat": "{{ vm_name }} gap", - "refId": "C" + "legendFormat": "{{ vm_name }}", + "refId": "A" } ], - "title": "View Core Replication", + "title": "View Core Gap (length - contiguous)", "type": "timeseries" }, { @@ -1408,7 +1583,7 @@ "h": 8, "w": 12, "x": 0, - "y": 49 + "y": 57 }, "id": 23, "options": { @@ -1495,7 +1670,7 @@ "h": 8, "w": 12, "x": 0, - "y": 65 + "y": 73 }, "id": 24, "options": { @@ -1586,7 +1761,7 @@ "h": 8, "w": 12, "x": 12, - "y": 65 + "y": 73 }, "id": 25, "options": { @@ -1625,7 +1800,7 @@ "h": 10, "w": 24, "x": 0, - "y": 102 + "y": 110 }, "id": 13, "options": { @@ -1661,7 +1836,7 @@ "h": 10, "w": 24, "x": 0, - "y": 112 + "y": 120 }, "id": 14, "options": { @@ -1818,7 +1993,7 @@ "h": 8, "w": 8, "x": 0, - "y": 41 + "y": 49 }, "id": 27, "options": { @@ -1905,7 +2080,7 @@ "h": 8, "w": 8, "x": 8, - "y": 41 + "y": 49 }, "id": 43, "options": { @@ -2000,7 +2175,7 @@ "h": 8, "w": 8, "x": 16, - "y": 41 + "y": 49 }, "id": 44, "options": { @@ -2090,7 +2265,7 @@ "h": 8, "w": 12, "x": 12, - "y": 49 + "y": 57 }, "id": 28, "options": { @@ -2176,9 +2351,9 @@ }, "gridPos": { "h": 8, - "w": 12, + "w": 24, "x": 0, - "y": 57 + "y": 65 }, "id": 29, "options": { @@ -2211,7 +2386,7 @@ "h": 1, "w": 24, "x": 0, - "y": 73 + "y": 81 }, "id": 30, "panels": [], @@ -2249,7 +2424,7 @@ "h": 4, "w": 4, "x": 0, - "y": 74 + "y": 82 }, "id": 31, "options": { @@ -2307,7 +2482,7 @@ "h": 4, "w": 4, "x": 4, - "y": 74 + "y": 82 }, "id": 33, "options": { @@ -2365,7 +2540,7 @@ "h": 4, "w": 4, "x": 8, - "y": 74 + "y": 82 }, "id": 34, "options": { @@ -2427,7 +2602,7 @@ "h": 4, "w": 4, "x": 12, - "y": 74 + "y": 82 }, "id": 35, "options": { @@ -2481,7 +2656,7 @@ "h": 4, "w": 4, "x": 16, - "y": 74 + "y": 82 }, "id": 36, "options": { @@ -2572,7 +2747,7 @@ "h": 8, "w": 12, "x": 0, - "y": 78 + "y": 86 }, "id": 37, "options": { @@ -2660,7 +2835,7 @@ "h": 8, "w": 12, "x": 12, - "y": 78 + "y": 86 }, "id": 38, "options": { @@ -2747,7 +2922,7 @@ "h": 8, "w": 12, "x": 0, - "y": 86 + "y": 94 }, "id": 39, "options": { @@ -2850,7 +3025,7 @@ "h": 8, "w": 12, "x": 12, - "y": 86 + "y": 94 }, "id": 40, "options": { @@ -2943,7 +3118,7 @@ "h": 8, "w": 12, "x": 0, - "y": 94 + "y": 102 }, "id": 41, "options": { @@ -3035,7 +3210,7 @@ "h": 8, "w": 12, "x": 12, - "y": 94 + "y": 102 }, "id": 42, "options": { From a7ccb8008223213639a3202cfe071314aa32412d Mon Sep 17 00:00:00 2001 From: yuranich Date: Thu, 23 Apr 2026 18:15:38 +0600 Subject: [PATCH 8/8] chore: remove dead blind-peer helpers and fix stale metrics docs - Drop unreferenced getConnectedBlindPeerKeys / getConfiguredBlindPeerKeys / isBlindPeerConnected chain and the _peerConnectionCounts map that only existed to back isBlindPeerConnected. Left over from the dropped blob_core_blind_peers gauge (1de851b). - Fix DEPLOYMENT_GUIDE.md: default metrics port is 9210, not 9090; drop the hypermetrics reference since it is not a dependency (abandoned, incompatible with Hypercore v11) and per-core visibility is provided by the registry_blob_core_* / registry_view_core_* gauges. Made-with: Cursor --- .../docs/DEPLOYMENT_GUIDE.md | 6 ++-- .../lib/registry-service.js | 33 ------------------- 2 files changed, 3 insertions(+), 36 deletions(-) diff --git a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md index 0cd0225e7b..a3946fb151 100644 --- a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md +++ b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md @@ -539,10 +539,10 @@ Four layers of operational visibility, each independently deployable. The registry server exposes Prometheus metrics via an HTTP endpoint bound to `127.0.0.1`. -**Start with metrics enabled (default port 9090):** +**Start with metrics enabled (default port 9210):** ```bash -node scripts/bin.js run --storage ./corestore --metrics-port 9090 +node scripts/bin.js run --storage ./corestore --metrics-port 9210 ``` **Or disable metrics:** @@ -553,7 +553,7 @@ node scripts/bin.js run --storage ./corestore --metrics-port 0 **What is exposed:** -- **Holepunch P2P metrics** (via `hypercore-stats`, `hyperswarm-stats`, `hypermetrics`): core stats, swarm connections, DHT, UDX bytes/packets, per-core upload/download counters. +- **Holepunch P2P metrics** (via `hypercore-stats`, `hyperswarm-stats`): aggregate core stats, swarm connections, DHT activity, UDX bytes/packets. Per-core labeled metrics are not exposed — `hypermetrics` is abandoned and incompatible with Hypercore v11, so per-core visibility is provided by the QVAC-specific `registry_blob_core_*` / `registry_view_core_*` gauges below. - **QVAC-specific metrics:** | Metric | Type | Description | diff --git a/packages/qvac-lib-registry-server/lib/registry-service.js b/packages/qvac-lib-registry-server/lib/registry-service.js index 8fb831f5de..8e8c9e76b9 100644 --- a/packages/qvac-lib-registry-server/lib/registry-service.js +++ b/packages/qvac-lib-registry-server/lib/registry-service.js @@ -72,7 +72,6 @@ class RegistryService extends ReadyResource { this.blobsStore = this.store.namespace('blobs') this.blobsCores = new Map() - this._peerConnectionCounts = new Map() this._indexerMonitor = null this._mirroredCoreIds = new Set() this.blindPeering = null @@ -155,11 +154,8 @@ class RegistryService extends ReadyResource { this.swarm.on('connection', (conn, peerInfo) => { const peerKey = peerInfo?.publicKey ? IdEnc.normalize(peerInfo.publicKey) : null - if (peerKey) this._trackPeerConnection(peerKey) - this.logger.info({ peer: peerKey || 'unknown' }, 'Swarm connection opened') conn.on('close', () => { - if (peerKey) this._untrackPeerConnection(peerKey) this.logger.info({ peer: peerKey || 'unknown' }, 'Swarm connection closed') }) @@ -343,7 +339,6 @@ class RegistryService extends ReadyResource { }) } this.blobsCores.clear() - this._peerConnectionCounts.clear() this._mirroredCoreIds.clear() this.logger.info('RegistryService: closed') @@ -888,34 +883,6 @@ class RegistryService extends ReadyResource { this.logger.info('Indexer status confirmed') } - _trackPeerConnection (peerKey) { - const current = this._peerConnectionCounts.get(peerKey) || 0 - this._peerConnectionCounts.set(peerKey, current + 1) - } - - _untrackPeerConnection (peerKey) { - const current = this._peerConnectionCounts.get(peerKey) || 0 - if (current <= 1) { - this._peerConnectionCounts.delete(peerKey) - return - } - - this._peerConnectionCounts.set(peerKey, current - 1) - } - - getConfiguredBlindPeerKeys () { - return [...new Set(this.blindPeerKeys)] - } - - isBlindPeerConnected (peerKey) { - return (this._peerConnectionCounts.get(peerKey) || 0) > 0 - } - - getConnectedBlindPeerKeys () { - return this.getConfiguredBlindPeerKeys() - .filter(peerKey => this.isBlindPeerConnected(peerKey)) - } - async _downloadArtifact (sourceInfo, localPath) { switch (sourceInfo.protocol) { case 'hf':