From ef5f9acc11a3f773af2ab724eadd5179c27bf970 Mon Sep 17 00:00:00 2001 From: yuranich Date: Wed, 15 Apr 2026 16:48:34 +0600 Subject: [PATCH 1/3] feat: add Prometheus metrics monitoring to registry server Made-with: Cursor --- .../client/lib/client.js | 1 + .../docs/DEPLOYMENT_GUIDE.md | 111 + .../docs/grafana/REGISTRY_DASHBOARD.json | 1917 +++++++++++++++++ .../ecosystem.config.js | 22 + .../lib/metrics-server.js | 64 + .../qvac-lib-registry-server/lib/metrics.js | 195 ++ .../lib/registry-service.js | 224 +- .../qvac-lib-registry-server/package.json | 4 + .../scripts/add-model.js | 2 + .../qvac-lib-registry-server/scripts/bin.js | 29 +- .../integration/metrics.integration.test.js | 210 ++ 11 files changed, 2697 insertions(+), 82 deletions(-) create mode 100644 packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json create mode 100644 packages/qvac-lib-registry-server/ecosystem.config.js create mode 100644 packages/qvac-lib-registry-server/lib/metrics-server.js create mode 100644 packages/qvac-lib-registry-server/lib/metrics.js create mode 100644 packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js diff --git a/packages/qvac-lib-registry-server/client/lib/client.js b/packages/qvac-lib-registry-server/client/lib/client.js index 9f30329ffb..963cb9eb41 100644 --- a/packages/qvac-lib-registry-server/client/lib/client.js +++ b/packages/qvac-lib-registry-server/client/lib/client.js @@ -53,6 +53,7 @@ class QVACRegistryClient extends ReadyResource { this.hyperswarm.on('connection', this._connectionHandler) this._metadataReady = this._connectMetadataCore() + await this._metadataReady } async _connectMetadataCore () { diff --git a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md index 079f3aedb7..a0fd9dafa9 100644 --- a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md +++ b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md @@ -531,6 +531,116 @@ node scripts/bin.js run --storage ./new-writer --bootstrap --skip-storage- | Admin command retries | May need 1-2 retries | Usually works first try | | Writer coordination | Manual timing recommended | Automated/scripted works | +## Monitoring + +Four layers of operational visibility, each independently deployable. + +### Layer 1: In-Process Prometheus /metrics Endpoint + +The registry server exposes Prometheus metrics via an HTTP endpoint bound to `127.0.0.1`. + +**Start with metrics enabled (default port 9090):** + +```bash +node scripts/bin.js run --storage ./corestore --metrics-port 9090 +``` + +**Or disable metrics:** + +```bash +node scripts/bin.js run --storage ./corestore --metrics-port 0 +``` + +**What is exposed:** + +- **Holepunch P2P metrics** (via `hypercore-stats`, `hyperswarm-stats`, `hypermetrics`): core stats, swarm connections, DHT, UDX bytes/packets, per-core upload/download counters. +- **QVAC-specific metrics:** + +| Metric | Type | Description | +|--------|------|-------------| +| `qvac_registry_models_total` | Gauge | Total models in the registry | +| `qvac_registry_blob_cores_total` | Gauge | Number of blob cores | +| `qvac_registry_blob_core_peers` | Gauge | Connected peers per blob core | +| `qvac_registry_blob_core_fully_downloaded` | Gauge | Whether each blob core is fully replicated | +| `qvac_registry_view_core_length` | Gauge | View core length (total blocks) | +| `qvac_registry_view_core_contiguous_length` | Gauge | View core contiguous length (gap indicates replication lag) | +| `qvac_registry_rpc_requests_total` | Counter | RPC requests by method | +| `qvac_registry_rpc_errors_total` | Counter | RPC errors by method | +| `qvac_registry_is_indexer` | Gauge | Whether this node is an indexer | +| `qvac_registry_blind_peers_connected` | Gauge | Number of configured blind peers with an active connection | +| `qvac_registry_blind_peer_connected` | Gauge | Per-blind-peer connection status (labeled by `peer_key`) | +| `qvac_registry_blob_core_byte_length` | Gauge | Byte length per blob core | +| `qvac_registry_model_size_bytes` | Gauge | Size of each model blob (labeled by path, engine, quantization) | + +**Prometheus scrape config:** + +```yaml +scrape_configs: + - job_name: 'qvac-registry' + scrape_interval: 30s + static_configs: + - targets: ['127.0.0.1:9090'] +``` + +**Security:** The metrics endpoint binds to `127.0.0.1` by default. Only Prometheus scrapers on the same host or private network should reach the port. Do not expose to the public internet. + +### Layer 2: hyper-health-check Sidecar + +Run [hyper-health-check](https://github.com/holepunchto/hyper-health-check) as a separate PM2 process to independently verify that cores are discoverable and downloadable from the swarm. The server might report healthy internals while peers cannot actually reach it. + +```bash +pm2 start node_modules/.bin/hyper-health-check -- run \ + --core :registry-view \ + --core :blob-models \ + --port 9091 \ + --grace-period 600000 +``` + +The 10-minute grace period accommodates replication lag after model additions — blind peers need time to download multi-GB blobs before being flagged as unhealthy. + +**Exposed metrics (on port 9091):** + +- `hyper_health_peers_total` — peers swarming each core +- `hyper_health_peers_with_all_data_total` — peers with full replication +- `hyper_health_ips_with_all_data_total` — unique IPs with full data (geographic diversity) + +### Layer 3: PM2 Ecosystem Config + +The repository includes `ecosystem.config.js` for standardized PM2 process management: + +```bash +pm2 start ecosystem.config.js +``` + +This starts both the registry server (with metrics on port 9090) and the health-check sidecar (on port 9091). + +**Per-deployment customization:** Override `--core` flags for the health-check app via PM2 environment variables or by editing the `args` field. + +**Process-level metrics:** Install `pm2-prometheus-exporter` for CPU, memory, heap, event loop latency, restarts, and uptime metrics: + +```bash +pm2 install pm2-prometheus-exporter +``` + +This exposes process metrics on `localhost:9209` alongside the application-level metrics from Layers 1 and 2. + +### Layer 4: Grafana Dashboard + +Use Holepunch's pre-built [Grafana dashboard](https://grafana.com/grafana/dashboards/22313-hypercore-hyperswarm/) (ID: 22313) as a baseline. It includes panels for Hypercore, Hyperswarm, HyperDHT, UDX, and Node.js process stats. + +**Add QVAC-specific panels for:** + +- **Model availability:** `qvac_registry_models_total`, `hyper_health_peers_with_all_data_total` +- **Storage breakdown:** `qvac_registry_model_size_bytes` by engine/quantization, `sum(qvac_registry_blob_core_byte_length)` +- **RPC activity:** `rate(qvac_registry_rpc_requests_total[5m])`, error ratio +- **Cluster health:** `qvac_registry_is_indexer` across nodes, `qvac_registry_view_core_length` vs `qvac_registry_view_core_contiguous_length` + +**Import the baseline dashboard:** + +1. Add Prometheus as a data source in Grafana (URL: `http://127.0.0.1:9090`) +2. Import dashboard ID `22313` +3. Add custom panels for QVAC metrics + ## Reference ### Environment Variables @@ -554,6 +664,7 @@ node scripts/bin.js run --storage ./new-writer --bootstrap --skip-storage- | `node scripts/bin.js run --storage ` | Start a writer | | `node scripts/bin.js run --bootstrap ` | Join existing cluster | | `node scripts/bin.js run --blind-peers ` | Enable blind peer replication | +| `node scripts/bin.js run --metrics-port ` | Prometheus metrics port (default: 9090, 0 to disable) | | `node scripts/bin.js run --skip-storage-check` | Bypass storage/bootstrap key mismatch check | | `node scripts/bin.js init-writer --storage ` | Initialize/authorize a writer client | | `node scripts/bin.js sync-models --file ` | Sync models from JSON config | diff --git a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json new file mode 100644 index 0000000000..92a8f92304 --- /dev/null +++ b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json @@ -0,0 +1,1917 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + }, + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_up{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Process Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "green", + "value": 86400 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_uptime{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_restarts{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Restarts", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_memory{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_cpu{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "100 - (avg by (vm_name) (rate(node_cpu_seconds_total{mode=\"idle\", vm_name=~\"$vm\"}[5m])) * 100)", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "VM CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "(1 - (node_memory_MemAvailable_bytes{vm_name=~\"$vm\"} / node_memory_MemTotal_bytes{vm_name=~\"$vm\"})) * 100", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "VM Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{mountpoint=\"/\", vm_name=~\"$vm\"} / node_filesystem_size_bytes{mountpoint=\"/\", vm_name=~\"$vm\"}) * 100)", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Disk Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 5000000000 + }, + { + "color": "green", + "value": 20000000000 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "node_filesystem_avail_bytes{mountpoint=\"/\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Disk Available", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{device!~\"lo|tailscale.*\", vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} rx", + "refId": "A" + }, + { + "expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|tailscale.*\", vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} tx", + "refId": "B" + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} read", + "refId": "A" + }, + { + "expr": "-rate(node_disk_written_bytes_total{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} write", + "refId": "B" + } + ], + "title": "Disk I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "node_load15{vm_name=~\"$vm\"} / count without (cpu, mode) (node_cpu_seconds_total{mode=\"idle\", vm_name=~\"$vm\"})", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 15, + "panels": [], + "title": "QVAC Registry Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 37 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_models_total", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Models", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + }, + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 37 + }, + "id": 17, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_is_indexer", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Indexer", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "yellow", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 37 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blind_peers_connected", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Blind Peers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 37 + }, + "id": 19, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_cores_total", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Blob Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "No" + }, + "1": { + "color": "green", + "text": "Yes" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 37 + }, + "id": 20, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "min(qvac_registry_blob_core_fully_downloaded)", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Blobs Synced", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 37 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(qvac_registry_blob_core_byte_length)", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Blob Storage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 22, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_view_core_length", + "legendFormat": "length", + "refId": "A" + }, + { + "expr": "qvac_registry_view_core_contiguous_length", + "legendFormat": "contiguous", + "refId": "B" + }, + { + "expr": "qvac_registry_view_core_length - qvac_registry_view_core_contiguous_length", + "legendFormat": "gap (replication lag)", + "refId": "C" + } + ], + "title": "View Core Replication", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_peers", + "legendFormat": "{{core_name}}", + "refId": "A" + } + ], + "title": "Blob Core Peers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(qvac_registry_rpc_requests_total[5m])", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "title": "RPC Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(qvac_registry_rpc_errors_total[5m])", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "title": "RPC Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 57 + }, + "id": 26, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_model_size_bytes", + "legendFormat": "{{path}}", + "refId": "A" + } + ], + "title": "Model Size Breakdown", + "type": "bargauge" + }, + { + "datasource": { + "type": "loki" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 13, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "showControls": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "unwrappedColumns": false, + "wrapLogMessage": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "{job=\"registry\"}", + "refId": "A" + } + ], + "title": "Registry Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 75 + }, + "id": 14, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "showControls": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "unwrappedColumns": false, + "wrapLogMessage": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "{job=\"registry\", level=\"error\"}", + "refId": "A" + } + ], + "title": "Registry Error Logs", + "type": "logs" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 42, + "tags": [ + "registry", + "pm2", + "qvac" + ], + "templating": { + "list": [ + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus" + }, + "includeAll": true, + "multi": true, + "name": "vm", + "options": [], + "query": "label_values(pm2_up{name=\"registry\"}, vm_name)", + "refresh": 1, + "regexApplyTo": "value", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "QVAC Registry Overview", + "uid": "f43aa479-9d22-4b3b-baa6-f527a2615981", + "version": 1, + "weekStart": "" +} diff --git a/packages/qvac-lib-registry-server/ecosystem.config.js b/packages/qvac-lib-registry-server/ecosystem.config.js new file mode 100644 index 0000000000..af22b6054e --- /dev/null +++ b/packages/qvac-lib-registry-server/ecosystem.config.js @@ -0,0 +1,22 @@ +'use strict' + +module.exports = { + apps: [ + { + name: 'registry', + script: 'scripts/bin.js', + args: 'run --storage ./corestore --metrics-port 9090', + env: { + NODE_ENV: 'production' + } + }, + { + name: 'health-check', + script: 'node_modules/.bin/hyper-health-check', + args: 'run --port 9091 --grace-period 600000', + env: { + NODE_ENV: 'production' + } + } + ] +} diff --git a/packages/qvac-lib-registry-server/lib/metrics-server.js b/packages/qvac-lib-registry-server/lib/metrics-server.js new file mode 100644 index 0000000000..ff86e2756e --- /dev/null +++ b/packages/qvac-lib-registry-server/lib/metrics-server.js @@ -0,0 +1,64 @@ +'use strict' + +const http = require('http') +const ReadyResource = require('ready-resource') + +const DEFAULT_PORT = 9090 +const DEFAULT_HOST = '127.0.0.1' + +class MetricsServer extends ReadyResource { + constructor (promRegister, opts = {}) { + super() + + this._register = promRegister + this._port = opts.port || DEFAULT_PORT + this._host = opts.host || DEFAULT_HOST + this._logger = opts.logger || console + this._server = null + } + + async _open () { + this._server = http.createServer(async (req, res) => { + if (req.url === '/metrics' && req.method === 'GET') { + try { + const metrics = await this._register.metrics() + res.writeHead(200, { 'Content-Type': this._register.contentType }) + res.end(metrics) + } catch (err) { + this._logger.error({ err }, 'MetricsServer: failed to collect metrics') + res.writeHead(500) + res.end('Internal Server Error') + } + return + } + + res.writeHead(404) + res.end('Not Found') + }) + + await new Promise((resolve, reject) => { + this._server.listen(this._port, this._host, () => { + this._logger.info({ + host: this._host, + port: this._port + }, 'MetricsServer: listening') + resolve() + }) + this._server.on('error', reject) + }) + } + + async _close () { + if (!this._server) return + + await new Promise((resolve) => { + this._server.close(() => { + this._logger.info('MetricsServer: closed') + resolve() + }) + }) + this._server = null + } +} + +module.exports = MetricsServer diff --git a/packages/qvac-lib-registry-server/lib/metrics.js b/packages/qvac-lib-registry-server/lib/metrics.js new file mode 100644 index 0000000000..e39d4ce17b --- /dev/null +++ b/packages/qvac-lib-registry-server/lib/metrics.js @@ -0,0 +1,195 @@ +'use strict' + +const promClient = require('prom-client') + +const MODEL_CACHE_TTL_MS = 15000 + +class QvacMetrics { + constructor (service, opts = {}) { + this._service = service + this._logger = opts.logger || console + + this._modelCache = null + this._modelCacheExpiry = 0 + + this._rpcRequests = new promClient.Counter({ + name: 'qvac_registry_rpc_requests_total', + help: 'Total RPC requests by method', + labelNames: ['method'] + }) + + this._rpcErrors = new promClient.Counter({ + name: 'qvac_registry_rpc_errors_total', + help: 'Total RPC errors by method', + labelNames: ['method'] + }) + + this._registerGauges() + } + + recordRpcRequest (method) { + this._rpcRequests.inc({ method }) + } + + recordRpcError (method) { + this._rpcErrors.inc({ method }) + } + + _registerGauges () { + const self = this + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_models_total', + help: 'Total number of models in the registry', + async collect () { + const models = await self._getCachedModels() + this.set(models.length) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blob_cores_total', + help: 'Number of blob cores', + collect () { + this.set(self._service.blobsCores.size) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blob_core_peers', + help: 'Number of connected peers per blob core', + labelNames: ['core_name'], + collect () { + this.reset() + for (const [name, { core }] of self._service.blobsCores) { + this.set({ core_name: name }, core.peers.length) + } + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blob_core_fully_downloaded', + help: 'Whether each blob core is fully downloaded (1=yes, 0=no)', + labelNames: ['core_name'], + collect () { + this.reset() + for (const [name, { core }] of self._service.blobsCores) { + const full = core.contiguousLength === core.length && core.length > 0 ? 1 : 0 + this.set({ core_name: name }, full) + } + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_view_core_length', + help: 'View core length (total blocks)', + collect () { + const viewCore = self._service.view?.core + this.set(viewCore ? viewCore.length : 0) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_view_core_contiguous_length', + help: 'View core contiguous length (gap = length - contiguous indicates replication lag)', + collect () { + const viewCore = self._service.view?.core + this.set(viewCore ? viewCore.contiguousLength : 0) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_is_indexer', + help: 'Whether this node is an indexer (1=yes, 0=no)', + collect () { + this.set(self._service.base?.isIndexer ? 1 : 0) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blind_peers_connected', + help: 'Number of configured blind peers with an active connection', + collect () { + this.set(self._service.getConnectedBlindPeerKeys().length) + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blind_peer_connected', + help: 'Whether each configured blind peer currently has an active connection (1=yes, 0=no)', + labelNames: ['peer_key'], + collect () { + this.reset() + for (const peerKey of self._service.getConfiguredBlindPeerKeys()) { + this.set( + { peer_key: peerKey }, + self._service.isBlindPeerConnected(peerKey) ? 1 : 0 + ) + } + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_blob_core_byte_length', + help: 'Byte length of each blob core', + labelNames: ['core_name'], + collect () { + this.reset() + for (const [name, { core }] of self._service.blobsCores) { + this.set({ core_name: name }, core.byteLength) + } + } + }) + + // eslint-disable-next-line no-new + new promClient.Gauge({ + name: 'qvac_registry_model_size_bytes', + help: 'Size in bytes of each model blob', + labelNames: ['path', 'engine', 'quantization'], + async collect () { + const models = await self._getCachedModels() + this.reset() + for (const m of models) { + if (m.blobBinding && m.blobBinding.byteLength > 0) { + this.set({ + path: m.path, + engine: m.engine || '', + quantization: m.quantization || '' + }, m.blobBinding.byteLength) + } + } + } + }) + } + + async _getCachedModels () { + const now = Date.now() + if (this._modelCache && now < this._modelCacheExpiry) { + return this._modelCache + } + + try { + const view = this._service.view + if (!view || !view.opened) return this._modelCache || [] + const models = await view.findModelsByPath({}).toArray() + this._modelCache = models + this._modelCacheExpiry = now + MODEL_CACHE_TTL_MS + return models + } catch (err) { + this._logger.warn({ err: err.message }, 'QvacMetrics: failed to query models') + return this._modelCache || [] + } + } +} + +module.exports = QvacMetrics diff --git a/packages/qvac-lib-registry-server/lib/registry-service.js b/packages/qvac-lib-registry-server/lib/registry-service.js index 63794b2bea..b373aea697 100644 --- a/packages/qvac-lib-registry-server/lib/registry-service.js +++ b/packages/qvac-lib-registry-server/lib/registry-service.js @@ -70,10 +70,12 @@ class RegistryService extends ReadyResource { this.blobsStore = this.store.namespace('blobs') this.blobsCores = new Map() + this._peerConnectionCounts = new Map() this._indexerMonitor = null this._mirroredCoreIds = new Set() this.blindPeering = null this.reseedTracker = null + this.metrics = null this._registerApplyHandlers() @@ -148,10 +150,15 @@ class RegistryService extends ReadyResource { }) this.swarm.on('connection', (conn, peerInfo) => { - const peerKey = peerInfo?.publicKey ? IdEnc.normalize(peerInfo.publicKey) : 'unknown' + const peerKey = peerInfo?.publicKey ? IdEnc.normalize(peerInfo.publicKey) : null - this.logger.info({ peer: peerKey }, 'Swarm connection opened') - conn.on('close', () => this.logger.info({ peer: peerKey }, 'Swarm connection closed')) + if (peerKey) this._trackPeerConnection(peerKey) + + this.logger.info({ peer: peerKey || 'unknown' }, 'Swarm connection opened') + conn.on('close', () => { + if (peerKey) this._untrackPeerConnection(peerKey) + this.logger.info({ peer: peerKey || 'unknown' }, 'Swarm connection closed') + }) this._setupRpc(conn) @@ -307,6 +314,7 @@ class RegistryService extends ReadyResource { }) } this.blobsCores.clear() + this._peerConnectionCounts.clear() this._mirroredCoreIds.clear() this.logger.info('RegistryService: closed') @@ -468,28 +476,34 @@ class RegistryService extends ReadyResource { rpc.respond( 'add-model', async (entry) => { - ensureWriterAccess() + if (this.metrics) this.metrics.recordRpcRequest('add-model') + try { + ensureWriterAccess() - if (!this.opened) await this.ready() - await this._ensureIndexer() + if (!this.opened) await this.ready() + await this._ensureIndexer() - const skipExisting = entry.skipExisting || false - const modelEntry = { ...entry } - delete modelEntry.skipExisting + const skipExisting = entry.skipExisting || false + const modelEntry = { ...entry } + delete modelEntry.skipExisting - const result = await this.addModel(modelEntry, { skipExisting }) - - this.logger.info({ - path: result.path, - source: result.source - }, 'RPC: add-model completed') + const result = await this.addModel(modelEntry, { skipExisting }) - return { - success: true, - model: { + this.logger.info({ path: result.path, source: result.source + }, 'RPC: add-model completed') + + return { + success: true, + model: { + path: result.path, + source: result.source + } } + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('add-model') + throw err } } ) @@ -497,19 +511,25 @@ class RegistryService extends ReadyResource { rpc.respond( 'put-license', async (licenseRecord) => { - ensureWriterAccess() + if (this.metrics) this.metrics.recordRpcRequest('put-license') + try { + ensureWriterAccess() - if (!this.opened) await this.ready() - await this._ensureIndexer() - await this.putLicense(licenseRecord) + if (!this.opened) await this.ready() + await this._ensureIndexer() + await this.putLicense(licenseRecord) - this.logger.info({ - spdxId: licenseRecord.spdxId - }, 'RPC: put-license completed') + this.logger.info({ + spdxId: licenseRecord.spdxId + }, 'RPC: put-license completed') - return { - success: true, - message: 'License operation appended' + return { + success: true, + message: 'License operation appended' + } + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('put-license') + throw err } } ) @@ -517,45 +537,51 @@ class RegistryService extends ReadyResource { rpc.respond( 'update-model-metadata', async (data) => { - ensureWriterAccess() - - if (!data.path) throw new TypeError('path is required') - if (!data.source) throw new TypeError('source is required') - - if (!this.opened) await this.ready() - await this._ensureIndexer() - - const existing = await this.getModelByKey({ path: data.path, source: data.source }) - if (!existing) throw new Error(`Model not found: ${data.path}`) - - // If explicitly undeprecating, clear deprecation fields - const isUndeprecating = data.deprecated === false - - const updated = { - ...existing, - engine: data.engine ?? existing.engine, - licenseId: data.licenseId ?? existing.licenseId, - description: data.description ?? existing.description, - quantization: data.quantization ?? existing.quantization, - params: data.params ?? existing.params, - notes: data.notes ?? existing.notes, - tags: data.tags ?? existing.tags, - deprecated: data.deprecated !== undefined ? data.deprecated : existing.deprecated, - deprecatedAt: isUndeprecating ? '' : (data.deprecatedAt ?? existing.deprecatedAt), - replacedBy: isUndeprecating ? '' : (data.replacedBy ?? existing.replacedBy), - deprecationReason: isUndeprecating ? '' : (data.deprecationReason ?? existing.deprecationReason) - } + if (this.metrics) this.metrics.recordRpcRequest('update-model-metadata') + try { + ensureWriterAccess() + + if (!data.path) throw new TypeError('path is required') + if (!data.source) throw new TypeError('source is required') + + if (!this.opened) await this.ready() + await this._ensureIndexer() + + const existing = await this.getModelByKey({ path: data.path, source: data.source }) + if (!existing) throw new Error(`Model not found: ${data.path}`) + + // If explicitly undeprecating, clear deprecation fields + const isUndeprecating = data.deprecated === false + + const updated = { + ...existing, + engine: data.engine ?? existing.engine, + licenseId: data.licenseId ?? existing.licenseId, + description: data.description ?? existing.description, + quantization: data.quantization ?? existing.quantization, + params: data.params ?? existing.params, + notes: data.notes ?? existing.notes, + tags: data.tags ?? existing.tags, + deprecated: data.deprecated !== undefined ? data.deprecated : existing.deprecated, + deprecatedAt: isUndeprecating ? '' : (data.deprecatedAt ?? existing.deprecatedAt), + replacedBy: isUndeprecating ? '' : (data.replacedBy ?? existing.replacedBy), + deprecationReason: isUndeprecating ? '' : (data.deprecationReason ?? existing.deprecationReason) + } - await this._appendOperation(DISPATCH_PUT_MODEL, updated) + await this._appendOperation(DISPATCH_PUT_MODEL, updated) - const viewLength = this.view?.core?.length ?? 0 - const viewContiguous = this.view?.core?.contiguousLength ?? 0 - const viewSigned = this.view?.core?.signedLength ?? 0 - this.logger.info({ path: data.path, viewLength, viewContiguous, viewSigned }, 'RPC: update-model-metadata completed') + const viewLength = this.view?.core?.length ?? 0 + const viewContiguous = this.view?.core?.contiguousLength ?? 0 + const viewSigned = this.view?.core?.signedLength ?? 0 + this.logger.info({ path: data.path, viewLength, viewContiguous, viewSigned }, 'RPC: update-model-metadata completed') - return { - success: true, - model: updated + return { + success: true, + model: updated + } + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('update-model-metadata') + throw err } } ) @@ -563,31 +589,42 @@ class RegistryService extends ReadyResource { rpc.respond( 'delete-model', async (data) => { - ensureWriterAccess() + if (this.metrics) this.metrics.recordRpcRequest('delete-model') + try { + ensureWriterAccess() - if (!data.path) throw new TypeError('path is required') - if (!data.source) throw new TypeError('source is required') + if (!data.path) throw new TypeError('path is required') + if (!data.source) throw new TypeError('source is required') - if (!this.opened) await this.ready() - await this._ensureIndexer() + if (!this.opened) await this.ready() + await this._ensureIndexer() - const result = await this.deleteModel({ path: data.path, source: data.source }) + const result = await this.deleteModel({ path: data.path, source: data.source }) - this.logger.info({ - path: data.path, - source: data.source - }, 'RPC: delete-model completed') - - return result + this.logger.info({ + path: data.path, + source: data.source + }, 'RPC: delete-model completed') + + return result + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('delete-model') + throw err + } } ) - // Server identification endpoint - allows RPC clients to verify they connected - // to the actual server and not a blind peer (which won't have RPC responders) rpc.respond('ping', async () => { + if (this.metrics) this.metrics.recordRpcRequest('ping') return { role: 'registry-server', - timestamp: Date.now() + timestamp: Date.now(), + isIndexer: this.base?.isIndexer ?? false, + modelCount: this.view?.core?.length ?? 0, + peerCount: this.swarm?.connections?.size ?? 0, + blobCoresCount: this.blobsCores.size, + viewCoreLength: this.view?.core?.length ?? 0, + viewCoreContiguousLength: this.view?.core?.contiguousLength ?? 0 } }) @@ -826,6 +863,34 @@ class RegistryService extends ReadyResource { this.logger.info('Indexer status confirmed') } + _trackPeerConnection (peerKey) { + const current = this._peerConnectionCounts.get(peerKey) || 0 + this._peerConnectionCounts.set(peerKey, current + 1) + } + + _untrackPeerConnection (peerKey) { + const current = this._peerConnectionCounts.get(peerKey) || 0 + if (current <= 1) { + this._peerConnectionCounts.delete(peerKey) + return + } + + this._peerConnectionCounts.set(peerKey, current - 1) + } + + getConfiguredBlindPeerKeys () { + return [...new Set(this.blindPeerKeys)] + } + + isBlindPeerConnected (peerKey) { + return (this._peerConnectionCounts.get(peerKey) || 0) > 0 + } + + getConnectedBlindPeerKeys () { + return this.getConfiguredBlindPeerKeys() + .filter(peerKey => this.isBlindPeerConnected(peerKey)) + } + async _downloadArtifact (sourceInfo, localPath) { switch (sourceInfo.protocol) { case 'hf': @@ -969,7 +1034,6 @@ class RegistryService extends ReadyResource { discoveryKey: core.discoveryKey.toString('hex') }, 'Hyperblobs core ready') - // Caller is responsible for mirroring after data is added return entry } diff --git a/packages/qvac-lib-registry-server/package.json b/packages/qvac-lib-registry-server/package.json index 3330611838..34ecf056d1 100644 --- a/packages/qvac-lib-registry-server/package.json +++ b/packages/qvac-lib-registry-server/package.json @@ -67,15 +67,19 @@ "crypto": "npm:bare-node-crypto", "http": "npm:bare-node-http", "https": "npm:bare-node-https", + "hyper-health-check": "^1.3.0", "hyperblobs": "^2.8.0", "hypercore-id-encoding": "^1.3.0", + "hypercore-stats": "^2.4.0", "hyperdb": "^4.16.0", "hyperdispatch": "^1.4.0", "hyperschema": "^1.13.0", "hyperswarm": "^4.10.5", + "hyperswarm-stats": "^1.3.0", "paparam": "^1.8.6", "pino": "^9.9.4", "pino-pretty": "^13.1.1", + "prom-client": "^15.1.3", "protomux-rpc": "^1.7.0", "readline": "npm:bare-node-readline", "ready-resource": "^1.2.0", diff --git a/packages/qvac-lib-registry-server/scripts/add-model.js b/packages/qvac-lib-registry-server/scripts/add-model.js index 076f18dd6d..116b10ed40 100644 --- a/packages/qvac-lib-registry-server/scripts/add-model.js +++ b/packages/qvac-lib-registry-server/scripts/add-model.js @@ -24,6 +24,8 @@ async function addModel () { primaryKey = args[++i] } else if (args[i] === '--models-file' || args[i] === '-f') { modelsFile = args[++i] + } else if (args[i] === '--') { + continue } else if (!canonicalSource && args[i] !== '--skip-existing') { canonicalSource = args[i] } diff --git a/packages/qvac-lib-registry-server/scripts/bin.js b/packages/qvac-lib-registry-server/scripts/bin.js index c01f21d53a..78954e67ce 100644 --- a/packages/qvac-lib-registry-server/scripts/bin.js +++ b/packages/qvac-lib-registry-server/scripts/bin.js @@ -14,8 +14,15 @@ const fs = require('fs') const RegistryService = require('../lib/registry-service') const RegistryConfig = require('../lib/config') +const MetricsServer = require('../lib/metrics-server') +const QvacMetrics = require('../lib/metrics') +const HypercoreStats = require('hypercore-stats') +const HyperswarmStats = require('hyperswarm-stats') +const promClient = require('prom-client') const { AUTOBASE_NAMESPACE } = require('@qvac/registry-schema') +const DEFAULT_METRICS_PORT = 9090 + const DEFAULT_STORAGE = './corestore' const DEFAULT_WRITER_STORAGE = './writer-storage' @@ -36,6 +43,7 @@ const runCmd = command('run', flag('--clear-after-reseed', 'Clear blob blocks after successful replication to blind peers'), flag('--compaction-interval [ms]', `Periodic RocksDB compaction interval in ms (default: ${DEFAULT_COMPACTION_INTERVAL_MS}, 0 to disable)`), flag('--skip-storage-check', 'Skip storage/bootstrap key mismatch check (use when joining existing cluster with fresh storage)'), + flag('--metrics-port [port]', `Prometheus metrics HTTP port (default: ${DEFAULT_METRICS_PORT}, 0 to disable)`), async function ({ flags }) { const logger = createLogger() @@ -86,8 +94,24 @@ const runCmd = command('run', config.setAutobaseKey(IdEnc.normalize(service.base.key)) config.setRegistryCoreKey(IdEnc.normalize(service.registryCoreKey)) + const metricsPort = flags.metricsPort !== undefined + ? parseInt(flags.metricsPort, 10) + : DEFAULT_METRICS_PORT + + let metricsServer = null + if (metricsPort > 0) { + const qvacMetrics = new QvacMetrics(service, { logger }) + service.metrics = qvacMetrics + + HypercoreStats.fromCorestore(store).registerPrometheusMetrics(promClient) + new HyperswarmStats(swarm).registerPrometheusMetrics(promClient) // eslint-disable-line no-new + + metricsServer = new MetricsServer(promClient.register, { port: metricsPort, logger }) + await metricsServer.ready() + } + logServiceInfo(logger, service) - registerShutdown(logger, service, swarm, store) + registerShutdown(logger, service, swarm, store, metricsServer) } ) @@ -155,13 +179,14 @@ const cmd = command('registry', runCmd, initWriter, syncModelsCmd) cmd.parse() -function registerShutdown (logger, service, swarm, store) { +function registerShutdown (logger, service, swarm, store, metricsServer) { let closing = false const shutdown = async () => { if (closing) return closing = true logger.info('Shutting down gracefully…') try { + if (metricsServer) await metricsServer.close() await service.close() await swarm.destroy() await store.close() diff --git a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js new file mode 100644 index 0000000000..e2c5b99005 --- /dev/null +++ b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js @@ -0,0 +1,210 @@ +'use strict' + +const test = require('brittle') +const Corestore = require('corestore') +const Hyperswarm = require('hyperswarm') +const http = require('http') +const IdEnc = require('hypercore-id-encoding') +const promClient = require('prom-client') + +const RegistryService = require('../../lib/registry-service') +const RegistryConfig = require('../../lib/config') +const MetricsServer = require('../../lib/metrics-server') +const QvacMetrics = require('../../lib/metrics') +const { AUTOBASE_NAMESPACE } = require('../../shared/constants') +const { createTempStorage } = require('../helpers/test-utils') + +const noopLogger = { + info () {}, + debug () {}, + error () {}, + warn () {} +} + +async function createServiceWithMetrics (t, opts = {}) { + const basePath = await createTempStorage(t) + const store = new Corestore(basePath) + await store.ready() + + const swarm = new Hyperswarm({ bootstrap: [] }) + const config = new RegistryConfig({ logger: noopLogger }) + + const service = new RegistryService( + store.namespace(AUTOBASE_NAMESPACE), + swarm, + config, + { + logger: noopLogger, + ackInterval: 5, + skipStorageCheck: true + } + ) + + await service.ready() + + // Fresh registry per test to avoid metric name collisions + const registry = new promClient.Registry() + promClient.register.clear() + + const qvacMetrics = new QvacMetrics(service, { logger: noopLogger }) + service.metrics = qvacMetrics + + const port = opts.port || 0 + const metricsServer = new MetricsServer(promClient.register, { + port, + logger: noopLogger + }) + await metricsServer.ready() + + const actualPort = metricsServer._server.address().port + + return { service, store, swarm, metricsServer, qvacMetrics, port: actualPort, registry } +} + +async function cleanup (ctx) { + if (ctx.metricsServer) await ctx.metricsServer.close().catch(() => {}) + if (ctx.service && ctx.service.opened) await ctx.service.close().catch(() => {}) + if (ctx.swarm) await ctx.swarm.destroy().catch(() => {}) + if (ctx.store) await ctx.store.close().catch(() => {}) + promClient.register.clear() +} + +function httpGet (port, path) { + return new Promise((resolve, reject) => { + const req = http.get(`http://127.0.0.1:${port}${path}`, (res) => { + let body = '' + res.on('data', (chunk) => { body += chunk }) + res.on('end', () => resolve({ status: res.statusCode, body, headers: res.headers })) + }) + req.on('error', reject) + }) +} + +test('MetricsServer serves Prometheus text at /metrics', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + const res = await httpGet(ctx.port, '/metrics') + t.is(res.status, 200, 'returns 200') + t.ok(res.headers['content-type'].includes('text/plain') || res.headers['content-type'].includes('openmetrics'), 'correct content type') + t.ok(res.body.length > 0, 'body is non-empty') + } finally { + await cleanup(ctx) + } +}) + +test('MetricsServer returns 404 for non-metrics paths', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + const res = await httpGet(ctx.port, '/health') + t.is(res.status, 404, 'returns 404') + } finally { + await cleanup(ctx) + } +}) + +test('/metrics includes QVAC custom gauges', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + const res = await httpGet(ctx.port, '/metrics') + const body = res.body + + t.ok(body.includes('qvac_registry_models_total'), 'has models_total') + t.ok(body.includes('qvac_registry_blob_cores_total'), 'has blob_cores_total') + t.ok(body.includes('qvac_registry_view_core_length'), 'has view_core_length') + t.ok(body.includes('qvac_registry_view_core_contiguous_length'), 'has view_core_contiguous_length') + t.ok(body.includes('qvac_registry_is_indexer'), 'has is_indexer') + t.ok(body.includes('qvac_registry_blind_peers_connected'), 'has blind_peers_connected') + t.ok(body.includes('qvac_registry_blind_peer_connected'), 'has blind_peer_connected') + t.ok(body.includes('qvac_registry_rpc_requests_total'), 'has rpc_requests_total') + t.ok(body.includes('qvac_registry_rpc_errors_total'), 'has rpc_errors_total') + } finally { + await cleanup(ctx) + } +}) + +test('RPC metrics counters increment', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + ctx.qvacMetrics.recordRpcRequest('ping') + ctx.qvacMetrics.recordRpcRequest('ping') + ctx.qvacMetrics.recordRpcRequest('add-model') + ctx.qvacMetrics.recordRpcError('add-model') + + const res = await httpGet(ctx.port, '/metrics') + const body = res.body + + const pingLine = body.split('\n').find(l => l.includes('qvac_registry_rpc_requests_total') && l.includes('ping')) + t.ok(pingLine, 'has ping request counter line') + t.ok(pingLine.includes('2'), 'ping counter is 2') + + const errorLine = body.split('\n').find(l => l.includes('qvac_registry_rpc_errors_total') && l.includes('add-model')) + t.ok(errorLine, 'has add-model error counter line') + t.ok(errorLine.includes('1'), 'error counter is 1') + } finally { + await cleanup(ctx) + } +}) + +test('blind peer metrics track configured peers with active connections', async (t) => { + const blindPeerKeys = [ + IdEnc.normalize(Buffer.alloc(32, 1)), + IdEnc.normalize(Buffer.alloc(32, 2)) + ] + const ctx = await createServiceWithMetrics(t) + + try { + ctx.service.blindPeerKeys = blindPeerKeys + ctx.service._trackPeerConnection(blindPeerKeys[0]) + ctx.service._trackPeerConnection(blindPeerKeys[0]) + ctx.service._trackPeerConnection('writer-peer') + + let res = await httpGet(ctx.port, '/metrics') + let body = res.body + + const connectedPeerLine = body.split('\n') + .find(line => line.startsWith(`qvac_registry_blind_peer_connected{peer_key="${blindPeerKeys[0]}"}`)) + t.ok(connectedPeerLine, 'has connected blind peer series') + t.ok(connectedPeerLine.endsWith(' 1'), 'connected blind peer is reported as 1') + + const disconnectedPeerLine = body.split('\n') + .find(line => line.startsWith(`qvac_registry_blind_peer_connected{peer_key="${blindPeerKeys[1]}"}`)) + t.ok(disconnectedPeerLine, 'has disconnected blind peer series') + t.ok(disconnectedPeerLine.endsWith(' 0'), 'disconnected blind peer is reported as 0') + + ctx.service._untrackPeerConnection(blindPeerKeys[0]) + ctx.service._untrackPeerConnection(blindPeerKeys[0]) + + res = await httpGet(ctx.port, '/metrics') + body = res.body + + const afterCloseCountLine = body.split('\n') + .find(line => line.startsWith('qvac_registry_blind_peers_connected ')) + t.ok(afterCloseCountLine.endsWith(' 0'), 'blind peer count drops after connection closes') + + const afterClosePeerLine = body.split('\n') + .find(line => line.startsWith(`qvac_registry_blind_peer_connected{peer_key="${blindPeerKeys[0]}"}`)) + t.ok(afterClosePeerLine.endsWith(' 0'), 'blind peer status drops after connection closes') + } finally { + await cleanup(ctx) + } +}) + +test('MetricsServer closes cleanly', async (t) => { + const ctx = await createServiceWithMetrics(t) + + await ctx.metricsServer.close() + ctx.metricsServer = null + + try { + await httpGet(ctx.port, '/metrics') + t.fail('should not connect after close') + } catch (err) { + t.ok(err.code === 'ECONNREFUSED', 'connection refused after close') + } finally { + await cleanup(ctx) + } +}) From a7fbfb9360926f24d5a6961c116234aa437fba00 Mon Sep 17 00:00:00 2001 From: yuranich Date: Mon, 20 Apr 2026 18:00:04 +0600 Subject: [PATCH 2/3] fix: restrict registry ping RPC to role and timestamp to avoid exposing operational data --- packages/qvac-lib-registry-server/README.md | 2 +- packages/qvac-lib-registry-server/lib/registry-service.js | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/packages/qvac-lib-registry-server/README.md b/packages/qvac-lib-registry-server/README.md index 25fe9e68a5..a24dd04871 100644 --- a/packages/qvac-lib-registry-server/README.md +++ b/packages/qvac-lib-registry-server/README.md @@ -302,7 +302,7 @@ Regenerate specs with `npm run build:spec` and restart the service. node scripts/check-peers.js [--key ] ``` -**`ping-server.js`**: Pings a running registry server via RPC to check availability and retrieve server status (role, view key, lengths, connected peers). +**`ping-server.js`**: Pings a running registry server via RPC to verify availability and confirm the connected peer is the indexer rather than a blind relay. Returns `role` and `timestamp` only — operational metrics (model count, view core lag, peer counts, etc.) are exposed via the Prometheus `/metrics` endpoint instead. ```bash node scripts/ping-server.js [--peer ] diff --git a/packages/qvac-lib-registry-server/lib/registry-service.js b/packages/qvac-lib-registry-server/lib/registry-service.js index b373aea697..74c30a27d0 100644 --- a/packages/qvac-lib-registry-server/lib/registry-service.js +++ b/packages/qvac-lib-registry-server/lib/registry-service.js @@ -618,13 +618,7 @@ class RegistryService extends ReadyResource { if (this.metrics) this.metrics.recordRpcRequest('ping') return { role: 'registry-server', - timestamp: Date.now(), - isIndexer: this.base?.isIndexer ?? false, - modelCount: this.view?.core?.length ?? 0, - peerCount: this.swarm?.connections?.size ?? 0, - blobCoresCount: this.blobsCores.size, - viewCoreLength: this.view?.core?.length ?? 0, - viewCoreContiguousLength: this.view?.core?.contiguousLength ?? 0 + timestamp: Date.now() } }) From 234babf06a3ca2fef702229e0ab94ec0d0780be5 Mon Sep 17 00:00:00 2001 From: yuranich Date: Mon, 20 Apr 2026 19:53:32 +0600 Subject: [PATCH 3/3] fix: make metrics bind host configurable and move off port 9090 --- .../docs/DEPLOYMENT_GUIDE.md | 34 +++++++++++++--- .../ecosystem.config.js | 2 +- .../lib/metrics-server.js | 2 +- .../qvac-lib-registry-server/scripts/bin.js | 12 +++++- .../integration/metrics.integration.test.js | 39 +++++++++++++++++++ 5 files changed, 79 insertions(+), 10 deletions(-) diff --git a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md index a0fd9dafa9..0fa924a141 100644 --- a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md +++ b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md @@ -572,17 +572,38 @@ node scripts/bin.js run --storage ./corestore --metrics-port 0 | `qvac_registry_blob_core_byte_length` | Gauge | Byte length per blob core | | `qvac_registry_model_size_bytes` | Gauge | Size of each model blob (labeled by path, engine, quantization) | -**Prometheus scrape config:** +**Prometheus scrape config (local Prometheus, loopback bind):** ```yaml scrape_configs: - job_name: 'qvac-registry' scrape_interval: 30s static_configs: - - targets: ['127.0.0.1:9090'] + - targets: ['127.0.0.1:9210'] ``` -**Security:** The metrics endpoint binds to `127.0.0.1` by default. Only Prometheus scrapers on the same host or private network should reach the port. Do not expose to the public internet. +**Prometheus scrape config (central Prometheus scraping multiple registry VMs):** + +Run the registry with `--metrics-host 0.0.0.0` (or the private-network NIC address) so a remote Prometheus can reach the endpoint. Attach matching labels across jobs (`node-exporter`, `pm2-prometheus-exporter`, `qvac-registry`) so Grafana template variables work uniformly. + +```yaml +scrape_configs: + - job_name: 'qvac-registry' + scrape_interval: 30s + static_configs: + - targets: [':9210'] + labels: + vm_name: '' + network: '' + zone: '' + - targets: [':9210'] + labels: + vm_name: '' + network: '' + zone: '' +``` + +**Security:** Port 9210 is chosen to avoid confusion with Prometheus's own port 9090 and to sit next to pm2-prometheus-exporter on 9209. The endpoint binds to `127.0.0.1` by default. When exposing on a private network via `--metrics-host`, restrict access with firewall rules, VPN/overlay network ACLs (WireGuard, Tailscale, Nebula), or a VPC security group. Do not expose to the public internet. ### Layer 2: hyper-health-check Sidecar @@ -612,7 +633,7 @@ The repository includes `ecosystem.config.js` for standardized PM2 process manag pm2 start ecosystem.config.js ``` -This starts both the registry server (with metrics on port 9090) and the health-check sidecar (on port 9091). +This starts both the registry server (with metrics on port 9210, loopback by default) and the health-check sidecar (on port 9091). For remote Prometheus scraping, edit the `args` field to add `--metrics-host ` and ensure the port is firewalled to trusted scrapers only. **Per-deployment customization:** Override `--core` flags for the health-check app via PM2 environment variables or by editing the `args` field. @@ -637,7 +658,7 @@ Use Holepunch's pre-built [Grafana dashboard](https://grafana.com/grafana/dashbo **Import the baseline dashboard:** -1. Add Prometheus as a data source in Grafana (URL: `http://127.0.0.1:9090`) +1. Add Prometheus as a data source in Grafana (URL of the Prometheus server itself, e.g. `http://prometheus-vm:9090`) 2. Import dashboard ID `22313` 3. Add custom panels for QVAC metrics @@ -664,7 +685,8 @@ Use Holepunch's pre-built [Grafana dashboard](https://grafana.com/grafana/dashbo | `node scripts/bin.js run --storage ` | Start a writer | | `node scripts/bin.js run --bootstrap ` | Join existing cluster | | `node scripts/bin.js run --blind-peers ` | Enable blind peer replication | -| `node scripts/bin.js run --metrics-port ` | Prometheus metrics port (default: 9090, 0 to disable) | +| `node scripts/bin.js run --metrics-port ` | Prometheus metrics port (default: 9210, 0 to disable) | +| `node scripts/bin.js run --metrics-host ` | Prometheus metrics bind address (default: 127.0.0.1; use 0.0.0.0 or a private NIC IP to expose) | | `node scripts/bin.js run --skip-storage-check` | Bypass storage/bootstrap key mismatch check | | `node scripts/bin.js init-writer --storage ` | Initialize/authorize a writer client | | `node scripts/bin.js sync-models --file ` | Sync models from JSON config | diff --git a/packages/qvac-lib-registry-server/ecosystem.config.js b/packages/qvac-lib-registry-server/ecosystem.config.js index af22b6054e..526cdd6f0d 100644 --- a/packages/qvac-lib-registry-server/ecosystem.config.js +++ b/packages/qvac-lib-registry-server/ecosystem.config.js @@ -5,7 +5,7 @@ module.exports = { { name: 'registry', script: 'scripts/bin.js', - args: 'run --storage ./corestore --metrics-port 9090', + args: 'run --storage ./corestore --metrics-port 9210', env: { NODE_ENV: 'production' } diff --git a/packages/qvac-lib-registry-server/lib/metrics-server.js b/packages/qvac-lib-registry-server/lib/metrics-server.js index ff86e2756e..a01a6b3302 100644 --- a/packages/qvac-lib-registry-server/lib/metrics-server.js +++ b/packages/qvac-lib-registry-server/lib/metrics-server.js @@ -3,7 +3,7 @@ const http = require('http') const ReadyResource = require('ready-resource') -const DEFAULT_PORT = 9090 +const DEFAULT_PORT = 9210 const DEFAULT_HOST = '127.0.0.1' class MetricsServer extends ReadyResource { diff --git a/packages/qvac-lib-registry-server/scripts/bin.js b/packages/qvac-lib-registry-server/scripts/bin.js index 78954e67ce..c9cf4069d2 100644 --- a/packages/qvac-lib-registry-server/scripts/bin.js +++ b/packages/qvac-lib-registry-server/scripts/bin.js @@ -21,7 +21,8 @@ const HyperswarmStats = require('hyperswarm-stats') const promClient = require('prom-client') const { AUTOBASE_NAMESPACE } = require('@qvac/registry-schema') -const DEFAULT_METRICS_PORT = 9090 +const DEFAULT_METRICS_PORT = 9210 +const DEFAULT_METRICS_HOST = '127.0.0.1' const DEFAULT_STORAGE = './corestore' const DEFAULT_WRITER_STORAGE = './writer-storage' @@ -44,6 +45,7 @@ const runCmd = command('run', flag('--compaction-interval [ms]', `Periodic RocksDB compaction interval in ms (default: ${DEFAULT_COMPACTION_INTERVAL_MS}, 0 to disable)`), flag('--skip-storage-check', 'Skip storage/bootstrap key mismatch check (use when joining existing cluster with fresh storage)'), flag('--metrics-port [port]', `Prometheus metrics HTTP port (default: ${DEFAULT_METRICS_PORT}, 0 to disable)`), + flag('--metrics-host [host]', `Prometheus metrics HTTP bind address (default: ${DEFAULT_METRICS_HOST}; use 0.0.0.0 to expose on all interfaces)`), async function ({ flags }) { const logger = createLogger() @@ -98,6 +100,12 @@ const runCmd = command('run', ? parseInt(flags.metricsPort, 10) : DEFAULT_METRICS_PORT + if (Number.isNaN(metricsPort) || metricsPort < 0) { + throw new Error('--metrics-port must be a non-negative integer (0 to disable)') + } + + const metricsHost = flags.metricsHost || DEFAULT_METRICS_HOST + let metricsServer = null if (metricsPort > 0) { const qvacMetrics = new QvacMetrics(service, { logger }) @@ -106,7 +114,7 @@ const runCmd = command('run', HypercoreStats.fromCorestore(store).registerPrometheusMetrics(promClient) new HyperswarmStats(swarm).registerPrometheusMetrics(promClient) // eslint-disable-line no-new - metricsServer = new MetricsServer(promClient.register, { port: metricsPort, logger }) + metricsServer = new MetricsServer(promClient.register, { port: metricsPort, host: metricsHost, logger }) await metricsServer.ready() } diff --git a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js index e2c5b99005..1386e2ee73 100644 --- a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js +++ b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js @@ -208,3 +208,42 @@ test('MetricsServer closes cleanly', async (t) => { await cleanup(ctx) } }) + +test('MetricsServer binds to custom host', async (t) => { + const basePath = await createTempStorage(t) + const store = new Corestore(basePath) + await store.ready() + + const swarm = new Hyperswarm({ bootstrap: [] }) + const config = new RegistryConfig({ logger: noopLogger }) + + const service = new RegistryService( + store.namespace(AUTOBASE_NAMESPACE), + swarm, + config, + { logger: noopLogger, ackInterval: 5, skipStorageCheck: true } + ) + await service.ready() + + promClient.register.clear() + + const metricsServer = new MetricsServer(promClient.register, { + port: 0, + host: '127.0.0.1', + logger: noopLogger + }) + + const ctx = { service, store, swarm, metricsServer } + + try { + await metricsServer.ready() + + const address = metricsServer._server.address() + t.is(address.address, '127.0.0.1', 'bound to requested host') + + const res = await httpGet(address.port, '/metrics') + t.is(res.status, 200, 'reachable on requested host') + } finally { + await cleanup(ctx) + } +})