diff --git a/packages/qvac-lib-registry-server/README.md b/packages/qvac-lib-registry-server/README.md index 25fe9e68a5..a24dd04871 100644 --- a/packages/qvac-lib-registry-server/README.md +++ b/packages/qvac-lib-registry-server/README.md @@ -302,7 +302,7 @@ Regenerate specs with `npm run build:spec` and restart the service. node scripts/check-peers.js [--key ] ``` -**`ping-server.js`**: Pings a running registry server via RPC to check availability and retrieve server status (role, view key, lengths, connected peers). +**`ping-server.js`**: Pings a running registry server via RPC to verify availability and confirm the connected peer is the indexer rather than a blind relay. Returns `role` and `timestamp` only — operational metrics (model count, view core lag, peer counts, etc.) are exposed via the Prometheus `/metrics` endpoint instead. ```bash node scripts/ping-server.js [--peer ] diff --git a/packages/qvac-lib-registry-server/client/lib/client.js b/packages/qvac-lib-registry-server/client/lib/client.js index 6acac3e751..27e8e44cec 100644 --- a/packages/qvac-lib-registry-server/client/lib/client.js +++ b/packages/qvac-lib-registry-server/client/lib/client.js @@ -56,6 +56,7 @@ class QVACRegistryClient extends ReadyResource { this.hyperswarm.on('connection', this._connectionHandler) this._metadataReady = this._connectMetadataCore() + await this._metadataReady } async _connectMetadataCore () { diff --git a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md index 079f3aedb7..a3946fb151 100644 --- a/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md +++ b/packages/qvac-lib-registry-server/docs/DEPLOYMENT_GUIDE.md @@ -531,6 +531,149 @@ node scripts/bin.js run --storage ./new-writer --bootstrap --skip-storage- | Admin command retries | May need 1-2 retries | Usually works first try | | Writer coordination | Manual timing recommended | Automated/scripted works | +## Monitoring + +Four layers of operational visibility, each independently deployable. + +### Layer 1: In-Process Prometheus /metrics Endpoint + +The registry server exposes Prometheus metrics via an HTTP endpoint bound to `127.0.0.1`. + +**Start with metrics enabled (default port 9210):** + +```bash +node scripts/bin.js run --storage ./corestore --metrics-port 9210 +``` + +**Or disable metrics:** + +```bash +node scripts/bin.js run --storage ./corestore --metrics-port 0 +``` + +**What is exposed:** + +- **Holepunch P2P metrics** (via `hypercore-stats`, `hyperswarm-stats`): aggregate core stats, swarm connections, DHT activity, UDX bytes/packets. Per-core labeled metrics are not exposed — `hypermetrics` is abandoned and incompatible with Hypercore v11, so per-core visibility is provided by the QVAC-specific `registry_blob_core_*` / `registry_view_core_*` gauges below. +- **QVAC-specific metrics:** + +| Metric | Type | Description | +|--------|------|-------------| +| `qvac_registry_model_count` | Gauge | Number of models in the registry (refreshed every 5 min and on local writes) | +| `qvac_registry_total_blob_bytes` | Gauge | Sum of `blobBinding.byteLength` across every model record in the view | +| `qvac_registry_totals_refreshed_age_seconds` | Gauge | Seconds since `total_blob_bytes` / `model_count` were last recomputed (-1 if never) | +| `qvac_registry_blob_core_count` | Gauge | Number of blob cores opened locally on this node | +| `qvac_registry_blob_core_peers` | Gauge | Peers connected to this node's local blob core (may be partial replicas) | +| `qvac_registry_blob_core_seeders` | Gauge | Peers holding this node's local blob core fully and uploading (full replicas) | +| `qvac_registry_blob_core_length` | Gauge | This node's local blob core length in blocks | +| `qvac_registry_blob_core_contiguous_length` | Gauge | Blob core contiguous length in blocks (gap indicates missing blocks on disk) | +| `qvac_registry_blob_core_byte_length` | Gauge | Byte length of this node's local blob core | +| `qvac_registry_view_core_length` | Gauge | View core length (total blocks) | +| `qvac_registry_view_core_contiguous_length` | Gauge | View core contiguous length (gap indicates replication lag) | +| `qvac_registry_view_core_seeders` | Gauge | Peers holding the view core fully and willing to upload (full replicas in the swarm) | +| `qvac_registry_rpc_requests_total` | Counter | RPC requests by method | +| `qvac_registry_rpc_errors_total` | Counter | RPC errors by method | +| `qvac_registry_is_indexer` | Gauge | Whether this node is an indexer | + +`qvac_registry_total_blob_bytes` is derived from the view, not from the on-disk blob cores, so it reports the logical registry size consistently on every node (indexers that do not store blobs locally still report the same value). + +`qvac_registry_blob_core_*` metrics are populated on writer/indexer nodes — the blob core is opened eagerly at startup. Reader-only nodes that don't hold writer state do not open the blob core locally and will export `0` for these gauges. Each indexer owns exactly one writable blob core namespaced to its own primary key, so these metrics are single-series per node; Prometheus's automatic `instance` label distinguishes nodes at scrape time. + +**Multi-indexer dashboards:** view-derived metrics (`qvac_registry_model_count`, `qvac_registry_total_blob_bytes`, `qvac_registry_totals_refreshed_age_seconds`) report the same value on every indexer because the view is authoritative and identical cluster-wide. For single-stat panels use `quantile(0.5, …)` or `avg(…)` to collapse to one value without triple-counting. On-disk metrics (`qvac_registry_blob_core_byte_length`, `qvac_registry_blob_core_peers`, `qvac_registry_blob_core_seeders`) are per-node and should be displayed per `instance` or summed for cluster totals. + +`*_seeders` count peers whose replication handshake has completed, who advertise `remoteUploading`, and whose `remoteContiguousLength` covers the local core length. For the view core they converge to the number of connected replicating peers within an RTT because the view is small (a few MB of autobase metadata); for blob cores the gap `peers - seeders` indicates peers currently downloading rather than serving. + +**Prometheus scrape config (local Prometheus, loopback bind):** + +```yaml +scrape_configs: + - job_name: 'qvac-registry' + scrape_interval: 30s + static_configs: + - targets: ['127.0.0.1:9210'] +``` + +**Prometheus scrape config (central Prometheus scraping multiple registry VMs):** + +Run the registry with `--metrics-host 0.0.0.0` (or the private-network NIC address) so a remote Prometheus can reach the endpoint. Attach matching labels across jobs (`node-exporter`, `pm2-prometheus-exporter`, `qvac-registry`) so Grafana template variables work uniformly. + +```yaml +scrape_configs: + - job_name: 'qvac-registry' + scrape_interval: 30s + static_configs: + - targets: [':9210'] + labels: + vm_name: '' + network: '' + zone: '' + - targets: [':9210'] + labels: + vm_name: '' + network: '' + zone: '' +``` + +**Security:** Port 9210 is chosen to avoid confusion with Prometheus's own port 9090 and to sit next to pm2-prometheus-exporter on 9209. The endpoint binds to `127.0.0.1` by default. When exposing on a private network via `--metrics-host`, restrict access with firewall rules, VPN/overlay network ACLs (WireGuard, Tailscale, Nebula), or a VPC security group. Do not expose to the public internet. + +### Layer 2: hyper-health-check Sidecar + +Run [hyper-health-check](https://github.com/holepunchto/hyper-health-check) as a separate PM2 process to independently verify that cores are discoverable and downloadable from the swarm. The server might report healthy internals while peers cannot actually reach it. + +```bash +pm2 start node_modules/.bin/hyper-health-check -- run \ + --core :registry-view \ + --core :blob-models \ + --port 9091 \ + --grace-period 600000 +``` + +The 10-minute grace period accommodates replication lag after model additions — blind peers need time to download multi-GB blobs before being flagged as unhealthy. + +**Exposed metrics (on port 9091):** + +- `hyper_health_peers_total` — peers swarming each core +- `hyper_health_peers_with_all_data_total` — peers with full replication +- `hyper_health_ips_with_all_data_total` — unique IPs with full data (geographic diversity) + +### Layer 3: PM2 Ecosystem Config + +The repository includes `ecosystem.config.js` for standardized PM2 process management: + +```bash +pm2 start ecosystem.config.js +``` + +This starts both the registry server (with metrics on port 9210, loopback by default) and the health-check sidecar (on port 9091). For remote Prometheus scraping, edit the `args` field to add `--metrics-host ` and ensure the port is firewalled to trusted scrapers only. + +**Per-deployment customization:** Override `--core` flags for the health-check app via PM2 environment variables or by editing the `args` field. + +**Process-level metrics:** Install `pm2-prometheus-exporter` for CPU, memory, heap, event loop latency, restarts, and uptime metrics: + +```bash +pm2 install pm2-prometheus-exporter +``` + +This exposes process metrics on `localhost:9209` alongside the application-level metrics from Layers 1 and 2. + +### Layer 4: Grafana Dashboard + +Use Holepunch's pre-built [Grafana dashboard](https://grafana.com/grafana/dashboards/22313-hypercore-hyperswarm/) (ID: 22313) as a baseline. It includes panels for Hypercore, Hyperswarm, HyperDHT, UDX, and Node.js process stats. + +**Add QVAC-specific panels for:** + +- **Model availability:** `qvac_registry_model_count`, `hyper_health_peers_with_all_data_total` +- **Storage:** `qvac_registry_total_blob_bytes` (view-derived logical size), `sum(qvac_registry_blob_core_byte_length)` (on-disk per node) +- **Replication durability:** `qvac_registry_view_core_seeders`, `qvac_registry_blob_core_seeders` — alert when either drops below a redundancy floor (e.g. `< 2`). Gap between `blob_core_peers` and `blob_core_seeders` surfaces peers mid-download. +- **RPC activity:** `rate(qvac_registry_rpc_requests_total[5m])`, error ratio +- **Cluster health:** `qvac_registry_is_indexer` across nodes, `qvac_registry_view_core_length` vs `qvac_registry_view_core_contiguous_length` +- **Metric freshness:** `qvac_registry_totals_refreshed_age_seconds` — alert if it exceeds 15 minutes (background refresh runs every 5) + +**Import the baseline dashboard:** + +1. Add Prometheus as a data source in Grafana (URL of the Prometheus server itself, e.g. `http://prometheus-vm:9090`) +2. Import dashboard ID `22313` +3. Add custom panels for QVAC metrics + ## Reference ### Environment Variables @@ -554,6 +697,8 @@ node scripts/bin.js run --storage ./new-writer --bootstrap --skip-storage- | `node scripts/bin.js run --storage ` | Start a writer | | `node scripts/bin.js run --bootstrap ` | Join existing cluster | | `node scripts/bin.js run --blind-peers ` | Enable blind peer replication | +| `node scripts/bin.js run --metrics-port ` | Prometheus metrics port (default: 9210, 0 to disable) | +| `node scripts/bin.js run --metrics-host ` | Prometheus metrics bind address (default: 127.0.0.1; use 0.0.0.0 or a private NIC IP to expose) | | `node scripts/bin.js run --skip-storage-check` | Bypass storage/bootstrap key mismatch check | | `node scripts/bin.js init-writer --storage ` | Initialize/authorize a writer client | | `node scripts/bin.js sync-models --file ` | Sync models from JSON config | diff --git a/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json new file mode 100644 index 0000000000..9af1254c6c --- /dev/null +++ b/packages/qvac-lib-registry-server/docs/grafana/REGISTRY_DASHBOARD.json @@ -0,0 +1,3285 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + }, + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_up{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Process Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 3600 + }, + { + "color": "green", + "value": 86400 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_uptime{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_restarts{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Restarts", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_memory{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "pm2_cpu{name=\"registry\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Registry CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "100 - (avg by (vm_name) (rate(node_cpu_seconds_total{mode=\"idle\", vm_name=~\"$vm\"}[5m])) * 100)", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "VM CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "(1 - (node_memory_MemAvailable_bytes{vm_name=~\"$vm\"} / node_memory_MemTotal_bytes{vm_name=~\"$vm\"})) * 100", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "VM Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{mountpoint=\"/\", vm_name=~\"$vm\"} / node_filesystem_size_bytes{mountpoint=\"/\", vm_name=~\"$vm\"}) * 100)", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Disk Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 5000000000 + }, + { + "color": "green", + "value": 20000000000 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "node_filesystem_avail_bytes{mountpoint=\"/\", vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Disk Available", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{device!~\"lo|tailscale.*\", vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} rx", + "refId": "A" + }, + { + "expr": "-rate(node_network_transmit_bytes_total{device!~\"lo|tailscale.*\", vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} tx", + "refId": "B" + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} read", + "refId": "A" + }, + { + "expr": "-rate(node_disk_written_bytes_total{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} write", + "refId": "B" + } + ], + "title": "Disk I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "node_load15{vm_name=~\"$vm\"} / count without (cpu, mode) (node_cpu_seconds_total{mode=\"idle\", vm_name=~\"$vm\"})", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 15, + "panels": [], + "title": "QVAC Registry Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 37 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "quantile(0.5, qvac_registry_model_count{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Model Files", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + }, + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 37 + }, + "id": 17, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "vertical", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value_and_name", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_is_indexer{vm_name=~\"$vm\"}", + "legendFormat": "{{vm_name}}", + "refId": "A" + } + ], + "title": "Indexer", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 37 + }, + "id": 19, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "vertical", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value_and_name", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_count{vm_name=~\"$vm\"}", + "legendFormat": "{{vm_name}}", + "refId": "A" + } + ], + "title": "Blob Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 37 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "quantile(0.5, qvac_registry_total_blob_bytes{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Blob Storage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 41 + }, + "id": 22, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_view_core_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "View Core Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 41 + }, + "id": 45, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_view_core_contiguous_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "View Core Contiguous Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 1000000 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 41 + }, + "id": 46, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_view_core_length{vm_name=~\"$vm\"} - qvac_registry_view_core_contiguous_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "View Core Gap (length - contiguous)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_peers{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Blob Core Peers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 73 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(qvac_registry_rpc_requests_total{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "title": "RPC Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 73 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(qvac_registry_rpc_errors_total{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "title": "RPC Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 110 + }, + "id": 13, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "showControls": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "unwrappedColumns": false, + "wrapLogMessage": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "{job=\"registry\"}", + "refId": "A" + } + ], + "title": "Registry Logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 120 + }, + "id": 14, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "showControls": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "unwrappedColumns": false, + "wrapLogMessage": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "{job=\"registry\", level=\"error\"}", + "refId": "A" + } + ], + "title": "Registry Error Logs", + "type": "logs" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "-1": { + "color": "red", + "text": "never" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 300 + }, + { + "color": "red", + "value": 600 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 37 + }, + "id": 26, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "max(qvac_registry_totals_refreshed_age_seconds{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Totals Refresh Age", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 49 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Blob Core Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 49 + }, + "id": 43, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_contiguous_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Blob Core Contiguous Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 1000000 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 49 + }, + "id": 44, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_length{vm_name=~\"$vm\"} - qvac_registry_blob_core_contiguous_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Blob Core Gap (length - contiguous)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_seeders{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Blob Core Seeders (full replicas)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "qvac_registry_blob_core_byte_length{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Blob Core Bytes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 81 + }, + "id": 30, + "panels": [], + "title": "Holepunch P2P Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "green", + "value": 3 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 82 + }, + "id": 31, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(hyperswarm_nr_peers{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Swarm Peers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 82 + }, + "id": 33, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(hypercore_invalid_data{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Hypercore Invalid Data", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 82 + }, + "id": 34, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(hypercore_invalid_requests{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Hypercore Invalid Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 100 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 82 + }, + "id": 35, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "sum(udx_packets_dropped_total{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "UDX Packet Drops", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 82 + }, + "id": 36, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "avg(hyperswarm_avg_congestion_window{vm_name=~\"$vm\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Avg Congestion Window", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 86 + }, + "id": 37, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "hyperswarm_nr_peers{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Swarm Peers Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 86 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "hypercore_round_trip_time_avg_seconds{vm_name=~\"$vm\"}", + "legendFormat": "{{ vm_name }}", + "refId": "A" + } + ], + "title": "Replication RTT", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 94 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(hyperswarm_client_connections_opened{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} client-opened", + "refId": "A" + }, + { + "expr": "rate(hyperswarm_server_connections_opened{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} server-opened", + "refId": "B" + }, + { + "expr": "rate(hyperswarm_client_connections_closed{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} client-closed", + "refId": "C" + }, + { + "expr": "rate(hyperswarm_server_connections_closed{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} server-closed", + "refId": "D" + } + ], + "title": "Swarm Connection Churn (per-second)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 94 + }, + "id": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(hypercore_total_wire_data_transmitted{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} tx", + "refId": "A" + }, + { + "expr": "rate(hypercore_total_wire_data_received{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} rx", + "refId": "B" + } + ], + "title": "Hypercore Wire Data Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 102 + }, + "id": 41, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(udx_total_bytes_transmitted{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} tx", + "refId": "A" + }, + { + "expr": "rate(udx_total_bytes_received{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} rx", + "refId": "B" + } + ], + "title": "UDX Bytes (DHT transport)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 102 + }, + "id": 42, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.0", + "targets": [ + { + "expr": "rate(dht_total_queries{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} queries", + "refId": "A" + }, + { + "expr": "rate(dht_total_requests{vm_name=~\"$vm\"}[5m])", + "legendFormat": "{{ vm_name }} requests", + "refId": "B" + } + ], + "title": "DHT Query & Request Rate", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 42, + "tags": [ + "registry", + "pm2", + "qvac" + ], + "templating": { + "list": [ + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus" + }, + "includeAll": true, + "multi": true, + "name": "vm", + "options": [], + "query": "label_values(pm2_up{name=\"registry\"}, vm_name)", + "refresh": 1, + "regexApplyTo": "value", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "QVAC Registry Overview", + "uid": "f43aa479-9d22-4b3b-baa6-f527a2615981", + "version": 1, + "weekStart": "" +} diff --git a/packages/qvac-lib-registry-server/ecosystem.config.js b/packages/qvac-lib-registry-server/ecosystem.config.js new file mode 100644 index 0000000000..526cdd6f0d --- /dev/null +++ b/packages/qvac-lib-registry-server/ecosystem.config.js @@ -0,0 +1,22 @@ +'use strict' + +module.exports = { + apps: [ + { + name: 'registry', + script: 'scripts/bin.js', + args: 'run --storage ./corestore --metrics-port 9210', + env: { + NODE_ENV: 'production' + } + }, + { + name: 'health-check', + script: 'node_modules/.bin/hyper-health-check', + args: 'run --port 9091 --grace-period 600000', + env: { + NODE_ENV: 'production' + } + } + ] +} diff --git a/packages/qvac-lib-registry-server/lib/metrics-server.js b/packages/qvac-lib-registry-server/lib/metrics-server.js new file mode 100644 index 0000000000..a01a6b3302 --- /dev/null +++ b/packages/qvac-lib-registry-server/lib/metrics-server.js @@ -0,0 +1,64 @@ +'use strict' + +const http = require('http') +const ReadyResource = require('ready-resource') + +const DEFAULT_PORT = 9210 +const DEFAULT_HOST = '127.0.0.1' + +class MetricsServer extends ReadyResource { + constructor (promRegister, opts = {}) { + super() + + this._register = promRegister + this._port = opts.port || DEFAULT_PORT + this._host = opts.host || DEFAULT_HOST + this._logger = opts.logger || console + this._server = null + } + + async _open () { + this._server = http.createServer(async (req, res) => { + if (req.url === '/metrics' && req.method === 'GET') { + try { + const metrics = await this._register.metrics() + res.writeHead(200, { 'Content-Type': this._register.contentType }) + res.end(metrics) + } catch (err) { + this._logger.error({ err }, 'MetricsServer: failed to collect metrics') + res.writeHead(500) + res.end('Internal Server Error') + } + return + } + + res.writeHead(404) + res.end('Not Found') + }) + + await new Promise((resolve, reject) => { + this._server.listen(this._port, this._host, () => { + this._logger.info({ + host: this._host, + port: this._port + }, 'MetricsServer: listening') + resolve() + }) + this._server.on('error', reject) + }) + } + + async _close () { + if (!this._server) return + + await new Promise((resolve) => { + this._server.close(() => { + this._logger.info('MetricsServer: closed') + resolve() + }) + }) + this._server = null + } +} + +module.exports = MetricsServer diff --git a/packages/qvac-lib-registry-server/lib/metrics.js b/packages/qvac-lib-registry-server/lib/metrics.js new file mode 100644 index 0000000000..f0411cc235 --- /dev/null +++ b/packages/qvac-lib-registry-server/lib/metrics.js @@ -0,0 +1,192 @@ +'use strict' + +const promClient = require('prom-client') + +// RPC methods that the registry service exposes. Pre-initialising counter +// series at zero for each method means `rate()` returns 0 (instead of NaN) +// from the first scrape, so dashboards do not appear empty on a fresh start. +const RPC_METHODS = Object.freeze([ + 'add-model', + 'put-license', + 'update-model-metadata', + 'delete-model', + 'ping' +]) + +class QvacMetrics { + constructor (service, opts = {}) { + this._service = service + this._logger = opts.logger || console + + this._rpcRequests = new promClient.Counter({ + name: 'qvac_registry_rpc_requests_total', + help: 'Total RPC requests by method', + labelNames: ['method'] + }) + + this._rpcErrors = new promClient.Counter({ + name: 'qvac_registry_rpc_errors_total', + help: 'Total RPC errors by method', + labelNames: ['method'] + }) + + for (const method of RPC_METHODS) { + this._rpcRequests.inc({ method }, 0) + this._rpcErrors.inc({ method }, 0) + } + + this._registerGauges() + } + + recordRpcRequest (method) { + this._rpcRequests.inc({ method }) + } + + recordRpcError (method) { + this._rpcErrors.inc({ method }) + } + + _registerGauges () { + const self = this + + registerGauge({ + name: 'qvac_registry_model_count', + help: 'Number of models in the registry', + collect () { + this.set(self._service.modelCount) + } + }) + + registerGauge({ + name: 'qvac_registry_total_blob_bytes', + help: 'Total bytes across all model blobs (sum of blobBinding.byteLength across view records)', + collect () { + this.set(self._service.totalModelBytes) + } + }) + + // Derived from totalModelBytes via a background refresh; expose staleness so + // operators can alert when the refresh stalls. + registerGauge({ + name: 'qvac_registry_totals_refreshed_age_seconds', + help: 'Seconds since qvac_registry_total_blob_bytes and qvac_registry_model_count were last recomputed (-1 if never)', + collect () { + const ts = self._service.totalsRefreshedAt + this.set(ts ? (Date.now() - ts) / 1000 : -1) + } + }) + + registerGauge({ + name: 'qvac_registry_blob_core_count', + help: 'Number of blob cores opened locally on this node', + collect () { + this.set(self._service.blobsCores.size) + } + }) + + // Each indexer owns exactly one writable blob core (namespaced on its + // own primary key), so per-node blob-core metrics are single-series and + // don't need an extra label - Prometheus's automatic `instance` label + // distinguishes nodes at scrape time. + registerGauge({ + name: 'qvac_registry_blob_core_peers', + help: 'Number of peers connected to this node\'s local blob core (may be partial replicas)', + collect () { + this.set(firstBlobCore(self._service)?.peers.length ?? 0) + } + }) + + registerGauge({ + name: 'qvac_registry_blob_core_length', + help: 'This node\'s local blob core length (total blocks)', + collect () { + this.set(firstBlobCore(self._service)?.length ?? 0) + } + }) + + registerGauge({ + name: 'qvac_registry_blob_core_contiguous_length', + help: 'This node\'s local blob core contiguous length (gap = length - contiguous indicates missing blocks on disk)', + collect () { + this.set(firstBlobCore(self._service)?.contiguousLength ?? 0) + } + }) + + registerGauge({ + name: 'qvac_registry_view_core_length', + help: 'View core length (total blocks)', + collect () { + const viewCore = self._service.view?.core + this.set(viewCore ? viewCore.length : 0) + } + }) + + registerGauge({ + name: 'qvac_registry_view_core_contiguous_length', + help: 'View core contiguous length (gap = length - contiguous indicates replication lag)', + collect () { + const viewCore = self._service.view?.core + this.set(viewCore ? viewCore.contiguousLength : 0) + } + }) + + registerGauge({ + name: 'qvac_registry_view_core_seeders', + help: 'Peers that hold the view core fully and are willing to upload (full replicas available in the swarm)', + collect () { + this.set(countSeeders(self._service.view?.core)) + } + }) + + registerGauge({ + name: 'qvac_registry_is_indexer', + help: 'Whether this node is an indexer (1=yes, 0=no)', + collect () { + this.set(self._service.base?.isIndexer ? 1 : 0) + } + }) + + registerGauge({ + name: 'qvac_registry_blob_core_byte_length', + help: 'Byte length of this node\'s local blob core (only populated on nodes that opened the blob core locally)', + collect () { + this.set(firstBlobCore(self._service)?.byteLength ?? 0) + } + }) + + registerGauge({ + name: 'qvac_registry_blob_core_seeders', + help: 'Peers holding this node\'s local blob core fully and willing to upload (full replicas)', + collect () { + this.set(countSeeders(firstBlobCore(self._service))) + } + }) + } +} + +// Each indexer owns at most one writable blob core; this helper returns it +// or null when the node is a reader that hasn't opened the core locally. +function firstBlobCore (service) { + const iter = service.blobsCores.values().next() + return iter.done ? null : iter.value.core +} + +function registerGauge (opts) { + return new promClient.Gauge(opts) +} + +// A peer is a "seeder" for a core when the replication handshake has opened, +// the remote has advertised willingness to upload, and the remote's contiguous +// length covers the core's current length. `remoteContiguousLength` is zero +// until the handshake completes, so the `remoteOpened` check avoids counting +// partially-initialised peers as full replicas. +function countSeeders (core) { + if (!core || !Array.isArray(core.peers) || core.length === 0) return 0 + let n = 0 + for (const p of core.peers) { + if (p.remoteOpened && p.remoteUploading && p.remoteContiguousLength >= core.length) n++ + } + return n +} + +module.exports = QvacMetrics diff --git a/packages/qvac-lib-registry-server/lib/registry-service.js b/packages/qvac-lib-registry-server/lib/registry-service.js index 63794b2bea..8e8c9e76b9 100644 --- a/packages/qvac-lib-registry-server/lib/registry-service.js +++ b/packages/qvac-lib-registry-server/lib/registry-service.js @@ -46,6 +46,8 @@ const DISPATCH_DELETE_MODEL = `@${QVAC_MAIN_REGISTRY}/delete-model` const BLOB_CORE_NAME = 'models' +const MODEL_TOTALS_REFRESH_INTERVAL_MS = 5 * 60 * 1000 + class RegistryService extends ReadyResource { constructor (store, swarm, config, opts = {}) { super() @@ -74,6 +76,12 @@ class RegistryService extends ReadyResource { this._mirroredCoreIds = new Set() this.blindPeering = null this.reseedTracker = null + this.metrics = null + + this._totalModelBytes = 0 + this._modelCount = 0 + this._totalModelBytesRefreshedAt = 0 + this._totalsRefreshTimer = null this._registerApplyHandlers() @@ -143,15 +151,13 @@ class RegistryService extends ReadyResource { this.view = this.base.view await this.view.ready() - this._logAvailableModels().catch(err => { - this.logger.error({ err }, 'RegistryService: Failed to log available models') - }) - this.swarm.on('connection', (conn, peerInfo) => { - const peerKey = peerInfo?.publicKey ? IdEnc.normalize(peerInfo.publicKey) : 'unknown' + const peerKey = peerInfo?.publicKey ? IdEnc.normalize(peerInfo.publicKey) : null - this.logger.info({ peer: peerKey }, 'Swarm connection opened') - conn.on('close', () => this.logger.info({ peer: peerKey }, 'Swarm connection closed')) + this.logger.info({ peer: peerKey || 'unknown' }, 'Swarm connection opened') + conn.on('close', () => { + this.logger.info({ peer: peerKey || 'unknown' }, 'Swarm connection closed') + }) this._setupRpc(conn) @@ -248,16 +254,42 @@ class RegistryService extends ReadyResource { await this._setupBlindPeering() } + // Eagerly open the blob core on writer/indexer nodes so that per-core + // metrics (peers, seeders, byte length) and on-disk replication health + // are observable immediately, without waiting for the first addModel RPC + // or for blind-peering setup to run. On reader-only nodes we skip this, + // because `writable: true` would create a local core with the wrong key. + if (this.base.isIndexer || this.base.localWriter) { + try { + await this._getOrCreateBlobsCore(BLOB_CORE_NAME) + } catch (err) { + this.logger.warn({ err: err.message }, 'RegistryService: failed to open blob core at startup') + } + } + if (this.compactionIntervalMs > 0) { this._startCompactionInterval() } + await this._refreshTotals() + this._totalsRefreshTimer = setInterval(() => { + this._refreshTotals().catch(err => { + this.logger.warn({ err: err.message }, 'RegistryService: failed to refresh model totals') + }) + }, MODEL_TOTALS_REFRESH_INTERVAL_MS) + if (this._totalsRefreshTimer.unref) this._totalsRefreshTimer.unref() + this.logger.info('RegistryService: swarm joined and flushed') } async _close () { this.logger.info('RegistryService: closing') + if (this._totalsRefreshTimer) { + clearInterval(this._totalsRefreshTimer) + this._totalsRefreshTimer = null + } + if (this._compactionInterval) { clearInterval(this._compactionInterval) this._compactionInterval = null @@ -468,28 +500,34 @@ class RegistryService extends ReadyResource { rpc.respond( 'add-model', async (entry) => { - ensureWriterAccess() - - if (!this.opened) await this.ready() - await this._ensureIndexer() + if (this.metrics) this.metrics.recordRpcRequest('add-model') + try { + ensureWriterAccess() - const skipExisting = entry.skipExisting || false - const modelEntry = { ...entry } - delete modelEntry.skipExisting + if (!this.opened) await this.ready() + await this._ensureIndexer() - const result = await this.addModel(modelEntry, { skipExisting }) + const skipExisting = entry.skipExisting || false + const modelEntry = { ...entry } + delete modelEntry.skipExisting - this.logger.info({ - path: result.path, - source: result.source - }, 'RPC: add-model completed') + const result = await this.addModel(modelEntry, { skipExisting }) - return { - success: true, - model: { + this.logger.info({ path: result.path, source: result.source + }, 'RPC: add-model completed') + + return { + success: true, + model: { + path: result.path, + source: result.source + } } + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('add-model') + throw err } } ) @@ -497,19 +535,25 @@ class RegistryService extends ReadyResource { rpc.respond( 'put-license', async (licenseRecord) => { - ensureWriterAccess() + if (this.metrics) this.metrics.recordRpcRequest('put-license') + try { + ensureWriterAccess() - if (!this.opened) await this.ready() - await this._ensureIndexer() - await this.putLicense(licenseRecord) + if (!this.opened) await this.ready() + await this._ensureIndexer() + await this.putLicense(licenseRecord) - this.logger.info({ - spdxId: licenseRecord.spdxId - }, 'RPC: put-license completed') + this.logger.info({ + spdxId: licenseRecord.spdxId + }, 'RPC: put-license completed') - return { - success: true, - message: 'License operation appended' + return { + success: true, + message: 'License operation appended' + } + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('put-license') + throw err } } ) @@ -517,45 +561,51 @@ class RegistryService extends ReadyResource { rpc.respond( 'update-model-metadata', async (data) => { - ensureWriterAccess() - - if (!data.path) throw new TypeError('path is required') - if (!data.source) throw new TypeError('source is required') - - if (!this.opened) await this.ready() - await this._ensureIndexer() - - const existing = await this.getModelByKey({ path: data.path, source: data.source }) - if (!existing) throw new Error(`Model not found: ${data.path}`) - - // If explicitly undeprecating, clear deprecation fields - const isUndeprecating = data.deprecated === false - - const updated = { - ...existing, - engine: data.engine ?? existing.engine, - licenseId: data.licenseId ?? existing.licenseId, - description: data.description ?? existing.description, - quantization: data.quantization ?? existing.quantization, - params: data.params ?? existing.params, - notes: data.notes ?? existing.notes, - tags: data.tags ?? existing.tags, - deprecated: data.deprecated !== undefined ? data.deprecated : existing.deprecated, - deprecatedAt: isUndeprecating ? '' : (data.deprecatedAt ?? existing.deprecatedAt), - replacedBy: isUndeprecating ? '' : (data.replacedBy ?? existing.replacedBy), - deprecationReason: isUndeprecating ? '' : (data.deprecationReason ?? existing.deprecationReason) - } + if (this.metrics) this.metrics.recordRpcRequest('update-model-metadata') + try { + ensureWriterAccess() + + if (!data.path) throw new TypeError('path is required') + if (!data.source) throw new TypeError('source is required') + + if (!this.opened) await this.ready() + await this._ensureIndexer() + + const existing = await this.getModelByKey({ path: data.path, source: data.source }) + if (!existing) throw new Error(`Model not found: ${data.path}`) + + // If explicitly undeprecating, clear deprecation fields + const isUndeprecating = data.deprecated === false + + const updated = { + ...existing, + engine: data.engine ?? existing.engine, + licenseId: data.licenseId ?? existing.licenseId, + description: data.description ?? existing.description, + quantization: data.quantization ?? existing.quantization, + params: data.params ?? existing.params, + notes: data.notes ?? existing.notes, + tags: data.tags ?? existing.tags, + deprecated: data.deprecated !== undefined ? data.deprecated : existing.deprecated, + deprecatedAt: isUndeprecating ? '' : (data.deprecatedAt ?? existing.deprecatedAt), + replacedBy: isUndeprecating ? '' : (data.replacedBy ?? existing.replacedBy), + deprecationReason: isUndeprecating ? '' : (data.deprecationReason ?? existing.deprecationReason) + } - await this._appendOperation(DISPATCH_PUT_MODEL, updated) + await this._appendOperation(DISPATCH_PUT_MODEL, updated) - const viewLength = this.view?.core?.length ?? 0 - const viewContiguous = this.view?.core?.contiguousLength ?? 0 - const viewSigned = this.view?.core?.signedLength ?? 0 - this.logger.info({ path: data.path, viewLength, viewContiguous, viewSigned }, 'RPC: update-model-metadata completed') + const viewLength = this.view?.core?.length ?? 0 + const viewContiguous = this.view?.core?.contiguousLength ?? 0 + const viewSigned = this.view?.core?.signedLength ?? 0 + this.logger.info({ path: data.path, viewLength, viewContiguous, viewSigned }, 'RPC: update-model-metadata completed') - return { - success: true, - model: updated + return { + success: true, + model: updated + } + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('update-model-metadata') + throw err } } ) @@ -563,28 +613,33 @@ class RegistryService extends ReadyResource { rpc.respond( 'delete-model', async (data) => { - ensureWriterAccess() + if (this.metrics) this.metrics.recordRpcRequest('delete-model') + try { + ensureWriterAccess() - if (!data.path) throw new TypeError('path is required') - if (!data.source) throw new TypeError('source is required') + if (!data.path) throw new TypeError('path is required') + if (!data.source) throw new TypeError('source is required') - if (!this.opened) await this.ready() - await this._ensureIndexer() + if (!this.opened) await this.ready() + await this._ensureIndexer() - const result = await this.deleteModel({ path: data.path, source: data.source }) + const result = await this.deleteModel({ path: data.path, source: data.source }) - this.logger.info({ - path: data.path, - source: data.source - }, 'RPC: delete-model completed') - - return result + this.logger.info({ + path: data.path, + source: data.source + }, 'RPC: delete-model completed') + + return result + } catch (err) { + if (this.metrics) this.metrics.recordRpcError('delete-model') + throw err + } } ) - // Server identification endpoint - allows RPC clients to verify they connected - // to the actual server and not a blind peer (which won't have RPC responders) rpc.respond('ping', async () => { + if (this.metrics) this.metrics.recordRpcRequest('ping') return { role: 'registry-server', timestamp: Date.now() @@ -673,6 +728,8 @@ class RegistryService extends ReadyResource { await this._appendOperation(DISPATCH_PUT_MODEL, modelData) + this._scheduleTotalsRefresh() + if (this.reseedTracker) { await this.reseedTracker.waitForComplete() this.logger.info({ @@ -969,7 +1026,6 @@ class RegistryService extends ReadyResource { discoveryKey: core.discoveryKey.toString('hex') }, 'Hyperblobs core ready') - // Caller is responsible for mirroring after data is added return entry } @@ -1074,6 +1130,8 @@ class RegistryService extends ReadyResource { await this._appendOperation(DISPATCH_DELETE_MODEL, { path, source }) + this._scheduleTotalsRefresh() + this.logger.info({ path, source }, 'deleteModel: completed') return { success: true, path, source } @@ -1154,26 +1212,45 @@ class RegistryService extends ReadyResource { } } - async _logAvailableModels () { - if (!this.view) return + async _refreshTotals () { + if (!this.view || !this.view.opened) return - try { - if (!this.view.opened) await this.view.ready() - const models = await this.view.findModelsByPath({}).toArray() + const startedAt = Date.now() + let total = 0 + let count = 0 - if (models.length === 0) { - this.logger.info('RegistryService: No models in registry yet') - } else { - const modelsToLog = models.length > 5 ? models.slice(-5) : models - this.logger.info({ - count: models.length, - showing: modelsToLog.length, - models: modelsToLog.map(m => `${m.path} [${m.engine}]`) - }, 'RegistryService: models available') - } - } catch (err) { - this.logger.error({ err }, 'RegistryService: Failed to log models') + for await (const model of this.view.findModelsByPath({})) { + total += model.blobBinding?.byteLength || 0 + count++ } + + this._totalModelBytes = total + this._modelCount = count + this._totalModelBytesRefreshedAt = Date.now() + + this.logger.debug({ + totalBytes: total, + models: count, + durationMs: Date.now() - startedAt + }, 'RegistryService: refreshed model totals') + } + + _scheduleTotalsRefresh () { + this._refreshTotals().catch(err => { + this.logger.warn({ err: err.message }, 'RegistryService: failed to refresh model totals') + }) + } + + get totalModelBytes () { + return this._totalModelBytes + } + + get modelCount () { + return this._modelCount + } + + get totalsRefreshedAt () { + return this._totalModelBytesRefreshedAt } _normalizeKey (key) { diff --git a/packages/qvac-lib-registry-server/package.json b/packages/qvac-lib-registry-server/package.json index 3330611838..34ecf056d1 100644 --- a/packages/qvac-lib-registry-server/package.json +++ b/packages/qvac-lib-registry-server/package.json @@ -67,15 +67,19 @@ "crypto": "npm:bare-node-crypto", "http": "npm:bare-node-http", "https": "npm:bare-node-https", + "hyper-health-check": "^1.3.0", "hyperblobs": "^2.8.0", "hypercore-id-encoding": "^1.3.0", + "hypercore-stats": "^2.4.0", "hyperdb": "^4.16.0", "hyperdispatch": "^1.4.0", "hyperschema": "^1.13.0", "hyperswarm": "^4.10.5", + "hyperswarm-stats": "^1.3.0", "paparam": "^1.8.6", "pino": "^9.9.4", "pino-pretty": "^13.1.1", + "prom-client": "^15.1.3", "protomux-rpc": "^1.7.0", "readline": "npm:bare-node-readline", "ready-resource": "^1.2.0", diff --git a/packages/qvac-lib-registry-server/scripts/add-model.js b/packages/qvac-lib-registry-server/scripts/add-model.js index 076f18dd6d..116b10ed40 100644 --- a/packages/qvac-lib-registry-server/scripts/add-model.js +++ b/packages/qvac-lib-registry-server/scripts/add-model.js @@ -24,6 +24,8 @@ async function addModel () { primaryKey = args[++i] } else if (args[i] === '--models-file' || args[i] === '-f') { modelsFile = args[++i] + } else if (args[i] === '--') { + continue } else if (!canonicalSource && args[i] !== '--skip-existing') { canonicalSource = args[i] } diff --git a/packages/qvac-lib-registry-server/scripts/bin.js b/packages/qvac-lib-registry-server/scripts/bin.js index c01f21d53a..c9cf4069d2 100644 --- a/packages/qvac-lib-registry-server/scripts/bin.js +++ b/packages/qvac-lib-registry-server/scripts/bin.js @@ -14,8 +14,16 @@ const fs = require('fs') const RegistryService = require('../lib/registry-service') const RegistryConfig = require('../lib/config') +const MetricsServer = require('../lib/metrics-server') +const QvacMetrics = require('../lib/metrics') +const HypercoreStats = require('hypercore-stats') +const HyperswarmStats = require('hyperswarm-stats') +const promClient = require('prom-client') const { AUTOBASE_NAMESPACE } = require('@qvac/registry-schema') +const DEFAULT_METRICS_PORT = 9210 +const DEFAULT_METRICS_HOST = '127.0.0.1' + const DEFAULT_STORAGE = './corestore' const DEFAULT_WRITER_STORAGE = './writer-storage' @@ -36,6 +44,8 @@ const runCmd = command('run', flag('--clear-after-reseed', 'Clear blob blocks after successful replication to blind peers'), flag('--compaction-interval [ms]', `Periodic RocksDB compaction interval in ms (default: ${DEFAULT_COMPACTION_INTERVAL_MS}, 0 to disable)`), flag('--skip-storage-check', 'Skip storage/bootstrap key mismatch check (use when joining existing cluster with fresh storage)'), + flag('--metrics-port [port]', `Prometheus metrics HTTP port (default: ${DEFAULT_METRICS_PORT}, 0 to disable)`), + flag('--metrics-host [host]', `Prometheus metrics HTTP bind address (default: ${DEFAULT_METRICS_HOST}; use 0.0.0.0 to expose on all interfaces)`), async function ({ flags }) { const logger = createLogger() @@ -86,8 +96,30 @@ const runCmd = command('run', config.setAutobaseKey(IdEnc.normalize(service.base.key)) config.setRegistryCoreKey(IdEnc.normalize(service.registryCoreKey)) + const metricsPort = flags.metricsPort !== undefined + ? parseInt(flags.metricsPort, 10) + : DEFAULT_METRICS_PORT + + if (Number.isNaN(metricsPort) || metricsPort < 0) { + throw new Error('--metrics-port must be a non-negative integer (0 to disable)') + } + + const metricsHost = flags.metricsHost || DEFAULT_METRICS_HOST + + let metricsServer = null + if (metricsPort > 0) { + const qvacMetrics = new QvacMetrics(service, { logger }) + service.metrics = qvacMetrics + + HypercoreStats.fromCorestore(store).registerPrometheusMetrics(promClient) + new HyperswarmStats(swarm).registerPrometheusMetrics(promClient) // eslint-disable-line no-new + + metricsServer = new MetricsServer(promClient.register, { port: metricsPort, host: metricsHost, logger }) + await metricsServer.ready() + } + logServiceInfo(logger, service) - registerShutdown(logger, service, swarm, store) + registerShutdown(logger, service, swarm, store, metricsServer) } ) @@ -155,13 +187,14 @@ const cmd = command('registry', runCmd, initWriter, syncModelsCmd) cmd.parse() -function registerShutdown (logger, service, swarm, store) { +function registerShutdown (logger, service, swarm, store, metricsServer) { let closing = false const shutdown = async () => { if (closing) return closing = true logger.info('Shutting down gracefully…') try { + if (metricsServer) await metricsServer.close() await service.close() await swarm.destroy() await store.close() diff --git a/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js new file mode 100644 index 0000000000..7be99521be --- /dev/null +++ b/packages/qvac-lib-registry-server/tests/integration/metrics.integration.test.js @@ -0,0 +1,233 @@ +'use strict' + +const test = require('brittle') +const Corestore = require('corestore') +const Hyperswarm = require('hyperswarm') +const http = require('http') +const promClient = require('prom-client') + +const RegistryService = require('../../lib/registry-service') +const RegistryConfig = require('../../lib/config') +const MetricsServer = require('../../lib/metrics-server') +const QvacMetrics = require('../../lib/metrics') +const { AUTOBASE_NAMESPACE } = require('../../shared/constants') +const { createTempStorage } = require('../helpers/test-utils') + +const noopLogger = { + info () {}, + debug () {}, + error () {}, + warn () {} +} + +async function createServiceWithMetrics (t, opts = {}) { + const basePath = await createTempStorage(t) + const store = new Corestore(basePath) + await store.ready() + + const swarm = new Hyperswarm({ bootstrap: [] }) + const config = new RegistryConfig({ logger: noopLogger }) + + const service = new RegistryService( + store.namespace(AUTOBASE_NAMESPACE), + swarm, + config, + { + logger: noopLogger, + ackInterval: 5, + skipStorageCheck: true + } + ) + + await service.ready() + + // Fresh registry per test to avoid metric name collisions + const registry = new promClient.Registry() + promClient.register.clear() + + const qvacMetrics = new QvacMetrics(service, { logger: noopLogger }) + service.metrics = qvacMetrics + + const port = opts.port || 0 + const metricsServer = new MetricsServer(promClient.register, { + port, + logger: noopLogger + }) + await metricsServer.ready() + + const actualPort = metricsServer._server.address().port + + return { service, store, swarm, metricsServer, qvacMetrics, port: actualPort, registry } +} + +async function cleanup (ctx) { + if (ctx.metricsServer) await ctx.metricsServer.close().catch(() => {}) + if (ctx.service && ctx.service.opened) await ctx.service.close().catch(() => {}) + if (ctx.swarm) await ctx.swarm.destroy().catch(() => {}) + if (ctx.store) await ctx.store.close().catch(() => {}) + promClient.register.clear() +} + +function httpGet (port, path) { + return new Promise((resolve, reject) => { + const req = http.get(`http://127.0.0.1:${port}${path}`, (res) => { + let body = '' + res.on('data', (chunk) => { body += chunk }) + res.on('end', () => resolve({ status: res.statusCode, body, headers: res.headers })) + }) + req.on('error', reject) + }) +} + +test('MetricsServer serves Prometheus text at /metrics', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + const res = await httpGet(ctx.port, '/metrics') + t.is(res.status, 200, 'returns 200') + t.ok(res.headers['content-type'].includes('text/plain') || res.headers['content-type'].includes('openmetrics'), 'correct content type') + t.ok(res.body.length > 0, 'body is non-empty') + } finally { + await cleanup(ctx) + } +}) + +test('MetricsServer returns 404 for non-metrics paths', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + const res = await httpGet(ctx.port, '/health') + t.is(res.status, 404, 'returns 404') + } finally { + await cleanup(ctx) + } +}) + +test('/metrics includes QVAC custom gauges', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + const res = await httpGet(ctx.port, '/metrics') + const body = res.body + + t.ok(body.includes('qvac_registry_model_count'), 'has model_count') + t.ok(body.includes('qvac_registry_total_blob_bytes'), 'has total_blob_bytes') + t.ok(body.includes('qvac_registry_totals_refreshed_age_seconds'), 'has totals_refreshed_age_seconds') + t.ok(body.includes('qvac_registry_blob_core_count'), 'has blob_core_count') + t.ok(body.includes('qvac_registry_blob_core_seeders'), 'has blob_core_seeders') + t.ok(body.includes('qvac_registry_blob_core_length'), 'has blob_core_length') + t.ok(body.includes('qvac_registry_blob_core_contiguous_length'), 'has blob_core_contiguous_length') + t.ok(body.includes('qvac_registry_view_core_length'), 'has view_core_length') + t.ok(body.includes('qvac_registry_view_core_contiguous_length'), 'has view_core_contiguous_length') + t.ok(body.includes('qvac_registry_view_core_seeders'), 'has view_core_seeders') + t.ok(body.includes('qvac_registry_is_indexer'), 'has is_indexer') + t.ok(body.includes('qvac_registry_rpc_requests_total'), 'has rpc_requests_total') + t.ok(body.includes('qvac_registry_rpc_errors_total'), 'has rpc_errors_total') + + const totalBytesLine = body.split('\n') + .find(line => line.startsWith('qvac_registry_total_blob_bytes ')) + t.ok(totalBytesLine, 'exports total_blob_bytes as a single series') + t.ok(totalBytesLine.endsWith(' 0'), 'total_blob_bytes is 0 on an empty registry') + + const modelCountLine = body.split('\n') + .find(line => line.startsWith('qvac_registry_model_count ')) + t.ok(modelCountLine, 'exports model_count as a single series') + t.ok(modelCountLine.endsWith(' 0'), 'model_count is 0 on an empty registry') + + const viewSeedersLine = body.split('\n') + .find(line => line.startsWith('qvac_registry_view_core_seeders ')) + t.ok(viewSeedersLine, 'exports view_core_seeders as a single series') + t.ok(viewSeedersLine.endsWith(' 0'), 'view_core_seeders is 0 with no connected peers') + + const rpcPingRequests = body.split('\n') + .find(line => line.startsWith('qvac_registry_rpc_requests_total{method="ping"}')) + t.ok(rpcPingRequests, 'rpc_requests_total{method="ping"} series is pre-initialised') + t.ok(rpcPingRequests.endsWith(' 0'), 'rpc_requests_total{method="ping"} starts at 0') + + const rpcPingErrors = body.split('\n') + .find(line => line.startsWith('qvac_registry_rpc_errors_total{method="add-model"}')) + t.ok(rpcPingErrors, 'rpc_errors_total{method="add-model"} series is pre-initialised') + t.ok(rpcPingErrors.endsWith(' 0'), 'rpc_errors_total{method="add-model"} starts at 0') + } finally { + await cleanup(ctx) + } +}) + +test('RPC metrics counters increment', async (t) => { + const ctx = await createServiceWithMetrics(t) + + try { + ctx.qvacMetrics.recordRpcRequest('ping') + ctx.qvacMetrics.recordRpcRequest('ping') + ctx.qvacMetrics.recordRpcRequest('add-model') + ctx.qvacMetrics.recordRpcError('add-model') + + const res = await httpGet(ctx.port, '/metrics') + const body = res.body + + const pingLine = body.split('\n').find(l => l.includes('qvac_registry_rpc_requests_total') && l.includes('ping')) + t.ok(pingLine, 'has ping request counter line') + t.ok(pingLine.includes('2'), 'ping counter is 2') + + const errorLine = body.split('\n').find(l => l.includes('qvac_registry_rpc_errors_total') && l.includes('add-model')) + t.ok(errorLine, 'has add-model error counter line') + t.ok(errorLine.includes('1'), 'error counter is 1') + } finally { + await cleanup(ctx) + } +}) + +test('MetricsServer closes cleanly', async (t) => { + const ctx = await createServiceWithMetrics(t) + + await ctx.metricsServer.close() + ctx.metricsServer = null + + try { + await httpGet(ctx.port, '/metrics') + t.fail('should not connect after close') + } catch (err) { + t.ok(err.code === 'ECONNREFUSED', 'connection refused after close') + } finally { + await cleanup(ctx) + } +}) + +test('MetricsServer binds to custom host', async (t) => { + const basePath = await createTempStorage(t) + const store = new Corestore(basePath) + await store.ready() + + const swarm = new Hyperswarm({ bootstrap: [] }) + const config = new RegistryConfig({ logger: noopLogger }) + + const service = new RegistryService( + store.namespace(AUTOBASE_NAMESPACE), + swarm, + config, + { logger: noopLogger, ackInterval: 5, skipStorageCheck: true } + ) + await service.ready() + + promClient.register.clear() + + const metricsServer = new MetricsServer(promClient.register, { + port: 0, + host: '127.0.0.1', + logger: noopLogger + }) + + const ctx = { service, store, swarm, metricsServer } + + try { + await metricsServer.ready() + + const address = metricsServer._server.address() + t.is(address.address, '127.0.0.1', 'bound to requested host') + + const res = await httpGet(address.port, '/metrics') + t.is(res.status, 200, 'reachable on requested host') + } finally { + await cleanup(ctx) + } +})