From 4277e6445dade71e482f38199eff2cefcf495468 Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Thu, 9 Jan 2025 16:24:05 +0000 Subject: [PATCH 01/10] feat: monitor event loop utilization --- .../src/event_loop_monitor.ts | 30 ++++++++++++++----- yarn-project/telemetry-client/src/metrics.ts | 1 + 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/yarn-project/telemetry-client/src/event_loop_monitor.ts b/yarn-project/telemetry-client/src/event_loop_monitor.ts index ee6f6a68b6c9..b66cc675581a 100644 --- a/yarn-project/telemetry-client/src/event_loop_monitor.ts +++ b/yarn-project/telemetry-client/src/event_loop_monitor.ts @@ -1,20 +1,29 @@ import { promiseWithResolvers } from '@aztec/foundation/promise'; import { Timer } from '@aztec/foundation/timer'; -import { EVENT_LOOP_LAG } from './metrics.js'; -import { type Meter, type ObservableGauge, type ObservableResult, ValueType } from './telemetry.js'; +import { type EventLoopUtilization, performance } from 'node:perf_hooks'; + +import { EVENT_LOOP_LAG, EVENT_LOOP_UTILIZATION } from './metrics.js'; +import { type BatchObservableResult, type Meter, type ObservableGauge, ValueType } from './telemetry.js'; /** * Detector for custom Aztec attributes */ export class EventLoopMonitor { private eventLoopLag: ObservableGauge; + private eventLoopUilization: ObservableGauge; private started = false; - constructor(meter: Meter) { + private lastELU: EventLoopUtilization | undefined; + + constructor(private meter: Meter) { this.eventLoopLag = meter.createObservableGauge(EVENT_LOOP_LAG, { unit: 'us', valueType: ValueType.INT, + description: 'Latency to execute a macro task', + }); + this.eventLoopUilization = meter.createObservableGauge(EVENT_LOOP_UTILIZATION, { + valueType: ValueType.DOUBLE, description: 'How busy is the event loop', }); } @@ -23,17 +32,23 @@ export class EventLoopMonitor { if (this.started) { return; } - this.eventLoopLag.addCallback(this.measureLag); + + this.lastELU = performance.eventLoopUtilization(); + this.meter.addBatchObservableCallback(this.measureLag, [this.eventLoopUilization, this.eventLoopLag]); } stop(): void { if (!this.started) { return; } - this.eventLoopLag.removeCallback(this.measureLag); + this.meter.removeBatchObservableCallback(this.measureLag, [this.eventLoopUilization, this.eventLoopLag]); } - private measureLag = async (obs: ObservableResult): Promise => { + private measureLag = async (obs: BatchObservableResult): Promise => { + const newELU = performance.eventLoopUtilization(); + const delta = performance.eventLoopUtilization(newELU, this.lastELU); + this.lastELU = newELU; + const timer = new Timer(); const { promise, resolve } = promiseWithResolvers(); // how long does it take to schedule the next macro task? @@ -44,6 +59,7 @@ export class EventLoopMonitor { }); const lag = await promise; - obs.observe(Math.floor(lag)); + obs.observe(this.eventLoopLag, Math.floor(lag)); + obs.observe(this.eventLoopUilization, delta.utilization); }; } diff --git a/yarn-project/telemetry-client/src/metrics.ts b/yarn-project/telemetry-client/src/metrics.ts index 1561d4a9b606..27adad90306c 100644 --- a/yarn-project/telemetry-client/src/metrics.ts +++ b/yarn-project/telemetry-client/src/metrics.ts @@ -126,4 +126,5 @@ export const PROOF_VERIFIER_COUNT = 'aztec.proof_verifier.count'; export const VALIDATOR_RE_EXECUTION_TIME = 'aztec.validator.re_execution_time'; export const VALIDATOR_FAILED_REEXECUTION_COUNT = 'aztec.validator.failed_reexecution_count'; +export const EVENT_LOOP_UTILIZATION = 'aztec.event_loop_utilization'; export const EVENT_LOOP_LAG = 'aztec.event_loop_lag'; From 9a09f3d8045f44e9a384cbdf866a4d902c55b3af Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Thu, 9 Jan 2025 17:25:59 +0000 Subject: [PATCH 02/10] feat: add GCP/K8s resource detection --- yarn-project/telemetry-client/package.json | 2 +- .../telemetry-client/src/otel_resource.ts | 10 +- yarn-project/yarn.lock | 104 +++++++++++++++++- 3 files changed, 109 insertions(+), 7 deletions(-) diff --git a/yarn-project/telemetry-client/package.json b/yarn-project/telemetry-client/package.json index 7face42b10c4..042f14a062d1 100644 --- a/yarn-project/telemetry-client/package.json +++ b/yarn-project/telemetry-client/package.json @@ -37,7 +37,7 @@ "@opentelemetry/exporter-trace-otlp-http": "^0.55.0", "@opentelemetry/host-metrics": "^0.35.4", "@opentelemetry/otlp-exporter-base": "^0.55.0", - "@opentelemetry/resource-detector-aws": "^1.8.0", + "@opentelemetry/resource-detector-gcp": "^0.32.0", "@opentelemetry/resources": "^1.28.0", "@opentelemetry/sdk-logs": "^0.55.0", "@opentelemetry/sdk-metrics": "^1.28.0", diff --git a/yarn-project/telemetry-client/src/otel_resource.ts b/yarn-project/telemetry-client/src/otel_resource.ts index d0a8ec7677c2..83b14383b512 100644 --- a/yarn-project/telemetry-client/src/otel_resource.ts +++ b/yarn-project/telemetry-client/src/otel_resource.ts @@ -1,3 +1,4 @@ +import { gcpDetector } from '@opentelemetry/resource-detector-gcp'; import { type IResource, detectResourcesSync, @@ -11,7 +12,14 @@ import { aztecDetector } from './aztec_resource_detector.js'; export async function getOtelResource(): Promise { const resource = detectResourcesSync({ - detectors: [osDetectorSync, envDetectorSync, processDetectorSync, serviceInstanceIdDetectorSync, aztecDetector], + detectors: [ + osDetectorSync, + envDetectorSync, + processDetectorSync, + serviceInstanceIdDetectorSync, + gcpDetector, + aztecDetector, + ], }); if (resource.asyncAttributesPending) { diff --git a/yarn-project/yarn.lock b/yarn-project/yarn.lock index cf4a8c93dd9a..6fd514cc76bc 100644 --- a/yarn-project/yarn.lock +++ b/yarn-project/yarn.lock @@ -1286,7 +1286,7 @@ __metadata: "@opentelemetry/exporter-trace-otlp-http": "npm:^0.55.0" "@opentelemetry/host-metrics": "npm:^0.35.4" "@opentelemetry/otlp-exporter-base": "npm:^0.55.0" - "@opentelemetry/resource-detector-aws": "npm:^1.8.0" + "@opentelemetry/resource-detector-gcp": "npm:^0.32.0" "@opentelemetry/resources": "npm:^1.28.0" "@opentelemetry/sdk-logs": "npm:^0.55.0" "@opentelemetry/sdk-metrics": "npm:^1.28.0" @@ -4107,16 +4107,17 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/resource-detector-aws@npm:^1.8.0": - version: 1.8.0 - resolution: "@opentelemetry/resource-detector-aws@npm:1.8.0" +"@opentelemetry/resource-detector-gcp@npm:^0.32.0": + version: 0.32.0 + resolution: "@opentelemetry/resource-detector-gcp@npm:0.32.0" dependencies: "@opentelemetry/core": "npm:^1.0.0" "@opentelemetry/resources": "npm:^1.10.0" "@opentelemetry/semantic-conventions": "npm:^1.27.0" + gcp-metadata: "npm:^6.0.0" peerDependencies: "@opentelemetry/api": ^1.0.0 - checksum: 10/077b37288659b441e958e73386696eef27a01ca1ba2f1f46706e0feef9e3f59976ce034d435198adacf6b2b6575f2ade115f05a30ddedd3a0d4b17811fcae5a0 + checksum: 10/d7c29b239462bb96bcb58a41dd70c0fa1a4766e08f8f4395e48dcc799b222c0bba6a7afe884d4a46d430b43ab6a9d5d33a63f7431f1fe29bb9f9cfaf1338898d languageName: node linkType: hard @@ -7323,6 +7324,13 @@ __metadata: languageName: node linkType: hard +"bignumber.js@npm:^9.0.0": + version: 9.1.2 + resolution: "bignumber.js@npm:9.1.2" + checksum: 10/d89b8800a987225d2c00dcbf8a69dc08e92aa0880157c851c287b307d31ceb2fc2acb0c62c3e3a3d42b6c5fcae9b004035f13eb4386e56d529d7edac18d5c9d8 + languageName: node + linkType: hard + "binary-extensions@npm:^2.0.0": version: 2.3.0 resolution: "binary-extensions@npm:2.3.0" @@ -10791,6 +10799,13 @@ __metadata: languageName: node linkType: hard +"extend@npm:^3.0.2": + version: 3.0.2 + resolution: "extend@npm:3.0.2" + checksum: 10/59e89e2dc798ec0f54b36d82f32a27d5f6472c53974f61ca098db5d4648430b725387b53449a34df38fd0392045434426b012f302b3cc049a6500ccf82877e4e + languageName: node + linkType: hard + "external-editor@npm:^3.1.0": version: 3.1.0 resolution: "external-editor@npm:3.1.0" @@ -11275,6 +11290,29 @@ __metadata: languageName: node linkType: hard +"gaxios@npm:^6.0.0": + version: 6.7.1 + resolution: "gaxios@npm:6.7.1" + dependencies: + extend: "npm:^3.0.2" + https-proxy-agent: "npm:^7.0.1" + is-stream: "npm:^2.0.0" + node-fetch: "npm:^2.6.9" + uuid: "npm:^9.0.1" + checksum: 10/c85599162208884eadee91215ebbfa1faa412551df4044626cb561300e15193726e8f23d63b486533e066dadad130f58ed872a23acab455238d8d48b531a0695 + languageName: node + linkType: hard + +"gcp-metadata@npm:^6.0.0": + version: 6.1.0 + resolution: "gcp-metadata@npm:6.1.0" + dependencies: + gaxios: "npm:^6.0.0" + json-bigint: "npm:^1.0.0" + checksum: 10/a0d12a9cb7499fdb9de0fff5406aa220310c1326b80056be8d9b747aae26414f99d14bd795c0ec52ef7d0473eef9d61bb657b8cd3d8186c8a84c4ddbff025fe9 + languageName: node + linkType: hard + "gensync@npm:^1.0.0-beta.2": version: 1.0.0-beta.2 resolution: "gensync@npm:1.0.0-beta.2" @@ -13638,6 +13676,15 @@ __metadata: languageName: node linkType: hard +"json-bigint@npm:^1.0.0": + version: 1.0.0 + resolution: "json-bigint@npm:1.0.0" + dependencies: + bignumber.js: "npm:^9.0.0" + checksum: 10/cd3973b88e5706f8f89d2a9c9431f206ef385bd5c584db1b258891a5e6642507c32316b82745239088c697f5ddfe967351e1731f5789ba7855aed56ad5f70e1f + languageName: node + linkType: hard + "json-buffer@npm:3.0.1": version: 3.0.1 resolution: "json-buffer@npm:3.0.1" @@ -15251,6 +15298,20 @@ __metadata: languageName: node linkType: hard +"node-fetch@npm:^2.6.9": + version: 2.7.0 + resolution: "node-fetch@npm:2.7.0" + dependencies: + whatwg-url: "npm:^5.0.0" + peerDependencies: + encoding: ^0.1.0 + peerDependenciesMeta: + encoding: + optional: true + checksum: 10/b24f8a3dc937f388192e59bcf9d0857d7b6940a2496f328381641cb616efccc9866e89ec43f2ec956bbd6c3d3ee05524ce77fe7b29ccd34692b3a16f237d6676 + languageName: node + linkType: hard + "node-forge@npm:^1, node-forge@npm:^1.1.0": version: 1.3.1 resolution: "node-forge@npm:1.3.1" @@ -18741,6 +18802,13 @@ __metadata: languageName: node linkType: hard +"tr46@npm:~0.0.3": + version: 0.0.3 + resolution: "tr46@npm:0.0.3" + checksum: 10/8f1f5aa6cb232f9e1bdc86f485f916b7aa38caee8a778b378ffec0b70d9307873f253f5cbadbe2955ece2ac5c83d0dc14a77513166ccd0a0c7fe197e21396695 + languageName: node + linkType: hard + "transform-ast@npm:^2.4.0": version: 2.4.4 resolution: "transform-ast@npm:2.4.4" @@ -19540,6 +19608,15 @@ __metadata: languageName: node linkType: hard +"uuid@npm:^9.0.1": + version: 9.0.1 + resolution: "uuid@npm:9.0.1" + bin: + uuid: dist/bin/uuid + checksum: 10/9d0b6adb72b736e36f2b1b53da0d559125ba3e39d913b6072f6f033e0c87835b414f0836b45bcfaf2bdf698f92297fea1c3cc19b0b258bc182c9c43cc0fab9f2 + languageName: node + linkType: hard + "v8-compile-cache-lib@npm:^3.0.1": version: 3.0.1 resolution: "v8-compile-cache-lib@npm:3.0.1" @@ -19708,6 +19785,13 @@ __metadata: languageName: node linkType: hard +"webidl-conversions@npm:^3.0.0": + version: 3.0.1 + resolution: "webidl-conversions@npm:3.0.1" + checksum: 10/b65b9f8d6854572a84a5c69615152b63371395f0c5dcd6729c45789052296df54314db2bc3e977df41705eacb8bc79c247cee139a63fa695192f95816ed528ad + languageName: node + linkType: hard + "webidl-conversions@npm:^7.0.0": version: 7.0.0 resolution: "webidl-conversions@npm:7.0.0" @@ -19930,6 +20014,16 @@ __metadata: languageName: node linkType: hard +"whatwg-url@npm:^5.0.0": + version: 5.0.0 + resolution: "whatwg-url@npm:5.0.0" + dependencies: + tr46: "npm:~0.0.3" + webidl-conversions: "npm:^3.0.0" + checksum: 10/f95adbc1e80820828b45cc671d97da7cd5e4ef9deb426c31bcd5ab00dc7103042291613b3ef3caec0a2335ed09e0d5ed026c940755dbb6d404e2b27f940fdf07 + languageName: node + linkType: hard + "wherearewe@npm:^2.0.1": version: 2.0.1 resolution: "wherearewe@npm:2.0.1" From c18208e05d62d80b028cc6432cc1870d1295889d Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Thu, 9 Jan 2025 18:06:05 +0000 Subject: [PATCH 03/10] fix: capture service_name and service_instance_id --- spartan/aztec-network/templates/boot-node.yaml | 2 ++ spartan/aztec-network/templates/faucet.yaml | 10 ++++++++++ spartan/aztec-network/templates/prover-agent.yaml | 2 ++ spartan/aztec-network/templates/prover-broker.yaml | 2 ++ spartan/aztec-network/templates/prover-node.yaml | 2 ++ spartan/aztec-network/templates/pxe.yaml | 2 ++ spartan/aztec-network/templates/transaction-bot.yaml | 2 ++ spartan/aztec-network/templates/validator.yaml | 2 ++ .../telemetry-client/src/aztec_resource_detector.ts | 8 ++++++-- 9 files changed, 30 insertions(+), 2 deletions(-) diff --git a/spartan/aztec-network/templates/boot-node.yaml b/spartan/aztec-network/templates/boot-node.yaml index 6506d22bf304..0a883c24eaa3 100644 --- a/spartan/aztec-network/templates/boot-node.yaml +++ b/spartan/aztec-network/templates/boot-node.yaml @@ -123,6 +123,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: OTEL_SERVICE_NAME + value: boot-node {{- end }} containers: - name: boot-node diff --git a/spartan/aztec-network/templates/faucet.yaml b/spartan/aztec-network/templates/faucet.yaml index 5053999d0a6a..ffeeb0e1e4c8 100644 --- a/spartan/aztec-network/templates/faucet.yaml +++ b/spartan/aztec-network/templates/faucet.yaml @@ -81,6 +81,16 @@ spec: value: "1" - name: LOG_LEVEL value: "{{ .Values.faucet.logLevel }}" + - name: K8S_POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: OTEL_SERVICE_NAME + value: faucet ports: - name: http containerPort: {{ .Values.faucet.service.nodePort }} diff --git a/spartan/aztec-network/templates/prover-agent.yaml b/spartan/aztec-network/templates/prover-agent.yaml index ef7399d2e7ff..7f288af3bfbc 100644 --- a/spartan/aztec-network/templates/prover-agent.yaml +++ b/spartan/aztec-network/templates/prover-agent.yaml @@ -88,6 +88,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: OTEL_SERVICE_NAME + value: prover-agent - name: AZTEC_PORT value: "{{ .Values.proverAgent.service.nodePort }}" - name: LOG_LEVEL diff --git a/spartan/aztec-network/templates/prover-broker.yaml b/spartan/aztec-network/templates/prover-broker.yaml index c1857eb40aa0..315caf21e8e0 100644 --- a/spartan/aztec-network/templates/prover-broker.yaml +++ b/spartan/aztec-network/templates/prover-broker.yaml @@ -87,6 +87,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: OTEL_SERVICE_NAME + value: prover-broker - name: NODE_OPTIONS value: "--max-old-space-size={{ .Values.proverBroker.maxOldSpaceSize}}" - name: AZTEC_PORT diff --git a/spartan/aztec-network/templates/prover-node.yaml b/spartan/aztec-network/templates/prover-node.yaml index f33e483180db..7420fb920b1d 100644 --- a/spartan/aztec-network/templates/prover-node.yaml +++ b/spartan/aztec-network/templates/prover-node.yaml @@ -130,6 +130,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: OTEL_SERVICE_NAME + value: prover-node - name: POD_IP valueFrom: fieldRef: diff --git a/spartan/aztec-network/templates/pxe.yaml b/spartan/aztec-network/templates/pxe.yaml index e91bbd2a10a4..be0d0321d3a5 100644 --- a/spartan/aztec-network/templates/pxe.yaml +++ b/spartan/aztec-network/templates/pxe.yaml @@ -89,6 +89,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: OTEL_SERVICE_NAME + value: pxe - name: AZTEC_PORT value: "{{ .Values.pxe.service.nodePort }}" - name: LOG_JSON diff --git a/spartan/aztec-network/templates/transaction-bot.yaml b/spartan/aztec-network/templates/transaction-bot.yaml index 7d4837ad40f7..fc573dfb723c 100644 --- a/spartan/aztec-network/templates/transaction-bot.yaml +++ b/spartan/aztec-network/templates/transaction-bot.yaml @@ -83,6 +83,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: OTEL_SERVICE_NAME + value: bot - name: AZTEC_PORT value: "{{ .Values.bot.service.nodePort }}" - name: LOG_JSON diff --git a/spartan/aztec-network/templates/validator.yaml b/spartan/aztec-network/templates/validator.yaml index 02331567b02c..17d25e6a3957 100644 --- a/spartan/aztec-network/templates/validator.yaml +++ b/spartan/aztec-network/templates/validator.yaml @@ -158,6 +158,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: OTEL_SERVICE_NAME + value: validator - name: POD_IP valueFrom: fieldRef: diff --git a/yarn-project/telemetry-client/src/aztec_resource_detector.ts b/yarn-project/telemetry-client/src/aztec_resource_detector.ts index bb438dd49a91..39297e5a8e44 100644 --- a/yarn-project/telemetry-client/src/aztec_resource_detector.ts +++ b/yarn-project/telemetry-client/src/aztec_resource_detector.ts @@ -1,5 +1,9 @@ import { type DetectorSync, type IResource, Resource } from '@opentelemetry/resources'; -import { ATTR_K8S_POD_NAME, ATTR_K8S_POD_UID } from '@opentelemetry/semantic-conventions/incubating'; +import { + ATTR_K8S_POD_NAME, + ATTR_K8S_POD_UID, + ATTR_SERVICE_INSTANCE_ID, +} from '@opentelemetry/semantic-conventions/incubating'; import { NETWORK_NAME } from './attributes.js'; import { getConfigEnvVars } from './config.js'; @@ -12,9 +16,9 @@ class AztecDetector implements DetectorSync { const config = getConfigEnvVars(); return new Resource({ - [NETWORK_NAME]: config.networkName, [ATTR_K8S_POD_UID]: config.k8sPodUid, [ATTR_K8S_POD_NAME]: config.k8sPodName, + [ATTR_SERVICE_INSTANCE_ID]: config.k8sPodUid, }); } } From 9623ba671c353a83597e514b05be01c022d7ee72 Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Fri, 10 Jan 2025 16:30:38 +0000 Subject: [PATCH 04/10] chore: fix merge conflicts --- spartan/aztec-network/templates/boot-node.yaml | 4 ++-- .../src/aztec_resource_detector.ts | 2 +- .../telemetry-client/src/event_loop_monitor.ts | 14 ++++++++++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/spartan/aztec-network/templates/boot-node.yaml b/spartan/aztec-network/templates/boot-node.yaml index 0a883c24eaa3..ad0122920ae8 100644 --- a/spartan/aztec-network/templates/boot-node.yaml +++ b/spartan/aztec-network/templates/boot-node.yaml @@ -123,8 +123,6 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name - - name: OTEL_SERVICE_NAME - value: boot-node {{- end }} containers: - name: boot-node @@ -181,6 +179,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: OTEL_SERVICE_NAME + value: boot-node - name: NODE_OPTIONS value: "--max-old-space-size={{ .Values.bootNode.maxOldSpaceSize}}" - name: AZTEC_PORT diff --git a/yarn-project/telemetry-client/src/aztec_resource_detector.ts b/yarn-project/telemetry-client/src/aztec_resource_detector.ts index 39297e5a8e44..a16ee9ca1ec7 100644 --- a/yarn-project/telemetry-client/src/aztec_resource_detector.ts +++ b/yarn-project/telemetry-client/src/aztec_resource_detector.ts @@ -5,7 +5,6 @@ import { ATTR_SERVICE_INSTANCE_ID, } from '@opentelemetry/semantic-conventions/incubating'; -import { NETWORK_NAME } from './attributes.js'; import { getConfigEnvVars } from './config.js'; /** @@ -18,6 +17,7 @@ class AztecDetector implements DetectorSync { return new Resource({ [ATTR_K8S_POD_UID]: config.k8sPodUid, [ATTR_K8S_POD_NAME]: config.k8sPodName, + // this will get set by serviceInstanceIdDetector if not running in K8s [ATTR_SERVICE_INSTANCE_ID]: config.k8sPodUid, }); } diff --git a/yarn-project/telemetry-client/src/event_loop_monitor.ts b/yarn-project/telemetry-client/src/event_loop_monitor.ts index b66cc675581a..e41dd08f5e9d 100644 --- a/yarn-project/telemetry-client/src/event_loop_monitor.ts +++ b/yarn-project/telemetry-client/src/event_loop_monitor.ts @@ -34,17 +34,17 @@ export class EventLoopMonitor { } this.lastELU = performance.eventLoopUtilization(); - this.meter.addBatchObservableCallback(this.measureLag, [this.eventLoopUilization, this.eventLoopLag]); + this.meter.addBatchObservableCallback(this.measure, [this.eventLoopUilization, this.eventLoopLag]); } stop(): void { if (!this.started) { return; } - this.meter.removeBatchObservableCallback(this.measureLag, [this.eventLoopUilization, this.eventLoopLag]); + this.meter.removeBatchObservableCallback(this.measure, [this.eventLoopUilization, this.eventLoopLag]); } - private measureLag = async (obs: BatchObservableResult): Promise => { + private measure = async (obs: BatchObservableResult): Promise => { const newELU = performance.eventLoopUtilization(); const delta = performance.eventLoopUtilization(newELU, this.lastELU); this.lastELU = newELU; @@ -53,13 +53,19 @@ export class EventLoopMonitor { const { promise, resolve } = promiseWithResolvers(); // how long does it take to schedule the next macro task? // if this number spikes then we're (1) either blocking the event loop with long running sync code - // or (2) spamming the event loop with micro tasks + // (2) spamming the event loop with micro or macro tasks setImmediate(() => { resolve(timer.us()); }); const lag = await promise; obs.observe(this.eventLoopLag, Math.floor(lag)); + // `utilization` [0,1] represents how much the event loop is busy vs waiting for new events to come in + // This should be corelated with CPU usage to gauge the performance characteristics of services + // 100% utilization leads to high latency because the event loop is _always_ busy, there's no breathing room for events to be processed quickly. + // Docs and examples: + // - https://nodesource.com/blog/event-loop-utilization-nodejs + // - https://youtu.be/WetXnEPraYM obs.observe(this.eventLoopUilization, delta.utilization); }; } From a370c303dc1b79e63b117b340620909e74de2d64 Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Fri, 10 Jan 2025 16:49:17 +0000 Subject: [PATCH 05/10] refactor: use monitorEventLoopDelay instead of custom solution --- .../src/event_loop_monitor.ts | 49 +++++++++++-------- yarn-project/telemetry-client/src/metrics.ts | 1 + 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/yarn-project/telemetry-client/src/event_loop_monitor.ts b/yarn-project/telemetry-client/src/event_loop_monitor.ts index e41dd08f5e9d..6a97d5840dca 100644 --- a/yarn-project/telemetry-client/src/event_loop_monitor.ts +++ b/yarn-project/telemetry-client/src/event_loop_monitor.ts @@ -1,9 +1,6 @@ -import { promiseWithResolvers } from '@aztec/foundation/promise'; -import { Timer } from '@aztec/foundation/timer'; +import { type EventLoopUtilization, IntervalHistogram, monitorEventLoopDelay, performance } from 'node:perf_hooks'; -import { type EventLoopUtilization, performance } from 'node:perf_hooks'; - -import { EVENT_LOOP_LAG, EVENT_LOOP_UTILIZATION } from './metrics.js'; +import { EVENT_LOOP_LAG, EVENT_LOOP_LAG_MAX, EVENT_LOOP_UTILIZATION } from './metrics.js'; import { type BatchObservableResult, type Meter, type ObservableGauge, ValueType } from './telemetry.js'; /** @@ -11,21 +8,29 @@ import { type BatchObservableResult, type Meter, type ObservableGauge, ValueType */ export class EventLoopMonitor { private eventLoopLag: ObservableGauge; + private eventLoopLagMax: ObservableGauge; private eventLoopUilization: ObservableGauge; private started = false; private lastELU: EventLoopUtilization | undefined; + private eventLoopDelay: IntervalHistogram; constructor(private meter: Meter) { this.eventLoopLag = meter.createObservableGauge(EVENT_LOOP_LAG, { - unit: 'us', + unit: 'ns', + valueType: ValueType.INT, + description: 'Mean event loop delay', + }); + this.eventLoopLagMax = meter.createObservableGauge(EVENT_LOOP_LAG_MAX, { + unit: 'ns', valueType: ValueType.INT, - description: 'Latency to execute a macro task', + description: 'Max event loop delay', }); this.eventLoopUilization = meter.createObservableGauge(EVENT_LOOP_UTILIZATION, { valueType: ValueType.DOUBLE, description: 'How busy is the event loop', }); + this.eventLoopDelay = monitorEventLoopDelay(); } start(): void { @@ -34,14 +39,25 @@ export class EventLoopMonitor { } this.lastELU = performance.eventLoopUtilization(); - this.meter.addBatchObservableCallback(this.measure, [this.eventLoopUilization, this.eventLoopLag]); + this.meter.addBatchObservableCallback(this.measure, [ + this.eventLoopUilization, + this.eventLoopLag, + this.eventLoopLagMax, + ]); + this.eventLoopDelay.enable(); } stop(): void { if (!this.started) { return; } - this.meter.removeBatchObservableCallback(this.measure, [this.eventLoopUilization, this.eventLoopLag]); + this.meter.removeBatchObservableCallback(this.measure, [ + this.eventLoopUilization, + this.eventLoopLag, + this.eventLoopLagMax, + ]); + this.eventLoopDelay.disable(); + this.eventLoopDelay.reset(); } private measure = async (obs: BatchObservableResult): Promise => { @@ -49,17 +65,6 @@ export class EventLoopMonitor { const delta = performance.eventLoopUtilization(newELU, this.lastELU); this.lastELU = newELU; - const timer = new Timer(); - const { promise, resolve } = promiseWithResolvers(); - // how long does it take to schedule the next macro task? - // if this number spikes then we're (1) either blocking the event loop with long running sync code - // (2) spamming the event loop with micro or macro tasks - setImmediate(() => { - resolve(timer.us()); - }); - - const lag = await promise; - obs.observe(this.eventLoopLag, Math.floor(lag)); // `utilization` [0,1] represents how much the event loop is busy vs waiting for new events to come in // This should be corelated with CPU usage to gauge the performance characteristics of services // 100% utilization leads to high latency because the event loop is _always_ busy, there's no breathing room for events to be processed quickly. @@ -67,5 +72,9 @@ export class EventLoopMonitor { // - https://nodesource.com/blog/event-loop-utilization-nodejs // - https://youtu.be/WetXnEPraYM obs.observe(this.eventLoopUilization, delta.utilization); + + obs.observe(this.eventLoopLag, Math.floor(this.eventLoopDelay.mean)); + obs.observe(this.eventLoopLagMax, Math.floor(this.eventLoopDelay.max)); + this.eventLoopDelay.reset(); }; } diff --git a/yarn-project/telemetry-client/src/metrics.ts b/yarn-project/telemetry-client/src/metrics.ts index 27adad90306c..e078e99d6bb4 100644 --- a/yarn-project/telemetry-client/src/metrics.ts +++ b/yarn-project/telemetry-client/src/metrics.ts @@ -128,3 +128,4 @@ export const VALIDATOR_FAILED_REEXECUTION_COUNT = 'aztec.validator.failed_reexec export const EVENT_LOOP_UTILIZATION = 'aztec.event_loop_utilization'; export const EVENT_LOOP_LAG = 'aztec.event_loop_lag'; +export const EVENT_LOOP_LAG_MAX = 'aztec.event_loop_lag_max'; From f7111262903d2f87b510d9ef67269b23b0746bd7 Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Fri, 10 Jan 2025 18:32:08 +0000 Subject: [PATCH 06/10] feat: implement recommended OTEL Nodejs metrics --- .../telemetry-client/src/attributes.ts | 2 + .../src/event_loop_monitor.ts | 79 ++++++++++++++----- yarn-project/telemetry-client/src/metrics.ts | 13 ++- 3 files changed, 71 insertions(+), 23 deletions(-) diff --git a/yarn-project/telemetry-client/src/attributes.ts b/yarn-project/telemetry-client/src/attributes.ts index 6fa7d7c32677..56b1105052b7 100644 --- a/yarn-project/telemetry-client/src/attributes.ts +++ b/yarn-project/telemetry-client/src/attributes.ts @@ -105,3 +105,5 @@ export const REVERTIBILITY = 'aztec.revertibility'; export const GAS_DIMENSION = 'aztec.gas_dimension'; export const WORLD_STATE_REQUEST_TYPE = 'aztec.world_state_request'; + +export const NODEJS_EVENT_LOOP_STATE = 'nodejs.eventloop.state'; diff --git a/yarn-project/telemetry-client/src/event_loop_monitor.ts b/yarn-project/telemetry-client/src/event_loop_monitor.ts index 6a97d5840dca..041367840849 100644 --- a/yarn-project/telemetry-client/src/event_loop_monitor.ts +++ b/yarn-project/telemetry-client/src/event_loop_monitor.ts @@ -1,35 +1,66 @@ import { type EventLoopUtilization, IntervalHistogram, monitorEventLoopDelay, performance } from 'node:perf_hooks'; -import { EVENT_LOOP_LAG, EVENT_LOOP_LAG_MAX, EVENT_LOOP_UTILIZATION } from './metrics.js'; -import { type BatchObservableResult, type Meter, type ObservableGauge, ValueType } from './telemetry.js'; +import { NODEJS_EVENT_LOOP_STATE } from './attributes.js'; +import * as Metrics from './metrics.js'; +import { + type BatchObservableResult, + type Meter, + type ObservableGauge, + ObservableUpDownCounter, + ValueType, +} from './telemetry.js'; /** * Detector for custom Aztec attributes */ export class EventLoopMonitor { - private eventLoopLag: ObservableGauge; - private eventLoopLagMax: ObservableGauge; + private eventLoopDelayGauges: { + min: ObservableGauge; + max: ObservableGauge; + mean: ObservableGauge; + stddev: ObservableGauge; + p50: ObservableGauge; + p90: ObservableGauge; + p99: ObservableGauge; + }; + private eventLoopUilization: ObservableGauge; + private eventLoopTime: ObservableUpDownCounter; + private started = false; private lastELU: EventLoopUtilization | undefined; private eventLoopDelay: IntervalHistogram; constructor(private meter: Meter) { - this.eventLoopLag = meter.createObservableGauge(EVENT_LOOP_LAG, { - unit: 'ns', - valueType: ValueType.INT, - description: 'Mean event loop delay', + const nsObsGauge = (name: (typeof Metrics)[keyof typeof Metrics], description: string) => + meter.createObservableGauge(name, { + unit: 'ns', + valueType: ValueType.INT, + description, + }); + + this.eventLoopDelayGauges = { + min: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MIN, 'Minimum delay of the event loop'), + mean: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MEAN, 'Mean delay of the event loop'), + max: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MAX, 'Max delay of the event loop'), + stddev: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_STDDEV, 'Stddev delay of the event loop'), + p50: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P50, 'P50 delay of the event loop'), + p90: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P90, 'P90 delay of the event loop'), + p99: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P99, 'P99 delay of the event loop'), + }; + + this.eventLoopUilization = meter.createObservableGauge(Metrics.NODEJS_EVENT_LOOP_UTILIZATION, { + valueType: ValueType.DOUBLE, + description: 'How busy is the event loop', }); - this.eventLoopLagMax = meter.createObservableGauge(EVENT_LOOP_LAG_MAX, { - unit: 'ns', + + this.eventLoopTime = meter.createObservableUpDownCounter(Metrics.NODEJS_EVENT_LOOP_TIME, { + unit: 'ms', valueType: ValueType.INT, - description: 'Max event loop delay', - }); - this.eventLoopUilization = meter.createObservableGauge(EVENT_LOOP_UTILIZATION, { - valueType: ValueType.DOUBLE, description: 'How busy is the event loop', }); + this.eventLoopDelay = monitorEventLoopDelay(); } @@ -41,8 +72,8 @@ export class EventLoopMonitor { this.lastELU = performance.eventLoopUtilization(); this.meter.addBatchObservableCallback(this.measure, [ this.eventLoopUilization, - this.eventLoopLag, - this.eventLoopLagMax, + this.eventLoopTime, + ...Object.values(this.eventLoopDelayGauges), ]); this.eventLoopDelay.enable(); } @@ -53,8 +84,8 @@ export class EventLoopMonitor { } this.meter.removeBatchObservableCallback(this.measure, [ this.eventLoopUilization, - this.eventLoopLag, - this.eventLoopLagMax, + this.eventLoopTime, + ...Object.values(this.eventLoopDelayGauges), ]); this.eventLoopDelay.disable(); this.eventLoopDelay.reset(); @@ -72,9 +103,17 @@ export class EventLoopMonitor { // - https://nodesource.com/blog/event-loop-utilization-nodejs // - https://youtu.be/WetXnEPraYM obs.observe(this.eventLoopUilization, delta.utilization); + obs.observe(this.eventLoopTime, Math.floor(delta.idle), { [NODEJS_EVENT_LOOP_STATE]: 'idle' }); + obs.observe(this.eventLoopTime, Math.floor(delta.active), { [NODEJS_EVENT_LOOP_STATE]: 'active' }); + + obs.observe(this.eventLoopDelayGauges.min, Math.floor(this.eventLoopDelay.min)); + obs.observe(this.eventLoopDelayGauges.mean, Math.floor(this.eventLoopDelay.mean)); + obs.observe(this.eventLoopDelayGauges.max, Math.floor(this.eventLoopDelay.max)); + obs.observe(this.eventLoopDelayGauges.stddev, Math.floor(this.eventLoopDelay.stddev)); + obs.observe(this.eventLoopDelayGauges.p50, Math.floor(this.eventLoopDelay.percentile(50))); + obs.observe(this.eventLoopDelayGauges.p90, Math.floor(this.eventLoopDelay.percentile(90))); + obs.observe(this.eventLoopDelayGauges.p99, Math.floor(this.eventLoopDelay.percentile(99))); - obs.observe(this.eventLoopLag, Math.floor(this.eventLoopDelay.mean)); - obs.observe(this.eventLoopLagMax, Math.floor(this.eventLoopDelay.max)); this.eventLoopDelay.reset(); }; } diff --git a/yarn-project/telemetry-client/src/metrics.ts b/yarn-project/telemetry-client/src/metrics.ts index e078e99d6bb4..657bd7e0c043 100644 --- a/yarn-project/telemetry-client/src/metrics.ts +++ b/yarn-project/telemetry-client/src/metrics.ts @@ -126,6 +126,13 @@ export const PROOF_VERIFIER_COUNT = 'aztec.proof_verifier.count'; export const VALIDATOR_RE_EXECUTION_TIME = 'aztec.validator.re_execution_time'; export const VALIDATOR_FAILED_REEXECUTION_COUNT = 'aztec.validator.failed_reexecution_count'; -export const EVENT_LOOP_UTILIZATION = 'aztec.event_loop_utilization'; -export const EVENT_LOOP_LAG = 'aztec.event_loop_lag'; -export const EVENT_LOOP_LAG_MAX = 'aztec.event_loop_lag_max'; +export const NODEJS_EVENT_LOOP_DELAY_MIN = 'nodejs.eventloop.delay.min'; +export const NODEJS_EVENT_LOOP_DELAY_MEAN = 'nodejs.eventloop.delay.mean'; +export const NODEJS_EVENT_LOOP_DELAY_MAX = 'nodejs.eventloop.delay.max'; +export const NODEJS_EVENT_LOOP_DELAY_STDDEV = 'nodejs.eventloop.delay.stddev'; +export const NODEJS_EVENT_LOOP_DELAY_P50 = 'nodejs.eventloop.delay.p50'; +export const NODEJS_EVENT_LOOP_DELAY_P90 = 'nodejs.eventloop.delay.p90'; +export const NODEJS_EVENT_LOOP_DELAY_P99 = 'nodejs.eventloop.delay.p99'; + +export const NODEJS_EVENT_LOOP_UTILIZATION = 'nodejs.eventloop.utilization'; +export const NODEJS_EVENT_LOOP_TIME = 'nodejs.eventloop.time'; From 13b17b44e07a42808f1ad3c7d2c0fd8c15535745 Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Sat, 11 Jan 2025 09:30:43 +0000 Subject: [PATCH 07/10] refactor: event loop time is a counter --- .../telemetry-client/src/event_loop_monitor.ts | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/yarn-project/telemetry-client/src/event_loop_monitor.ts b/yarn-project/telemetry-client/src/event_loop_monitor.ts index 041367840849..3755c08d5ed6 100644 --- a/yarn-project/telemetry-client/src/event_loop_monitor.ts +++ b/yarn-project/telemetry-client/src/event_loop_monitor.ts @@ -6,7 +6,7 @@ import { type BatchObservableResult, type Meter, type ObservableGauge, - ObservableUpDownCounter, + type UpDownCounter, ValueType, } from './telemetry.js'; @@ -25,7 +25,7 @@ export class EventLoopMonitor { }; private eventLoopUilization: ObservableGauge; - private eventLoopTime: ObservableUpDownCounter; + private eventLoopTime: UpDownCounter; private started = false; @@ -55,7 +55,7 @@ export class EventLoopMonitor { description: 'How busy is the event loop', }); - this.eventLoopTime = meter.createObservableUpDownCounter(Metrics.NODEJS_EVENT_LOOP_TIME, { + this.eventLoopTime = meter.createUpDownCounter(Metrics.NODEJS_EVENT_LOOP_TIME, { unit: 'ms', valueType: ValueType.INT, description: 'How busy is the event loop', @@ -72,7 +72,6 @@ export class EventLoopMonitor { this.lastELU = performance.eventLoopUtilization(); this.meter.addBatchObservableCallback(this.measure, [ this.eventLoopUilization, - this.eventLoopTime, ...Object.values(this.eventLoopDelayGauges), ]); this.eventLoopDelay.enable(); @@ -84,7 +83,6 @@ export class EventLoopMonitor { } this.meter.removeBatchObservableCallback(this.measure, [ this.eventLoopUilization, - this.eventLoopTime, ...Object.values(this.eventLoopDelayGauges), ]); this.eventLoopDelay.disable(); @@ -103,8 +101,9 @@ export class EventLoopMonitor { // - https://nodesource.com/blog/event-loop-utilization-nodejs // - https://youtu.be/WetXnEPraYM obs.observe(this.eventLoopUilization, delta.utilization); - obs.observe(this.eventLoopTime, Math.floor(delta.idle), { [NODEJS_EVENT_LOOP_STATE]: 'idle' }); - obs.observe(this.eventLoopTime, Math.floor(delta.active), { [NODEJS_EVENT_LOOP_STATE]: 'active' }); + + this.eventLoopTime.add(Math.floor(delta.idle), { [NODEJS_EVENT_LOOP_STATE]: 'idle' }); + this.eventLoopTime.add(Math.floor(delta.active), { [NODEJS_EVENT_LOOP_STATE]: 'active' }); obs.observe(this.eventLoopDelayGauges.min, Math.floor(this.eventLoopDelay.min)); obs.observe(this.eventLoopDelayGauges.mean, Math.floor(this.eventLoopDelay.mean)); From 961053e298760b1dd84c89e703b34896365e6e72 Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Sun, 12 Jan 2025 18:23:34 +0000 Subject: [PATCH 08/10] chore: lint --- yarn-project/telemetry-client/src/event_loop_monitor.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yarn-project/telemetry-client/src/event_loop_monitor.ts b/yarn-project/telemetry-client/src/event_loop_monitor.ts index 3755c08d5ed6..384a023712b7 100644 --- a/yarn-project/telemetry-client/src/event_loop_monitor.ts +++ b/yarn-project/telemetry-client/src/event_loop_monitor.ts @@ -1,4 +1,4 @@ -import { type EventLoopUtilization, IntervalHistogram, monitorEventLoopDelay, performance } from 'node:perf_hooks'; +import { type EventLoopUtilization, type IntervalHistogram, monitorEventLoopDelay, performance } from 'node:perf_hooks'; import { NODEJS_EVENT_LOOP_STATE } from './attributes.js'; import * as Metrics from './metrics.js'; @@ -89,7 +89,7 @@ export class EventLoopMonitor { this.eventLoopDelay.reset(); } - private measure = async (obs: BatchObservableResult): Promise => { + private measure = (obs: BatchObservableResult): void => { const newELU = performance.eventLoopUtilization(); const delta = performance.eventLoopUtilization(newELU, this.lastELU); this.lastELU = newELU; From 45d2ed9df4b158acc248d1fcd2488c0bd54dd7ee Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Tue, 14 Jan 2025 15:15:38 +0000 Subject: [PATCH 09/10] chore: description --- yarn-project/telemetry-client/src/event_loop_monitor.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yarn-project/telemetry-client/src/event_loop_monitor.ts b/yarn-project/telemetry-client/src/event_loop_monitor.ts index 384a023712b7..748b893fcb8b 100644 --- a/yarn-project/telemetry-client/src/event_loop_monitor.ts +++ b/yarn-project/telemetry-client/src/event_loop_monitor.ts @@ -58,7 +58,7 @@ export class EventLoopMonitor { this.eventLoopTime = meter.createUpDownCounter(Metrics.NODEJS_EVENT_LOOP_TIME, { unit: 'ms', valueType: ValueType.INT, - description: 'How busy is the event loop', + description: 'How much time the event loop has spent in a given state', }); this.eventLoopDelay = monitorEventLoopDelay(); @@ -70,11 +70,11 @@ export class EventLoopMonitor { } this.lastELU = performance.eventLoopUtilization(); + this.eventLoopDelay.enable(); this.meter.addBatchObservableCallback(this.measure, [ this.eventLoopUilization, ...Object.values(this.eventLoopDelayGauges), ]); - this.eventLoopDelay.enable(); } stop(): void { @@ -87,6 +87,7 @@ export class EventLoopMonitor { ]); this.eventLoopDelay.disable(); this.eventLoopDelay.reset(); + this.lastELU = undefined; } private measure = (obs: BatchObservableResult): void => { From c56a813eea7b11eacf0a32a628d274af7558df57 Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Tue, 14 Jan 2025 15:47:27 +0000 Subject: [PATCH 10/10] chore: disable gcp resource detector temporarily --- yarn-project/telemetry-client/src/otel_resource.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yarn-project/telemetry-client/src/otel_resource.ts b/yarn-project/telemetry-client/src/otel_resource.ts index 83b14383b512..9b777d09aebc 100644 --- a/yarn-project/telemetry-client/src/otel_resource.ts +++ b/yarn-project/telemetry-client/src/otel_resource.ts @@ -1,4 +1,4 @@ -import { gcpDetector } from '@opentelemetry/resource-detector-gcp'; +// import { gcpDetector } from '@opentelemetry/resource-detector-gcp'; import { type IResource, detectResourcesSync, @@ -17,7 +17,7 @@ export async function getOtelResource(): Promise { envDetectorSync, processDetectorSync, serviceInstanceIdDetectorSync, - gcpDetector, + // gcpDetector, aztecDetector, ], });