Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/boot-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: boot-node
Comment on lines +182 to +183

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get this from the metadata.name? No need to change now.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could but that would be the pod name iirc (including random hash at the end)

- name: NODE_OPTIONS
value: "--max-old-space-size={{ .Values.bootNode.maxOldSpaceSize}}"
- name: AZTEC_PORT
Expand Down
10 changes: 10 additions & 0 deletions spartan/aztec-network/templates/faucet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,16 @@ spec:
value: "1"
- name: LOG_LEVEL
value: "{{ .Values.faucet.logLevel }}"
- name: K8S_POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
- name: K8S_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: faucet
ports:
- name: http
containerPort: {{ .Values.faucet.service.nodePort }}
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/prover-agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: prover-agent
- name: AZTEC_PORT
value: "{{ .Values.proverAgent.service.nodePort }}"
- name: LOG_LEVEL
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/prover-broker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: prover-broker
- name: NODE_OPTIONS
value: "--max-old-space-size={{ .Values.proverBroker.maxOldSpaceSize}}"
- name: AZTEC_PORT
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/prover-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: prover-node
- name: POD_IP
valueFrom:
fieldRef:
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/pxe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: pxe
- name: AZTEC_PORT
value: "{{ .Values.pxe.service.nodePort }}"
- name: LOG_JSON
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/transaction-bot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: bot
- name: AZTEC_PORT
value: "{{ .Values.bot.service.nodePort }}"
- name: LOG_JSON
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/validator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: validator
- name: POD_IP
valueFrom:
fieldRef:
Expand Down
2 changes: 1 addition & 1 deletion yarn-project/telemetry-client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"@opentelemetry/exporter-trace-otlp-http": "^0.55.0",
"@opentelemetry/host-metrics": "^0.35.4",
"@opentelemetry/otlp-exporter-base": "^0.55.0",
"@opentelemetry/resource-detector-aws": "^1.8.0",
"@opentelemetry/resource-detector-gcp": "^0.32.0",
"@opentelemetry/resources": "^1.28.0",
"@opentelemetry/sdk-logs": "^0.55.0",
"@opentelemetry/sdk-metrics": "^1.28.0",
Expand Down
2 changes: 2 additions & 0 deletions yarn-project/telemetry-client/src/attributes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,5 @@ export const REVERTIBILITY = 'aztec.revertibility';
export const GAS_DIMENSION = 'aztec.gas_dimension';

export const WORLD_STATE_REQUEST_TYPE = 'aztec.world_state_request';

export const NODEJS_EVENT_LOOP_STATE = 'nodejs.eventloop.state';
10 changes: 7 additions & 3 deletions yarn-project/telemetry-client/src/aztec_resource_detector.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import { type DetectorSync, type IResource, Resource } from '@opentelemetry/resources';
import { ATTR_K8S_POD_NAME, ATTR_K8S_POD_UID } from '@opentelemetry/semantic-conventions/incubating';
import {
ATTR_K8S_POD_NAME,
ATTR_K8S_POD_UID,
ATTR_SERVICE_INSTANCE_ID,
} from '@opentelemetry/semantic-conventions/incubating';

import { NETWORK_NAME } from './attributes.js';
import { getConfigEnvVars } from './config.js';

/**
Expand All @@ -12,9 +15,10 @@ class AztecDetector implements DetectorSync {
const config = getConfigEnvVars();

return new Resource({
[NETWORK_NAME]: config.networkName,
[ATTR_K8S_POD_UID]: config.k8sPodUid,
[ATTR_K8S_POD_NAME]: config.k8sPodName,
// this will get set by serviceInstanceIdDetector if not running in K8s
[ATTR_SERVICE_INSTANCE_ID]: config.k8sPodUid,
});
}
}
Expand Down
114 changes: 92 additions & 22 deletions yarn-project/telemetry-client/src/event_loop_monitor.ts

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for this PR, but do we know whether this event loop monitoring causes extra overhead? Should we have an env var to toggle this monitoring?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is all done on the C++ side and should have minimal impact.

Original file line number Diff line number Diff line change
@@ -1,49 +1,119 @@
import { promiseWithResolvers } from '@aztec/foundation/promise';
import { Timer } from '@aztec/foundation/timer';
import { type EventLoopUtilization, type IntervalHistogram, monitorEventLoopDelay, performance } from 'node:perf_hooks';

import { EVENT_LOOP_LAG } from './metrics.js';
import { type Meter, type ObservableGauge, type ObservableResult, ValueType } from './telemetry.js';
import { NODEJS_EVENT_LOOP_STATE } from './attributes.js';
import * as Metrics from './metrics.js';
import {
type BatchObservableResult,
type Meter,
type ObservableGauge,
type UpDownCounter,
ValueType,
} from './telemetry.js';

/**
* Detector for custom Aztec attributes
*/
export class EventLoopMonitor {
private eventLoopLag: ObservableGauge;
private eventLoopDelayGauges: {
min: ObservableGauge;
max: ObservableGauge;
mean: ObservableGauge;
stddev: ObservableGauge;
p50: ObservableGauge;
p90: ObservableGauge;
p99: ObservableGauge;
};

private eventLoopUilization: ObservableGauge;
private eventLoopTime: UpDownCounter;

private started = false;

constructor(meter: Meter) {
this.eventLoopLag = meter.createObservableGauge(EVENT_LOOP_LAG, {
unit: 'us',
valueType: ValueType.INT,
private lastELU: EventLoopUtilization | undefined;
private eventLoopDelay: IntervalHistogram;

constructor(private meter: Meter) {
const nsObsGauge = (name: (typeof Metrics)[keyof typeof Metrics], description: string) =>
meter.createObservableGauge(name, {
unit: 'ns',
valueType: ValueType.INT,
description,
});

this.eventLoopDelayGauges = {
min: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MIN, 'Minimum delay of the event loop'),
mean: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MEAN, 'Mean delay of the event loop'),
max: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MAX, 'Max delay of the event loop'),
stddev: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_STDDEV, 'Stddev delay of the event loop'),
p50: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P50, 'P50 delay of the event loop'),
p90: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P90, 'P90 delay of the event loop'),
p99: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P99, 'P99 delay of the event loop'),
};
Comment on lines +43 to +51

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious: is it better to have multiple gauges, or a single gauge where we differentiate measurements using an attribute?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it depends on what we're measuring, in this case I feel like min/max/mean/percentiles measure different things even though the source of data is the same


this.eventLoopUilization = meter.createObservableGauge(Metrics.NODEJS_EVENT_LOOP_UTILIZATION, {
valueType: ValueType.DOUBLE,
description: 'How busy is the event loop',
});

this.eventLoopTime = meter.createUpDownCounter(Metrics.NODEJS_EVENT_LOOP_TIME, {
unit: 'ms',
valueType: ValueType.INT,
description: 'How much time the event loop has spent in a given state',
});

this.eventLoopDelay = monitorEventLoopDelay();
}

start(): void {
if (this.started) {
return;
}
this.eventLoopLag.addCallback(this.measureLag);

this.lastELU = performance.eventLoopUtilization();
this.eventLoopDelay.enable();
this.meter.addBatchObservableCallback(this.measure, [
this.eventLoopUilization,
...Object.values(this.eventLoopDelayGauges),
]);
}

stop(): void {
if (!this.started) {
return;
}
this.eventLoopLag.removeCallback(this.measureLag);
this.meter.removeBatchObservableCallback(this.measure, [
this.eventLoopUilization,
...Object.values(this.eventLoopDelayGauges),
]);
this.eventLoopDelay.disable();
this.eventLoopDelay.reset();
this.lastELU = undefined;
}

private measureLag = async (obs: ObservableResult): Promise<void> => {
const timer = new Timer();
const { promise, resolve } = promiseWithResolvers<number>();
// how long does it take to schedule the next macro task?
// if this number spikes then we're (1) either blocking the event loop with long running sync code
// or (2) spamming the event loop with micro tasks
setImmediate(() => {
resolve(timer.us());
});
private measure = (obs: BatchObservableResult): void => {
const newELU = performance.eventLoopUtilization();
const delta = performance.eventLoopUtilization(newELU, this.lastELU);
this.lastELU = newELU;

// `utilization` [0,1] represents how much the event loop is busy vs waiting for new events to come in
// This should be corelated with CPU usage to gauge the performance characteristics of services
// 100% utilization leads to high latency because the event loop is _always_ busy, there's no breathing room for events to be processed quickly.
// Docs and examples:
// - https://nodesource.com/blog/event-loop-utilization-nodejs
// - https://youtu.be/WetXnEPraYM
obs.observe(this.eventLoopUilization, delta.utilization);

this.eventLoopTime.add(Math.floor(delta.idle), { [NODEJS_EVENT_LOOP_STATE]: 'idle' });
this.eventLoopTime.add(Math.floor(delta.active), { [NODEJS_EVENT_LOOP_STATE]: 'active' });

obs.observe(this.eventLoopDelayGauges.min, Math.floor(this.eventLoopDelay.min));
obs.observe(this.eventLoopDelayGauges.mean, Math.floor(this.eventLoopDelay.mean));
obs.observe(this.eventLoopDelayGauges.max, Math.floor(this.eventLoopDelay.max));
obs.observe(this.eventLoopDelayGauges.stddev, Math.floor(this.eventLoopDelay.stddev));
obs.observe(this.eventLoopDelayGauges.p50, Math.floor(this.eventLoopDelay.percentile(50)));
obs.observe(this.eventLoopDelayGauges.p90, Math.floor(this.eventLoopDelay.percentile(90)));
obs.observe(this.eventLoopDelayGauges.p99, Math.floor(this.eventLoopDelay.percentile(99)));

const lag = await promise;
obs.observe(Math.floor(lag));
this.eventLoopDelay.reset();
};
}
11 changes: 10 additions & 1 deletion yarn-project/telemetry-client/src/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,13 @@ export const PROOF_VERIFIER_COUNT = 'aztec.proof_verifier.count';
export const VALIDATOR_RE_EXECUTION_TIME = 'aztec.validator.re_execution_time';
export const VALIDATOR_FAILED_REEXECUTION_COUNT = 'aztec.validator.failed_reexecution_count';

export const EVENT_LOOP_LAG = 'aztec.event_loop_lag';
export const NODEJS_EVENT_LOOP_DELAY_MIN = 'nodejs.eventloop.delay.min';
export const NODEJS_EVENT_LOOP_DELAY_MEAN = 'nodejs.eventloop.delay.mean';
export const NODEJS_EVENT_LOOP_DELAY_MAX = 'nodejs.eventloop.delay.max';
export const NODEJS_EVENT_LOOP_DELAY_STDDEV = 'nodejs.eventloop.delay.stddev';
export const NODEJS_EVENT_LOOP_DELAY_P50 = 'nodejs.eventloop.delay.p50';
export const NODEJS_EVENT_LOOP_DELAY_P90 = 'nodejs.eventloop.delay.p90';
export const NODEJS_EVENT_LOOP_DELAY_P99 = 'nodejs.eventloop.delay.p99';

export const NODEJS_EVENT_LOOP_UTILIZATION = 'nodejs.eventloop.utilization';
export const NODEJS_EVENT_LOOP_TIME = 'nodejs.eventloop.time';
10 changes: 9 additions & 1 deletion yarn-project/telemetry-client/src/otel_resource.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { gcpDetector } from '@opentelemetry/resource-detector-gcp';
import {
type IResource,
detectResourcesSync,
Expand All @@ -11,7 +12,14 @@ import { aztecDetector } from './aztec_resource_detector.js';

export async function getOtelResource(): Promise<IResource> {
const resource = detectResourcesSync({
detectors: [osDetectorSync, envDetectorSync, processDetectorSync, serviceInstanceIdDetectorSync, aztecDetector],
detectors: [
osDetectorSync,
envDetectorSync,
processDetectorSync,
serviceInstanceIdDetectorSync,
gcpDetector,

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we gate this gcp detector behind Mitch's GCP observability flag, once that's merged?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can comment it out for now. cc @just-mitch

aztecDetector,
],
});

if (resource.asyncAttributesPending) {
Expand Down
Loading