-
Notifications
You must be signed in to change notification settings - Fork 615
feat: track nodejs runtime metrics #11160
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
4277e64
9a09f3d
c18208e
9623ba6
a370c30
f711126
13b17b4
961053e
45d2ed9
c56a813
9635cb9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not for this PR, but do we know whether this event loop monitoring causes extra overhead? Should we have an env var to toggle this monitoring?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is all done on the C++ side and should have minimal impact. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,49 +1,119 @@ | ||
| import { promiseWithResolvers } from '@aztec/foundation/promise'; | ||
| import { Timer } from '@aztec/foundation/timer'; | ||
| import { type EventLoopUtilization, type IntervalHistogram, monitorEventLoopDelay, performance } from 'node:perf_hooks'; | ||
|
|
||
| import { EVENT_LOOP_LAG } from './metrics.js'; | ||
| import { type Meter, type ObservableGauge, type ObservableResult, ValueType } from './telemetry.js'; | ||
| import { NODEJS_EVENT_LOOP_STATE } from './attributes.js'; | ||
| import * as Metrics from './metrics.js'; | ||
| import { | ||
| type BatchObservableResult, | ||
| type Meter, | ||
| type ObservableGauge, | ||
| type UpDownCounter, | ||
| ValueType, | ||
| } from './telemetry.js'; | ||
|
|
||
| /** | ||
| * Detector for custom Aztec attributes | ||
| */ | ||
| export class EventLoopMonitor { | ||
| private eventLoopLag: ObservableGauge; | ||
| private eventLoopDelayGauges: { | ||
| min: ObservableGauge; | ||
| max: ObservableGauge; | ||
| mean: ObservableGauge; | ||
| stddev: ObservableGauge; | ||
| p50: ObservableGauge; | ||
| p90: ObservableGauge; | ||
| p99: ObservableGauge; | ||
| }; | ||
|
|
||
| private eventLoopUilization: ObservableGauge; | ||
| private eventLoopTime: UpDownCounter; | ||
|
|
||
| private started = false; | ||
|
|
||
| constructor(meter: Meter) { | ||
| this.eventLoopLag = meter.createObservableGauge(EVENT_LOOP_LAG, { | ||
| unit: 'us', | ||
| valueType: ValueType.INT, | ||
| private lastELU: EventLoopUtilization | undefined; | ||
| private eventLoopDelay: IntervalHistogram; | ||
|
|
||
| constructor(private meter: Meter) { | ||
| const nsObsGauge = (name: (typeof Metrics)[keyof typeof Metrics], description: string) => | ||
| meter.createObservableGauge(name, { | ||
| unit: 'ns', | ||
| valueType: ValueType.INT, | ||
| description, | ||
| }); | ||
|
|
||
| this.eventLoopDelayGauges = { | ||
| min: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MIN, 'Minimum delay of the event loop'), | ||
| mean: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MEAN, 'Mean delay of the event loop'), | ||
| max: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_MAX, 'Max delay of the event loop'), | ||
| stddev: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_STDDEV, 'Stddev delay of the event loop'), | ||
| p50: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P50, 'P50 delay of the event loop'), | ||
| p90: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P90, 'P90 delay of the event loop'), | ||
| p99: nsObsGauge(Metrics.NODEJS_EVENT_LOOP_DELAY_P99, 'P99 delay of the event loop'), | ||
| }; | ||
|
Comment on lines
+43
to
+51
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Curious: is it better to have multiple gauges, or a single gauge where we differentiate measurements using an attribute?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it depends on what we're measuring, in this case I feel like min/max/mean/percentiles measure different things even though the source of data is the same |
||
|
|
||
| this.eventLoopUilization = meter.createObservableGauge(Metrics.NODEJS_EVENT_LOOP_UTILIZATION, { | ||
| valueType: ValueType.DOUBLE, | ||
| description: 'How busy is the event loop', | ||
| }); | ||
|
|
||
| this.eventLoopTime = meter.createUpDownCounter(Metrics.NODEJS_EVENT_LOOP_TIME, { | ||
| unit: 'ms', | ||
| valueType: ValueType.INT, | ||
| description: 'How much time the event loop has spent in a given state', | ||
| }); | ||
|
|
||
| this.eventLoopDelay = monitorEventLoopDelay(); | ||
| } | ||
|
|
||
| start(): void { | ||
| if (this.started) { | ||
| return; | ||
| } | ||
| this.eventLoopLag.addCallback(this.measureLag); | ||
|
|
||
| this.lastELU = performance.eventLoopUtilization(); | ||
| this.eventLoopDelay.enable(); | ||
| this.meter.addBatchObservableCallback(this.measure, [ | ||
| this.eventLoopUilization, | ||
| ...Object.values(this.eventLoopDelayGauges), | ||
| ]); | ||
| } | ||
|
|
||
| stop(): void { | ||
| if (!this.started) { | ||
| return; | ||
| } | ||
| this.eventLoopLag.removeCallback(this.measureLag); | ||
| this.meter.removeBatchObservableCallback(this.measure, [ | ||
| this.eventLoopUilization, | ||
| ...Object.values(this.eventLoopDelayGauges), | ||
| ]); | ||
| this.eventLoopDelay.disable(); | ||
| this.eventLoopDelay.reset(); | ||
| this.lastELU = undefined; | ||
| } | ||
|
|
||
| private measureLag = async (obs: ObservableResult): Promise<void> => { | ||
| const timer = new Timer(); | ||
| const { promise, resolve } = promiseWithResolvers<number>(); | ||
| // how long does it take to schedule the next macro task? | ||
| // if this number spikes then we're (1) either blocking the event loop with long running sync code | ||
| // or (2) spamming the event loop with micro tasks | ||
| setImmediate(() => { | ||
| resolve(timer.us()); | ||
| }); | ||
| private measure = (obs: BatchObservableResult): void => { | ||
| const newELU = performance.eventLoopUtilization(); | ||
| const delta = performance.eventLoopUtilization(newELU, this.lastELU); | ||
| this.lastELU = newELU; | ||
|
|
||
| // `utilization` [0,1] represents how much the event loop is busy vs waiting for new events to come in | ||
| // This should be corelated with CPU usage to gauge the performance characteristics of services | ||
| // 100% utilization leads to high latency because the event loop is _always_ busy, there's no breathing room for events to be processed quickly. | ||
| // Docs and examples: | ||
| // - https://nodesource.com/blog/event-loop-utilization-nodejs | ||
| // - https://youtu.be/WetXnEPraYM | ||
| obs.observe(this.eventLoopUilization, delta.utilization); | ||
|
|
||
| this.eventLoopTime.add(Math.floor(delta.idle), { [NODEJS_EVENT_LOOP_STATE]: 'idle' }); | ||
| this.eventLoopTime.add(Math.floor(delta.active), { [NODEJS_EVENT_LOOP_STATE]: 'active' }); | ||
|
|
||
| obs.observe(this.eventLoopDelayGauges.min, Math.floor(this.eventLoopDelay.min)); | ||
| obs.observe(this.eventLoopDelayGauges.mean, Math.floor(this.eventLoopDelay.mean)); | ||
| obs.observe(this.eventLoopDelayGauges.max, Math.floor(this.eventLoopDelay.max)); | ||
| obs.observe(this.eventLoopDelayGauges.stddev, Math.floor(this.eventLoopDelay.stddev)); | ||
| obs.observe(this.eventLoopDelayGauges.p50, Math.floor(this.eventLoopDelay.percentile(50))); | ||
| obs.observe(this.eventLoopDelayGauges.p90, Math.floor(this.eventLoopDelay.percentile(90))); | ||
| obs.observe(this.eventLoopDelayGauges.p99, Math.floor(this.eventLoopDelay.percentile(99))); | ||
|
|
||
| const lag = await promise; | ||
| obs.observe(Math.floor(lag)); | ||
| this.eventLoopDelay.reset(); | ||
| }; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| import { gcpDetector } from '@opentelemetry/resource-detector-gcp'; | ||
| import { | ||
| type IResource, | ||
| detectResourcesSync, | ||
|
|
@@ -11,7 +12,14 @@ import { aztecDetector } from './aztec_resource_detector.js'; | |
|
|
||
| export async function getOtelResource(): Promise<IResource> { | ||
| const resource = detectResourcesSync({ | ||
| detectors: [osDetectorSync, envDetectorSync, processDetectorSync, serviceInstanceIdDetectorSync, aztecDetector], | ||
| detectors: [ | ||
| osDetectorSync, | ||
| envDetectorSync, | ||
| processDetectorSync, | ||
| serviceInstanceIdDetectorSync, | ||
| gcpDetector, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we gate this gcp detector behind Mitch's GCP observability flag, once that's merged?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can comment it out for now. cc @just-mitch |
||
| aztecDetector, | ||
| ], | ||
| }); | ||
|
|
||
| if (resource.asyncAttributesPending) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we get this from the
metadata.name? No need to change now.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could but that would be the pod name iirc (including random hash at the end)