-
Notifications
You must be signed in to change notification settings - Fork 301
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2115 from josemore/jmore/nvidia-gpu
feat: add nvidia gpu monitoring quickstart
- Loading branch information
Showing
7 changed files
with
274 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
name: High GPU Memory Utilization | ||
|
||
description: |+ | ||
This alert is triggered when the Nvidia GPU memory utilization is above 90%. | ||
type: STATIC | ||
nrql: | ||
query: "SELECT latest(utilization.memory.percent) FROM NvidiaGpuSample" | ||
|
||
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) | ||
valueFunction: SINGLE_VALUE | ||
|
||
# List of Critical and Warning thresholds for the condition | ||
terms: | ||
- priority: CRITICAL | ||
# Operator used to compare against the threshold. | ||
operator: ABOVE | ||
# Value that triggers a violation | ||
threshold: 90 | ||
# Time in seconds; 120 - 3600 | ||
thresholdDuration: 300 | ||
# How many data points must be in violation for the duration | ||
thresholdOccurrences: ALL | ||
|
||
# Duration after which a violation automatically closes | ||
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) | ||
violationTimeLimitSeconds: 86400 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
{ | ||
"name": "Nvidia GPU Monitoring", | ||
"description": null, | ||
"pages": [ | ||
{ | ||
"name": "Nvidia GPU Monitoring", | ||
"description": null, | ||
"widgets": [ | ||
{ | ||
"visualization": { | ||
"id": "viz.markdown" | ||
}, | ||
"layout": { | ||
"column": 1, | ||
"row": 1, | ||
"height": 1, | ||
"width": 4 | ||
}, | ||
"title": "", | ||
"rawConfiguration": { | ||
"text": "[![NVIDIA SMI](https://logos-download.com/wp-content/uploads/2016/10/Nvidia_logo.png)](https://developer.nvidia.com/nvidia-system-management-interface)\n" | ||
} | ||
}, | ||
{ | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"layout": { | ||
"column": 5, | ||
"row": 1, | ||
"height": 3, | ||
"width": 2 | ||
}, | ||
"title": "Current Clock Speeds", | ||
"rawConfiguration": { | ||
"dataFormatters": [], | ||
"nrqlQueries": [ | ||
{ | ||
"accountId": 123, | ||
"query": "FROM NvidiaGpuSample SELECT latest(clocks.current.memory.MHz) as 'Memory MHz', latest(clocks.current.graphics.MHz) as 'Graphics MHz', latest(clocks.current.video.MHz) as 'Video MHz', latest(clocks.current.sm.MHz) as 'SM MHz' " | ||
} | ||
], | ||
"thresholds": [] | ||
} | ||
}, | ||
{ | ||
"visualization": { | ||
"id": "viz.line" | ||
}, | ||
"layout": { | ||
"column": 7, | ||
"row": 1, | ||
"height": 3, | ||
"width": 6 | ||
}, | ||
"title": "Current Clock MHz", | ||
"rawConfiguration": { | ||
"legend": { | ||
"enabled": true | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountId": 123, | ||
"query": "FROM NvidiaGpuSample SELECT latest(clocks.current.memory.MHz) as 'Memory MHz', latest(clocks.current.graphics.MHz) as 'Graphics MHz', latest(clocks.current.video.MHz) as 'Video MHz', latest(clocks.current.sm.MHz) as 'SM MHz' TIMESERIES" | ||
} | ||
], | ||
"yAxisLeft": { | ||
"zero": true | ||
} | ||
} | ||
}, | ||
{ | ||
"visualization": { | ||
"id": "viz.bar" | ||
}, | ||
"layout": { | ||
"column": 1, | ||
"row": 2, | ||
"height": 2, | ||
"width": 4 | ||
}, | ||
"title": "Select GPU", | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountId": 123, | ||
"query": "FROM NvidiaGpuSample SELECT latest(name) FACET pci.device_id, hostname " | ||
} | ||
] | ||
} | ||
}, | ||
{ | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"layout": { | ||
"column": 1, | ||
"row": 4, | ||
"height": 3, | ||
"width": 2 | ||
}, | ||
"title": "Temps", | ||
"rawConfiguration": { | ||
"dataFormatters": [], | ||
"nrqlQueries": [ | ||
{ | ||
"accountId": 123, | ||
"query": "FROM NvidiaGpuSample SELECT latest(temperature.gpu) as 'GPU Temp', latest(temperature.memory) as 'Memory Temp', latest(fan.speed.percent) as 'Fan speed %'" | ||
} | ||
], | ||
"thresholds": [] | ||
} | ||
}, | ||
{ | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"layout": { | ||
"column": 3, | ||
"row": 4, | ||
"height": 3, | ||
"width": 2 | ||
}, | ||
"title": "Power Usage", | ||
"rawConfiguration": { | ||
"dataFormatters": [], | ||
"nrqlQueries": [ | ||
{ | ||
"accountId": 123, | ||
"query": "FROM NvidiaGpuSample SELECT latest(power.draw.watts) as 'Power Draw Watts', latest(`power.limit.watts`) as 'Power Limit Watts', latest(power.draw.watts)/latest(`power.limit.watts`) * 100 as 'Power usage %'" | ||
} | ||
], | ||
"thresholds": [] | ||
} | ||
}, | ||
{ | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"layout": { | ||
"column": 5, | ||
"row": 4, | ||
"height": 3, | ||
"width": 2 | ||
}, | ||
"title": "Memory Usage", | ||
"rawConfiguration": { | ||
"dataFormatters": [], | ||
"nrqlQueries": [ | ||
{ | ||
"accountId": 123, | ||
"query": "FROM NvidiaGpuSample SELECT latest(memory.free.MiB) as 'Memory Free MiB', latest(memory.used.MiB) as 'Memory Used MiB', latest(memory.total.MiB) as 'Memory Total MiB'" | ||
} | ||
], | ||
"thresholds": [] | ||
} | ||
}, | ||
{ | ||
"visualization": { | ||
"id": "viz.line" | ||
}, | ||
"layout": { | ||
"column": 7, | ||
"row": 4, | ||
"height": 3, | ||
"width": 6 | ||
}, | ||
"title": "Utilization", | ||
"rawConfiguration": { | ||
"legend": { | ||
"enabled": true | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountId": 123, | ||
"query": "FROM NvidiaGpuSample SELECT latest(memory.used.MiB/memory.total.MiB) * 100 as 'Memory Used %', latest(utilization.gpu.percent) as 'GPU Utilization %', latest(power.draw.watts)/latest(`power.limit.watts`)*100 as 'Power Usage %', latest(fan.speed.percent) as 'Fan Speed %' TIMESERIES" | ||
} | ||
], | ||
"yAxisLeft": { | ||
"zero": true | ||
} | ||
} | ||
} | ||
] | ||
} | ||
] | ||
} |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
id: nvidia-gpu | ||
displayName: Nvidia GPUs | ||
description: | | ||
Monitor Nvidia GPUs based on the Nvidia SMI utility. | ||
install: | ||
primary: | ||
link: | ||
url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-gpu-integration/ | ||
|
||
icon: logo.png | ||
|
||
keywords: | ||
- infrastructure | ||
- nvidia | ||
- gpu | ||
|
||
categoryTerms: | ||
- infrastructure |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
slug: nvidia-gpu | ||
|
||
title: Nvidia GPU Monitoring | ||
|
||
description: | | ||
Our NVIDIA GPU integration assists you in monitoring the status of GPUs. | ||
This integration leverages our infrastructure agent and the Flex integration, which is seamlessly integrated with NVIDIA's SMI utility. | ||
It provides you with a pre-built dashboard containing crucial GPU metrics, including GPU utilization, ECC error counts, | ||
active compute processes, clock and performance states, temperature, fan speed, as well as dynamic and static information about each supported device. | ||
summary: | | ||
Monitor NVidia GPUs performance and state. | ||
level: Community | ||
|
||
authors: | ||
- New Relic | ||
|
||
keywords: | ||
- nvidia | ||
- gpu | ||
- infrastructure | ||
|
||
documentation: | ||
- name: NVIDIA GPU integration | ||
url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-gpu-integration/ | ||
description: Monitor the status and performance of NVidia GPUs. | ||
|
||
icon: logo.png | ||
|
||
dashboards: | ||
- nvidia-gpu | ||
|
||
alertPolicies: | ||
- nvidia-gpu | ||
|
||
dataSourceIds: | ||
- nvidia-gpu |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.