Skip to content

Commit

Permalink
Merge pull request #2115 from josemore/jmore/nvidia-gpu
Browse files Browse the repository at this point in the history
feat: add nvidia gpu monitoring quickstart
  • Loading branch information
aswanson-nr authored Oct 19, 2023
2 parents eb3a223 + 882a0a7 commit 86b613a
Show file tree
Hide file tree
Showing 7 changed files with 274 additions and 0 deletions.
27 changes: 27 additions & 0 deletions alert-policies/nvidia-gpu/HighMemoryUtilization.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High GPU Memory Utilization

description: |+
This alert is triggered when the Nvidia GPU memory utilization is above 90%.
type: STATIC
nrql:
query: "SELECT latest(utilization.memory.percent) FROM NvidiaGpuSample"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
190 changes: 190 additions & 0 deletions dashboards/nvidia-gpu/nvidia-gpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
{
"name": "Nvidia GPU Monitoring",
"description": null,
"pages": [
{
"name": "Nvidia GPU Monitoring",
"description": null,
"widgets": [
{
"visualization": {
"id": "viz.markdown"
},
"layout": {
"column": 1,
"row": 1,
"height": 1,
"width": 4
},
"title": "",
"rawConfiguration": {
"text": "[![NVIDIA SMI](https://logos-download.com/wp-content/uploads/2016/10/Nvidia_logo.png)](https://developer.nvidia.com/nvidia-system-management-interface)\n"
}
},
{
"visualization": {
"id": "viz.billboard"
},
"layout": {
"column": 5,
"row": 1,
"height": 3,
"width": 2
},
"title": "Current Clock Speeds",
"rawConfiguration": {
"dataFormatters": [],
"nrqlQueries": [
{
"accountId": 123,
"query": "FROM NvidiaGpuSample SELECT latest(clocks.current.memory.MHz) as 'Memory MHz', latest(clocks.current.graphics.MHz) as 'Graphics MHz', latest(clocks.current.video.MHz) as 'Video MHz', latest(clocks.current.sm.MHz) as 'SM MHz' "
}
],
"thresholds": []
}
},
{
"visualization": {
"id": "viz.line"
},
"layout": {
"column": 7,
"row": 1,
"height": 3,
"width": 6
},
"title": "Current Clock MHz",
"rawConfiguration": {
"legend": {
"enabled": true
},
"nrqlQueries": [
{
"accountId": 123,
"query": "FROM NvidiaGpuSample SELECT latest(clocks.current.memory.MHz) as 'Memory MHz', latest(clocks.current.graphics.MHz) as 'Graphics MHz', latest(clocks.current.video.MHz) as 'Video MHz', latest(clocks.current.sm.MHz) as 'SM MHz' TIMESERIES"
}
],
"yAxisLeft": {
"zero": true
}
}
},
{
"visualization": {
"id": "viz.bar"
},
"layout": {
"column": 1,
"row": 2,
"height": 2,
"width": 4
},
"title": "Select GPU",
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountId": 123,
"query": "FROM NvidiaGpuSample SELECT latest(name) FACET pci.device_id, hostname "
}
]
}
},
{
"visualization": {
"id": "viz.billboard"
},
"layout": {
"column": 1,
"row": 4,
"height": 3,
"width": 2
},
"title": "Temps",
"rawConfiguration": {
"dataFormatters": [],
"nrqlQueries": [
{
"accountId": 123,
"query": "FROM NvidiaGpuSample SELECT latest(temperature.gpu) as 'GPU Temp', latest(temperature.memory) as 'Memory Temp', latest(fan.speed.percent) as 'Fan speed %'"
}
],
"thresholds": []
}
},
{
"visualization": {
"id": "viz.billboard"
},
"layout": {
"column": 3,
"row": 4,
"height": 3,
"width": 2
},
"title": "Power Usage",
"rawConfiguration": {
"dataFormatters": [],
"nrqlQueries": [
{
"accountId": 123,
"query": "FROM NvidiaGpuSample SELECT latest(power.draw.watts) as 'Power Draw Watts', latest(`power.limit.watts`) as 'Power Limit Watts', latest(power.draw.watts)/latest(`power.limit.watts`) * 100 as 'Power usage %'"
}
],
"thresholds": []
}
},
{
"visualization": {
"id": "viz.billboard"
},
"layout": {
"column": 5,
"row": 4,
"height": 3,
"width": 2
},
"title": "Memory Usage",
"rawConfiguration": {
"dataFormatters": [],
"nrqlQueries": [
{
"accountId": 123,
"query": "FROM NvidiaGpuSample SELECT latest(memory.free.MiB) as 'Memory Free MiB', latest(memory.used.MiB) as 'Memory Used MiB', latest(memory.total.MiB) as 'Memory Total MiB'"
}
],
"thresholds": []
}
},
{
"visualization": {
"id": "viz.line"
},
"layout": {
"column": 7,
"row": 4,
"height": 3,
"width": 6
},
"title": "Utilization",
"rawConfiguration": {
"legend": {
"enabled": true
},
"nrqlQueries": [
{
"accountId": 123,
"query": "FROM NvidiaGpuSample SELECT latest(memory.used.MiB/memory.total.MiB) * 100 as 'Memory Used %', latest(utilization.gpu.percent) as 'GPU Utilization %', latest(power.draw.watts)/latest(`power.limit.watts`)*100 as 'Power Usage %', latest(fan.speed.percent) as 'Fan Speed %' TIMESERIES"
}
],
"yAxisLeft": {
"zero": true
}
}
}
]
}
]
}
Binary file added dashboards/nvidia-gpu/nvidia-gpu.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
19 changes: 19 additions & 0 deletions data-sources/nvidia-gpu/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
id: nvidia-gpu
displayName: Nvidia GPUs
description: |
Monitor Nvidia GPUs based on the Nvidia SMI utility.
install:
primary:
link:
url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-gpu-integration/

icon: logo.png

keywords:
- infrastructure
- nvidia
- gpu

categoryTerms:
- infrastructure
Binary file added data-sources/nvidia-gpu/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
38 changes: 38 additions & 0 deletions quickstarts/nvidia-gpu/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
slug: nvidia-gpu

title: Nvidia GPU Monitoring

description: |
Our NVIDIA GPU integration assists you in monitoring the status of GPUs.
This integration leverages our infrastructure agent and the Flex integration, which is seamlessly integrated with NVIDIA's SMI utility.
It provides you with a pre-built dashboard containing crucial GPU metrics, including GPU utilization, ECC error counts,
active compute processes, clock and performance states, temperature, fan speed, as well as dynamic and static information about each supported device.
summary: |
Monitor NVidia GPUs performance and state.
level: Community

authors:
- New Relic

keywords:
- nvidia
- gpu
- infrastructure

documentation:
- name: NVIDIA GPU integration
url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-gpu-integration/
description: Monitor the status and performance of NVidia GPUs.

icon: logo.png

dashboards:
- nvidia-gpu

alertPolicies:
- nvidia-gpu

dataSourceIds:
- nvidia-gpu
Binary file added quickstarts/nvidia-gpu/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 86b613a

Please sign in to comment.