Skip to content

Commit 4f0cb56

Browse files
authored
Merge branch 'main' into NR-178810
2 parents 5fe84e7 + a68e477 commit 4f0cb56

File tree

8 files changed

+367
-0
lines changed

8 files changed

+367
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: High GPU Temperature
2+
3+
description: |+
4+
TThis alert is triggered when the NVIDIA GPU Temperature is above 90%.
5+
6+
type: STATIC
7+
nrql:
8+
query: "SELECT latest(DCGM_FI_DEV_GPU_TEMP) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP'"
9+
10+
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
11+
valueFunction: SINGLE_VALUE
12+
13+
# List of Critical and Warning thresholds for the condition
14+
terms:
15+
- priority: CRITICAL
16+
# Operator used to compare against the threshold.
17+
operator: ABOVE
18+
# Value that triggers a violation
19+
threshold: 90
20+
# Time in seconds; 120 - 3600
21+
thresholdDuration: 300
22+
# How many data points must be in violation for the duration
23+
thresholdOccurrences: ALL
24+
25+
# Duration after which a violation automatically closes
26+
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
27+
violationTimeLimitSeconds: 86400
+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: XID Error
2+
3+
description: |+
4+
This alert is triggered when the error is higher than 3 for 5 minutes.
5+
6+
type: STATIC
7+
nrql:
8+
query: "SELECT latest(DCGM_FI_DEV_XID_ERRORS) AS 'errors' FROM Metric WHERE metricName like 'DCGM_FI_DEV_XID_ERRORS'"
9+
10+
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
11+
valueFunction: SINGLE_VALUE
12+
13+
# List of Critical and Warning thresholds for the condition
14+
terms:
15+
- priority: CRITICAL
16+
# Operator used to compare against the threshold.
17+
operator: ABOVE
18+
# Value that triggers a violation
19+
threshold: 3
20+
# Time in seconds; 120 - 3600
21+
thresholdDuration: 300
22+
# How many data points must be in violation for the duration
23+
thresholdOccurrences: ALL
24+
25+
# Duration after which a violation automatically closes
26+
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
27+
violationTimeLimitSeconds: 86400
+246
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
{
2+
"name": "NVIDIA",
3+
"description": null,
4+
"pages": [
5+
{
6+
"name": "Overview",
7+
"description": null,
8+
"widgets": [
9+
{
10+
"title": "",
11+
"layout": {
12+
"column": 1,
13+
"row": 1,
14+
"width": 2,
15+
"height": 2
16+
},
17+
"linkedEntityGuids": null,
18+
"visualization": {
19+
"id": "viz.markdown"
20+
},
21+
"rawConfiguration": {
22+
"text": "![NVIDIA DCGM](https://github-production-user-asset-6210df.s3.amazonaws.com/104448291/279630087-461421da-3f8b-4d71-bac7-2e20d58b4180.png)"
23+
}
24+
},
25+
{
26+
"title": "GPU Temperature ",
27+
"layout": {
28+
"column": 3,
29+
"row": 1,
30+
"width": 4,
31+
"height": 3
32+
},
33+
"linkedEntityGuids": null,
34+
"visualization": {
35+
"id": "viz.area"
36+
},
37+
"rawConfiguration": {
38+
"facet": {
39+
"showOtherSeries": false
40+
},
41+
"legend": {
42+
"enabled": true
43+
},
44+
"nrqlQueries": [
45+
{
46+
"accountIds": [],
47+
"query": "SELECT latest(DCGM_FI_DEV_GPU_TEMP ) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP' TIMESERIES "
48+
}
49+
],
50+
"platformOptions": {
51+
"ignoreTimeRange": false
52+
},
53+
"units": {
54+
"unit": "CELSIUS"
55+
}
56+
}
57+
},
58+
{
59+
"title": "Power usage(%)",
60+
"layout": {
61+
"column": 7,
62+
"row": 1,
63+
"width": 3,
64+
"height": 3
65+
},
66+
"linkedEntityGuids": null,
67+
"visualization": {
68+
"id": "viz.billboard"
69+
},
70+
"rawConfiguration": {
71+
"facet": {
72+
"showOtherSeries": false
73+
},
74+
"nrqlQueries": [
75+
{
76+
"accountIds": [],
77+
"query": "SELECT average(DCGM_FI_DEV_POWER_USAGE) AS 'usage' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_POWER_USAGE' "
78+
}
79+
],
80+
"platformOptions": {
81+
"ignoreTimeRange": false
82+
}
83+
}
84+
},
85+
{
86+
"title": "Total NVLink bandwidth",
87+
"layout": {
88+
"column": 10,
89+
"row": 1,
90+
"width": 3,
91+
"height": 3
92+
},
93+
"linkedEntityGuids": null,
94+
"visualization": {
95+
"id": "viz.area"
96+
},
97+
"rawConfiguration": {
98+
"facet": {
99+
"showOtherSeries": false
100+
},
101+
"legend": {
102+
"enabled": true
103+
},
104+
"nrqlQueries": [
105+
{
106+
"accountIds": [],
107+
"query": "SELECT latest(DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL) AS 'nvlink bandwidth' FROM Metric WHERE metricName like 'DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL' TIMESERIES "
108+
}
109+
],
110+
"platformOptions": {
111+
"ignoreTimeRange": false
112+
}
113+
}
114+
},
115+
{
116+
"title": "",
117+
"layout": {
118+
"column": 1,
119+
"row": 3,
120+
"width": 2,
121+
"height": 2
122+
},
123+
"linkedEntityGuids": null,
124+
"visualization": {
125+
"id": "viz.markdown"
126+
},
127+
"rawConfiguration": {
128+
"text": "**About**\n\nInstrument your application with New Relic - [Add Data](https://one.newrelic.com).\n\nInstrument NVIDIA DCGM with New Relic using the [documentation](https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-dcgm-integration/).\n\n[Please rate this dashboard](https://docs.google.com/forms/d/e/1FAIpQLSclR38J8WbbB2J1tHnllKUkzWZkJhf4SrJGyavpMd4t82NjnQ/viewform?usp=pp_url&entry.1615922415=nvidia-dcgm) here and let us know how we can improve it for you."
129+
}
130+
},
131+
{
132+
"title": "Clocks(MHz)",
133+
"layout": {
134+
"column": 3,
135+
"row": 4,
136+
"width": 5,
137+
"height": 3
138+
},
139+
"linkedEntityGuids": null,
140+
"visualization": {
141+
"id": "viz.area"
142+
},
143+
"rawConfiguration": {
144+
"facet": {
145+
"showOtherSeries": false
146+
},
147+
"legend": {
148+
"enabled": true
149+
},
150+
"nrqlQueries": [
151+
{
152+
"accountIds": [],
153+
"query": "SELECT latest(DCGM_FI_DEV_MEM_CLOCK) AS 'MEM Clock', latest(DCGM_FI_DEV_SM_CLOCK) AS 'SM Clock' FROM Metric TIMESERIES"
154+
}
155+
],
156+
"platformOptions": {
157+
"ignoreTimeRange": false
158+
}
159+
}
160+
},
161+
{
162+
"title": "Framebuffer free (bytes)",
163+
"layout": {
164+
"column": 8,
165+
"row": 4,
166+
"width": 3,
167+
"height": 3
168+
},
169+
"linkedEntityGuids": null,
170+
"visualization": {
171+
"id": "viz.billboard"
172+
},
173+
"rawConfiguration": {
174+
"facet": {
175+
"showOtherSeries": false
176+
},
177+
"nrqlQueries": [
178+
{
179+
"accountIds": [],
180+
"query": "SELECT latest(DCGM_FI_DEV_FB_FREE) AS 'Free', latest(DCGM_FI_DEV_FB_USED) AS 'Used' FROM Metric"
181+
}
182+
],
183+
"platformOptions": {
184+
"ignoreTimeRange": false
185+
}
186+
}
187+
},
188+
{
189+
"title": "XID errors",
190+
"layout": {
191+
"column": 11,
192+
"row": 4,
193+
"width": 2,
194+
"height": 3
195+
},
196+
"linkedEntityGuids": null,
197+
"visualization": {
198+
"id": "viz.billboard"
199+
},
200+
"rawConfiguration": {
201+
"facet": {
202+
"showOtherSeries": false
203+
},
204+
"nrqlQueries": [
205+
{
206+
"accountIds": [],
207+
"query": "SELECT latest(DCGM_FI_DEV_XID_ERRORS) AS 'errors' FROM Metric WHERE metricName like 'DCGM_FI_DEV_XID_ERRORS'"
208+
}
209+
],
210+
"platformOptions": {
211+
"ignoreTimeRange": false
212+
}
213+
}
214+
},
215+
{
216+
"title": "GPU utilisation ",
217+
"layout": {
218+
"column": 1,
219+
"row": 5,
220+
"width": 2,
221+
"height": 2
222+
},
223+
"linkedEntityGuids": null,
224+
"visualization": {
225+
"id": "viz.billboard"
226+
},
227+
"rawConfiguration": {
228+
"facet": {
229+
"showOtherSeries": false
230+
},
231+
"nrqlQueries": [
232+
{
233+
"accountIds": [],
234+
"query": "SELECT average(DCGM_FI_DEV_GPU_UTIL) AS 'gpu utilisation' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_UTIL'"
235+
}
236+
],
237+
"platformOptions": {
238+
"ignoreTimeRange": false
239+
}
240+
}
241+
}
242+
]
243+
}
244+
],
245+
"variables": []
246+
}
377 KB
Loading

data-sources/nvidia-dcgm/config.yml

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
id: nvidia-dcgm
2+
displayName: NVIDIA DCGM
3+
description: |
4+
Monitor and analyze your NVIDIA DCGM infrastructure with New Relic.
5+
install:
6+
primary:
7+
link:
8+
url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-dcgm-integration/
9+
icon: logo.png
10+
keywords:
11+
- NVIDIA DCGM
12+
- AI Acceleration
13+
- Machine Learning Acceleration
14+
- GPU Management
15+
- AI Management
16+
- Machine Learning Management
17+
- Deep Learning Performance
18+
- AI Performance
19+
- GPU Optimization
20+
- AI Optimization
21+
- NR1_addData

data-sources/nvidia-dcgm/logo.png

155 KB
Loading

quickstarts/nvidia-dcgm/config.yml

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
id: 7f86bb7f-0502-418b-aae7-4ad327964874
2+
slug: nvidia-dcgm
3+
description: |
4+
## Why monitor NVIDIA DCGM?
5+
monitoring NVIDIA DCGM is essential for maintaining the health and efficiency of your GPU infrastructure in a data center. It helps with performance optimization, fault detection, resource management, energy efficiency, and overall data center health, while also aiding in troubleshooting, security, and compliance.
6+
7+
## Comprehensive monitoring quickstart for NVIDIA DCGM
8+
New Relic comprehensive monitoring of your GPU infrastructure in your data center. This setup will allow you to monitor GPU performance and health while leveraging the capabilities of New Relic for data visualization, alerting, and analysis.
9+
10+
## What’s included in this quickstart?
11+
New Relic NVIDIA DCGM monitoring quickstart provides quality out-of-the-box reporting:
12+
- Dashboards (power usage, GPU utilisation, clocks, etc)
13+
- Alerts for NVIDIA DCGM (GPU temperature, Xid error)
14+
15+
16+
summary: |
17+
Monitor and analyze your NVIDIA DCGM infrastructure with New Relic.
18+
icon: logo.png
19+
level: New Relic
20+
authors:
21+
- New Relic
22+
- Ramana Reddy
23+
title: NVIDIA DCGM
24+
documentation:
25+
- name: NVIDIA DCGM integration documentation
26+
description: |
27+
Monitor and instrument your NVIDIA DCGM with New Relic to gain deep insights into your performance.
28+
url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-dcgm-integration/
29+
keywords:
30+
- NVIDIA DCGM
31+
- AI Acceleration
32+
- Machine Learning Acceleration
33+
- GPU Management
34+
- AI Management
35+
- Machine Learning Management
36+
- Deep Learning Performance
37+
- AI Performance
38+
- GPU Optimization
39+
- AI Optimization
40+
- NR1_addData
41+
dataSourceIds:
42+
- nvidia-dcgm
43+
dashboards:
44+
- nvidia-dcgm
45+
alertPolicies:
46+
- nvidia-dcgm

quickstarts/nvidia-dcgm/logo.png

155 KB
Loading

0 commit comments

Comments
 (0)