You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
"query": "SELECT latest(DCGM_FI_DEV_GPU_TEMP ) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP' TIMESERIES "
48
+
}
49
+
],
50
+
"platformOptions": {
51
+
"ignoreTimeRange": false
52
+
},
53
+
"units": {
54
+
"unit": "CELSIUS"
55
+
}
56
+
}
57
+
},
58
+
{
59
+
"title": "Power usage(%)",
60
+
"layout": {
61
+
"column": 7,
62
+
"row": 1,
63
+
"width": 3,
64
+
"height": 3
65
+
},
66
+
"linkedEntityGuids": null,
67
+
"visualization": {
68
+
"id": "viz.billboard"
69
+
},
70
+
"rawConfiguration": {
71
+
"facet": {
72
+
"showOtherSeries": false
73
+
},
74
+
"nrqlQueries": [
75
+
{
76
+
"accountIds": [],
77
+
"query": "SELECT average(DCGM_FI_DEV_POWER_USAGE) AS 'usage' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_POWER_USAGE' "
78
+
}
79
+
],
80
+
"platformOptions": {
81
+
"ignoreTimeRange": false
82
+
}
83
+
}
84
+
},
85
+
{
86
+
"title": "Total NVLink bandwidth",
87
+
"layout": {
88
+
"column": 10,
89
+
"row": 1,
90
+
"width": 3,
91
+
"height": 3
92
+
},
93
+
"linkedEntityGuids": null,
94
+
"visualization": {
95
+
"id": "viz.area"
96
+
},
97
+
"rawConfiguration": {
98
+
"facet": {
99
+
"showOtherSeries": false
100
+
},
101
+
"legend": {
102
+
"enabled": true
103
+
},
104
+
"nrqlQueries": [
105
+
{
106
+
"accountIds": [],
107
+
"query": "SELECT latest(DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL) AS 'nvlink bandwidth' FROM Metric WHERE metricName like 'DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL' TIMESERIES "
108
+
}
109
+
],
110
+
"platformOptions": {
111
+
"ignoreTimeRange": false
112
+
}
113
+
}
114
+
},
115
+
{
116
+
"title": "",
117
+
"layout": {
118
+
"column": 1,
119
+
"row": 3,
120
+
"width": 2,
121
+
"height": 2
122
+
},
123
+
"linkedEntityGuids": null,
124
+
"visualization": {
125
+
"id": "viz.markdown"
126
+
},
127
+
"rawConfiguration": {
128
+
"text": "**About**\n\nInstrument your application with New Relic - [Add Data](https://one.newrelic.com).\n\nInstrument NVIDIA DCGM with New Relic using the [documentation](https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-dcgm-integration/).\n\n[Please rate this dashboard](https://docs.google.com/forms/d/e/1FAIpQLSclR38J8WbbB2J1tHnllKUkzWZkJhf4SrJGyavpMd4t82NjnQ/viewform?usp=pp_url&entry.1615922415=nvidia-dcgm) here and let us know how we can improve it for you."
129
+
}
130
+
},
131
+
{
132
+
"title": "Clocks(MHz)",
133
+
"layout": {
134
+
"column": 3,
135
+
"row": 4,
136
+
"width": 5,
137
+
"height": 3
138
+
},
139
+
"linkedEntityGuids": null,
140
+
"visualization": {
141
+
"id": "viz.area"
142
+
},
143
+
"rawConfiguration": {
144
+
"facet": {
145
+
"showOtherSeries": false
146
+
},
147
+
"legend": {
148
+
"enabled": true
149
+
},
150
+
"nrqlQueries": [
151
+
{
152
+
"accountIds": [],
153
+
"query": "SELECT latest(DCGM_FI_DEV_MEM_CLOCK) AS 'MEM Clock', latest(DCGM_FI_DEV_SM_CLOCK) AS 'SM Clock' FROM Metric TIMESERIES"
154
+
}
155
+
],
156
+
"platformOptions": {
157
+
"ignoreTimeRange": false
158
+
}
159
+
}
160
+
},
161
+
{
162
+
"title": "Framebuffer free (bytes)",
163
+
"layout": {
164
+
"column": 8,
165
+
"row": 4,
166
+
"width": 3,
167
+
"height": 3
168
+
},
169
+
"linkedEntityGuids": null,
170
+
"visualization": {
171
+
"id": "viz.billboard"
172
+
},
173
+
"rawConfiguration": {
174
+
"facet": {
175
+
"showOtherSeries": false
176
+
},
177
+
"nrqlQueries": [
178
+
{
179
+
"accountIds": [],
180
+
"query": "SELECT latest(DCGM_FI_DEV_FB_FREE) AS 'Free', latest(DCGM_FI_DEV_FB_USED) AS 'Used' FROM Metric"
181
+
}
182
+
],
183
+
"platformOptions": {
184
+
"ignoreTimeRange": false
185
+
}
186
+
}
187
+
},
188
+
{
189
+
"title": "XID errors",
190
+
"layout": {
191
+
"column": 11,
192
+
"row": 4,
193
+
"width": 2,
194
+
"height": 3
195
+
},
196
+
"linkedEntityGuids": null,
197
+
"visualization": {
198
+
"id": "viz.billboard"
199
+
},
200
+
"rawConfiguration": {
201
+
"facet": {
202
+
"showOtherSeries": false
203
+
},
204
+
"nrqlQueries": [
205
+
{
206
+
"accountIds": [],
207
+
"query": "SELECT latest(DCGM_FI_DEV_XID_ERRORS) AS 'errors' FROM Metric WHERE metricName like 'DCGM_FI_DEV_XID_ERRORS'"
208
+
}
209
+
],
210
+
"platformOptions": {
211
+
"ignoreTimeRange": false
212
+
}
213
+
}
214
+
},
215
+
{
216
+
"title": "GPU utilisation ",
217
+
"layout": {
218
+
"column": 1,
219
+
"row": 5,
220
+
"width": 2,
221
+
"height": 2
222
+
},
223
+
"linkedEntityGuids": null,
224
+
"visualization": {
225
+
"id": "viz.billboard"
226
+
},
227
+
"rawConfiguration": {
228
+
"facet": {
229
+
"showOtherSeries": false
230
+
},
231
+
"nrqlQueries": [
232
+
{
233
+
"accountIds": [],
234
+
"query": "SELECT average(DCGM_FI_DEV_GPU_UTIL) AS 'gpu utilisation' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_UTIL'"
monitoring NVIDIA DCGM is essential for maintaining the health and efficiency of your GPU infrastructure in a data center. It helps with performance optimization, fault detection, resource management, energy efficiency, and overall data center health, while also aiding in troubleshooting, security, and compliance.
6
+
7
+
## Comprehensive monitoring quickstart for NVIDIA DCGM
8
+
New Relic comprehensive monitoring of your GPU infrastructure in your data center. This setup will allow you to monitor GPU performance and health while leveraging the capabilities of New Relic for data visualization, alerting, and analysis.
9
+
10
+
## What’s included in this quickstart?
11
+
New Relic NVIDIA DCGM monitoring quickstart provides quality out-of-the-box reporting:
0 commit comments