diff --git a/alert-policies/nvidia-dcgm/HighTemperature.yml b/alert-policies/nvidia-dcgm/HighTemperature.yml new file mode 100644 index 0000000000..60232b317c --- /dev/null +++ b/alert-policies/nvidia-dcgm/HighTemperature.yml @@ -0,0 +1,27 @@ +name: High GPU Temperature + +description: |+ + TThis alert is triggered when the NVIDIA GPU Temperature is above 90%. + +type: STATIC +nrql: + query: "SELECT latest(DCGM_FI_DEV_GPU_TEMP) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP'" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 diff --git a/alert-policies/nvidia-dcgm/XidError.yml b/alert-policies/nvidia-dcgm/XidError.yml new file mode 100644 index 0000000000..e5346761c1 --- /dev/null +++ b/alert-policies/nvidia-dcgm/XidError.yml @@ -0,0 +1,27 @@ +name: XID Error + +description: |+ + This alert is triggered when the error is higher than 3 for 5 minutes. + +type: STATIC +nrql: + query: "SELECT latest(DCGM_FI_DEV_XID_ERRORS) AS 'errors' FROM Metric WHERE metricName like 'DCGM_FI_DEV_XID_ERRORS'" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 3 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 diff --git a/dashboards/nvidia-dcgm/nvidia-dcgm.json b/dashboards/nvidia-dcgm/nvidia-dcgm.json new file mode 100644 index 0000000000..983d116e7e --- /dev/null +++ b/dashboards/nvidia-dcgm/nvidia-dcgm.json @@ -0,0 +1,246 @@ +{ + "name": "NVIDIA", + "description": null, + "pages": [ + { + "name": "Overview", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "![NVIDIA DCGM](https://github-production-user-asset-6210df.s3.amazonaws.com/104448291/279630087-461421da-3f8b-4d71-bac7-2e20d58b4180.png)" + } + }, + { + "title": "GPU Temperature ", + "layout": { + "column": 3, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(DCGM_FI_DEV_GPU_TEMP ) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP' TIMESERIES " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "CELSIUS" + } + } + }, + { + "title": "Power usage(%)", + "layout": { + "column": 7, + "row": 1, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(DCGM_FI_DEV_POWER_USAGE) AS 'usage' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_POWER_USAGE' " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Total NVLink bandwidth", + "layout": { + "column": 10, + "row": 1, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL) AS 'nvlink bandwidth' FROM Metric WHERE metricName like 'DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL' TIMESERIES " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "", + "layout": { + "column": 1, + "row": 3, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "**About**\n\nInstrument your application with New Relic - [Add Data](https://one.newrelic.com).\n\nInstrument NVIDIA DCGM with New Relic using the [documentation](https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-dcgm-integration/).\n\n[Please rate this dashboard](https://docs.google.com/forms/d/e/1FAIpQLSclR38J8WbbB2J1tHnllKUkzWZkJhf4SrJGyavpMd4t82NjnQ/viewform?usp=pp_url&entry.1615922415=nvidia-dcgm) here and let us know how we can improve it for you." + } + }, + { + "title": "Clocks(MHz)", + "layout": { + "column": 3, + "row": 4, + "width": 5, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(DCGM_FI_DEV_MEM_CLOCK) AS 'MEM Clock', latest(DCGM_FI_DEV_SM_CLOCK) AS 'SM Clock' FROM Metric TIMESERIES" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Framebuffer free (bytes)", + "layout": { + "column": 8, + "row": 4, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(DCGM_FI_DEV_FB_FREE) AS 'Free', latest(DCGM_FI_DEV_FB_USED) AS 'Used' FROM Metric" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "XID errors", + "layout": { + "column": 11, + "row": 4, + "width": 2, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(DCGM_FI_DEV_XID_ERRORS) AS 'errors' FROM Metric WHERE metricName like 'DCGM_FI_DEV_XID_ERRORS'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "GPU utilisation ", + "layout": { + "column": 1, + "row": 5, + "width": 2, + "height": 2 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.billboard" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(DCGM_FI_DEV_GPU_UTIL) AS 'gpu utilisation' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_UTIL'" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + } + ], + "variables": [] +} \ No newline at end of file diff --git a/dashboards/nvidia-dcgm/nvidia-dcgm01.png b/dashboards/nvidia-dcgm/nvidia-dcgm01.png new file mode 100644 index 0000000000..8b9b404a6f Binary files /dev/null and b/dashboards/nvidia-dcgm/nvidia-dcgm01.png differ diff --git a/data-sources/nvidia-dcgm/config.yml b/data-sources/nvidia-dcgm/config.yml new file mode 100644 index 0000000000..19e71ce6ef --- /dev/null +++ b/data-sources/nvidia-dcgm/config.yml @@ -0,0 +1,21 @@ +id: nvidia-dcgm +displayName: NVIDIA DCGM +description: | + Monitor and analyze your NVIDIA DCGM infrastructure with New Relic. +install: + primary: + link: + url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-dcgm-integration/ +icon: logo.png +keywords: + - NVIDIA DCGM + - AI Acceleration + - Machine Learning Acceleration + - GPU Management + - AI Management + - Machine Learning Management + - Deep Learning Performance + - AI Performance + - GPU Optimization + - AI Optimization + - NR1_addData \ No newline at end of file diff --git a/data-sources/nvidia-dcgm/logo.png b/data-sources/nvidia-dcgm/logo.png new file mode 100644 index 0000000000..36d7822f35 Binary files /dev/null and b/data-sources/nvidia-dcgm/logo.png differ diff --git a/quickstarts/nvidia-dcgm/config.yml b/quickstarts/nvidia-dcgm/config.yml new file mode 100644 index 0000000000..906f0c787f --- /dev/null +++ b/quickstarts/nvidia-dcgm/config.yml @@ -0,0 +1,45 @@ +slug: nvidia-dcgm +description: | + ## Why monitor NVIDIA DCGM? + monitoring NVIDIA DCGM is essential for maintaining the health and efficiency of your GPU infrastructure in a data center. It helps with performance optimization, fault detection, resource management, energy efficiency, and overall data center health, while also aiding in troubleshooting, security, and compliance. + + ## Comprehensive monitoring quickstart for NVIDIA DCGM + New Relic comprehensive monitoring of your GPU infrastructure in your data center. This setup will allow you to monitor GPU performance and health while leveraging the capabilities of New Relic for data visualization, alerting, and analysis. + + ## What’s included in this quickstart? + New Relic NVIDIA DCGM monitoring quickstart provides quality out-of-the-box reporting: + - Dashboards (power usage, GPU utilisation, clocks, etc) + - Alerts for NVIDIA DCGM (GPU temperature, Xid error) + + +summary: | + Monitor and analyze your NVIDIA DCGM infrastructure with New Relic. +icon: logo.png +level: New Relic +authors: + - New Relic + - Ramana Reddy +title: NVIDIA DCGM +documentation: + - name: NVIDIA DCGM integration documentation + description: | + Monitor and instrument your NVIDIA DCGM with New Relic to gain deep insights into your performance. + url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvidia-dcgm-integration/ +keywords: + - NVIDIA DCGM + - AI Acceleration + - Machine Learning Acceleration + - GPU Management + - AI Management + - Machine Learning Management + - Deep Learning Performance + - AI Performance + - GPU Optimization + - AI Optimization + - NR1_addData +dataSourceIds: + - nvidia-dcgm +dashboards: + - nvidia-dcgm +alertPolicies: + - nvidia-dcgm \ No newline at end of file diff --git a/quickstarts/nvidia-dcgm/logo.png b/quickstarts/nvidia-dcgm/logo.png new file mode 100644 index 0000000000..36d7822f35 Binary files /dev/null and b/quickstarts/nvidia-dcgm/logo.png differ