Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 5/16/24 #2412

Merged
merged 12 commits into from
May 16, 2024
37 changes: 37 additions & 0 deletions alert-policies/nvml/HighTemperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: High GPU Temperature

description: |+
This alert is triggered when the GPU Temperature is exceeds 85 degrees Celsius for 5 minutes.

type: STATIC
nrql:
query: "SELECT latest(DCGM_FI_DEV_GPU_TEMP) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
354 changes: 354 additions & 0 deletions dashboards/nvml/nvml.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,354 @@
{
"name": "NVML",
"description": null,
"pages": [
{
"name": "NVML",
"description": null,
"widgets": [
{
"title": "",
"layout": {
"column": 1,
"row": 1,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.markdown"
},
"rawConfiguration": {
"text": "![NVML icon](https://raw.githubusercontent.com/newrelic/newrelic-quickstarts/main/quickstarts/nvidia-dcgm/logo.png)"
}
},
{
"title": "Device count",
"layout": {
"column": 4,
"row": 1,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(device_count) FROM nvmlSample"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "GPU Temperature ",
"layout": {
"column": 7,
"row": 1,
"width": 4,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.line"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"legend": {
"enabled": true
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(temperature_gpu) FROM nvmlSample TIMESERIES "
}
],
"platformOptions": {
"ignoreTimeRange": false
},
"thresholds": {
"isLabelVisible": true
},
"units": {
"unit": "CELSIUS"
},
"yAxisLeft": {
"zero": true
},
"yAxisRight": {
"zero": true
}
}
},
{
"title": "Clock memory (mhz)",
"layout": {
"column": 11,
"row": 1,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` clocks_max_memory_mhz`) as 'MAX Memory',latest( ` clocks_current_memory_mhz`) AS 'Current Memory' FROM nvmlSample "
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "",
"layout": {
"column": 1,
"row": 4,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.markdown"
},
"rawConfiguration": {
"text": "**About**\n\nInstrument your application with New Relic - [Add Data](https://one.newrelic.com/).\n\nInstrument NVML with New Relic using the [documentation](https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvml-integration/).\n\n[Please rate this dashboard](https://docs.google.com/forms/d/e/1FAIpQLSclR38J8WbbB2J1tHnllKUkzWZkJhf4SrJGyavpMd4t82NjnQ/viewform?usp=pp_url&entry.1615922415=NVML) here and let us know how we can improve it for you."
}
},
{
"title": "Power usage (watts)",
"layout": {
"column": 3,
"row": 4,
"width": 5,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.line"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"legend": {
"enabled": true
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest( ` power_limit_w`), latest( ` power_draw_w`) FROM nvmlSample TIMESERIES "
}
],
"platformOptions": {
"ignoreTimeRange": false
},
"thresholds": {
"isLabelVisible": true
},
"yAxisLeft": {
"zero": true
},
"yAxisRight": {
"zero": true
}
}
},
{
"title": "Memory (mib)",
"layout": {
"column": 8,
"row": 4,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` memory_free_mib`) AS 'Free memory', latest(` memory_used_mib`) AS 'Used memory' FROM nvmlSample "
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Performance state",
"layout": {
"column": 11,
"row": 4,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` pstate`) FROM nvmlSample"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Memory untilization",
"layout": {
"column": 1,
"row": 7,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` utilization_memory`) FROM nvmlSample"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Clock application graphics (mhz)",
"layout": {
"column": 4,
"row": 7,
"width": 4,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.line"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"legend": {
"enabled": true
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` clocks_applications_graphics_mhz`) FROM nvmlSample TIMESERIES "
}
],
"platformOptions": {
"ignoreTimeRange": false
},
"thresholds": {
"isLabelVisible": true
},
"yAxisLeft": {
"zero": true
},
"yAxisRight": {
"zero": true
}
}
},
{
"title": "GPU utilization",
"layout": {
"column": 8,
"row": 7,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` utilization_gpu`) FROM nvmlSample "
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Clocks throttle reasons active",
"layout": {
"column": 11,
"row": 7,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` clocks_throttle_reasons_active`) FROM nvmlSample"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
}
]
}
],
"variables": []
}
Binary file added dashboards/nvml/nvml01.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading