Skip to content

Commit

Permalink
Merge pull request #2447 from sjyothi54/NR-282144
Browse files Browse the repository at this point in the history
feat: NVIDIA Jetson Integration
  • Loading branch information
mdumpati authored Jun 25, 2024
2 parents 14433c0 + 62b1f19 commit 74d36e4
Show file tree
Hide file tree
Showing 13 changed files with 1,435 additions and 0 deletions.
36 changes: 36 additions & 0 deletions alert-policies/nvidia-jetson/EMCUsage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Excessive EMC usage

description: |+
This alert is triggered when the EMC usage exceeds 85% for 5 minutes.
type: STATIC
nrql:
query: "SELECT latest(emc_freq_pct) as 'EMC Used (%)' from jetsonTegrastats"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 80
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
36 changes: 36 additions & 0 deletions alert-policies/nvidia-jetson/GPUTemperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: High GPU temperature

description: |+
This alert is triggered when the GPU temperature exceeds 85°C for 5 minutes.
type: STATIC
nrql:
query: "SELECT latest(gpu_temp) as 'GPU temperature (C)' from jetsonTegrastats"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 80
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
36 changes: 36 additions & 0 deletions alert-policies/nvidia-jetson/ThermalTemperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: High thermal temperature

description: |+
This alert is triggered when the thermal temperature exceeds 80°C for 5 minutes.
type: STATIC
nrql:
query: "SELECT latest(thermal_temp) as 'Thermal temperature (C)' from jetsonTegrastats"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 80
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 70
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
Loading

0 comments on commit 74d36e4

Please sign in to comment.