Skip to content

Commit

Permalink
Merge pull request #2383 from newrelic/release
Browse files Browse the repository at this point in the history
feat: Release 2024-04-25
  • Loading branch information
Andrew Anguiano authored Apr 25, 2024
2 parents 0e6ed56 + 54819b2 commit 1f50178
Show file tree
Hide file tree
Showing 139 changed files with 4,784 additions and 1,447 deletions.
37 changes: 37 additions & 0 deletions alert-policies/nvidia-triton/CpuUsedPercentage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: CPU Utilization (%)

description: |+
This alert is triggered when the CPU utilization exceeds 85% for 5 minutes.
type: STATIC
nrql:
query: "SELECT average(cpuPercent) AS `CPU used %` FROM SystemSample"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/nvidia-triton/RequestFailures.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: HTTP Request Failures

description: |+
This alert is triggered when HTTP Request Failures exceed 1 every 5 minutes.
type: STATIC
nrql:
query: "SELECT latest(nv_inference_request_failure) FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
37 changes: 37 additions & 0 deletions alert-policies/nvidia-triton/StorageUsagePercentage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Storage Utilization (%)

description: |+
This alert is triggered when the storage utilization exceeds 85% for 5 minutes.
type: STATIC
nrql:
query: "SELECT average(diskUsedPercent) AS `Storage used %` FROM StorageSample"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
31 changes: 31 additions & 0 deletions alert-policies/ray/ActiveNodes.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Name of the alert
name: Ray Active Nodes

# Description and details
description: |+
This alert triggers when there are no active nodes for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "SELECT latest(ray_cluster_active_nodes) AS 'active nodes' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
41 changes: 41 additions & 0 deletions alert-policies/ray/FreeDiskPercentage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Name of the alert
name: Ray Free Disk Percentage

# Description and details
description: |+
This alert is triggered if there is less than 10% of free disk space for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "SELECT (latest(ray_node_disk_free) / 1e+9) / (latest(ray_node_disk_usage) / 1e+9 + latest(ray_node_disk_free) / 1e+9) * 100 AS 'free disk %' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

- priority: WARNING
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation
threshold: 15
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
41 changes: 41 additions & 0 deletions alert-policies/ray/RPCHealthCheckLatency.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Name of the alert
name: Ray RPC Health Check Latency

# Description and details
description: |+
This alert is triggered if the RPC health check latency exceeds 2 seconds for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "SELECT latest(ray_health_check_rpc_latency_ms_bucket) / 1000 as 'rpc latency' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 2
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1.5
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
2 changes: 1 addition & 1 deletion alert-policies/snowflake/FailedQueries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description: |+
type: STATIC
nrql:
query: "FROM SnowflakeVirtualWarehouse SELECT uniqueCount(QUERY_ID) AS 'Queries' WHERE EXECUTION_STATUS = 'FAIL'"
query: "FROM snowflakeLongestQueriesSample SELECT uniqueCount(QUERY_ID) AS 'Queries' WHERE EXECUTION_STATUS = 'FAIL'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
2 changes: 1 addition & 1 deletion alert-policies/snowflake/QueuedQueries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description: |+
type: STATIC
nrql:
query: "FROM SnowflakeVirtualWarehouse SELECT latest(QUEUED_LOAD_AVERAGE) as 'Queued Queries'"
query: "FROM snowflakeWarehouseLoadHistorySample SELECT latest(QUEUED_LOAD_AVERAGE) as 'Queued Queries'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
2 changes: 1 addition & 1 deletion alert-policies/snowflake/SpilledLocalStorage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description: |+
type: STATIC
nrql:
query: "SELECT latest(BYTES_SPILLED_TO_LOCAL_STORAGE_AVERAGE) as 'Bytes Spilled to Local Storage' FROM SnowflakeVirtualWarehouse"
query: "SELECT latest(BYTES_SPILLED_TO_LOCAL_STORAGE_AVERAGE) as 'Bytes Spilled to Local Storage' FROM snowflakeQueryHistorySample"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
2 changes: 1 addition & 1 deletion alert-policies/snowflake/SpilledRemoteStorage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description: |+
type: STATIC
nrql:
query: "SELECT latest(BYTES_SPILLED_TO_REMOTE_STORAGE_AVERAGE) as 'Bytes Spilled to Remote Storage' FROM SnowflakeVirtualWarehouse"
query: "SELECT latest(BYTES_SPILLED_TO_REMOTE_STORAGE_AVERAGE) as 'Bytes Spilled to Remote Storage' FROM snowflakeQueryHistorySample"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
4 changes: 2 additions & 2 deletions dashboards/apm-signals/apm-signals.json
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT apdex(apm.service.apdex) as 'Apdex' FROM Metric WHERE appName LIKE '%' WHERE appName = 'Proxy-East' SINCE 10 minutes ago COMPARE WITH 1 day ago"
"query": "SELECT apdex(apm.service.apdex) as 'Apdex' FROM Metric WHERE appName LIKE '%' SINCE 10 minutes ago COMPARE WITH 1 day ago"
}
],
"platformOptions": {
Expand Down Expand Up @@ -1069,4 +1069,4 @@
}
],
"variables": []
}
}
Binary file added dashboards/nvidia-triton/nvidia-triton-01.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added dashboards/nvidia-triton/nvidia-triton-02.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added dashboards/nvidia-triton/nvidia-triton-03.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 1f50178

Please sign in to comment.