Skip to content

Commit

Permalink
Merge branch 'release' into lucia/update-mux-video-ds
Browse files Browse the repository at this point in the history
  • Loading branch information
RamanaReddy8801 authored May 30, 2024
2 parents 13a982e + 4245e2c commit cf14497
Show file tree
Hide file tree
Showing 28 changed files with 1,821 additions and 16 deletions.
41 changes: 41 additions & 0 deletions alert-policies/apache-mesos/errorTasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Name of the alert
name: Error Tasks

# Description and details
description: |+
This alert is triggered when the number of error tasks exceeds 3 for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "FROM apacheMesos SELECT latest(`master/tasks_error`) as 'Tasks error'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 3
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
41 changes: 41 additions & 0 deletions alert-policies/apache-mesos/failedTasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Name of the alert
name: Failed Tasks

# Description and details
description: |+
This alert is triggered when the number of failed tasks exceeds 3 for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "FROM apacheMesos SELECT latest(`master/tasks_failed`) as 'Failed tasks'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 3
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
41 changes: 41 additions & 0 deletions alert-policies/apache-mesos/lostTasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Name of the alert
name: Lost Tasks

# Description and details
description: |+
This alert is triggered when the number of lost tasks exceeds 3 for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "FROM apacheMesos SELECT latest(`master/tasks_lost`) as 'Tasks lost'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 3
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
29 changes: 29 additions & 0 deletions alert-policies/temporal-cloud/FailedWorkflows.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Name of the alert
name: Failed Workflows

# Description and details
description: |+
This alert is triggered if the Temporal cloud workflows fail once within a 5-minute window.
type: STATIC

# NRQL query
nrql:
query: "FROM temporalCloudWorkflowFailed SELECT latest(`data.result-value1`) FACET `data.result-metric-__name__`"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
30 changes: 30 additions & 0 deletions alert-policies/temporal-cloud/ServiceLatency.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Name of the alert
name: Service Latency

# Description and details
description: |+
This alert is triggered if the Temporal cloud service latency exceeds 5 seconds for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "FROM temporalCloudWorkflowFailed SELECT latest(`data.result-value1`) FACET `data.result-metric-__name__`"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
Loading

0 comments on commit cf14497

Please sign in to comment.