Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
nr-mlosier authored Aug 30, 2023
2 parents 83862f7 + bb8038e commit 8523693
Show file tree
Hide file tree
Showing 142 changed files with 5,602 additions and 2,589 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Name of the alert
name: Lower Availability Score

# Description and details
description: |+
This alert occurs when availability score of traffic falls below 100 for 5 minutes
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "SELECT average(aws.internetmonitor.AvailabilityScore) from Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation; float value
threshold: 100
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: BELOW
threshold: 95
thresholdDuration: 300
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400

Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Name of the alert
name: Lower Performance Score

# Description and details
description: |+
This alert occurs when performance score of traffic falls below 100 for 5 minutes
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "SELECT average(aws.internetmonitor.PerformanceScore) from Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation; float value
threshold: 100
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: BELOW
threshold: 95
thresholdDuration: 300
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
26 changes: 26 additions & 0 deletions alert-policies/amazon-codebuild/BuildsFailed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Builds Failed

description: |+
This alert is triggered when the Builds Failed Count is above 100 in 10 minutes.
type: STATIC
nrql:
query: "SELECT count(`aws.codebuild.FailedBuilds`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 100
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
26 changes: 26 additions & 0 deletions alert-policies/amazon-codebuild/CPUUtilization.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: High CPU Utilization

description: |+
This alert is triggered when the CPU Utilization is above 90%.
type: STATIC
nrql:
query: "SELECT rate(sum(aws.codebuild.CPUUtilized), 1 second) * 100 as cpuUsage FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/amazon-elastictranscoder/JobsErrored.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Jobs Error

description: |+
This alert is triggered when the Job Error Count is above 100 in 10 minutes.
type: STATIC
nrql:
query: "SELECT count(`aws.elastictranscoder.JobsErrored`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 100
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
34 changes: 34 additions & 0 deletions alert-policies/amazon-inspector/HighAssessmentRunFindings.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: High Assessment Run Findings

description: |+
This alert is triggered if the number of Assessment Run Findings exceeds 100 for 5 minutes.
type: STATIC
nrql:
query: "SELECT sum(`aws.inspector.TotalAssessmentRunFindings`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 100
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: ABOVE
threshold: 50
thresholdDuration: 300
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/amazon-sagemaker/HighEndpointsCPUUtil.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High CPU Utilization By Endpoints

description: |+
This alert is triggered when the CPU Utilization by Endpoints is above 90%.
type: STATIC
nrql:
query: "SELECT average(`aws..aws.sagemaker.endpoints.CPUUtilization`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/amazon-sagemaker/HighEndpointsMemoryUtil.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High Memory Utilization By Endpoints

description: |+
This alert is triggered when the memory Utilization by Endpoints is above 90%.
type: STATIC
nrql:
query: "SELECT average(`aws..aws.sagemaker.endpoints.MemoryUtilization`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/amazon-sagemaker/HighModelInvocationErrors.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High Model Invocation Errors

description: |+
This alert is triggered when Model Invocation errors are above 100 in 10 minutes.
type: STATIC
nrql:
query: "SELECT count(`aws.sagemaker.InvocationModelErrors`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 100
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/amazon-sagemaker/HighTrainingJobsCPUUtil.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High CPU Utilization By Training Jobs

description: |+
This alert is triggered when the CPU Utilization by Training Jobs is above 90%.
type: STATIC
nrql:
query: "SELECT average(`aws..aws.sagemaker.trainingjobs.CPUUtilization`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/amazon-workspaces/ConnectionFailure.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High Connection Failure

description: |+
This alert is triggered if the number of failed connections exceeds 100 for 10 minutes.
type: STATIC
nrql:
query: "SELECT count(`aws.workspaces.ConnectionFailure`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 100
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
6 changes: 3 additions & 3 deletions alert-policies/apache-hadoop/UsedDiskPercentage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Used Disk Percent

# Description and details
description: |+
This alert is triggered when used disk space exceeds 90% for at least 5 minutes.
This alert is triggered when disk usage exceeds 90% for at least 5 minutes.
# Type of alert
type: STATIC

Expand All @@ -19,7 +19,7 @@ valueFunction: SINGLE_VALUE
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: BELOW
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
Expand All @@ -28,7 +28,7 @@ terms:
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: BELOW
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
Expand Down
Loading

0 comments on commit 8523693

Please sign in to comment.