Skip to content

Commit

Permalink
Merge branch 'main' into ssaboo-sagemaker
Browse files Browse the repository at this point in the history
  • Loading branch information
mdumpati authored Sep 11, 2023
2 parents a2021a6 + 380df6a commit f229ef6
Show file tree
Hide file tree
Showing 32 changed files with 2,633 additions and 277 deletions.
40 changes: 40 additions & 0 deletions alert-policies/apache-traffic-server/CPUPercent.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Name of the alert
name: CPU Percent

# Description and details
description: |+
This alert is triggered if the CPU usage exceeds 90% for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "SELECT average(host.cpuPercent) AS 'CPU used %' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
40 changes: 40 additions & 0 deletions alert-policies/apache-traffic-server/FreeCacheRAM.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Name of the alert
name: Free RAM Cache Percent

# Description and details
description: |+
This alert is triggered if the free space in RAM memory cache is below 10% for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "SELECT (latest(global.proxy.process.cache.ram_cache.total_bytes) - latest(global.proxy.process.cache.ram_cache.bytes_used)) / latest(global.proxy.process.cache.ram_cache.total_bytes) * 100 FROM ATSCacheSampleMetrics"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation
threshold: 15
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
40 changes: 40 additions & 0 deletions alert-policies/apache-traffic-server/MemoryUsagePercent.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Name of the alert
name: Memory Usage Percent

# Description and details
description: |+
This alert is triggered if the memory usage exceeds 90% for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "SELECT average(host.memoryUsedPercent) AS 'Memory used %' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
30 changes: 30 additions & 0 deletions alert-policies/apache-traffic-server/SSLExpiredCertificate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Name of the alert
name: SSL Expired Certificates

# Description and details
description: |+
This alert is triggered when at least 1 SSL connection to an origin server with an expired original certificate is connected.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "SELECT latest(global.proxy.process.ssl.origin_server_expired_cert) AS 'ssl expired' FROM ATSSampleMetrics"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
13 changes: 11 additions & 2 deletions alert-policies/node-js/HighCpuUtilization.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: High CPU Utilization
name: CPU Utilization

description: |+
This alert is triggered when the CPU Utilization is above 90%.
This alert is triggered when the CPU utilization exceeds 90% for 5 minutes.
type: STATIC
nrql:
Expand All @@ -22,6 +22,15 @@ terms:
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation.
threshold: 85
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration?
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
Expand Down
11 changes: 10 additions & 1 deletion alert-policies/node-js/HighMemoryUsage.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: Memory Usage

description: |+
This alert is triggered when Memory usage is above 90%
This alert is triggered when the Memory usage exceeds 90% for 5 minutes.
type: STATIC
nrql:
query: "SELECT average(memoryUsedBytes/memoryTotalBytes) * 100 FROM SystemSample"
Expand All @@ -20,6 +20,15 @@ terms:
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation.
threshold: 85
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration?
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
Expand Down
4 changes: 2 additions & 2 deletions alert-policies/node-js/LowApdexScore.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
name: Apdex Score

description: |+
This alert is triggered when the Apdex score is below 0.5 for 5 minutes
This alert is triggered when the Apdex score is below 0.5 for 5 minutes.
type: STATIC
nrql:
query: "SELECT apdex(duration, t: 0.5) FROM Transaction WHERE appName like '%'"
query: "SELECT apdex(duration, t: 0.5) FROM Transaction FACET appName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
16 changes: 12 additions & 4 deletions alert-policies/node-js/TransactionErrors.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
name: Transaction Errors
name: Web Transaction Errors

description: |+
This alert is triggered when the the transactions fail more than 10% of the time in 5 minutes.
This alert is triggered when the web transactions fail exceeds 10% for 5 minutes.
type: STATIC
nrql:
query:
"SELECT count(apm.service.error.count) / count(apm.service.transaction.duration) * 100
as 'Web errors' FROM Metric WHERE appName like '%'
AND (transactionType = 'Web')"
as 'Web errors' FROM Metric WHERE appName LIKE '%' AND (transactionType = 'Web') FACET appName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand All @@ -24,6 +23,15 @@ terms:
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation.
threshold: 5
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration?
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
Expand Down
Loading

0 comments on commit f229ef6

Please sign in to comment.