Skip to content

Commit

Permalink
Merge branch 'main' into nr-125012-adobe-commerce
Browse files Browse the repository at this point in the history
  • Loading branch information
pkosanam authored Sep 20, 2023
2 parents c48022c + 793dc86 commit de02597
Show file tree
Hide file tree
Showing 169 changed files with 9,666 additions and 666 deletions.
40 changes: 40 additions & 0 deletions alert-policies/apache-traffic-server/CPUPercent.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Name of the alert
name: CPU Percent

# Description and details
description: |+
This alert is triggered if the CPU usage exceeds 90% for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "SELECT average(host.cpuPercent) AS 'CPU used %' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
40 changes: 40 additions & 0 deletions alert-policies/apache-traffic-server/FreeCacheRAM.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Name of the alert
name: Free RAM Cache Percent

# Description and details
description: |+
This alert is triggered if the free space in RAM memory cache is below 10% for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "SELECT (latest(global.proxy.process.cache.ram_cache.total_bytes) - latest(global.proxy.process.cache.ram_cache.bytes_used)) / latest(global.proxy.process.cache.ram_cache.total_bytes) * 100 FROM ATSCacheSampleMetrics"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation
threshold: 15
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
40 changes: 40 additions & 0 deletions alert-policies/apache-traffic-server/MemoryUsagePercent.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Name of the alert
name: Memory Usage Percent

# Description and details
description: |+
This alert is triggered if the memory usage exceeds 90% for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "SELECT average(host.memoryUsedPercent) AS 'Memory used %' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
30 changes: 30 additions & 0 deletions alert-policies/apache-traffic-server/SSLExpiredCertificate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Name of the alert
name: SSL Expired Certificates

# Description and details
description: |+
This alert is triggered when at least 1 SSL connection to an origin server with an expired original certificate is connected.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "SELECT latest(global.proxy.process.ssl.origin_server_expired_cert) AS 'ssl expired' FROM ATSSampleMetrics"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
33 changes: 33 additions & 0 deletions alert-policies/azure-machine-learning/Errors.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Errors

description: |+
This alert is triggered if the number of errors exceeds 20 within 10 minutes.
type: STATIC
nrql:
query: "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.Errors) AS 'Errors'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 20
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: ABOVE
threshold: 10
thresholdDuration: 600
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/bring-your-own-data/DataDrift.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Data Drift

description: |+
This alert is triggered if data drift exceeds 1 for 5 minutes.
type: STATIC
nrql:
query: "FROM Metric SELECT average(data_drift) where metricName = 'data_drift'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/bring-your-own-data/ModelDrift.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Model Drift

description: |+
This alert is triggered if model drift exceeds 1 for 5 minutes.
type: STATIC
nrql:
query: "FROM Metric SELECT sum(model_drift) where metricName = 'model_drift'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
36 changes: 36 additions & 0 deletions alert-policies/bring-your-own-data/Predictions.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Predictions

description: |+
This alert is triggered if prediction exceeds 1000 for 5 minutes.
type: STATIC
nrql:
query: "SELECT count(inference_id) as 'Predictions received' FROM InferenceData"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1000
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 850
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
32 changes: 32 additions & 0 deletions alert-policies/langchain-vectordb/APIResponseTime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Name of the alert
name: API Response Time(seconds)

# Description and details
description: |+
This alert is triggered when the response time exceeds 2 seconds for 1 minute.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "FROM LlmVectorSearch SELECT latest(response_time) WHERE applicationName IN ('LangChain observability trace') TIMESERIES"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 2
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
41 changes: 41 additions & 0 deletions alert-policies/langchain-vectordb/ZeroDocumentsRetrieved.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Name of the alert
name: Zero Documents Retrieved

# Description and details
description: |+
This alert is triggered when more than 20% of the searches return zero documents for 1 minute.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "SELECT filter(count(document_count), WHERE document_count=0) / count(document_count) * 100 from LlmVectorSearch WHERE applicationName IN ('LangChain observability trace')"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 50
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 20
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/langchain/error.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Errors

description: |+
This alert is triggered if the number of errors exceeds 2 within 5 minutes.
type: STATIC
nrql:
query: "FROM LlmChatCompletionSummary, LlmCompletion SELECT count(error_type) AS 'Error count' WHERE error_type IS NOT null"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 2
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
Loading

0 comments on commit de02597

Please sign in to comment.