Skip to content

Commit

Permalink
Merge branch 'main' into NR-149661-ATS
Browse files Browse the repository at this point in the history
  • Loading branch information
pkosanam authored Sep 11, 2023
2 parents e48dd8a + 978b27a commit 1029c3d
Show file tree
Hide file tree
Showing 97 changed files with 4,009 additions and 114 deletions.
27 changes: 27 additions & 0 deletions alert-policies/bring-your-own-data/DataDrift.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Data Drift

description: |+
This alert is triggered if data drift exceeds 1 for 5 minutes.
type: STATIC
nrql:
query: "FROM Metric SELECT average(data_drift) where metricName = 'data_drift'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/bring-your-own-data/ModelDrift.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Model Drift

description: |+
This alert is triggered if model drift exceeds 1 for 5 minutes.
type: STATIC
nrql:
query: "FROM Metric SELECT sum(model_drift) where metricName = 'model_drift'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
36 changes: 36 additions & 0 deletions alert-policies/bring-your-own-data/Predictions.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Predictions

description: |+
This alert is triggered if prediction exceeds 1000 for 5 minutes.
type: STATIC
nrql:
query: "SELECT count(inference_id) as 'Predictions received' FROM InferenceData"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1000
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 850
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
32 changes: 32 additions & 0 deletions alert-policies/langchain-vectordb/APIResponseTime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Name of the alert
name: API Response Time(seconds)

# Description and details
description: |+
This alert is triggered when the response time exceeds 2 seconds for 1 minute.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "FROM LlmVectorSearch SELECT latest(response_time) WHERE applicationName IN ('LangChain observability trace') TIMESERIES"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 2
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
41 changes: 41 additions & 0 deletions alert-policies/langchain-vectordb/ZeroDocumentsRetrieved.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Name of the alert
name: Zero Documents Retrieved

# Description and details
description: |+
This alert is triggered when more than 20% of the searches return zero documents for 1 minute.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "SELECT filter(count(document_count), WHERE document_count=0) / count(document_count) * 100 from LlmVectorSearch WHERE applicationName IN ('LangChain observability trace')"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 50
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 20
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/langchain/error.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Errors

description: |+
This alert is triggered if the number of errors exceeds 2 within 5 minutes.
type: STATIC
nrql:
query: "FROM LlmChatCompletionSummary, LlmCompletion SELECT count(error_type) AS 'Error count' WHERE error_type IS NOT null"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 2
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
35 changes: 35 additions & 0 deletions alert-policies/langchain/requestsPerModel.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Requests per model

description: |+
This alert is triggered if requests per model exceeds 1000 in 5 minutes.
type: STATIC
nrql:
query: "FROM LlmChatCompletionSummary, LlmCompletion, LlmEmbedding SELECT count(request.model) AS 'Requests per model'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1000
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 900
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
35 changes: 35 additions & 0 deletions alert-policies/langchain/responseTime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Response time

description: |+
This alert is triggered if response time exceeds 5 seconds in 5 minutes.
type: STATIC
nrql:
query: "FROM LlmChatCompletionSummary, LlmCompletion, LlmEmbedding SELECT latest(response_time)/1000"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 15
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
41 changes: 41 additions & 0 deletions alert-policies/llm-application/APIResponseTime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Name of the alert
name: API Response Time(seconds)

# Description and details
description: |+
This alert is triggered when the response time exceeds 15 seconds for 1 minute.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "FROM LlmChatCompletionSummary, LlmCompletion, LlmEmbedding SELECT percentile(response_time, 95) TIMESERIES FACET entity.guid"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 15
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
41 changes: 41 additions & 0 deletions alert-policies/llm-application/cost.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Name of the alert
name: Cost

# Description and details
description: |+
This alert is triggered when the cost exceeds 10 USD for 1 minute.
# Type of alert
type: STATIC

# NRQL query
nrql:
query: "FROM LlmChatCompletionSummary, LlmCompletion SELECT filter(sum(usage.prompt_tokens) * 0.0300 / 1000, where response.model in ('gpt-4', 'gpt-4-0314', 'gpt-4-0613')) + filter(sum(usage.completion_tokens) * 0.0600 / 1000, where response.model LIKE '%gpt-4%' AND respsonse.model NOT LIKE '%32k%') + filter(sum(usage.prompt_tokens) * 0.0600 / 1000, where response.model LIKE '%gpt-4-32k%') + filter(sum(usage.completion_tokens) * 0.1200 / 1000, where response.model LIKE '%gpt-4-32k%') + filter(sum(usage.total_tokens) * 0.0020 / 1000, where response.model LIKE '%gpt-3.5-turbo%') + filter(sum(usage.total_tokens) * 0.0200 / 1000, where response.model LIKE '%davinci%') + filter(sum(usage.total_tokens) * 0.0020 / 1000, where response.model LIKE '%curie%') + filter(sum(usage.total_tokens) * 0.0005 / 1000, where response.model LIKE '%babbage%') + filter(sum(usage.total_tokens) * 0.0004 / 1000, where response.model LIKE '%ada%') as 'Cost (USD)' FACET entity.guid"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 8
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 1029c3d

Please sign in to comment.