Merge branch 'main' into NR-149661-ATS

newrelic · Sep 11, 2023 · 1029c3d · 1029c3d
2 parents e48dd8a + 978b27a
commit 1029c3d
Show file tree

Hide file tree

Showing 97 changed files with 4,009 additions and 114 deletions.
diff --git a/alert-policies/bring-your-own-data/DataDrift.yml b/alert-policies/bring-your-own-data/DataDrift.yml
@@ -0,0 +1,27 @@
+name: Data Drift
+
+description: |+
+  This alert is triggered if data drift exceeds 1 for 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM Metric SELECT average(data_drift) where metricName = 'data_drift'"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 1
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/bring-your-own-data/ModelDrift.yml b/alert-policies/bring-your-own-data/ModelDrift.yml
@@ -0,0 +1,27 @@
+name: Model Drift
+
+description: |+
+  This alert is triggered if model drift exceeds 1 for 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM Metric SELECT sum(model_drift) where metricName = 'model_drift'"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 1
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/bring-your-own-data/Predictions.yml b/alert-policies/bring-your-own-data/Predictions.yml
@@ -0,0 +1,36 @@
+name: Predictions
+
+description: |+
+  This alert is triggered if prediction exceeds 1000 for 5 minutes.
+type: STATIC
+nrql:
+  query: "SELECT count(inference_id) as 'Predictions received' FROM InferenceData"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 1000
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+  - priority: WARNING
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 850
+    # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/langchain-vectordb/APIResponseTime.yml b/alert-policies/langchain-vectordb/APIResponseTime.yml
@@ -0,0 +1,32 @@
+# Name of the alert
+name: API Response Time(seconds)
+
+# Description and details
+description: |+
+  This alert is triggered when the response time exceeds 2 seconds for 1 minute.
+
+# Type of alert
+type: STATIC
+
+# NRQL query
+nrql:
+  query: "FROM LlmVectorSearch SELECT latest(response_time) WHERE applicationName IN ('LangChain observability trace') TIMESERIES"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 2
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 120
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/langchain-vectordb/ZeroDocumentsRetrieved.yml b/alert-policies/langchain-vectordb/ZeroDocumentsRetrieved.yml
@@ -0,0 +1,41 @@
+# Name of the alert
+name: Zero Documents Retrieved
+
+# Description and details
+description: |+
+  This alert is triggered when more than 20% of the searches return zero documents for 1 minute.
+
+# Type of alert
+type: STATIC
+
+# NRQL query
+nrql:
+  query: "SELECT filter(count(document_count), WHERE document_count=0) / count(document_count) * 100 from LlmVectorSearch WHERE applicationName IN ('LangChain observability trace')"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 50
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 120
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+  - priority: WARNING
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 20
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 120
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/langchain/error.yml b/alert-policies/langchain/error.yml
@@ -0,0 +1,27 @@
+name: Errors
+
+description: |+
+  This alert is triggered if the number of errors exceeds 2 within 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM LlmChatCompletionSummary, LlmCompletion SELECT count(error_type) AS 'Error count' WHERE error_type IS NOT null"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 2
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/langchain/requestsPerModel.yml b/alert-policies/langchain/requestsPerModel.yml
@@ -0,0 +1,35 @@
+name: Requests per model
+
+description: |+
+  This alert is triggered if requests per model exceeds 1000 in 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM LlmChatCompletionSummary, LlmCompletion, LlmEmbedding SELECT count(request.model) AS 'Requests per model'"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 1000
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+  - priority: WARNING
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 900
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/langchain/responseTime.yml b/alert-policies/langchain/responseTime.yml
@@ -0,0 +1,35 @@
+name: Response time
+
+description: |+
+  This alert is triggered if response time exceeds 5 seconds in 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM LlmChatCompletionSummary, LlmCompletion, LlmEmbedding SELECT latest(response_time)/1000"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 15
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+  - priority: WARNING
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 10
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/llm-application/APIResponseTime.yml b/alert-policies/llm-application/APIResponseTime.yml
@@ -0,0 +1,41 @@
+# Name of the alert
+name: API Response Time(seconds)
+
+# Description and details
+description: |+
+  This alert is triggered when the response time exceeds 15 seconds for 1 minute.
+
+# Type of alert
+type: STATIC
+
+# NRQL query
+nrql:
+  query: "FROM LlmChatCompletionSummary, LlmCompletion, LlmEmbedding SELECT percentile(response_time, 95) TIMESERIES FACET entity.guid"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 15
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 120
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+  - priority: WARNING
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 10
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 120
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/llm-application/cost.yml b/alert-policies/llm-application/cost.yml
@@ -0,0 +1,41 @@
+# Name of the alert
+name: Cost
+
+# Description and details
+description: |+
+  This alert is triggered when the cost exceeds 10 USD for 1 minute.
+
+# Type of alert
+type: STATIC
+
+# NRQL query
+nrql:
+  query: "FROM LlmChatCompletionSummary, LlmCompletion SELECT filter(sum(usage.prompt_tokens) * 0.0300 / 1000, where response.model in ('gpt-4', 'gpt-4-0314', 'gpt-4-0613')) + filter(sum(usage.completion_tokens) * 0.0600 / 1000, where response.model LIKE '%gpt-4%' AND respsonse.model NOT LIKE '%32k%') + filter(sum(usage.prompt_tokens) * 0.0600 / 1000, where response.model LIKE '%gpt-4-32k%') + filter(sum(usage.completion_tokens) * 0.1200 / 1000, where response.model LIKE '%gpt-4-32k%') + filter(sum(usage.total_tokens) * 0.0020 / 1000, where response.model LIKE '%gpt-3.5-turbo%') + filter(sum(usage.total_tokens) * 0.0200 / 1000, where response.model LIKE '%davinci%') + filter(sum(usage.total_tokens) * 0.0020 / 1000, where response.model LIKE '%curie%') + filter(sum(usage.total_tokens) * 0.0005 / 1000, where response.model LIKE '%babbage%') + filter(sum(usage.total_tokens) * 0.0004 / 1000, where response.model LIKE '%ada%') as 'Cost (USD)' FACET entity.guid"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 10
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 120
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+  - priority: WARNING
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 8
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 120
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/dashboards/langchain-vectordb/langchain-vectordb-01.png b/dashboards/langchain-vectordb/langchain-vectordb-01.png
diff --git a/dashboards/langchain-vectordb/langchain-vectordb-02.png b/dashboards/langchain-vectordb/langchain-vectordb-02.png
diff --git a/dashboards/langchain-vectordb/langchain-vectordb-03.png b/dashboards/langchain-vectordb/langchain-vectordb-03.png
diff --git a/dashboards/langchain-vectordb/langchain-vectordb-04.png b/dashboards/langchain-vectordb/langchain-vectordb-04.png
diff --git a/dashboards/langchain-vectordb/langchain-vectordb-05.png b/dashboards/langchain-vectordb/langchain-vectordb-05.png