Merge pull request #2245 from newrelic/release

Release - 01/18/2024
newrelic · Jan 18, 2024 · aa2aebc · aa2aebc
2 parents 55aa613 + 27fa8a2
commit aa2aebc
Show file tree

Hide file tree

Showing 168 changed files with 2,422 additions and 417 deletions.
diff --git a/.github/workflows/pr-open.yml b/.github/workflows/pr-open.yml
diff --git a/alert-policies/amazon-backup/HighBackupJobFailure.yml b/alert-policies/amazon-backup/HighBackupJobFailure.yml
@@ -0,0 +1,34 @@
+name: High Backup Job Failure
+
+description: |+
+  This alert is triggered if the number of Backup Job Failure exceeds 10 for 10 minutes.
+
+type: STATIC
+nrql:
+  query: "SELECT sum(`aws.backup.NumberOfBackupJobsFailed`) as 'Query' FROM Metric"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 10
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 600
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+  # Adding a Warning threshold is optional
+  - priority: WARNING
+    operator: ABOVE
+    threshold: 5
+    thresholdDuration: 600
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/amazon-backup/HighCopyJobsFailure.yml b/alert-policies/amazon-backup/HighCopyJobsFailure.yml
@@ -0,0 +1,34 @@
+name: High Copy Job Failure
+
+description: |+
+  This alert is triggered if the number of Copy Job Failure exceeds 10 for 10 minutes.
+
+type: STATIC
+nrql:
+  query: "SELECT sum(`aws.backup.NumberOfCopyJobsFailed`) as 'Query' FROM Metric"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 10
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 600
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+  # Adding a Warning threshold is optional
+  - priority: WARNING
+    operator: ABOVE
+    threshold: 5
+    thresholdDuration: 600
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/amazon-backup/HighRestoreJobsFailure.yml.yml b/alert-policies/amazon-backup/HighRestoreJobsFailure.yml.yml
@@ -0,0 +1,34 @@
+name: High Restore Job Failure
+
+description: |+
+  This alert is triggered if the number of Restore Job Failure exceeds 10 for 10 minutes.
+
+type: STATIC
+nrql:
+  query: "SELECT sum(`aws.backup.NumberOfRestoreJobsFailed`) as 'Query' FROM Metric"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 10
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 600
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+  # Adding a Warning threshold is optional
+  - priority: WARNING
+    operator: ABOVE
+    threshold: 5
+    thresholdDuration: 600
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/aws-iot-twinmaker/HighComponentTypeCreationFailure.yml b/alert-policies/aws-iot-twinmaker/HighComponentTypeCreationFailure.yml
@@ -0,0 +1,34 @@
+name: High Component Type Creation Failure
+
+description: |+
+  This alert is triggered if the number of component type creation failure exceeds 10 for 5 minutes.
+
+type: STATIC
+nrql:
+  query: "SELECT sum(`aws.iottwinmaker.ComponentTypeCreationFailure`) as 'Query' FROM Metric"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 10
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+  # Adding a Warning threshold is optional
+  - priority: WARNING
+    operator: ABOVE
+    threshold: 5
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/aws-iot-twinmaker/HighEntityCreationFailure.yml b/alert-policies/aws-iot-twinmaker/HighEntityCreationFailure.yml
@@ -0,0 +1,34 @@
+name: High Entity Creation Failure
+
+description: |+
+  This alert is triggered if the number of entity creation failure exceeds 10 for 5 minutes.
+
+type: STATIC
+nrql:
+  query: "SELECT sum(`aws.iottwinmaker.EntityCreationFailure`) as 'Query' FROM Metric"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 10
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+  # Adding a Warning threshold is optional
+  - priority: WARNING
+    operator: ABOVE
+    threshold: 5
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/pinecone-prometheus/PineconeIndexFullness.yml b/alert-policies/pinecone-prometheus/PineconeIndexFullness.yml
@@ -0,0 +1,28 @@
+name: Pinecone Index Fullness
+
+description: |+
+  - This metric indicates the index's fullness on a scale from 0 to 1
+  - An alert is triggered if the value exceeds the 80% threshold
+  - Resolution: If it surpasses 80%, we need to add another replica or increase the pod size
+type: STATIC
+nrql:
+  query: "FROM Metric SELECT average(pinecone_index_fullness) AS '(%) index fullness ' WHERE instrumentation.name = 'remote-write' and instrumentation.provider = 'prometheus' LIMIT MAX "
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 0.8
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/pinecone-prometheus/PineconeRequestsErrors.yml b/alert-policies/pinecone-prometheus/PineconeRequestsErrors.yml
@@ -0,0 +1,27 @@
+name: Pinecone Request Errors
+
+description: |+
+  - This metric displays the total count of data plane calls executed by clients that resulted in errors
+  - An alert is triggered if the value exceeds 0
+type: STATIC
+nrql:
+  query: "FROM Metric SELECT latest(pinecone_request_error_count_total) AS 'request errors' WHERE instrumentation.name = 'remote-write' and instrumentation.provider = 'prometheus' LIMIT MAX"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 1
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/pinecone-prometheus/PineconeRequestsLatency.yml b/alert-policies/pinecone-prometheus/PineconeRequestsLatency.yml
@@ -0,0 +1,29 @@
+name: Pinecone Request Latency
+
+description: |+
+  - This metric illustrates the server-side processing latency distribution for
+  Pinecone data plane calls
+  - An alert is triggered if the 50th percentile exceeds 100 ms
+  - Resolution: If it surpasses 100 ms, we need to add another replica
+type: STATIC
+nrql:
+  query: "FROM Metric SELECT percentile(pinecone_request_latency_seconds, 50) * 1000 as 'requests latency' WHERE instrumentation.name = 'remote-write' and instrumentation.provider = 'prometheus' LIMIT MAX"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 100
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/dashboards/adobe-commerce-business-insights/adobe-commerce-business-insights.json b/dashboards/adobe-commerce-business-insights/adobe-commerce-business-insights.json
@@ -3544,7 +3544,7 @@
 			  "nrqlQueries": [
 				{
 				  "accountIds": [],
-				  "query": "SELECT latest(`apache.server.scoreboard.totalWorkers`) * uniqueCount(entity.name) as 'Total Workers', latest(`apache.server.idleWorkers`) * uniqueCount(entity.name) as 'Idle Workers', latest(`apache.server.busyWorkers`) * uniqueCount(entity.name) as 'Busy Workers' FROM Metric WHERE `metricName` IN ('apache.server.scoreboard.totalWorkers', 'apache.server.idleWorkers', 'apache.server.busyWorkers') TIMESERIES"
+				  "query": "SELECT (latest(server.scoreboard.totalWorkers) * uniqueCount(entityName)) AS `Total Workers`, (latest(server.idleWorkers) * uniqueCount(entityName)) AS `Idle Workers`, (latest(server.busyWorkers) * uniqueCount(entityName)) AS `Busy Workers` FROM ApacheSample TIMESERIES"
 				}
 			  ],
 			  "platformOptions": {
@@ -3593,7 +3593,7 @@
 			  "nrqlQueries": [
 				{
 				  "accountId": 0,
-				  "query": "SELECT latest(host.cpuPercent) AS 'CPU used %' FROM Metric TIMESERIES"
+				  "query": "SELECT latest(cpuPercent) AS `CPU used %` FROM SystemSample TIMESERIES"
 				}
 			  ],
 			  "platformOptions": {
@@ -3623,7 +3623,7 @@
 			  "nrqlQueries": [
 				{
 				  "accountId": 0,
-				  "query": "SELECT latest(host.memoryUsedPercent) AS 'Memory used %' FROM Metric TIMESERIES"
+				  "query": "SELECT latest(memoryUsedPercent) AS `Memory used %` FROM SystemSample TIMESERIES"
 				}
 			  ],
 			  "platformOptions": {
@@ -3653,7 +3653,7 @@
 			  "nrqlQueries": [
 				{
 				  "accountId": 0,
-				  "query": "SELECT latest(host.disk.usedPercent) as 'Storage used %' FROM Metric TIMESERIES"
+				  "query": "SELECT latest(diskUsedPercent) AS `Storage used %` FROM StorageSample TIMESERIES"
 				}
 			  ],
 			  "platformOptions": {
@@ -3680,7 +3680,7 @@
 			  "nrqlQueries": [
 				{
 				  "accountId": 0,
-				  "query": "SELECT latest(host.disk.usedPercent) as 'Used %' FROM Metric FACET device"
+				  "query": "SELECT latest(diskUsedPercent) AS `Used %` FROM StorageSample FACET device"
 				}
 			  ],
 			  "platformOptions": {