Merge branch 'main' into add-redis-quickstart

newrelic · Sep 28, 2023 · d684d27 · d684d27
2 parents 24e4175 + 8ff6a39
commit d684d27
Show file tree

Hide file tree

Showing 35 changed files with 3,437 additions and 1,467 deletions.
diff --git a/alert-policies/amazon-appstream/HighCapacityUtilization.yml b/alert-policies/amazon-appstream/HighCapacityUtilization.yml
@@ -0,0 +1,27 @@
+name: High Capacity Utilization
+
+description: |+
+  This alert is triggered when the Capacity Utilization is above 90%.
+
+type: STATIC
+nrql:
+  query: "SELECT average(`aws.appstream.CapacityUtilization`) as 'Query' FROM Metric"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 90
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 300
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/amazon-appstream/HighInsufficientCapacityErrors.yml b/alert-policies/amazon-appstream/HighInsufficientCapacityErrors.yml
@@ -0,0 +1,27 @@
+name: High Insufficient Capacity Errors
+
+description: |+
+  This alert is triggered when Insufficient Capacity Errors are above 10 in 10 minutes.
+
+type: STATIC
+nrql:
+  query: "SELECT count(`aws.appstream.InsufficientCapacityError`) as 'Query' FROM Metric"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 100
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 600
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/azure-machine-learning/ModelDeployFailed.yml b/alert-policies/azure-machine-learning/ModelDeployFailed.yml
@@ -0,0 +1,33 @@
+name: Model Deployment Failed
+
+description: |+
+  This alert is triggered if the number of Failure exceeds 20 within 10 minutes.
+type: STATIC
+nrql:
+  query: "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.ModelDeployFailed) AS 'ModelDeployFailed'"
+
+# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
+valueFunction: SINGLE_VALUE
+
+# List of Critical and Warning thresholds for the condition
+terms:
+  - priority: CRITICAL
+    # Operator used to compare against the threshold.
+    operator: ABOVE
+    # Value that triggers a violation
+    threshold: 20
+    # Time in seconds; 120 - 3600
+    thresholdDuration: 600
+    # How many data points must be in violation for the duration
+    thresholdOccurrences: ALL
+
+  # Adding a Warning threshold is optional
+  - priority: WARNING
+    operator: ABOVE
+    threshold: 10
+    thresholdDuration: 600
+    thresholdOccurrences: ALL
+
+# Duration after which a violation automatically closes
+# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
+violationTimeLimitSeconds: 86400
diff --git a/alert-policies/vmware-vsphere/cluster-overall-status.yml b/alert-policies/vmware-vsphere/cluster-overall-status.yml
@@ -0,0 +1,19 @@
+name: vSphere Cluster overallStatus = 'red'
+description: |+
+  This alert fires when a vSphere Cluster has an overall status = 'red' for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereClusterSample SELECT count(*) FACET datacenterName, displayName WHERE overallStatus = 'red'"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 0
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/datacenter-overall-status.yml b/alert-policies/vmware-vsphere/datacenter-overall-status.yml
@@ -0,0 +1,19 @@
+name: vSphere Datacenter overallStatus = 'red'
+description: |+
+  This alert fires when a vSphere Datacenter has an overall status = 'red' for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereDatacenterSample SELECT count(*) FACET datacenterName WHERE overallStatus = 'red'"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 0
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/datastore-accessible.yml b/alert-policies/vmware-vsphere/datastore-accessible.yml
@@ -0,0 +1,19 @@
+name: vSphere Datastore is not accessible
+description: |+
+  This alert fires when a vSphere Datastore is not accessible for longer than 5 minutes, indicating a loss of connectivity.
+type: STATIC
+nrql:
+  query: "FROM VSphereDatastoreSample SELECT count(*) FACET datacenterName, name, displayName WHERE accessible = 'false'"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 0
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/datastore-capacity-percent.yml b/alert-policies/vmware-vsphere/datastore-capacity-percent.yml
@@ -0,0 +1,19 @@
+name: vSphere Datastore high Capacity Utilization
+description: |+
+  This alert fires when a vSphere Datastore has capacity utilization % > 90 for longer than 10 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereDatastoreSample SELECT ((max(capacity) - max(freeSpace)) / max(capacity)) * 100 FACET datacenterName, name, displayName"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 90
+    thresholdDuration: 600
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/datastore-overall-status.yml b/alert-policies/vmware-vsphere/datastore-overall-status.yml
@@ -0,0 +1,19 @@
+name: vSphere Datastore overallStatus = 'red'
+description: |+
+  This alert fires when a vSphere Datastore has an overall status = 'red' for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereDatastoreSample SELECT count(*) FACET datacenterName, name, displayName WHERE overallStatus = 'red'"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 0
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/host-connection-state.yml b/alert-policies/vmware-vsphere/host-connection-state.yml
@@ -0,0 +1,19 @@
+name: vSphere Host connection lost
+description: |+
+  This alert fires when a vSphere Host is not responding to heartbeats for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereHostSample SELECT count(*) FACET datacenterName, clusterName, hypervisorHostname WHERE connectionState = 'notResponding'"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 0
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/host-cpu-percent.yml b/alert-policies/vmware-vsphere/host-cpu-percent.yml
@@ -0,0 +1,19 @@
+name: vSphere Host high CPU Utilization
+description: |+
+  This alert fires when a vSphere Host has a CPU utilization % > 90 for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereHostSample SELECT max(cpu.percent) FACET datacenterName, clusterName, hypervisorHostname"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 90
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/host-memory-percent.yml b/alert-policies/vmware-vsphere/host-memory-percent.yml
@@ -0,0 +1,19 @@
+name: vSphere Host high Memory Utilization
+description: |+
+  This alert fires when a vSphere Host has memory utilization % > 90 for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereHostSample SELECT (max(mem.usage) / max(mem.size)) * 100 FACET datacenterName, clusterName, hypervisorHostname"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 90
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/host-overall-status.yml b/alert-policies/vmware-vsphere/host-overall-status.yml
@@ -0,0 +1,19 @@
+name: vSphere Host overallStatus = 'red'
+description: |+
+  This alert fires when a vSphere Host has an overall status = 'red' for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereHostSample SELECT count(*) FACET datacenterName, clusterName, hypervisorHostname WHERE overallStatus = 'red'"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 0
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/resourcepool-cpu-percent.yml b/alert-policies/vmware-vsphere/resourcepool-cpu-percent.yml
@@ -0,0 +1,19 @@
+name: vSphere Resource Pool high CPU Utilization
+description: |+
+  This alert fires when a vSphere Resource Pool has a CPU utilization % > 90 for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereResourcePoolSample SELECT (max(cpu.overallUsage) / max(cpu.totalMHz)) * 100 FACET datacenterName, clusterName, resourcePoolName"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 90
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/resourcepool-memory-percent.yml b/alert-policies/vmware-vsphere/resourcepool-memory-percent.yml
@@ -0,0 +1,19 @@
+name: vSphere Resource Pool high Memory Utilization
+description: |+
+  This alert fires when a vSphere Resource Pool has memory utilization % > 90 for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereResourcePoolSample SELECT (max(mem.usage) / max(mem.size)) * 100 FACET datacenterName, clusterName, resourcePoolName"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 90
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/resourcepool-overall-status.yml b/alert-policies/vmware-vsphere/resourcepool-overall-status.yml
@@ -0,0 +1,19 @@
+name: vSphere Resource Pool overallStatus = 'red'
+description: |+
+  This alert fires when a vSphere Resource Pool has an overall status = 'red' for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereResourcePoolSample SELECT count(*) FACET datacenterName, clusterName, resourcePoolName WHERE overallStatus = 'red'"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 0
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200
diff --git a/alert-policies/vmware-vsphere/vm-overall-status.yml b/alert-policies/vmware-vsphere/vm-overall-status.yml
@@ -0,0 +1,19 @@
+name: vSphere Virtual Machine overallStatus = 'red'
+description: |+
+  This alert fires when a vSphere Virtual Machine has an overall status = 'red' for longer than 5 minutes.
+type: STATIC
+nrql:
+  query: "FROM VSphereVmSample SELECT count(*) FACET datacenterName, clusterName, displayName WHERE overallStatus = 'red'"
+valueFunction: SINGLE_VALUE 
+terms:
+  - priority: CRITICAL
+    operator: ABOVE
+    threshold: 0
+    thresholdDuration: 300
+    thresholdOccurrences: ALL
+signal:
+  aggregationDelay: 120
+  aggregationMethod: EVENT_FLOW
+  aggregationWindow: 60
+
+violationTimeLimitSeconds: 259200