From 40be71066c57f116e5a0a5a3329ba521ef5ce0f8 Mon Sep 17 00:00:00 2001 From: Doua Vue Date: Tue, 17 Sep 2024 12:22:41 -0500 Subject: [PATCH] feat: Add new quickstart and alert policies for k8s otel --- .../ContainerCPUThrottling.yaml | 76 ++++++++++++++++++ .../ContainerHighCPUUtil.yaml | 76 ++++++++++++++++++ .../ContainerHighMemUtil.yaml | 76 ++++++++++++++++++ .../ContainerRestarting.yaml | 73 ++++++++++++++++++ .../ContainerWaiting.yaml | 73 ++++++++++++++++++ .../DaemonsetPodsMissing.yaml | 73 ++++++++++++++++++ .../DeploymentPodsMissing.yaml | 73 ++++++++++++++++++ .../JobFailedOtel.yaml | 77 +++++++++++++++++++ .../NodeHighAllocatableCPUUtil.yaml | 76 ++++++++++++++++++ .../NodeHighAllocatableMemUtil.yaml | 76 ++++++++++++++++++ .../NodeHighFSCapacityUtil.yaml | 76 ++++++++++++++++++ .../NodeIsNotReady.yaml | 77 +++++++++++++++++++ .../NodePodCapacity.yaml | 77 +++++++++++++++++++ .../PersistentVolumeErrors.yaml | 73 ++++++++++++++++++ .../kubernetes-opentelemetry/PodNotReady.yaml | 77 +++++++++++++++++++ .../PodNotScheduled.yaml | 76 ++++++++++++++++++ .../PodsFailingNamespace.yaml | 73 ++++++++++++++++++ .../StatefulsetPodsMissing.yaml | 73 ++++++++++++++++++ .../kubernetes-opentelemetry/config.yml | 54 +++++++++++++ quickstarts/kubernetes-opentelemetry/logo.svg | 1 + 20 files changed, 1406 insertions(+) create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/DaemonsetPodsMissing.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/DeploymentPodsMissing.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/JobFailedOtel.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/PersistentVolumeErrors.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/PodNotReady.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/PodNotScheduled.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml create mode 100644 quickstarts/kubernetes-opentelemetry/config.yml create mode 100644 quickstarts/kubernetes-opentelemetry/logo.svg diff --git a/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml b/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml new file mode 100644 index 0000000000..896c786d39 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml @@ -0,0 +1,76 @@ +name: Container cpu throttling is high +# Description and details +description: | + Alert when container is being throttled > 25% of the time for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(container_cpu_cfs_throttled_periods_total) / latest(container_cpu_cfs_periods_total)* 100 where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml b/alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml new file mode 100644 index 0000000000..625670c141 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml @@ -0,0 +1,76 @@ +name: Container high cpu utilization +# Description and details +description: | + Alert when the average container cpu utilization (vs. Limit) is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select average(container.cpu.utilization) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml b/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml new file mode 100644 index 0000000000..fe3db565c0 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml @@ -0,0 +1,76 @@ +name: Container high memory utilization +# Description and details +description: | + Alert when the average container memory utilization (vs. Limit) is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric max(container_memory_working_set_bytes) / filter(max(kube_pod_container_resource_limits), where resource = 'memory') where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml b/alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml new file mode 100644 index 0000000000..9df6519ebd --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml @@ -0,0 +1,73 @@ +name: Container is Restarting + +# Description and details +description: | + Alert when the container restart count is greater than 0 in a sliding 5 minute window + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select sum(kube_pod_container_status_restarts_total) where metricName = 'kube_pod_container_status_restarts_total' and k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') FACET k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml b/alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml new file mode 100644 index 0000000000..d47e2bc813 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml @@ -0,0 +1,73 @@ +name: Container is Waiting + +# Description and details +description: | + Alert when a container is Waiting for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select uniqueCount(k8s.pod.name) WHERE container_phase = 'waiting' and k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') FACET k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/DaemonsetPodsMissing.yaml b/alert-policies/kubernetes-opentelemetry/DaemonsetPodsMissing.yaml new file mode 100644 index 0000000000..3dd5ae52f3 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/DaemonsetPodsMissing.yaml @@ -0,0 +1,73 @@ +name: Daemonset is missing Pods + +# Description and details +description: | + Alert when Daemonset is missing Pods for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(kube_daemonset_status_desired_number_scheduled) - latest(kube_daemonset_status_number_ready) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.daemonset.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/DeploymentPodsMissing.yaml b/alert-policies/kubernetes-opentelemetry/DeploymentPodsMissing.yaml new file mode 100644 index 0000000000..0dc3e4f664 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/DeploymentPodsMissing.yaml @@ -0,0 +1,73 @@ +name: Deployment is missing Pods + +# Description and details +description: | + Alert when Deployment is missing Pods for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(kube_deployment_spec_replicas) - latest(kube_deployment_status_replicas) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.deployment.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/JobFailedOtel.yaml b/alert-policies/kubernetes-opentelemetry/JobFailedOtel.yaml new file mode 100644 index 0000000000..2b5c52374e --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/JobFailedOtel.yaml @@ -0,0 +1,77 @@ +name: Job Failed + +# Description and details +description: | + Alert when a Job reports a failed status + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select if(latest(kube_job_failed), uniqueCount(job_name), 0) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet job_name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + # - priority: CRITICAL + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 300 + # # How many data points must be in violation for the duration + # thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + - priority: WARNING + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 60 + # How many data points must be in violation for the duration + thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: null + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml new file mode 100644 index 0000000000..ecf61d4fe2 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml @@ -0,0 +1,76 @@ +name: Node allocatable cpu utilization is high +# Description and details +description: | + Alert when the average Node allocatable cpu utilization is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(k8s.node.cpu.utilization) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 900 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml new file mode 100644 index 0000000000..da0e55aa99 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml @@ -0,0 +1,76 @@ +name: Node allocatable memory utilization is high +# Description and details +description: | + Alert when the average Node allocatable memory utilization is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(k8s.node.memory.working_set) / filter(latest(kube_node_status_allocatable), WHERE resource = 'memory') where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 900 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml b/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml new file mode 100644 index 0000000000..a09b71d454 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml @@ -0,0 +1,76 @@ +name: Node root file system capacity utilization is high +# Description and details +description: | + Alert when the average Node root file system capacity utilization is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select max(k8s.node.filesystem.usage) / max(k8s.node.filesystem.capacity) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 900 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml b/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml new file mode 100644 index 0000000000..b46ee313d6 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml @@ -0,0 +1,77 @@ +name: Node is not ready + +# Description and details +description: | + Alert when a Node is not ready for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric if(latest(condition) = 'Ready', 0, 1) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: BELOW + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 300 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml b/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml new file mode 100644 index 0000000000..09c20296fd --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml @@ -0,0 +1,77 @@ +name: Node Pod count nearing capacity + +# Description and details +description: | + Alert when the Running pod count on a Node is > 90% of the Node's Pod Capacity for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "FROM Metric select filter(uniqueCount(k8s.pod.name), where phase = 'Running' AND (metricName = 'kube_pod_status_phase' AND kube_pod_status_phase ['latest'] = 1) and created_by_kind != 'Job' ) / filter(latest(kube_node_status_allocatable), WHERE resource = 'cpu' ) * 100 as 'Pod Capacity %' where k8s.node.name != '' and k8s.node.name is not null and k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + # - priority: CRITICAL + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 90 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 300 + # # How many data points must be in violation for the duration + # thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + - priority: WARNING + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 300 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/PersistentVolumeErrors.yaml b/alert-policies/kubernetes-opentelemetry/PersistentVolumeErrors.yaml new file mode 100644 index 0000000000..73fd69119f --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/PersistentVolumeErrors.yaml @@ -0,0 +1,73 @@ +name: Persistent Volume has errors + +# Description and details +description: | + Alert when Persistent Volume is in a Failed or Pending state for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select uniqueCount(persistentvolume) where phase in ('Failed','Pending') and k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet persistentvolume, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/PodNotReady.yaml b/alert-policies/kubernetes-opentelemetry/PodNotReady.yaml new file mode 100644 index 0000000000..1418e1189e --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/PodNotReady.yaml @@ -0,0 +1,77 @@ +name: Pod is not ready + +# Description and details +description: | + Alert when a Pod is not ready for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(kube_pod_status_ready) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: BELOW + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/PodNotScheduled.yaml b/alert-policies/kubernetes-opentelemetry/PodNotScheduled.yaml new file mode 100644 index 0000000000..38aea1d89d --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/PodNotScheduled.yaml @@ -0,0 +1,76 @@ +name: Pod cannot be scheduled + +# Description and details +description: | + Alert when a Pod cannot be scheduled for more than 5 minutes +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(kube_pod_status_scheduled) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') and metricName = 'kube_pod_status_scheduled' facet k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: BELOW + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml b/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml new file mode 100644 index 0000000000..e8926f955c --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml @@ -0,0 +1,73 @@ +name: More than 5 pods failing in namespace + +# Description and details +description: | + Alert when more than 5 pods are failing in a namespace for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select uniqueCount(k8s.pod.name) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') and phase = 'Failed' facet k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml b/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml new file mode 100644 index 0000000000..7452575bb6 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml @@ -0,0 +1,73 @@ +name: Statefulset is missing Pods + +# Description and details +description: | + Alert when Statefulset is missing Pods for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric latest(kube_statefulset_replicas) - latest(kube_statefulset_status_replicas_ready) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.statefulset.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/quickstarts/kubernetes-opentelemetry/config.yml b/quickstarts/kubernetes-opentelemetry/config.yml new file mode 100644 index 0000000000..0a2b17214f --- /dev/null +++ b/quickstarts/kubernetes-opentelemetry/config.yml @@ -0,0 +1,54 @@ +slug: kubernetes-opentelemetry +title: Kubernetes (OpenTelemetry) +description: | + ## Why monitor Kubernetes? + + Kubernetes is an open-source system for automating deployment, scaling, and, management of containerized applications. The New Relic Kubernetes monitoring quickstart gives you visibility into your Kubernetes clusters and workloads in minutes, whether your clusters are hosted on-premises or in the cloud. + + ### Kubernetes quickstart highlights + + The New Relic Kubernetes quickstart uses dashboards to proactively monitor your metrics, like: + + - resources used + - number of K8s objects + - namespaces per cluster + - pods by namespace + - container cpu usage + - container restarts + - missing pods by deployment + - node resource consumption, and more. + + This quickstart is also compatible with on-host integrations like: + + - Cassandra + - MySQL + - Apache, and more. + + ### New Relic + Kubernetes = Optimum performance monitoring + + The [New Relic Kubernetes](https://docs.newrelic.com/docs/integrations/kubernetes-integration/installation/kubernetes-integration-install-configure/) quickstart has multiple components that work together to give you end-to-end observability across your clusters. While you have the flexibility to deploy the components that you prefer, to achieve full observability, you need to install the complete package to monitor all metrics. Use our quickstart to generate a Kubernetes manifest and add Pixie for more fine-grained telemetry data. You can also do the installation with Pixie for fine-grained telemetry data. + Our quickstart monitors the aggregated core and memory usage across all nodes in your cluster. This allows you to meet resource requirements for optimal application performance. It also empowers you to track resource consumption, find pods that aren't running, monitor disk usage, and troubleshoot container restarts. The New Relic Kubernetes integration has dashboards and a cluster explorer that provide a multi-dimensional representation of a Kubernetes cluster from which you can explore your namespaces, deployments, nodes, pods, containers, and applications. Download the New Relic Kubernetes quickstart today to gain instant visibility into your Kubernetes services, clusters and workloads in minutes. +summary: | + Monitoring Kubernetes is crucial to gain instant visibility into Kubernetes clusters and workloads. Download New Relic Kubernetes quickstart to proactively monitor Kubernetes cluster health and capacity. +level: New Relic +authors: + - New Relic +documentation: + - name: Kubernetes installation docs + description: | + Kubernetes is an open-source container-orchestration system for automating + computer application deployment, scaling, and management. + url: >- + https://docs.newrelic.com/docs/integrations/host-integrations/host-integrations-list/kubernetes-monitoring-integration +dataSourceIds: + - kubernetes +keywords: + - kubernetes + - containers + - pixie + - k8s + - opentelemetry +dashboards: + - kubernetes +alertPolicies: + - kubernetes-opentelemetry diff --git a/quickstarts/kubernetes-opentelemetry/logo.svg b/quickstarts/kubernetes-opentelemetry/logo.svg new file mode 100644 index 0000000000..1efad8e7a6 --- /dev/null +++ b/quickstarts/kubernetes-opentelemetry/logo.svg @@ -0,0 +1 @@ + \ No newline at end of file