diff --git a/controllers/hyperconverged/hyperconverged_controller.go b/controllers/hyperconverged/hyperconverged_controller.go index 65fd3058e7..a5c4bf384f 100644 --- a/controllers/hyperconverged/hyperconverged_controller.go +++ b/controllers/hyperconverged/hyperconverged_controller.go @@ -1014,8 +1014,6 @@ func (r *ReconcileHyperConverged) updateConditions(req *common.HcoRequest) { req.Instance.Status.SystemHealthStatus = systemHealthStatus req.StatusDirty = true } - - metrics.SetHCOMetricSystemHealthStatus(getNumericalHealthStatus(systemHealthStatus)) } func (r *ReconcileHyperConverged) setLabels(req *common.HcoRequest) { @@ -1071,33 +1069,42 @@ func (r *ReconcileHyperConverged) detectTaintedConfiguration(req *common.HcoRequ } func (r *ReconcileHyperConverged) getSystemHealthStatus(conditions common.HcoConditions) string { - if isSystemHealthStatusError(conditions) { + if isError, reason := isSystemHealthStatusError(conditions); isError { + metrics.SetHCOSystemError(reason) return systemHealthStatusError } - if isSystemHealthStatusWarning(conditions) { + if isWarning, reason := isSystemHealthStatusWarning(conditions); isWarning { + metrics.SetHCOSystemWarning(reason) return systemHealthStatusWarning } + metrics.SetHCOSystemHealthy() return systemHealthStatusHealthy } -func isSystemHealthStatusError(conditions common.HcoConditions) bool { - return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionAvailable) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionDegraded) -} +func isSystemHealthStatusError(conditions common.HcoConditions) (bool, string) { + if cond, found := conditions.GetCondition(hcov1beta1.ConditionDegraded); found && cond.Status == metav1.ConditionTrue { + return true, cond.Reason + } -func isSystemHealthStatusWarning(conditions common.HcoConditions) bool { - return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionReconcileComplete) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionProgressing) + if cond, found := conditions.GetCondition(hcov1beta1.ConditionAvailable); found && cond.Status != metav1.ConditionTrue { + return true, cond.Reason + } + + return false, "" } -func getNumericalHealthStatus(status string) float64 { - healthStatusCodes := map[string]float64{ - systemHealthStatusHealthy: metrics.SystemHealthStatusHealthy, - systemHealthStatusWarning: metrics.SystemHealthStatusWarning, - systemHealthStatusError: metrics.SystemHealthStatusError, +func isSystemHealthStatusWarning(conditions common.HcoConditions) (bool, string) { + if cond, found := conditions.GetCondition(hcov1beta1.ConditionProgressing); found && cond.Status == metav1.ConditionTrue { + return true, cond.Reason + } + + if cond, found := conditions.GetCondition(hcov1beta1.ConditionReconcileComplete); found && cond.Status != metav1.ConditionTrue { + return true, cond.Reason } - return healthStatusCodes[status] + return false, "" } func getNumOfChangesJSONPatch(jsonPatch string) int { diff --git a/controllers/hyperconverged/hyperconverged_controller_test.go b/controllers/hyperconverged/hyperconverged_controller_test.go index d1a32c7dc7..de36cee34a 100644 --- a/controllers/hyperconverged/hyperconverged_controller_test.go +++ b/controllers/hyperconverged/hyperconverged_controller_test.go @@ -3888,7 +3888,7 @@ func verifyHyperConvergedCRExistsMetricFalse() { func verifySystemHealthStatusHealthy(hco *hcov1beta1.HyperConverged) { ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusHealthy)) - systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus() + systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus("healthy") ExpectWithOffset(1, err).ToNot(HaveOccurred()) ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusHealthy)) } @@ -3896,7 +3896,7 @@ func verifySystemHealthStatusHealthy(hco *hcov1beta1.HyperConverged) { func verifySystemHealthStatusError(hco *hcov1beta1.HyperConverged) { ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusError)) - systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus() + systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus(reconcileInit) ExpectWithOffset(1, err).ToNot(HaveOccurred()) ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusError)) } diff --git a/hack/prom-rule-ci/prom-rules-tests.yaml b/hack/prom-rule-ci/prom-rules-tests.yaml index 49decf236e..7f306c9e0f 100755 --- a/hack/prom-rule-ci/prom-rules-tests.yaml +++ b/hack/prom-rule-ci/prom-rules-tests.yaml @@ -691,7 +691,7 @@ tests: - labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}' value: 2 - # Test kubevirt_hco_misconfigured_descheduler +# Test kubevirt_hco_misconfigured_descheduler - interval: 1m input_series: - series: 'kubevirt_hco_misconfigured_descheduler' @@ -770,3 +770,49 @@ tests: operator_health_impact: "critical" kubernetes_operator_part_of: "kubevirt" kubernetes_operator_component: "hyperconverged-cluster-operator" + +# Test OperatorConditionsUnhealthy +- interval: 1m + input_series: + - series: 'kubevirt_hco_system_health_status{reason="SOME_ERROR"}' + values: "stale stale 2 stale" + + - series: 'kubevirt_hco_system_health_status{reason="SOME_WARNING"}' + values: "stale stale stale stale 1 stale" + + alert_rule_test: + - eval_time: 1m + alertname: OperatorConditionsUnhealthy + exp_alerts: [ ] + + - eval_time: 1m + alertname: OperatorConditionsUnhealthy + exp_alerts: [ ] + + - eval_time: 2m + alertname: OperatorConditionsUnhealthy + exp_alerts: + - exp_annotations: + description: "HCO and its secondary resources are in a critical state due to SOME_ERROR." + summary: "HCO and its secondary resources are in a critical state." + runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy" + exp_labels: + severity: "critical" + operator_health_impact: "critical" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "hyperconverged-cluster-operator" + reason: "SOME_ERROR" + + - eval_time: 4m + alertname: OperatorConditionsUnhealthy + exp_alerts: + - exp_annotations: + description: "HCO and its secondary resources are in a warning state due to SOME_WARNING." + summary: "HCO and its secondary resources are in a warning state." + runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy" + exp_labels: + severity: "warning" + operator_health_impact: "warning" + kubernetes_operator_part_of: "kubevirt" + kubernetes_operator_component: "hyperconverged-cluster-operator" + reason: "SOME_WARNING" diff --git a/pkg/monitoring/metrics/operator_metrics.go b/pkg/monitoring/metrics/operator_metrics.go index f51dc69ec0..cf6463bf7e 100644 --- a/pkg/monitoring/metrics/operator_metrics.go +++ b/pkg/monitoring/metrics/operator_metrics.go @@ -52,11 +52,12 @@ var ( }, ) - systemHealthStatus = operatormetrics.NewGauge( + systemHealthStatus = operatormetrics.NewGaugeVec( operatormetrics.MetricOpts{ Name: "kubevirt_hco_system_health_status", Help: "Indicates whether the system health status is healthy (0), warning (1), or error (2), by aggregating the conditions of HCO and its secondary resources", }, + []string{"reason"}, ) ) @@ -117,15 +118,22 @@ func IsHCOMetricHyperConvergedExists() (bool, error) { return value == hyperConvergedExists, nil } -// SetHCOMetricSystemHealthStatus sets the gauge to status -func SetHCOMetricSystemHealthStatus(status float64) { - systemHealthStatus.Set(status) +func SetHCOSystemHealthy() { + systemHealthStatus.WithLabelValues("healthy").Set(SystemHealthStatusHealthy) +} + +func SetHCOSystemWarning(reason string) { + systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusWarning) +} + +func SetHCOSystemError(reason string) { + systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusError) } // GetHCOMetricSystemHealthStatus returns current value of gauge. If error is not nil then value is undefined -func GetHCOMetricSystemHealthStatus() (float64, error) { +func GetHCOMetricSystemHealthStatus(reason string) (float64, error) { dto := &ioprometheusclient.Metric{} - err := systemHealthStatus.Write(dto) + err := systemHealthStatus.WithLabelValues(reason).Write(dto) value := dto.Gauge.GetValue() if err != nil { diff --git a/pkg/monitoring/rules/alerts/alerts.go b/pkg/monitoring/rules/alerts/alerts.go index 347cb0ae45..1c13b0f198 100644 --- a/pkg/monitoring/rules/alerts/alerts.go +++ b/pkg/monitoring/rules/alerts/alerts.go @@ -23,6 +23,7 @@ const ( func Register(operatorRegistry *operatorrules.Registry) error { alerts := [][]promv1.Rule{ operatorAlerts(), + healthAlerts(), } runbookURLTemplate := getRunbookURLTemplate() diff --git a/pkg/monitoring/rules/alerts/health_alerts.go b/pkg/monitoring/rules/alerts/health_alerts.go new file mode 100644 index 0000000000..dcc3d27719 --- /dev/null +++ b/pkg/monitoring/rules/alerts/health_alerts.go @@ -0,0 +1,35 @@ +package alerts + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" +) + +func healthAlerts() []promv1.Rule { + return []promv1.Rule{ + { + Alert: "OperatorConditionsUnhealthy", + Expr: intstr.FromString("kubevirt_hco_system_health_status == 2"), + Annotations: map[string]string{ + "description": "HCO and its secondary resources are in a critical state due to {{ $labels.reason }}.", + "summary": "HCO and its secondary resources are in a critical state.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "OperatorConditionsUnhealthy", + Expr: intstr.FromString("kubevirt_hco_system_health_status == 1"), + Annotations: map[string]string{ + "description": "HCO and its secondary resources are in a warning state due to {{ $labels.reason }}.", + "summary": "HCO and its secondary resources are in a warning state.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "warning", + }, + }, + } +} diff --git a/tests/func-tests/hco_prometheus_route.go b/tests/func-tests/hco_prometheus_route.go index 430f7789ac..5df5cb3dac 100644 --- a/tests/func-tests/hco_prometheus_route.go +++ b/tests/func-tests/hco_prometheus_route.go @@ -96,7 +96,11 @@ func (hcoCli HCOPrometheusClient) GetHCOMetric(ctx context.Context, query string for scanner.Scan() { line := scanner.Text() if strings.HasPrefix(line, query) { - res, err := strconv.ParseFloat(strings.TrimSpace(strings.TrimPrefix(line, query)), 64) + parts := strings.Fields(line) + if len(parts) < 2 { + return 0, fmt.Errorf("metric line does not contain a value") + } + res, err := strconv.ParseFloat(strings.TrimSpace(parts[1]), 64) if err != nil { return 0, fmt.Errorf("error converting %s to int: %v\n", line, err) }