Skip to content

Commit

Permalink
Add OperatorConditionsUnhealthy alerts (#3144)
Browse files Browse the repository at this point in the history
* Add reason label to kubevirt_hco_system_health_status

Signed-off-by: João Vilaça <[email protected]>

* Add OperatorConditionsUnhealthy alerts

Signed-off-by: João Vilaça <[email protected]>

* Fix GetHCOMetric

Signed-off-by: João Vilaça <[email protected]>

---------

Signed-off-by: João Vilaça <[email protected]>
  • Loading branch information
machadovilaca authored Nov 11, 2024
1 parent 04327e1 commit 726acfe
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 25 deletions.
37 changes: 22 additions & 15 deletions controllers/hyperconverged/hyperconverged_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1014,8 +1014,6 @@ func (r *ReconcileHyperConverged) updateConditions(req *common.HcoRequest) {
req.Instance.Status.SystemHealthStatus = systemHealthStatus
req.StatusDirty = true
}

metrics.SetHCOMetricSystemHealthStatus(getNumericalHealthStatus(systemHealthStatus))
}

func (r *ReconcileHyperConverged) setLabels(req *common.HcoRequest) {
Expand Down Expand Up @@ -1071,33 +1069,42 @@ func (r *ReconcileHyperConverged) detectTaintedConfiguration(req *common.HcoRequ
}

func (r *ReconcileHyperConverged) getSystemHealthStatus(conditions common.HcoConditions) string {
if isSystemHealthStatusError(conditions) {
if isError, reason := isSystemHealthStatusError(conditions); isError {
metrics.SetHCOSystemError(reason)
return systemHealthStatusError
}

if isSystemHealthStatusWarning(conditions) {
if isWarning, reason := isSystemHealthStatusWarning(conditions); isWarning {
metrics.SetHCOSystemWarning(reason)
return systemHealthStatusWarning
}

metrics.SetHCOSystemHealthy()
return systemHealthStatusHealthy
}

func isSystemHealthStatusError(conditions common.HcoConditions) bool {
return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionAvailable) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionDegraded)
}
func isSystemHealthStatusError(conditions common.HcoConditions) (bool, string) {
if cond, found := conditions.GetCondition(hcov1beta1.ConditionDegraded); found && cond.Status == metav1.ConditionTrue {
return true, cond.Reason
}

func isSystemHealthStatusWarning(conditions common.HcoConditions) bool {
return !conditions.IsStatusConditionTrue(hcov1beta1.ConditionReconcileComplete) || conditions.IsStatusConditionTrue(hcov1beta1.ConditionProgressing)
if cond, found := conditions.GetCondition(hcov1beta1.ConditionAvailable); found && cond.Status != metav1.ConditionTrue {
return true, cond.Reason
}

return false, ""
}

func getNumericalHealthStatus(status string) float64 {
healthStatusCodes := map[string]float64{
systemHealthStatusHealthy: metrics.SystemHealthStatusHealthy,
systemHealthStatusWarning: metrics.SystemHealthStatusWarning,
systemHealthStatusError: metrics.SystemHealthStatusError,
func isSystemHealthStatusWarning(conditions common.HcoConditions) (bool, string) {
if cond, found := conditions.GetCondition(hcov1beta1.ConditionProgressing); found && cond.Status == metav1.ConditionTrue {
return true, cond.Reason
}

if cond, found := conditions.GetCondition(hcov1beta1.ConditionReconcileComplete); found && cond.Status != metav1.ConditionTrue {
return true, cond.Reason
}

return healthStatusCodes[status]
return false, ""
}

func getNumOfChangesJSONPatch(jsonPatch string) int {
Expand Down
4 changes: 2 additions & 2 deletions controllers/hyperconverged/hyperconverged_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3888,15 +3888,15 @@ func verifyHyperConvergedCRExistsMetricFalse() {
func verifySystemHealthStatusHealthy(hco *hcov1beta1.HyperConverged) {
ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusHealthy))

systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus()
systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus("healthy")
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusHealthy))
}

func verifySystemHealthStatusError(hco *hcov1beta1.HyperConverged) {
ExpectWithOffset(1, hco.Status.SystemHealthStatus).To(Equal(systemHealthStatusError))

systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus()
systemHealthStatusMetric, err := metrics.GetHCOMetricSystemHealthStatus(reconcileInit)
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, systemHealthStatusMetric).To(Equal(metrics.SystemHealthStatusError))
}
Expand Down
48 changes: 47 additions & 1 deletion hack/prom-rule-ci/prom-rules-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,7 @@ tests:
- labels: 'kubevirt_hyperconverged_operator_health_status{name="kubevirt-hyperconverged"}'
value: 2

# Test kubevirt_hco_misconfigured_descheduler
# Test kubevirt_hco_misconfigured_descheduler
- interval: 1m
input_series:
- series: 'kubevirt_hco_misconfigured_descheduler'
Expand Down Expand Up @@ -770,3 +770,49 @@ tests:
operator_health_impact: "critical"
kubernetes_operator_part_of: "kubevirt"
kubernetes_operator_component: "hyperconverged-cluster-operator"

# Test OperatorConditionsUnhealthy
- interval: 1m
input_series:
- series: 'kubevirt_hco_system_health_status{reason="SOME_ERROR"}'
values: "stale stale 2 stale"

- series: 'kubevirt_hco_system_health_status{reason="SOME_WARNING"}'
values: "stale stale stale stale 1 stale"

alert_rule_test:
- eval_time: 1m
alertname: OperatorConditionsUnhealthy
exp_alerts: [ ]

- eval_time: 1m
alertname: OperatorConditionsUnhealthy
exp_alerts: [ ]

- eval_time: 2m
alertname: OperatorConditionsUnhealthy
exp_alerts:
- exp_annotations:
description: "HCO and its secondary resources are in a critical state due to SOME_ERROR."
summary: "HCO and its secondary resources are in a critical state."
runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy"
exp_labels:
severity: "critical"
operator_health_impact: "critical"
kubernetes_operator_part_of: "kubevirt"
kubernetes_operator_component: "hyperconverged-cluster-operator"
reason: "SOME_ERROR"

- eval_time: 4m
alertname: OperatorConditionsUnhealthy
exp_alerts:
- exp_annotations:
description: "HCO and its secondary resources are in a warning state due to SOME_WARNING."
summary: "HCO and its secondary resources are in a warning state."
runbook_url: "https://kubevirt.io/monitoring/runbooks/OperatorConditionsUnhealthy"
exp_labels:
severity: "warning"
operator_health_impact: "warning"
kubernetes_operator_part_of: "kubevirt"
kubernetes_operator_component: "hyperconverged-cluster-operator"
reason: "SOME_WARNING"
20 changes: 14 additions & 6 deletions pkg/monitoring/metrics/operator_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,12 @@ var (
},
)

systemHealthStatus = operatormetrics.NewGauge(
systemHealthStatus = operatormetrics.NewGaugeVec(
operatormetrics.MetricOpts{
Name: "kubevirt_hco_system_health_status",
Help: "Indicates whether the system health status is healthy (0), warning (1), or error (2), by aggregating the conditions of HCO and its secondary resources",
},
[]string{"reason"},
)
)

Expand Down Expand Up @@ -117,15 +118,22 @@ func IsHCOMetricHyperConvergedExists() (bool, error) {
return value == hyperConvergedExists, nil
}

// SetHCOMetricSystemHealthStatus sets the gauge to status
func SetHCOMetricSystemHealthStatus(status float64) {
systemHealthStatus.Set(status)
func SetHCOSystemHealthy() {
systemHealthStatus.WithLabelValues("healthy").Set(SystemHealthStatusHealthy)
}

func SetHCOSystemWarning(reason string) {
systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusWarning)
}

func SetHCOSystemError(reason string) {
systemHealthStatus.WithLabelValues(reason).Set(SystemHealthStatusError)
}

// GetHCOMetricSystemHealthStatus returns current value of gauge. If error is not nil then value is undefined
func GetHCOMetricSystemHealthStatus() (float64, error) {
func GetHCOMetricSystemHealthStatus(reason string) (float64, error) {
dto := &ioprometheusclient.Metric{}
err := systemHealthStatus.Write(dto)
err := systemHealthStatus.WithLabelValues(reason).Write(dto)
value := dto.Gauge.GetValue()

if err != nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/monitoring/rules/alerts/alerts.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ const (
func Register(operatorRegistry *operatorrules.Registry) error {
alerts := [][]promv1.Rule{
operatorAlerts(),
healthAlerts(),
}

runbookURLTemplate := getRunbookURLTemplate()
Expand Down
35 changes: 35 additions & 0 deletions pkg/monitoring/rules/alerts/health_alerts.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package alerts

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

func healthAlerts() []promv1.Rule {
return []promv1.Rule{
{
Alert: "OperatorConditionsUnhealthy",
Expr: intstr.FromString("kubevirt_hco_system_health_status == 2"),
Annotations: map[string]string{
"description": "HCO and its secondary resources are in a critical state due to {{ $labels.reason }}.",
"summary": "HCO and its secondary resources are in a critical state.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "OperatorConditionsUnhealthy",
Expr: intstr.FromString("kubevirt_hco_system_health_status == 1"),
Annotations: map[string]string{
"description": "HCO and its secondary resources are in a warning state due to {{ $labels.reason }}.",
"summary": "HCO and its secondary resources are in a warning state.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
},
},
}
}
6 changes: 5 additions & 1 deletion tests/func-tests/hco_prometheus_route.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,11 @@ func (hcoCli HCOPrometheusClient) GetHCOMetric(ctx context.Context, query string
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, query) {
res, err := strconv.ParseFloat(strings.TrimSpace(strings.TrimPrefix(line, query)), 64)
parts := strings.Fields(line)
if len(parts) < 2 {
return 0, fmt.Errorf("metric line does not contain a value")
}
res, err := strconv.ParseFloat(strings.TrimSpace(parts[1]), 64)
if err != nil {
return 0, fmt.Errorf("error converting %s to int: %v\n", line, err)
}
Expand Down

0 comments on commit 726acfe

Please sign in to comment.