Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
- [#6212](https://github.com/thanos-io/thanos/pull/6212) Query-Frontend: Disable scalar for vertical sharding.
- [#6107](https://github.com/thanos-io/thanos/pull/6082) Change default user id in container image from 0(root) to 1001
- [#6228](https://github.com/thanos-io/thanos/pull/6228) Conditionally generate debug messages in ProxyStore to avoid memory bloat.
- [#6244](https://github.com/thanos-io/thanos/pull/6244) mixin(Rule): Add rule evaluation failures to the Rule dashboard.

### Removed

Expand Down
124 changes: 100 additions & 24 deletions examples/dashboards/rule.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job, strategy) (rate(prometheus_rule_evaluations_total{job=~\"$job\"}[$__rate_interval]))",
"expr": "sum by (job, rule_group, strategy) (rate(prometheus_rule_evaluations_total{job=~\"$job\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ strategy }}",
"legendFormat": "{{ rule_group }} {{ strategy }}",
"legendLink": null,
"step": 10
}
Expand Down Expand Up @@ -116,23 +116,23 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job, strategy) (increase(prometheus_rule_group_iterations_missed_total{job=~\"$job\"}[$__rate_interval]))",
"expr": "sum by (job, rule_group, strategy) (rate(prometheus_rule_evaluation_failures_total{job=~\"$job\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ strategy }}",
"legendFormat": "{{ rule_group }} {{ strategy }}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Rule Group Evaluations Missed",
"title": "Rule Group Evaluations Failed",
"tooltip": {
"shared": false,
"sort": 0,
Expand Down Expand Up @@ -192,7 +192,83 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job, rule_group, strategy) (increase(prometheus_rule_group_iterations_missed_total{job=~\"$job\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ rule_group }} {{ strategy }}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Rule Group Evaluations Missed",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
Expand All @@ -208,7 +284,7 @@
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Rule Group Evlauations Too Slow",
"title": "Rule Group Evaluations Too Slow",
"tooltip": {
"shared": false,
"sort": 0,
Expand Down Expand Up @@ -261,7 +337,7 @@
"datasource": "$datasource",
"description": "Shows rate of dropped alerts.",
"fill": 1,
"id": 4,
"id": 5,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -338,7 +414,7 @@
"datasource": "$datasource",
"description": "Shows rate of alerts that successfully sent to alert manager.",
"fill": 10,
"id": 5,
"id": 6,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -417,7 +493,7 @@
"datasource": "$datasource",
"description": "Shows ratio of errors compared to the total number of sent alerts.",
"fill": 10,
"id": 6,
"id": 7,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -493,7 +569,7 @@
"datasource": "$datasource",
"description": "Shows how long has it taken to send alerts to alert manager.",
"fill": 1,
"id": 7,
"id": 8,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -623,7 +699,7 @@
"datasource": "$datasource",
"description": "Shows rate of queued alerts.",
"fill": 1,
"id": 8,
"id": 9,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -702,7 +778,7 @@
"datasource": "$datasource",
"description": "Shows ratio of dropped alerts compared to the total number of queued alerts.",
"fill": 10,
"id": 9,
"id": 10,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -790,7 +866,7 @@
"datasource": "$datasource",
"description": "Shows rate of handled Unary gRPC requests.",
"fill": 10,
"id": 10,
"id": 11,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -941,7 +1017,7 @@
"datasource": "$datasource",
"description": "Shows ratio of errors compared to the total number of handled requests.",
"fill": 10,
"id": 11,
"id": 12,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1017,7 +1093,7 @@
"datasource": "$datasource",
"description": "Shows how long has it taken to handle requests, in quantiles.",
"fill": 1,
"id": 12,
"id": 13,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1147,7 +1223,7 @@
"datasource": "$datasource",
"description": "Shows rate of handled Streamed gRPC requests.",
"fill": 10,
"id": 13,
"id": 14,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1298,7 +1374,7 @@
"datasource": "$datasource",
"description": "Shows ratio of errors compared to the total number of handled requests.",
"fill": 10,
"id": 14,
"id": 15,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1374,7 +1450,7 @@
"datasource": "$datasource",
"description": "Shows how long has it taken to handle requests, in quantiles",
"fill": 1,
"id": 15,
"id": 16,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1503,7 +1579,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 16,
"id": 17,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1619,7 +1695,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 17,
"id": 18,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1695,7 +1771,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 18,
"id": 19,
"legend": {
"avg": false,
"current": false,
Expand Down
18 changes: 13 additions & 5 deletions mixin/dashboards/rule.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ local utils = import '../lib/utils.libsonnet';
dashboard:: {
selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']),
dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']),
ruleGroupDimensions: std.join(', ', thanos.dashboard.dimensions + ['job', 'rule_group', 'strategy']),
},
},
grafanaDashboards+:: {
Expand All @@ -22,19 +23,26 @@ local utils = import '../lib/utils.libsonnet';
.addPanel(
g.panel('Rule Group Evaluations') +
g.queryPanel(
'sum by (%s) (rate(prometheus_rule_evaluations_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.rule.dashboard.dimensions, 'strategy']), thanos.rule.dashboard.selector],
'{{ strategy }}',
'sum by (%(ruleGroupDimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard,
'{{ rule_group }} {{ strategy }}',
)
)
.addPanel(
g.panel('Rule Group Evaluations Failed') +
g.queryPanel(
'sum by (%(ruleGroupDimensions)s) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard,
'{{ rule_group }} {{ strategy }}',
)
)
.addPanel(
g.panel('Rule Group Evaluations Missed') +
g.queryPanel(
'sum by (%s) (increase(prometheus_rule_group_iterations_missed_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.rule.dashboard.dimensions, 'strategy']), thanos.rule.dashboard.selector],
'{{ strategy }}',
'sum by (%(ruleGroupDimensions)s) (increase(prometheus_rule_group_iterations_missed_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard,
'{{ rule_group }} {{ strategy }}',
)
)
.addPanel(
g.panel('Rule Group Evlauations Too Slow') +
g.panel('Rule Group Evaluations Too Slow') +
g.queryPanel(
|||
(
Expand Down