Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## master / unreleased

* [FEATURE] Added "Cortex / Rollout progress" dashboard. #289
* [ENHANCEMENT] Added `newCompactorStatefulSet()` function to create a custom statefulset for the compactor. #287
* [ENHANCEMENT] Added option to configure compactor job name used in dashboards and alerts. #287
* [BUGFIX] Fixed `CortexCompactorRunFailed` false positives. #288
Expand Down
1 change: 1 addition & 0 deletions cortex-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
(import 'dashboards/scaling.libsonnet') +
(import 'dashboards/writes.libsonnet') +
(import 'dashboards/slow-queries.libsonnet') +
(import 'dashboards/rollout-progress.libsonnet') +

(if std.member($._config.storage_engine, 'blocks')
then
Expand Down
66 changes: 66 additions & 0 deletions cortex-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,72 @@ local utils = import 'mixin-utils/utils.libsonnet';
tooltip: { sort: 2 }, // Sort descending.
},

newStatPanel(queries, legends='', unit='percentunit', thresholds=[], instant=false, novalue='')::
super.queryPanel(queries, legends) + {
type: 'stat',
targets: [
target {
instant: instant,
interval: '',

// Reset defaults from queryPanel().
format: null,
intervalFactor: null,
step: null,
}
for target in super.targets
],
fieldConfig: {
defaults: {
color: { mode: 'thresholds' },
decimals: 1,
thresholds: {
mode: 'absolute',
steps: thresholds,
},
noValue: novalue,
unit: unit,
},
overrides: [],
},
},

barGauge(queries, legends='', thresholds=[], unit='short', min=null, max=null)::
super.queryPanel(queries, legends) + {
type: 'bargauge',
targets: [
target {
// Reset defaults from queryPanel().
format: null,
intervalFactor: null,
step: null,
}
for target in super.targets
],
fieldConfig: {
defaults: {
color: { mode: 'thresholds' },
mappings: [],
max: max,
min: min,
thresholds: {
mode: 'absolute',
steps: thresholds,
},
unit: unit,
},
},
options: {
displayMode: 'basic',
orientation: 'horizontal',
reduceOptions: {
calcs: ['lastNotNull'],
fields: '',
values: false,
},
},
},

// Switches a panel from lines (default) to bars.
bars:: {
bars: true,
Expand Down
284 changes: 284 additions & 0 deletions cortex-mixin/dashboards/rollout-progress.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'dashboard-utils.libsonnet') {
local config = {
namespace_matcher: $.namespaceMatcher(),
gateway_job_matcher: $.jobMatcher($._config.job_names.gateway),
gateway_write_routes_regex: 'api_(v1|prom)_push',
gateway_read_routes_regex: '(prometheus|api_prom)_api_v1_.+',
all_services_regex: std.join('|', ['cortex-gw', 'distributor', 'ingester', 'query-frontend', 'querier', 'compactor', 'store-gateway', 'ruler', 'alertmanager']),
},

'cortex-rollout-progress.json':
($.dashboard('Cortex / Rollout progress') + { uid: '7544a3a62b1be6ffd919fc990ab8ba8f' })
.addClusterSelectorTemplates(false) + {
// This dashboard uses the new grid system in order to place panels (using gridPos).
// Because of this we can't use the mixin's addRow() and addPanel().
schemaVersion: 27,
rows: null,
panels: [
//
// Rollout progress
//
$.panel('Rollout progress') +
$.barGauge([
|||
(
kube_statefulset_status_replicas_updated{%(namespace_matcher)s,statefulset=~"%(all_services_regex)s"}
/
kube_statefulset_replicas{%(namespace_matcher)s}
) and (
kube_statefulset_replicas{%(namespace_matcher)s}
> 0
)
||| % config,
|||
(
kube_deployment_status_replicas_updated{%(namespace_matcher)s,deployment=~"%(all_services_regex)s"}
/
kube_deployment_spec_replicas{%(namespace_matcher)s}
) and (
kube_deployment_spec_replicas{%(namespace_matcher)s}
> 0
)
||| % config,
], legends=[
'{{statefulset}}',
'{{deployment}}',
], thresholds=[
{ color: 'yellow', value: null },
{ color: 'yellow', value: 0.999 },
{ color: 'green', value: 1 },
], unit='percentunit', min=0, max=1) + {
id: 1,
gridPos: { h: 8, w: 10, x: 0, y: 0 },
},

//
// Writes
//
$.panel('Writes - 2xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
]) + {
id: 2,
gridPos: { h: 4, w: 2, x: 10, y: 0 },
},

$.panel('Writes - 4xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 0.2 },
{ color: 'red', value: 0.5 },
]) + {
id: 3,
gridPos: { h: 4, w: 2, x: 12, y: 0 },
},

$.panel('Writes - 5xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
{ color: 'red', value: 0.01 },
]) + {
id: 4,
gridPos: { h: 4, w: 2, x: 14, y: 0 },
},

$.panel('Writes 99th Latency') +
$.newStatPanel(|||
histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}))
||| % config, unit='s', thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 0.2 },
{ color: 'red', value: 0.5 },
]) + {
id: 5,
gridPos: { h: 4, w: 8, x: 16, y: 0 },
},

//
// Reads
//
$.panel('Reads - 2xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
]) + {
id: 6,
gridPos: { h: 4, w: 2, x: 10, y: 4 },
},

$.panel('Reads - 4xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 0.01 },
{ color: 'red', value: 0.05 },
]) + {
id: 7,
gridPos: { h: 4, w: 2, x: 12, y: 4 },
},

$.panel('Reads - 5xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
{ color: 'red', value: 0.01 },
]) + {
id: 8,
gridPos: { h: 4, w: 2, x: 14, y: 4 },
},

$.panel('Reads 99th Latency') +
$.newStatPanel(|||
histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}))
||| % config, unit='s', thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 1 },
{ color: 'red', value: 2.5 },
]) + {
id: 9,
gridPos: { h: 4, w: 8, x: 16, y: 4 },
},

//
// Unhealthy pods
//
$.panel('Unhealthy pods') +
$.newStatPanel([
|||
kube_deployment_status_replicas_unavailable{%(namespace_matcher)s, deployment=~"%(all_services_regex)s"}
> 0
||| % config,
|||
kube_statefulset_status_replicas_current{%(namespace_matcher)s, statefulset=~"%(all_services_regex)s"} -
kube_statefulset_status_replicas_ready {%(namespace_matcher)s, statefulset=~"%(all_services_regex)s"}
> 0
||| % config,
], legends=[
'{{deployment}}',
'{{statefulset}}',
], thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 1 },
{ color: 'red', value: 2 },
], instant=true, novalue='All healthy') + {
options: {
text: {
// Small font size since we may have many entries during a rollout.
titleSize: 14,
valueSize: 14,
},
},
id: 10,
gridPos: { h: 8, w: 10, x: 0, y: 8 },
},

//
// Versions
//
{
title: 'Pods count per Version',
type: 'table',
datasource: '$datasource',

targets: [
{
expr: |||
count by(container, version) (
label_replace(
kube_pod_container_info{%(namespace_matcher)s,container=~"%(all_services_regex)s"},
"version", "$1", "image", ".*:(.+)-.*"
)
)
||| % config,
instant: true,
legendFormat: '',
refId: 'A',
},
],

fieldConfig: {
overrides: [
{
// Center align the version.
matcher: { id: 'byRegexp', options: 'r.*' },
properties: [{ id: 'custom.align', value: 'center' }],
},
],
},

transformations: [
{
// Transform the version label to a field.
id: 'labelsToFields',
options: { valueLabel: 'version' },
},
{
// Hide time.
id: 'organize',
options: { excludeByName: { Time: true } },
},
{
// Sort by container.
id: 'sortBy',
options: { fields: {}, sort: [{ field: 'container' }] },
},
],

id: 11,
gridPos: { h: 8, w: 6, x: 10, y: 8 },
},

//
// Performance comparison with 24h ago
//
$.panel('Latency vs 24h ago') +
$.queryPanel([|||
1 - (
avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:])
/
avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}))[1h:])
)
||| % config, |||
1 - (
avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"} offset 24h))[1h:])
/
avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}))[1h:])
)
||| % config], ['writes', 'reads']) + {
yaxes: $.yaxes({
format: 'percentunit',
min: null, // Can be negative.
}),

id: 12,
gridPos: { h: 8, w: 8, x: 16, y: 8 },
},
],

templating+: {
list: [
// Do not allow to include all clusters/namespaces cause this dashboard is designed to show
// 1 cluster at a time.
l + (if (l.name == 'cluster' || l.name == 'namespace') then { includeAll: false } else {})
for l in super.list
],
},
},
}