grafana · pracucci · Apr 6, 2021 · Apr 2, 2021 · Apr 2, 2021
@@ -2,6 +2,7 @@
 
 ## master / unreleased
 
+* [FEATURE] Added "Cortex / Rollout progress" dashboard. #289
 * [ENHANCEMENT] Added `newCompactorStatefulSet()` function to create a custom statefulset for the compactor. #287
 * [ENHANCEMENT] Added option to configure compactor job name used in dashboards and alerts. #287
 * [BUGFIX] Fixed `CortexCompactorRunFailed` false positives. #288

@@ -8,6 +8,7 @@
     (import 'dashboards/scaling.libsonnet') +
     (import 'dashboards/writes.libsonnet') +
     (import 'dashboards/slow-queries.libsonnet') +
+    (import 'dashboards/rollout-progress.libsonnet') +
 
     (if std.member($._config.storage_engine, 'blocks')
      then

@@ -173,6 +173,72 @@ local utils = import 'mixin-utils/utils.libsonnet';
       tooltip: { sort: 2 },  // Sort descending.
     },
 
+  newStatPanel(queries, legends='', unit='percentunit', thresholds=[], instant=false, novalue='')::
+    super.queryPanel(queries, legends) + {
+      type: 'stat',
+      targets: [
+        target {
+          instant: instant,
+          interval: '',
+
+          // Reset defaults from queryPanel().
+          format: null,
+          intervalFactor: null,
+          step: null,
+        }
+        for target in super.targets
+      ],
+      fieldConfig: {
+        defaults: {
+          color: { mode: 'thresholds' },
+          decimals: 1,
+          thresholds: {
+            mode: 'absolute',
+            steps: thresholds,
+          },
+          noValue: novalue,
+          unit: unit,
+        },
+        overrides: [],
+      },
+    },
+
+  barGauge(queries, legends='', thresholds=[], unit='short', min=null, max=null)::
+    super.queryPanel(queries, legends) + {
+      type: 'bargauge',
+      targets: [
+        target {
+          // Reset defaults from queryPanel().
+          format: null,
+          intervalFactor: null,
+          step: null,
+        }
+        for target in super.targets
+      ],
+      fieldConfig: {
+        defaults: {
+          color: { mode: 'thresholds' },
+          mappings: [],
+          max: max,
+          min: min,
+          thresholds: {
+            mode: 'absolute',
+            steps: thresholds,
+          },
+          unit: unit,
+        },
+      },
+      options: {
+        displayMode: 'basic',
+        orientation: 'horizontal',
+        reduceOptions: {
+          calcs: ['lastNotNull'],
+          fields: '',
+          values: false,
+        },
+      },
+    },
+
   // Switches a panel from lines (default) to bars.
   bars:: {
     bars: true,

@@ -0,0 +1,284 @@
+local utils = import 'mixin-utils/utils.libsonnet';
+
+(import 'dashboard-utils.libsonnet') {
+  local config = {
+    namespace_matcher: $.namespaceMatcher(),
+    gateway_job_matcher: $.jobMatcher($._config.job_names.gateway),
+    gateway_write_routes_regex: 'api_(v1|prom)_push',
+    gateway_read_routes_regex: '(prometheus|api_prom)_api_v1_.+',
+    all_services_regex: std.join('|', ['cortex-gw', 'distributor', 'ingester', 'query-frontend', 'querier', 'compactor', 'store-gateway', 'ruler', 'alertmanager']),
+  },
+
+  'cortex-rollout-progress.json':
+    ($.dashboard('Cortex / Rollout progress') + { uid: '7544a3a62b1be6ffd919fc990ab8ba8f' })
+    .addClusterSelectorTemplates(false) + {
+      // This dashboard uses the new grid system in order to place panels (using gridPos).
+      // Because of this we can't use the mixin's addRow() and addPanel().
+      schemaVersion: 27,
+      rows: null,
+      panels: [
+        //
+        // Rollout progress
+        //
+        $.panel('Rollout progress') +
+        $.barGauge([
+          |||
+            (
+              kube_statefulset_status_replicas_updated{%(namespace_matcher)s,statefulset=~"%(all_services_regex)s"}
+              /
+              kube_statefulset_replicas{%(namespace_matcher)s}
+            ) and (
+              kube_statefulset_replicas{%(namespace_matcher)s}
+              > 0
+            )
+          ||| % config,
+          |||
+            (
+              kube_deployment_status_replicas_updated{%(namespace_matcher)s,deployment=~"%(all_services_regex)s"}
+              /
+              kube_deployment_spec_replicas{%(namespace_matcher)s}
+            ) and (
+              kube_deployment_spec_replicas{%(namespace_matcher)s}
+              > 0
+            )
+          ||| % config,
+        ], legends=[
+          '{{statefulset}}',
+          '{{deployment}}',
+        ], thresholds=[
+          { color: 'yellow', value: null },
+          { color: 'yellow', value: 0.999 },
+          { color: 'green', value: 1 },
+        ], unit='percentunit', min=0, max=1) + {
+          id: 1,
+          gridPos: { h: 8, w: 10, x: 0, y: 0 },
+        },
+
+        //
+        // Writes
+        //
+        $.panel('Writes - 2xx') +
+        $.newStatPanel(|||
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) /
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval]))
+        ||| % config, thresholds=[
+          { color: 'green', value: null },
+        ]) + {
+          id: 2,
+          gridPos: { h: 4, w: 2, x: 10, y: 0 },
+        },
+
+        $.panel('Writes - 4xx') +
+        $.newStatPanel(|||
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) /
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval]))
+        ||| % config, thresholds=[
+          { color: 'green', value: null },
+          { color: 'orange', value: 0.2 },
+          { color: 'red', value: 0.5 },
+        ]) + {
+          id: 3,
+          gridPos: { h: 4, w: 2, x: 12, y: 0 },
+        },
+
+        $.panel('Writes - 5xx') +
+        $.newStatPanel(|||
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) /
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval]))
+        ||| % config, thresholds=[
+          { color: 'green', value: null },
+          { color: 'red', value: 0.01 },
+        ]) + {
+          id: 4,
+          gridPos: { h: 4, w: 2, x: 14, y: 0 },
+        },
+
+        $.panel('Writes 99th Latency') +
+        $.newStatPanel(|||
+          histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}))
+        ||| % config, unit='s', thresholds=[
+          { color: 'green', value: null },
+          { color: 'orange', value: 0.2 },
+          { color: 'red', value: 0.5 },
+        ]) + {
+          id: 5,
+          gridPos: { h: 4, w: 8, x: 16, y: 0 },
+        },
+
+        //
+        // Reads
+        //
+        $.panel('Reads - 2xx') +
+        $.newStatPanel(|||
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) /
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval]))
+        ||| % config, thresholds=[
+          { color: 'green', value: null },
+        ]) + {
+          id: 6,
+          gridPos: { h: 4, w: 2, x: 10, y: 4 },
+        },
+
+        $.panel('Reads - 4xx') +
+        $.newStatPanel(|||
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) /
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval]))
+        ||| % config, thresholds=[
+          { color: 'green', value: null },
+          { color: 'orange', value: 0.01 },
+          { color: 'red', value: 0.05 },
+        ]) + {
+          id: 7,
+          gridPos: { h: 4, w: 2, x: 12, y: 4 },
+        },
+
+        $.panel('Reads - 5xx') +
+        $.newStatPanel(|||
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) /
+          sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval]))
+        ||| % config, thresholds=[
+          { color: 'green', value: null },
+          { color: 'red', value: 0.01 },
+        ]) + {
+          id: 8,
+          gridPos: { h: 4, w: 2, x: 14, y: 4 },
+        },
+
+        $.panel('Reads 99th Latency') +
+        $.newStatPanel(|||
+          histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}))
+        ||| % config, unit='s', thresholds=[
+          { color: 'green', value: null },
+          { color: 'orange', value: 1 },
+          { color: 'red', value: 2.5 },
+        ]) + {
+          id: 9,
+          gridPos: { h: 4, w: 8, x: 16, y: 4 },
+        },
+
+        //
+        // Unhealthy pods
+        //
+        $.panel('Unhealthy pods') +
+        $.newStatPanel([
+          |||
+            kube_deployment_status_replicas_unavailable{%(namespace_matcher)s, deployment=~"%(all_services_regex)s"}
+            > 0
+          ||| % config,
+          |||
+            kube_statefulset_status_replicas_current{%(namespace_matcher)s, statefulset=~"%(all_services_regex)s"} -
+            kube_statefulset_status_replicas_ready {%(namespace_matcher)s, statefulset=~"%(all_services_regex)s"}
+            > 0
+          ||| % config,
+        ], legends=[
+          '{{deployment}}',
+          '{{statefulset}}',
+        ], thresholds=[
+          { color: 'green', value: null },
+          { color: 'orange', value: 1 },
+          { color: 'red', value: 2 },
+        ], instant=true, novalue='All healthy') + {
+          options: {
+            text: {
+              // Small font size since we may have many entries during a rollout.
+              titleSize: 14,
+              valueSize: 14,
+            },
+          },
+          id: 10,
+          gridPos: { h: 8, w: 10, x: 0, y: 8 },
+        },
+
+        //
+        // Versions
+        //
+        {
+          title: 'Pods count per Version',
+          type: 'table',
+          datasource: '$datasource',
+
+          targets: [
+            {
+              expr: |||
+                count by(container, version) (
+                  label_replace(
+                    kube_pod_container_info{%(namespace_matcher)s,container=~"%(all_services_regex)s"},
+                    "version", "$1", "image", ".*:(.+)-.*"
+                  )
+                )
+              ||| % config,
+              instant: true,
+              legendFormat: '',
+              refId: 'A',
+            },
+          ],
+
+          fieldConfig: {
+            overrides: [
+              {
+                // Center align the version.
+                matcher: { id: 'byRegexp', options: 'r.*' },
+                properties: [{ id: 'custom.align', value: 'center' }],
+              },
+            ],
+          },
+
+          transformations: [
+            {
+              // Transform the version label to a field.
+              id: 'labelsToFields',
+              options: { valueLabel: 'version' },
+            },
+            {
+              // Hide time.
+              id: 'organize',
+              options: { excludeByName: { Time: true } },
+            },
+            {
+              // Sort by container.
+              id: 'sortBy',
+              options: { fields: {}, sort: [{ field: 'container' }] },
+            },
+          ],
+
+          id: 11,
+          gridPos: { h: 8, w: 6, x: 10, y: 8 },
+        },
+
+        //
+        // Performance comparison with 24h ago
+        //
+        $.panel('Latency vs 24h ago') +
+        $.queryPanel([|||
+          1 - (
+            avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:])
+            /
+            avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}))[1h:])
+          )
+        ||| % config, |||
+          1 - (
+            avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"} offset 24h))[1h:])
+            /
+            avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}))[1h:])
+          )
+        ||| % config], ['writes', 'reads']) + {
+          yaxes: $.yaxes({
+            format: 'percentunit',
+            min: null,  // Can be negative.
+          }),
+
+          id: 12,
+          gridPos: { h: 8, w: 8, x: 16, y: 8 },
+        },
+      ],
+
+      templating+: {
+        list: [
+          // Do not allow to include all clusters/namespaces cause this dashboard is designed to show
+          // 1 cluster at a time.
+          l + (if (l.name == 'cluster' || l.name == 'namespace') then { includeAll: false } else {})
+          for l in super.list
+        ],
+      },
+    },
+}