diff --git a/assets/control-plane/prometheus-rule.yaml b/assets/control-plane/prometheus-rule.yaml index eab90b437b..03b08b0721 100644 --- a/assets/control-plane/prometheus-rule.yaml +++ b/assets/control-plane/prometheus-rule.yaml @@ -18,7 +18,7 @@ spec: }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. summary: Pod is crash looping. expr: | - rate(kube_pod_container_status_restarts_total{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[10m]) * 60 * 5 > 0 + increase(kube_pod_container_status_restarts_total{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[10m]) > 0 for: 15m labels: severity: warning @@ -522,11 +522,11 @@ spec: rules: - expr: | sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - expr: | container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, diff --git a/assets/control-plane/service-monitor-kubelet.yaml b/assets/control-plane/service-monitor-kubelet.yaml index ece2f9bf00..994f395feb 100644 --- a/assets/control-plane/service-monitor-kubelet.yaml +++ b/assets/control-plane/service-monitor-kubelet.yaml @@ -41,7 +41,7 @@ spec: sourceLabels: - __name__ - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) sourceLabels: - __name__ port: https-metrics diff --git a/assets/grafana/dashboard-definitions.yaml b/assets/grafana/dashboard-definitions.yaml index 0b630232a8..415f8fb92f 100644 --- a/assets/grafana/dashboard-definitions.yaml +++ b/assets/grafana/dashboard-definitions.yaml @@ -3053,7 +3053,7 @@ items: ], "query": "label_values(etcd_server_has_leader, job)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 2, "tagValuesQuery": "", @@ -3691,7 +3691,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -3982,7 +3982,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4000,7 +4000,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4018,7 +4018,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6192,7 +6192,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -6276,7 +6276,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -6370,7 +6370,7 @@ items: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "Memory Utilization (from requests)", + "title": "Memory Utilisation (from requests)", "tooltip": { "shared": false, "sort": 0, @@ -6560,7 +6560,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -6811,7 +6811,7 @@ items: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6829,7 +6829,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6847,7 +6847,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8611,7 +8611,7 @@ items: "decimals": 2, "link": true, "linkTargetBlank": false, - "linkTooltip": "Drill down to containers", + "linkTooltip": "Drill down to pods", "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -8928,7 +8928,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9163,7 +9163,7 @@ items: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9181,7 +9181,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9199,7 +9199,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9916,7 +9916,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -10272,7 +10272,7 @@ items: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10290,7 +10290,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10308,7 +10308,7 @@ items: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10456,7 +10456,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Memory Usage (WSS)", "tooltip": { "shared": false, "sort": 0, @@ -10547,7 +10547,7 @@ items: "type": "hidden" }, { - "alias": "Memory Usage", + "alias": "Memory Usage (WSS)", "colorMode": null, "colors": [ @@ -11661,7 +11661,7 @@ items: "expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m])))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{pod}}", + "legendFormat": "{{container}}", "legendLink": null, "step": 10 } @@ -11747,7 +11747,7 @@ items: "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{pod}}", + "legendFormat": "{{container}}", "legendLink": null, "step": 10 } @@ -11966,18 +11966,18 @@ items: "unit": "Bps" }, { - "alias": "Pod", + "alias": "Container", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, - "link": true, + "link": false, "linkTargetBlank": false, - "linkTooltip": "Drill down to pods", - "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", "thresholds": [ ], @@ -12319,7 +12319,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12554,7 +12554,7 @@ items: ], "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12572,7 +12572,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12590,7 +12590,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14319,7 +14319,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}} - {{workload_type}}", @@ -14617,7 +14617,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14635,7 +14635,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14653,7 +14653,7 @@ items: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -17899,7 +17899,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", + "expr": "(\n instance:node_cpu_utilisation:rate5m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -17914,7 +17914,7 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18000,7 +18000,7 @@ items: "timeShift": null, "title": "CPU Saturation (load1 per CPU)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18098,7 +18098,7 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18169,7 +18169,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -18184,7 +18184,7 @@ items: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18275,7 +18275,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Receive", @@ -18283,7 +18283,7 @@ items: "step": 10 }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Transmit", @@ -18298,7 +18298,7 @@ items: "timeShift": null, "title": "Net Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18377,7 +18377,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Receive", @@ -18385,7 +18385,7 @@ items: "step": 10 }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Transmit", @@ -18400,7 +18400,7 @@ items: "timeShift": null, "title": "Net Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18483,7 +18483,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -18498,7 +18498,7 @@ items: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18569,7 +18569,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -18584,7 +18584,7 @@ items: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18682,7 +18682,7 @@ items: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18856,7 +18856,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance:node_cpu_utilisation:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -18871,7 +18871,7 @@ items: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18957,7 +18957,7 @@ items: "timeShift": null, "title": "CPU Saturation (Load1 per CPU)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19055,7 +19055,7 @@ items: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19126,7 +19126,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Major page faults", @@ -19141,7 +19141,7 @@ items: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19232,7 +19232,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Receive", @@ -19240,7 +19240,7 @@ items: "step": 10 }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Transmit", @@ -19255,7 +19255,7 @@ items: "timeShift": null, "title": "Net Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19334,7 +19334,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Receive drops", @@ -19342,7 +19342,7 @@ items: "step": 10 }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Transmit drops", @@ -19357,7 +19357,7 @@ items: "timeShift": null, "title": "Net Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19440,7 +19440,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -19455,7 +19455,7 @@ items: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19526,7 +19526,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -19541,7 +19541,7 @@ items: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19639,7 +19639,7 @@ items: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21204,7 +21204,7 @@ items: "timeShift": null, "title": "Prometheus Stats", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21303,7 +21303,7 @@ items: "timeShift": null, "title": "Target Sync", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21389,7 +21389,7 @@ items: "timeShift": null, "title": "Targets", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21487,7 +21487,7 @@ items: "timeShift": null, "title": "Average Scrape Interval Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21557,6 +21557,14 @@ items: "stack": true, "steppedLine": false, "targets": [ + { + "expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "exceeded body size limit: {{job}}", + "legendLink": null, + "step": 10 + }, { "expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))", "format": "time_series", @@ -21597,7 +21605,7 @@ items: "timeShift": null, "title": "Scrape failures", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21683,7 +21691,7 @@ items: "timeShift": null, "title": "Appended Samples", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21781,7 +21789,7 @@ items: "timeShift": null, "title": "Head Series", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21867,7 +21875,7 @@ items: "timeShift": null, "title": "Head Chunks", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21965,7 +21973,7 @@ items: "timeShift": null, "title": "Query Rate", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -22051,7 +22059,7 @@ items: "timeShift": null, "title": "Stage Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -22117,7 +22125,7 @@ items: "type": "datasource" }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All", @@ -22145,7 +22153,7 @@ items: "useTags": false }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All", diff --git a/assets/kube-state-metrics/deployment.yaml b/assets/kube-state-metrics/deployment.yaml index f3717a72d0..ca715cf00f 100644 --- a/assets/kube-state-metrics/deployment.yaml +++ b/assets/kube-state-metrics/deployment.yaml @@ -65,7 +65,7 @@ spec: - --upstream=http://127.0.0.1:8081/ - --tls-cert-file=/etc/tls/private/tls.crt - --tls-private-key-file=/etc/tls/private/tls.key - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.10.0 name: kube-rbac-proxy-main ports: - containerPort: 8443 @@ -87,7 +87,7 @@ spec: - --upstream=http://127.0.0.1:8082/ - --tls-cert-file=/etc/tls/private/tls.crt - --tls-private-key-file=/etc/tls/private/tls.key - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.10.0 name: kube-rbac-proxy-self ports: - containerPort: 9443 diff --git a/assets/kube-state-metrics/prometheus-rule.yaml b/assets/kube-state-metrics/prometheus-rule.yaml index 4309cce846..a733ba4476 100644 --- a/assets/kube-state-metrics/prometheus-rule.yaml +++ b/assets/kube-state-metrics/prometheus-rule.yaml @@ -42,3 +42,27 @@ spec: for: 15m labels: severity: critical + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards + configuration, some Kubernetes objects may be exposed multiple times or + not exposed at all. + summary: kube-state-metrics sharding is misconfigured. + expr: | + stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects + are not being exposed. + summary: kube-state-metrics shards are missing. + expr: | + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + - + sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + != 0 + for: 15m + labels: + severity: critical diff --git a/assets/node-exporter/daemonset.yaml b/assets/node-exporter/daemonset.yaml index 8a7c63632e..26894dcc93 100644 --- a/assets/node-exporter/daemonset.yaml +++ b/assets/node-exporter/daemonset.yaml @@ -67,7 +67,7 @@ spec: valueFrom: fieldRef: fieldPath: status.podIP - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.10.0 name: kube-rbac-proxy ports: - containerPort: 9100 diff --git a/assets/node-exporter/prometheus-rule.yaml b/assets/node-exporter/prometheus-rule.yaml index 974bed5542..ce87609606 100644 --- a/assets/node-exporter/prometheus-rule.yaml +++ b/assets/node-exporter/prometheus-rule.yaml @@ -237,9 +237,9 @@ spec: record: instance:node_num_cpu:sum - expr: | 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) ) - record: instance:node_cpu_utilisation:rate1m + record: instance:node_cpu_utilisation:rate5m - expr: | ( node_load1{job="node-exporter"} @@ -255,31 +255,31 @@ spec: ) record: instance:node_memory_utilisation:ratio - expr: | - rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) - record: instance:node_vmstat_pgmajfault:rate1m + rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m - expr: | - rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_seconds:rate1m + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate1m + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m - expr: | sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_receive_bytes_excluding_lo:rate1m + record: instance:node_network_receive_bytes_excluding_lo:rate5m - expr: | sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_transmit_bytes_excluding_lo:rate1m + record: instance:node_network_transmit_bytes_excluding_lo:rate5m - expr: | sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_receive_drop_excluding_lo:rate1m + record: instance:node_network_receive_drop_excluding_lo:rate5m - expr: | sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_transmit_drop_excluding_lo:rate1m + record: instance:node_network_transmit_drop_excluding_lo:rate5m diff --git a/assets/prometheus-k8s/prometheus-rule.yaml b/assets/prometheus-k8s/prometheus-rule.yaml index bbf6706edd..79a8fd5b49 100644 --- a/assets/prometheus-k8s/prometheus-rule.yaml +++ b/assets/prometheus-k8s/prometheus-rule.yaml @@ -139,12 +139,12 @@ spec: summary: Prometheus fails to send samples to remote storage. expr: | ( - rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) / ( - rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) + - rate(prometheus_remote_storage_succeeded_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + (rate(prometheus_remote_storage_succeeded_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) or rate(prometheus_remote_storage_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) ) ) * 100 @@ -222,6 +222,28 @@ spec: for: 15m labels: severity: warning + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped + {{ printf "%.0f" $value }} targets because some samples exceeded the configured + label_limit, label_name_length_limit or label_value_length_limit. + summary: Prometheus has dropped targets because some scrape configs have exceeded + the labels limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetSyncFailure + annotations: + description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} + have failed to sync because invalid configuration was supplied.' + summary: Prometheus has failed to sync targets. + expr: | + increase(prometheus_target_sync_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[30m]) > 0 + for: 5m + labels: + severity: critical - alert: PrometheusErrorSendingAlertsToAnyAlertmanager annotations: description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts @@ -242,31 +264,29 @@ spec: rules: - alert: ThanosSidecarPrometheusDown annotations: - description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} cannot connect - to Prometheus. + description: Thanos Sidecar {{$labels.instance}} cannot connect to Prometheus. summary: Thanos Sidecar cannot connect to Prometheus expr: | - sum by (job, instance) (thanos_sidecar_prometheus_up{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"} == 0) + thanos_sidecar_prometheus_up{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"} == 0 for: 1h labels: severity: warning - alert: ThanosSidecarBucketOperationsFailed annotations: - description: Thanos Sidecar {{$labels.job}} {{$labels.instance}} bucket operations - are failing + description: Thanos Sidecar {{$labels.instance}} bucket operations are failing summary: Thanos Sidecar bucket operations are failing expr: | - rate(thanos_objstore_bucket_operation_failures_total{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}[5m]) > 0 + sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}[5m])) > 0 for: 1h labels: severity: warning - alert: ThanosSidecarUnhealthy annotations: - description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for - more than {{ $value }} seconds. + description: Thanos Sidecar {{$labels.instance}} is unhealthy for more than + {{$value}} seconds. summary: Thanos Sidecar is unhealthy. expr: | - time() - max(timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"})) by (job,pod) >= 240 + time() - max by (job, instance) (timestamp(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"})) >= 240 for: 1h labels: severity: warning diff --git a/assets/prometheus-k8s/prometheus.yaml b/assets/prometheus-k8s/prometheus.yaml index 16385533fa..e2bd17d891 100644 --- a/assets/prometheus-k8s/prometheus.yaml +++ b/assets/prometheus-k8s/prometheus.yaml @@ -163,6 +163,7 @@ spec: requests: cpu: 1m memory: 10Mi + enableFeatures: [] externalLabels: {} image: quay.io/prometheus/prometheus:v2.26.1 listenLocal: true diff --git a/assets/prometheus-operator-user-workload/deployment.yaml b/assets/prometheus-operator-user-workload/deployment.yaml index 2440329502..04a85e7ba8 100644 --- a/assets/prometheus-operator-user-workload/deployment.yaml +++ b/assets/prometheus-operator-user-workload/deployment.yaml @@ -18,6 +18,7 @@ spec: template: metadata: annotations: + kubectl.kubernetes.io/default-container: prometheus-operator target.workload.openshift.io/management: '{"effect": "PreferredDuringScheduling"}' labels: app.kubernetes.io/component: controller @@ -51,7 +52,7 @@ spec: - --upstream=http://127.0.0.1:8080/ - --tls-cert-file=/etc/tls/private/tls.crt - --tls-private-key-file=/etc/tls/private/tls.key - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.10.0 name: kube-rbac-proxy ports: - containerPort: 8443 diff --git a/assets/prometheus-operator/deployment.yaml b/assets/prometheus-operator/deployment.yaml index efa3d990ef..2baa391f7b 100644 --- a/assets/prometheus-operator/deployment.yaml +++ b/assets/prometheus-operator/deployment.yaml @@ -18,6 +18,7 @@ spec: template: metadata: annotations: + kubectl.kubernetes.io/default-container: prometheus-operator target.workload.openshift.io/management: '{"effect": "PreferredDuringScheduling"}' labels: app.kubernetes.io/component: controller @@ -60,7 +61,7 @@ spec: - --tls-cert-file=/etc/tls/private/tls.crt - --tls-private-key-file=/etc/tls/private/tls.key - --upstream-ca-file=/etc/configmaps/operator-cert-ca-bundle/service-ca.crt - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + image: quay.io/brancz/kube-rbac-proxy:v0.10.0 name: kube-rbac-proxy ports: - containerPort: 8443 diff --git a/assets/prometheus-user-workload/prometheus.yaml b/assets/prometheus-user-workload/prometheus.yaml index aab3296d28..c6029b7394 100644 --- a/assets/prometheus-user-workload/prometheus.yaml +++ b/assets/prometheus-user-workload/prometheus.yaml @@ -108,6 +108,7 @@ spec: requests: cpu: 1m memory: 10Mi + enableFeatures: [] enforcedNamespaceLabel: namespace externalLabels: {} ignoreNamespaceSelectors: true diff --git a/assets/thanos-querier/deployment.yaml b/assets/thanos-querier/deployment.yaml index 2ece97a45c..f3c1cd09a8 100644 --- a/assets/thanos-querier/deployment.yaml +++ b/assets/thanos-querier/deployment.yaml @@ -46,6 +46,7 @@ spec: - --query.replica-label=prometheus_replica - --query.replica-label=thanos_ruler_replica - --store=dnssrv+_grpc._tcp.prometheus-operated.openshift-monitoring.svc.cluster.local + - --query.auto-downsampling - --store.sd-dns-resolver=miekgdns - --grpc-client-tls-secure - --grpc-client-tls-cert=/etc/tls/grpc/client.crt @@ -53,6 +54,11 @@ spec: - --grpc-client-tls-ca=/etc/tls/grpc/ca.crt - --grpc-client-server-name=prometheus-grpc - --rule=dnssrv+_grpc._tcp.prometheus-operated.openshift-monitoring.svc.cluster.local + env: + - name: HOST_IP_ADDRESS + valueFrom: + fieldRef: + fieldPath: status.hostIP image: quay.io/thanos/thanos:v0.20.2 livenessProbe: exec: @@ -181,6 +187,8 @@ spec: name: secret-thanos-querier-tls - mountPath: /etc/kube-rbac-proxy name: secret-thanos-querier-kube-rbac-proxy-rules + nodeSelector: + beta.kubernetes.io/os: linux priorityClassName: system-cluster-critical serviceAccountName: thanos-querier terminationGracePeriodSeconds: 120 diff --git a/assets/thanos-querier/prometheus-rule.yaml b/assets/thanos-querier/prometheus-rule.yaml index 174e787618..fd917fa6ef 100644 --- a/assets/thanos-querier/prometheus-rule.yaml +++ b/assets/thanos-querier/prometheus-rule.yaml @@ -14,36 +14,36 @@ spec: rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh annotations: - description: Thanos Query {{$labels.job}} is failing to handle {{ $value | - humanize }}% of "query" requests. + description: Thanos Query {{$labels.job}} is failing to handle {{$value | + humanize}}% of "query" requests. summary: Thanos Query is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query"}[5m])) / - sum(rate(http_requests_total{job="thanos-querier", handler="query"}[5m])) + sum by (job) (rate(http_requests_total{job="thanos-querier", handler="query"}[5m])) ) * 100 > 5 for: 1h labels: severity: warning - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh annotations: - description: Thanos Query {{$labels.job}} is failing to handle {{ $value | - humanize }}% of "query_range" requests. + description: Thanos Query {{$labels.job}} is failing to handle {{$value | + humanize}}% of "query_range" requests. summary: Thanos Query is failing to handle requests. expr: | ( - sum(rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query_range"}[5m])) / - sum(rate(http_requests_total{job="thanos-querier", handler="query_range"}[5m])) + sum by (job) (rate(http_requests_total{job="thanos-querier", handler="query_range"}[5m])) ) * 100 > 5 for: 1h labels: severity: warning - alert: ThanosQueryGrpcServerErrorRate annotations: - description: Thanos Query {{$labels.job}} is failing to handle {{ $value | - humanize }}% of requests. + description: Thanos Query {{$labels.job}} is failing to handle {{$value | + humanize}}% of requests. summary: Thanos Query is failing to handle requests. expr: | ( @@ -57,8 +57,8 @@ spec: severity: warning - alert: ThanosQueryGrpcClientErrorRate annotations: - description: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize - }}% of requests. + description: Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% + of requests. summary: Thanos Query is failing to send requests. expr: | ( @@ -71,8 +71,8 @@ spec: severity: warning - alert: ThanosQueryHighDNSFailures annotations: - description: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of - failing DNS queries for store endpoints. + description: Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing + DNS queries for store endpoints. summary: Thanos Query is having high number of DNS failures. expr: | ( diff --git a/assets/thanos-ruler/thanos-ruler-prometheus-rule.yaml b/assets/thanos-ruler/thanos-ruler-prometheus-rule.yaml index c33ef5ea6c..36f3cbbfab 100644 --- a/assets/thanos-ruler/thanos-ruler-prometheus-rule.yaml +++ b/assets/thanos-ruler/thanos-ruler-prometheus-rule.yaml @@ -9,31 +9,32 @@ spec: rules: - alert: ThanosRuleQueueIsDroppingAlerts annotations: - description: Thanos Rule {{$labels.job}} is failing to queue alerts. + description: Thanos Rule {{$labels.instance}} is failing to queue alerts. summary: Thanos Rule is failing to queue alerts. expr: | - sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosRuleSenderIsFailingAlerts annotations: - description: Thanos Rule {{$labels.job}} is failing to send alerts to alertmanager. + description: Thanos Rule {{$labels.instance}} is failing to send alerts to + alertmanager. summary: Thanos Rule is failing to send alerts to alertmanager. expr: | - sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0 + sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0 for: 5m labels: severity: critical - alert: ThanosRuleHighRuleEvaluationFailures annotations: - description: Thanos Rule {{$labels.job}} is failing to evaluate rules. + description: Thanos Rule {{$labels.instance}} is failing to evaluate rules. summary: Thanos Rule is failing to evaluate rules. expr: | ( - sum by (job) (rate(prometheus_rule_evaluation_failures_total{job="thanos-ruler"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job="thanos-ruler"}[5m])) / - sum by (job) (rate(prometheus_rule_evaluations_total{job="thanos-ruler"}[5m])) + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job="thanos-ruler"}[5m])) * 100 > 5 ) for: 5m @@ -41,17 +42,18 @@ spec: severity: critical - alert: ThanosRuleHighRuleEvaluationWarnings annotations: - description: Thanos Rule {{$labels.job}} has high number of evaluation warnings. + description: Thanos Rule {{$labels.instance}} has high number of evaluation + warnings. summary: Thanos Rule has high number of evaluation warnings. expr: | - sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job="thanos-ruler"}[5m])) > 0 + sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job="thanos-ruler"}[5m])) > 0 for: 15m labels: severity: info - alert: ThanosRuleRuleEvaluationLatencyHigh annotations: - description: Thanos Rule {{$labels.job}}/{{$labels.instance}} has higher evaluation - latency than interval for {{$labels.rule_group}}. + description: Thanos Rule {{$labels.instance}} has higher evaluation latency + than interval for {{$labels.rule_group}}. summary: Thanos Rule has high rule evaluation latency. expr: | ( @@ -64,14 +66,14 @@ spec: severity: warning - alert: ThanosRuleGrpcErrorRate annotations: - description: Thanos Rule {{$labels.job}} is failing to handle {{ $value | - humanize }}% of requests. + description: Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% + of requests. summary: Thanos Rule is failing to handle grpc requests. expr: | ( - sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job="thanos-ruler"}[5m])) + sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job="thanos-ruler"}[5m])) / - sum by (job) (rate(grpc_server_started_total{job="thanos-ruler"}[5m])) + sum by (job, instance) (rate(grpc_server_started_total{job="thanos-ruler"}[5m])) * 100 > 5 ) for: 5m @@ -81,21 +83,21 @@ spec: annotations: description: Thanos Rule {{$labels.job}} has not been able to reload its configuration. summary: Thanos Rule has not been able to reload configuration. - expr: avg(thanos_rule_config_last_reload_successful{job="thanos-ruler"}) by - (job) != 1 + expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job="thanos-ruler"}) + != 1 for: 5m labels: severity: info - alert: ThanosRuleQueryHighDNSFailures annotations: - description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints. summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job="thanos-ruler"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job="thanos-ruler"}[5m])) / - sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job="thanos-ruler"}[5m])) + sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job="thanos-ruler"}[5m])) * 100 > 1 ) for: 15m @@ -103,14 +105,14 @@ spec: severity: warning - alert: ThanosRuleAlertmanagerHighDNSFailures annotations: - description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing - DNS queries for Alertmanager endpoints. + description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of + failing DNS queries for Alertmanager endpoints. summary: Thanos Rule is having high number of DNS failures. expr: | ( - sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job="thanos-ruler"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job="thanos-ruler"}[5m])) / - sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job="thanos-ruler"}[5m])) + sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job="thanos-ruler"}[5m])) * 100 > 1 ) for: 15m @@ -118,25 +120,25 @@ spec: severity: warning - alert: ThanosRuleNoEvaluationFor10Intervals annotations: - description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule - groups that did not evaluate for at least 10x of their expected interval. + description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% rule groups + that did not evaluate for at least 10x of their expected interval. summary: Thanos Rule has rule groups that did not evaluate for 10 intervals. expr: | - time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job="thanos-ruler"}) + time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job="thanos-ruler"}) > - 10 * max by (job, group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"}) + 10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"}) for: 5m labels: severity: info - alert: ThanosNoRuleEvaluations annotations: - description: Thanos Rule {{$labels.job}} did not perform any rule evaluations - in the past 2 minutes. + description: Thanos Rule {{$labels.instance}} did not perform any rule evaluations + in the past 10 minutes. summary: Thanos Rule did not perform any rule evaluations. expr: | - sum(rate(prometheus_rule_evaluations_total{job="thanos-ruler"}[2m])) <= 0 + sum by (job, instance) (rate(prometheus_rule_evaluations_total{job="thanos-ruler"}[5m])) <= 0 and - sum(thanos_rule_loaded_rules{job="thanos-ruler"}) > 0 - for: 3m + sum by (job, instance) (thanos_rule_loaded_rules{job="thanos-ruler"}) > 0 + for: 5m labels: severity: critical diff --git a/jsonnet/jsonnetfile.json b/jsonnet/jsonnetfile.json index d802fe0232..28c3ab09b4 100644 --- a/jsonnet/jsonnetfile.json +++ b/jsonnet/jsonnetfile.json @@ -4,11 +4,11 @@ { "source": { "git": { - "remote": "https://github.com/coreos/etcd", + "remote": "https://github.com/etcd-io/etcd", "subdir": "contrib/mixin" } }, - "version": "master" + "version": "main" }, { "source": { @@ -17,7 +17,7 @@ "subdir": "jsonnet/kube-prometheus" } }, - "version": "release-0.8" + "version": "main" }, { "source": { @@ -26,7 +26,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "release-0.47" + "version": "master" }, { "source": { @@ -35,7 +35,7 @@ "subdir": "jsonnet" } }, - "version": "release-4.7", + "version": "master", "name": "openshift-state-metrics" }, { @@ -45,7 +45,7 @@ "subdir": "jsonnet/telemeter" } }, - "version": "release-4.7", + "version": "master", "name": "telemeter-client" }, { @@ -55,7 +55,7 @@ "subdir": "jsonnet/kube-thanos" } }, - "version": "release-0.17" + "version": "main" }, { "source": { @@ -64,7 +64,7 @@ "subdir": "mixin" } }, - "version": "release-0.19" + "version": "main" } ], "legacyImports": true diff --git a/jsonnet/jsonnetfile.lock.json b/jsonnet/jsonnetfile.lock.json index 7f2b9e64f2..f9bce48946 100644 --- a/jsonnet/jsonnetfile.lock.json +++ b/jsonnet/jsonnetfile.lock.json @@ -11,16 +11,6 @@ "version": "8ea4e7bc04b1bf5e9bd99918ca28c6271b42be0e", "sum": "muenICtKXABk6MZZHCZD2wCbmtiE96GwWRMGa1Rg+wA=" }, - { - "source": { - "git": { - "remote": "https://github.com/coreos/etcd.git", - "subdir": "contrib/mixin" - } - }, - "version": "0a02a107eabb97e3a5ebbace155bd6b2ffba247a", - "sum": "EgKKzxcW3ttt7gjPMX//DNTqNcn/0o2VAIaWJ/HSLEc=" - }, { "source": { "git": { @@ -28,8 +18,8 @@ "subdir": "contrib/mixin" } }, - "version": "d06d93d5b14b7f3aafd96da5703d55f15638c16f", - "sum": "EgKKzxcW3ttt7gjPMX//DNTqNcn/0o2VAIaWJ/HSLEc=" + "version": "56678038b5e9b076df7c95020ffcd626c9b0e389", + "sum": "W/Azptf1PoqjyMwJON96UY69MFugDA4IAYiKURscryc=" }, { "source": { @@ -38,8 +28,8 @@ "subdir": "grafonnet" } }, - "version": "356bd73e4792ffe107725776ca8946895969c191", - "sum": "CSMZ3dJrpJpwvffie8BqcfrIVVwiKNqdPEN+1XWRBGU=" + "version": "3082bfca110166cd69533fa3c0875fdb1b68c329", + "sum": "4/sUV0Kk+o8I+wlYxL9R6EPhL/NiLfYHk+NXlU64RUk=" }, { "source": { @@ -48,8 +38,8 @@ "subdir": "grafana-builder" } }, - "version": "1894e32a5d9d9de98ed763ff3061c07877d78d8a", - "sum": "9/eJqljTTtJeq9QRjabdKWL6yD8a7VzLmGKBK3ir77k=" + "version": "1e36fec2a1b44fb16b86c98edf3f477093e6b5b3", + "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" }, { "source": { @@ -69,8 +59,8 @@ "subdir": "" } }, - "version": "c67c0f19e869f1da34d79b6507c1fa37c23a6e4e", - "sum": "F+RxcI26zeoeI81uot39Jv6IpQ6BOz+xlSHlElJYsz8=" + "version": "8524aa43d49914b170b84816fc182319da04a167", + "sum": "J06UiBvcfpRzLM5VbLRAhP39Zaz+EKguJ5sSTBDeygs=" }, { "source": { @@ -79,7 +69,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "39a9cda705b5201c35105bd1f24c83923fa839ef", + "version": "8524aa43d49914b170b84816fc182319da04a167", "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" }, { @@ -89,7 +79,7 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "aafecf305bbb5c13fcec0ba3e749d1ffcb0d93e1", + "version": "95500e51a3144659522df603ed4e7ee7f597bc3b", "sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE=" }, { @@ -99,8 +89,8 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "aafecf305bbb5c13fcec0ba3e749d1ffcb0d93e1", - "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo=" + "version": "95500e51a3144659522df603ed4e7ee7f597bc3b", + "sum": "u8gaydJoxEjzizQ8jY8xSjYgWooPmxw+wIWdDxifMAk=" }, { "source": { @@ -109,7 +99,7 @@ "subdir": "jsonnet" } }, - "version": "4a64265b8f3f50efb907954b3f28f09362280409", + "version": "101429149266e2c86a41b82baf47a033c9a93b02", "sum": "xoF0ibl74QrQ1DWbmoZ3JY83dEVeU0EQCvom96KGEuA=", "name": "openshift-state-metrics" }, @@ -120,8 +110,8 @@ "subdir": "jsonnet/telemeter" } }, - "version": "92c5ea66751a8b087a6ff85c397bb949ad028fdb", - "sum": "QeDKNbmU+tkV4BtQFlOZZq4xmRl0myC1QQRII5lo01s=", + "version": "d6ceb8a4e94f775510591974b2cdeb19819abda0", + "sum": "lszFfJCYihFfNtBpMNHlM4iFackZQ7yxzo114U2c0gk=", "name": "telemeter-client" }, { @@ -131,8 +121,8 @@ "subdir": "jsonnet/kube-prometheus" } }, - "version": "2c34d3dff64b7c9a4c158149d6554b0ad9a0e144", - "sum": "mG7Jfa33upviONZuXZsQqW3iegVP5CF+U3KFH/HqKUw=" + "version": "11778868b18604ff504c1cfdcec24e7fd7169f7c", + "sum": "3HCifCftCis0vvjJLC76X8Yjx4SYeTqITZsHImqrrgk=" }, { "source": { @@ -141,7 +131,7 @@ "subdir": "jsonnet/mixin" } }, - "version": "64d466d7730165c0d260f187e2e9742bc0295bf2", + "version": "d26fd2d25e85b16b6ff944daf979de2b6d9e09ac", "sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=", "name": "prometheus-operator-mixin" }, @@ -152,8 +142,8 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "5555f492df250168657b72bb8cb60bec071de71f", - "sum": "quzK9/gITldAfVGBkFUsLjQ3Y2F4NOJ2GQUjPSD8HHQ=" + "version": "d26fd2d25e85b16b6ff944daf979de2b6d9e09ac", + "sum": "saq6pBcAO6o3nQL4VbeaOddBXvN0ULC+4WfyeCy7yFI=" }, { "source": { @@ -162,8 +152,8 @@ "subdir": "doc/alertmanager-mixin" } }, - "version": "f328dc6f8401d1f9eeb1ae0dce360b8df85e017f", - "sum": "VP1vn/WTGLZaBgGhGMUO81qNTc/fnp5KtzVjcaxad6Q=", + "version": "8598683b2461fb68e1921735c20163c4c784f9b6", + "sum": "YIWuR6x64SRQSCr8tuuGN1cc0TK5HGR0HWvgot3fc6k=", "name": "alertmanager" }, { @@ -173,8 +163,8 @@ "subdir": "docs/node-mixin" } }, - "version": "88ee42742e1a947c91a328dfe47e6ea3c3fbd5da", - "sum": "cZTNXQMUCLB5FGYpMn845dcqGdkcYt58qCqOFIV/BoQ=" + "version": "5b13775dc868e6c30a2346a8885fcf067c7aad57", + "sum": "os3VfjBdFdDaTYzI+A/RahIhQcgQ7KoaLL68s1kiCbA=" }, { "source": { @@ -183,8 +173,8 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "e4487274853c587717006eeda8804e597d120340", - "sum": "6kUzElCBWZ5U/3cxEpHNMmoKKPubG45QxpmLu8PGg08=", + "version": "9c23d1741aa8572a90e22dac09a3f99f168d7437", + "sum": "kMLsvbLtQcCi7q4YHkZlO3aiIayz1ayjrkH3+V8sd58=", "name": "prometheus" }, { @@ -194,8 +184,8 @@ "subdir": "jsonnet/kube-thanos" } }, - "version": "e6c97a0163e941ba76483391170553328774854b", - "sum": "cmvtINsnwAGR+ZFiz5cGg+9TRfQhnkGwiKQEqXmB5dQ=" + "version": "e1a68590f56034ca1a43d59401f03e72fd01ac5f", + "sum": "9g4HwpZ8Vpi950O6x92HNM47Yqa0+Nfv+7YbwwKpt3o=" }, { "source": { @@ -204,8 +194,8 @@ "subdir": "mixin" } }, - "version": "80e0257e25b59ff8113b9d21a155c785f69a4b95", - "sum": "bE5eEPMulYMtz0lJSxDVnCXVTg/lqSSS5aEUUH3V19A=" + "version": "7c6c5051b0bcc219c4e205b97a91634cfd57131d", + "sum": "IS62r3fSx0evbBhH0QqKUW+4TAMOHpzbsW+v9nw/SNM=" } ], "legacyImports": false diff --git a/jsonnet/main.jsonnet b/jsonnet/main.jsonnet index 70c3259f18..38f2a37ee1 100644 --- a/jsonnet/main.jsonnet +++ b/jsonnet/main.jsonnet @@ -64,6 +64,7 @@ local commonConfig = { prometheusOperator: '0.48.1', promLabelProxy: '0.2.0', thanos: '0.20.2', + kubeRbacProxy: '0.10.0', }, // In OSE images are overridden images: { @@ -78,6 +79,7 @@ local commonConfig = { promLabelProxy: 'quay.io/prometheuscommunity/prom-label-proxy:v' + $.versions.thanos, telemeter: '', thanos: 'quay.io/thanos/thanos:v' + $.versions.thanos, + kubeRbacProxy: 'quay.io/brancz/kube-rbac-proxy:v' + $.versions.kubeRbacProxy, openshiftOauthProxy: 'quay.io/openshift/oauth-proxy:latest', //kubeRbacProxy: 'quay.io/brancz/kube-rbac-proxy:v0.8.0', @@ -204,6 +206,7 @@ local inCluster = namespace: $.values.common.namespace, version: $.values.common.versions.kubeStateMetrics, image: $.values.common.images.kubeStateMetrics, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, commonLabels+: $.values.common.commonLabels, mixin+: { ruleLabels: $.values.common.ruleLabels }, }, @@ -211,6 +214,7 @@ local inCluster = namespace: $.values.common.namespace, version: $.values.common.versions.nodeExporter, image: $.values.common.images.nodeExporter, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, commonLabels+: $.values.common.commonLabels, mixin+: { ruleLabels: $.values.common.ruleLabels, @@ -254,6 +258,7 @@ local inCluster = namespace: $.values.common.namespace, version: $.values.common.versions.prometheusOperator, image: $.values.common.images.prometheusOperator, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, configReloaderImage: $.values.common.images.prometheusOperatorReloader, commonLabels+: $.values.common.commonLabels, mixin+: { @@ -373,6 +378,7 @@ local userWorkload = denyNamespace: inCluster.values.common.namespace, version: $.values.common.versions.prometheusOperator, image: $.values.common.images.prometheusOperator, + kubeRbacProxyImage: $.values.common.images.kubeRbacProxy, configReloaderImage: $.values.common.images.prometheusOperatorReloader, commonLabels+: $.values.common.commonLabels, mixin+: { diff --git a/jsonnet/thanos-querier.libsonnet b/jsonnet/thanos-querier.libsonnet index 01c1cde701..178a28e59e 100644 --- a/jsonnet/thanos-querier.libsonnet +++ b/jsonnet/thanos-querier.libsonnet @@ -5,6 +5,7 @@ function(params) local tq = querier(cfg); tq { mixin:: (import 'github.com/thanos-io/thanos/mixin/alerts/query.libsonnet') { + targetGroups: {}, query+:: { selector: 'job="thanos-querier"', }, @@ -323,6 +324,9 @@ function(params) ], serviceAccountName: 'thanos-querier', priorityClassName: 'system-cluster-critical', + // securityContext needs to be unset to be compatible with current OpenShift constrains for ServiceAccount + // OpenShift will automatically assign unprivileged context + securityContext:: {}, containers: [ super.containers[0] { livenessProbe: { diff --git a/jsonnet/thanos-ruler.libsonnet b/jsonnet/thanos-ruler.libsonnet index 6044eba4b8..19af49b8d4 100644 --- a/jsonnet/thanos-ruler.libsonnet +++ b/jsonnet/thanos-ruler.libsonnet @@ -4,6 +4,7 @@ function(params) { local cfg = params, mixin:: (import 'github.com/thanos-io/thanos/mixin/alerts/rule.libsonnet') { + targetGroups: {}, rule+:: { selector: 'job="thanos-ruler"', }, diff --git a/manifests/0000_50_cluster-monitoring-operator_00_0alertmanager-config-custom-resource-definition.yaml b/manifests/0000_50_cluster-monitoring-operator_00_0alertmanager-config-custom-resource-definition.yaml index d75986ba97..d50e20c074 100644 --- a/manifests/0000_50_cluster-monitoring-operator_00_0alertmanager-config-custom-resource-definition.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_00_0alertmanager-config-custom-resource-definition.yaml @@ -11,6 +11,8 @@ metadata: spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: AlertmanagerConfig listKind: AlertmanagerConfigList plural: alertmanagerconfigs diff --git a/manifests/0000_50_cluster-monitoring-operator_00_0alertmanager-custom-resource-definition.yaml b/manifests/0000_50_cluster-monitoring-operator_00_0alertmanager-custom-resource-definition.yaml index e2d8a587ee..76e5919a89 100644 --- a/manifests/0000_50_cluster-monitoring-operator_00_0alertmanager-custom-resource-definition.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_00_0alertmanager-custom-resource-definition.yaml @@ -11,6 +11,8 @@ metadata: spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: Alertmanager listKind: AlertmanagerList plural: alertmanagers diff --git a/manifests/0000_50_cluster-monitoring-operator_00_0podmonitor-custom-resource-definition.yaml b/manifests/0000_50_cluster-monitoring-operator_00_0podmonitor-custom-resource-definition.yaml index a4664501bc..16e0d08a7c 100644 --- a/manifests/0000_50_cluster-monitoring-operator_00_0podmonitor-custom-resource-definition.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_00_0podmonitor-custom-resource-definition.yaml @@ -11,6 +11,8 @@ metadata: spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: PodMonitor listKind: PodMonitorList plural: podmonitors @@ -200,8 +202,10 @@ spec: to proxy through this endpoint. type: string relabelings: - description: 'RelabelConfigs to apply to samples before ingestion. - More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + description: 'RelabelConfigs to apply to samples before scraping. + Prometheus Operator automatically adds relabelings for a few + standard Kubernetes fields and replaces original scrape job + name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' items: description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It diff --git a/manifests/0000_50_cluster-monitoring-operator_00_0probe-custom-resource-definition.yaml b/manifests/0000_50_cluster-monitoring-operator_00_0probe-custom-resource-definition.yaml index 89cb27f414..f3279f3196 100644 --- a/manifests/0000_50_cluster-monitoring-operator_00_0probe-custom-resource-definition.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_00_0probe-custom-resource-definition.yaml @@ -11,6 +11,8 @@ metadata: spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: Probe listKind: ProbeList plural: probes @@ -38,6 +40,68 @@ spec: description: Specification of desired Ingress selection for target discovery by Prometheus. properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over basic + authentication. More info: https://prometheus.io/docs/operating/configuration/#endpoint' + properties: + password: + description: The secret in the service monitor namespace that + contains the password for authentication. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor namespace that + contains the username for authentication. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: Secret to mount to read bearer token for scraping targets. + The secret needs to be in the same namespace as the probe and accessible + by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must be a + valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be defined + type: boolean + required: + - key + type: object interval: description: Interval at which targets are probed using the configured prober. If not specified Prometheus' global scrape interval is used. @@ -58,6 +122,9 @@ spec: path: description: Path to collect metrics from. Defaults to `/probe`. type: string + proxyUrl: + description: Optional ProxyURL. + type: string scheme: description: HTTP scheme to use for scraping. Defaults to `http`. type: string @@ -193,6 +260,52 @@ spec: description: Labels assigned to all metrics scraped from the targets. type: object + relabelingConfigs: + description: 'RelabelConfigs to apply to samples before ingestion. + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + items: + description: 'RelabelConfig allows dynamic rewriting of + the label set, being applied to samples before ingestion. + It defines ``-section of Prometheus + configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. + Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source + label values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. Default is '(.*)' + type: string + replacement: + description: Replacement value against which a regex + replace is performed if the regular expression matches. + Regex capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source + label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing + labels. Their content is concatenated using the configured + separator and matched against the configured regular + expression for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written + in a replace action. It is mandatory for replace actions. + Regex capture groups are available. + type: string + type: object + type: array static: description: Targets is a list of URLs to probe using the configured prober. @@ -201,6 +314,112 @@ spec: type: array type: object type: object + tlsConfig: + description: TLS configuration to use when scraping the endpoint. + properties: + ca: + description: Struct containing the CA cert to use for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key + must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file for the targets. + properties: + configMap: + description: ConfigMap containing data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap or its key + must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object type: object required: - spec diff --git a/manifests/0000_50_cluster-monitoring-operator_00_0prometheus-custom-resource-definition.yaml b/manifests/0000_50_cluster-monitoring-operator_00_0prometheus-custom-resource-definition.yaml index ca5993feb0..315bb93362 100644 --- a/manifests/0000_50_cluster-monitoring-operator_00_0prometheus-custom-resource-definition.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_00_0prometheus-custom-resource-definition.yaml @@ -11,6 +11,8 @@ metadata: spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: Prometheus listKind: PrometheusList plural: prometheuses @@ -2202,10 +2204,23 @@ spec: only clients authorized to perform these actions can do so. For more information see https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis' type: boolean + enableFeatures: + description: Enable access to Prometheus disabled features. By default, + no features are enabled. Enabling disabled features is entirely + outside the scope of what the maintainers will support and by doing + so, you accept that this behaviour may break at any time without + notice. For more information see https://prometheus.io/docs/prometheus/latest/disabled_features/ + items: + type: string + type: array enforcedNamespaceLabel: - description: EnforcedNamespaceLabel enforces adding a namespace label - of origin for each alert and metric that is user created. The label - value will always be the namespace of the object that is being created. + description: "EnforcedNamespaceLabel If set, a label will be added + to \n 1. all user-metrics (created by `ServiceMonitor`, `PodMonitor` + and `ProbeConfig` object) and 2. in all `PrometheusRule` objects + (except the ones excluded in `prometheusRulesExcludedFromEnforce`) + to * alerting & recording rules and * the metrics used in + their expressions (`expr`). \n Label name is this field's value. + Label value is the namespace of the created object (mentioned above)." type: string enforcedSampleLimit: description: EnforcedSampleLimit defines global limit on number of @@ -2226,7 +2241,7 @@ spec: format: int64 type: integer evaluationInterval: - description: Interval between consecutive evaluations. + description: 'Interval between consecutive evaluations. Default: `1m`' type: string externalLabels: additionalProperties: @@ -2271,10 +2286,12 @@ spec: into the Prometheus configuration from external sources. Any errors during the execution of an initContainer will lead to a restart of the Pod. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ - Using initContainers for any use case other then secret fetching - is entirely outside the scope of what the maintainers will support - and by doing so, you accept that this behaviour may break at any - time without notice.' + InitContainers described here modify an operator generated init + containers if they share the same name and modifications are done + via a strategic merge patch. The current init container name is: + `init-config-reloader`. Overriding init containers is entirely outside + the scope of what the maintainers will support and by doing so, + you accept that this behaviour may break at any time without notice.' items: description: A single application container that you want to run within a pod. @@ -3685,7 +3702,7 @@ spec: type: object type: object bearerToken: - description: bearer token for remote read. + description: Bearer token for remote read. type: string bearerTokenFile: description: File to read bearer token for remote read. @@ -3896,11 +3913,32 @@ spec: type: object type: object bearerToken: - description: File to read bearer token for remote write. + description: Bearer token for remote write. type: string bearerTokenFile: description: File to read bearer token for remote write. type: string + headers: + additionalProperties: + type: string + description: Custom HTTP headers to be sent along with each + remote write request. Be aware that headers that are set by + Prometheus itself can't be overwritten. Only valid in Prometheus + versions 2.25.0 and newer. + type: object + metadataConfig: + description: MetadataConfig configures the sending of series + metadata to remote storage. + properties: + send: + description: Whether metric metadata is sent to remote storage + or not. + type: boolean + sendInterval: + description: How frequently metric metadata is sent to remote + storage. + type: string + type: object name: description: The name of the remote write queue, must be unique if specified. The name is used in metrics and logging in order @@ -4298,7 +4336,7 @@ spec: type: object type: object scrapeInterval: - description: Interval between consecutive scrapes. + description: 'Interval between consecutive scrapes. Default: `1m`' type: string scrapeTimeout: description: Number of seconds to wait for target to respond before @@ -6485,6 +6523,150 @@ spec: pageTitle: description: The prometheus web page title type: string + tlsConfig: + description: WebTLSConfig defines the TLS parameters for HTTPS. + properties: + cert: + description: Contains the TLS certificate for the server. + properties: + configMap: + description: ConfigMap containing data to use for the + targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the ConfigMap or its + key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + type: object + cipherSuites: + description: 'List of supported cipher suites for TLS versions + up to TLS 1.2. If empty, Go default cipher suites are used. + Available cipher suites are documented in the go documentation: + https://golang.org/pkg/crypto/tls/#pkg-constants' + items: + type: string + type: array + client_ca: + description: Contains the CA certificate for client certificate + authentication to the server. + properties: + configMap: + description: ConfigMap containing data to use for the + targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the ConfigMap or its + key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + type: object + clientAuthType: + description: 'Server policy for client authentication. Maps + to ClientAuth Policies. For more detail on clientAuth options: + https://golang.org/pkg/crypto/tls/#ClientAuthType' + type: string + curvePreferences: + description: 'Elliptic curves that will be used in an ECDHE + handshake, in preference order. Available curves are documented + in the go documentation: https://golang.org/pkg/crypto/tls/#CurveID' + items: + type: string + type: array + keySecret: + description: Secret containing the TLS key for the server. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + maxVersion: + description: Maximum TLS version that is acceptable. Defaults + to TLS13. + type: string + minVersion: + description: Minimum TLS version that is acceptable. Defaults + to TLS12. + type: string + preferServerCipherSuites: + description: Controls whether the server selects the client's + most preferred cipher suite, or the server's most preferred + cipher suite. If true then the server's preference, as expressed + in the order of elements in cipherSuites, is used. + type: boolean + required: + - cert + - keySecret + type: object type: object type: object status: diff --git a/manifests/0000_50_cluster-monitoring-operator_00_0prometheusrule-custom-resource-definition.yaml b/manifests/0000_50_cluster-monitoring-operator_00_0prometheusrule-custom-resource-definition.yaml index 168e01a5aa..41fbb9a71c 100644 --- a/manifests/0000_50_cluster-monitoring-operator_00_0prometheusrule-custom-resource-definition.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_00_0prometheusrule-custom-resource-definition.yaml @@ -54,7 +54,10 @@ spec: type: string rules: items: - description: Rule describes an alerting or recording rule. + description: 'Rule describes an alerting or recording rule + See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) + or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) + rule' properties: alert: type: string diff --git a/manifests/0000_50_cluster-monitoring-operator_00_0servicemonitor-custom-resource-definition.yaml b/manifests/0000_50_cluster-monitoring-operator_00_0servicemonitor-custom-resource-definition.yaml index fd4ca5c732..e1350fbc1f 100644 --- a/manifests/0000_50_cluster-monitoring-operator_00_0servicemonitor-custom-resource-definition.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_00_0servicemonitor-custom-resource-definition.yaml @@ -11,6 +11,8 @@ metadata: spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: ServiceMonitor listKind: ServiceMonitorList plural: servicemonitors @@ -187,7 +189,9 @@ spec: type: string relabelings: description: 'RelabelConfigs to apply to samples before scraping. - More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' + Prometheus Operator automatically adds relabelings for a few + standard Kubernetes fields and replaces original scrape job + name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' items: description: 'RelabelConfig allows dynamic rewriting of the label set, being applied to samples before ingestion. It @@ -375,11 +379,14 @@ spec: type: object type: array jobLabel: - description: The label to use to retrieve the job name from. + description: "Chooses the label of the Kubernetes `Endpoints`. Its + value will be used for the `job`-label's value of the created metrics. + \n Default & fallback value: the name of the respective Kubernetes + `Endpoint`." type: string namespaceSelector: - description: Selector to select which namespaces the Endpoints objects - are discovered from. + description: Selector to select which namespaces the Kubernetes Endpoints + objects are discovered from. properties: any: description: Boolean describing whether all namespaces are selected @@ -392,8 +399,8 @@ spec: type: array type: object podTargetLabels: - description: PodTargetLabels transfers labels on the Kubernetes Pod - onto the target. + description: PodTargetLabels transfers labels on the Kubernetes `Pod` + onto the created metrics. items: type: string type: array @@ -447,8 +454,9 @@ spec: type: object type: object targetLabels: - description: TargetLabels transfers labels on the Kubernetes Service - onto the target. + description: TargetLabels transfers labels from the Kubernetes `Service` + onto the created metrics. All labels set in `selector.matchLabels` + are automatically transferred. items: type: string type: array diff --git a/manifests/0000_50_cluster-monitoring-operator_00_0thanosruler-custom-resource-definition.yaml b/manifests/0000_50_cluster-monitoring-operator_00_0thanosruler-custom-resource-definition.yaml index 24587f0a77..797b9b9179 100644 --- a/manifests/0000_50_cluster-monitoring-operator_00_0thanosruler-custom-resource-definition.yaml +++ b/manifests/0000_50_cluster-monitoring-operator_00_0thanosruler-custom-resource-definition.yaml @@ -11,6 +11,8 @@ metadata: spec: group: monitoring.coreos.com names: + categories: + - prometheus-operator kind: ThanosRuler listKind: ThanosRulerList plural: thanosrulers diff --git a/manifests/0000_90_cluster-monitoring-operator_01-dashboards.yaml b/manifests/0000_90_cluster-monitoring-operator_01-dashboards.yaml index 82ff308e67..7ed7078a6c 100644 --- a/manifests/0000_90_cluster-monitoring-operator_01-dashboards.yaml +++ b/manifests/0000_90_cluster-monitoring-operator_01-dashboards.yaml @@ -3054,7 +3054,7 @@ data: ], "query": "label_values(etcd_server_has_leader, job)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 2, "tagValuesQuery": "", @@ -3694,7 +3694,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -3985,7 +3985,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4003,7 +4003,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4021,7 +4021,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6197,7 +6197,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -6281,7 +6281,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -6375,7 +6375,7 @@ data: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "Memory Utilization (from requests)", + "title": "Memory Utilisation (from requests)", "tooltip": { "shared": false, "sort": 0, @@ -6565,7 +6565,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -6816,7 +6816,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6834,7 +6834,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -6852,7 +6852,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -8616,7 +8616,7 @@ data: "decimals": 2, "link": true, "linkTargetBlank": false, - "linkTooltip": "Drill down to containers", + "linkTooltip": "Drill down to pods", "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ @@ -8935,7 +8935,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -9170,7 +9170,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9188,7 +9188,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9206,7 +9206,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -9925,7 +9925,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -10281,7 +10281,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10299,7 +10299,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10317,7 +10317,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -10465,7 +10465,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Memory Usage (WSS)", "tooltip": { "shared": false, "sort": 0, @@ -10556,7 +10556,7 @@ data: "type": "hidden" }, { - "alias": "Memory Usage", + "alias": "Memory Usage (WSS)", "colorMode": null, "colors": [ @@ -11670,7 +11670,7 @@ data: "expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m])))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{pod}}", + "legendFormat": "{{container}}", "legendLink": null, "step": 10 } @@ -11756,7 +11756,7 @@ data: "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{pod}}", + "legendFormat": "{{container}}", "legendLink": null, "step": 10 } @@ -11975,18 +11975,18 @@ data: "unit": "Bps" }, { - "alias": "Pod", + "alias": "Container", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, - "link": true, + "link": false, "linkTargetBlank": false, - "linkTooltip": "Drill down to pods", - "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", "thresholds": [ ], @@ -12330,7 +12330,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -12565,7 +12565,7 @@ data: ], "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12583,7 +12583,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -12601,7 +12601,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14332,7 +14332,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}} - {{workload_type}}", @@ -14630,7 +14630,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14648,7 +14648,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -14666,7 +14666,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -17916,7 +17916,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", + "expr": "(\n instance:node_cpu_utilisation:rate5m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -17931,7 +17931,7 @@ data: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18017,7 +18017,7 @@ data: "timeShift": null, "title": "CPU Saturation (load1 per CPU)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18115,7 +18115,7 @@ data: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18186,7 +18186,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -18201,7 +18201,7 @@ data: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18292,7 +18292,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Receive", @@ -18300,7 +18300,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Transmit", @@ -18315,7 +18315,7 @@ data: "timeShift": null, "title": "Net Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18394,7 +18394,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Receive", @@ -18402,7 +18402,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Transmit", @@ -18417,7 +18417,7 @@ data: "timeShift": null, "title": "Net Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18500,7 +18500,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -18515,7 +18515,7 @@ data: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18586,7 +18586,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -18601,7 +18601,7 @@ data: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18699,7 +18699,7 @@ data: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18875,7 +18875,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_cpu_utilisation:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -18890,7 +18890,7 @@ data: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -18976,7 +18976,7 @@ data: "timeShift": null, "title": "CPU Saturation (Load1 per CPU)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19074,7 +19074,7 @@ data: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19145,7 +19145,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Major page faults", @@ -19160,7 +19160,7 @@ data: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19251,7 +19251,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Receive", @@ -19259,7 +19259,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Transmit", @@ -19274,7 +19274,7 @@ data: "timeShift": null, "title": "Net Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19353,7 +19353,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Receive drops", @@ -19361,7 +19361,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Transmit drops", @@ -19376,7 +19376,7 @@ data: "timeShift": null, "title": "Net Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19459,7 +19459,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -19474,7 +19474,7 @@ data: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19545,7 +19545,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -19560,7 +19560,7 @@ data: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -19658,7 +19658,7 @@ data: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21227,7 +21227,7 @@ data: "timeShift": null, "title": "Prometheus Stats", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21326,7 +21326,7 @@ data: "timeShift": null, "title": "Target Sync", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21412,7 +21412,7 @@ data: "timeShift": null, "title": "Targets", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21510,7 +21510,7 @@ data: "timeShift": null, "title": "Average Scrape Interval Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21580,6 +21580,14 @@ data: "stack": true, "steppedLine": false, "targets": [ + { + "expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "exceeded body size limit: {{job}}", + "legendLink": null, + "step": 10 + }, { "expr": "sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))", "format": "time_series", @@ -21620,7 +21628,7 @@ data: "timeShift": null, "title": "Scrape failures", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21706,7 +21714,7 @@ data: "timeShift": null, "title": "Appended Samples", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21804,7 +21812,7 @@ data: "timeShift": null, "title": "Head Series", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21890,7 +21898,7 @@ data: "timeShift": null, "title": "Head Chunks", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -21988,7 +21996,7 @@ data: "timeShift": null, "title": "Query Rate", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -22074,7 +22082,7 @@ data: "timeShift": null, "title": "Stage Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -22140,7 +22148,7 @@ data: "type": "datasource" }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All", @@ -22168,7 +22176,7 @@ data: "useTags": false }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All",