diff --git a/deployment/ds-hostnet-split/02-statsd.yaml b/deployment/ds-hostnet-split/02-statsd.yaml new file mode 100644 index 00000000000..57762cfa5e3 --- /dev/null +++ b/deployment/ds-hostnet-split/02-statsd.yaml @@ -0,0 +1,76 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: envoy-statsd + namespace: heptio-contour +data: + statsd.yaml: |2- + mappings: + - match: 'cluster\.(.+?)_(.+?)_(.+?)\.upstream_cx_connect_ms' + match_type: regex + name: "envoy_cluster_upstream_cx_connect_time" + timer_type: 'histogram' + labels: + cluster_name: "$1/$2/$3" + namespace: "$1" + service: "$2" + port: "$3" + - match: 'cluster\.(.+?)_(.+?)_(.+?)\.upstream_cx_length_ms' + match_type: regex + name: "envoy_cluster_upstream_cx_length" + timer_type: 'histogram' + labels: + cluster_name: "$1/$2/$3" + namespace: "$1" + service: "$2" + port: "$3" + - match: 'cluster\.(.+?)_(.+?)_(.+?)\.upstream_rq_time' + match_type: regex + name: "envoy_cluster_upstream_rq_time" + timer_type: 'histogram' + labels: + cluster_name: "$1/$2/$3" + namespace: "$1" + service: "$2" + port: "$3" + - match: 'cluster\.(.+?)_(.+?)_(.+?)\.internal.upstream_rq_time' + match_type: regex + name: "envoy_cluster_internal_upstream_rq_time" + timer_type: 'histogram' + labels: + cluster_name: "$1/$2/$3" + namespace: "$1" + service: "$2" + port: "$3" + - match: 'cluster\.(.+?)_(.+?)_(.+?)\.external.upstream_rq_time' + match_type: regex + name: "envoy_cluster_external_upstream_rq_time" + timer_type: 'histogram' + labels: + cluster_name: "$1/$2/$3" + namespace: "$1" + service: "$2" + port: "$3" + - match: 'cluster\.(.+?)_(.+?)_(.+?)\.canary.upstream_rq_time' + match_type: regex + name: "envoy_cluster_canary_upstream_rq_time" + timer_type: 'histogram' + labels: + cluster_name: "$1/$2/$3" + namespace: "$1" + service: "$2" + port: "$3" + - match: 'envoy.http.*.downstream_cx_length_ms' + name: "envoy_http_downstream_cx_length" + timer_type: 'histogram' + labels: + http_conn_manager_prefix: "$1" + - match: 'envoy.http.*.downstream_rq_time' + name: "envoy_http_downstream_rq_time" + timer_type: 'histogram' + labels: + http_conn_manager_prefix: "$1" + - match: '.' + match_type: 'regex' + action: 'drop' + name: 'dropped' \ No newline at end of file diff --git a/deployment/ds-hostnet-split/03-envoy.yaml b/deployment/ds-hostnet-split/03-envoy.yaml index 340d23ccf4e..09be804148d 100644 --- a/deployment/ds-hostnet-split/03-envoy.yaml +++ b/deployment/ds-hostnet-split/03-envoy.yaml @@ -18,8 +18,8 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "8002" - prometheus.io/path: "/stats" - prometheus.io/format: "prometheus" + prometheus.io/statsdport: "9102" + prometheus.io/path: "/stats/prometheus" labels: app: envoy spec: @@ -57,7 +57,20 @@ spec: lifecycle: preStop: exec: - command: ["wget", "-qO-", "http://localhost:9001/healthcheck/fail"] + command: ["wget", "-qO-", "http://localhost:9001/healthcheck/fail"] + - name: statsd-sink + image: prom/statsd-exporter:v0.6.0 + command: + - "/bin/statsd_exporter" + args: + - "-statsd.mapping-config=/etc/statsd/statsd.yaml" + ports: + - containerPort: 9102 + protocol: TCP + name: metrics + volumeMounts: + - name: statsd + mountPath: /etc/statsd dnsPolicy: ClusterFirstWithHostNet hostNetwork: true initContainers: @@ -68,6 +81,7 @@ spec: - $(CONTOUR_SERVICE_HOST) - --xds-port - $(CONTOUR_SERVICE_PORT) + - --statsd-enabled command: - contour image: gcr.io/heptio-images/contour:master @@ -80,4 +94,7 @@ spec: volumes: - name: contour-config emptyDir: {} + - name: statsd + configMap: + name: envoy-statsd restartPolicy: Always diff --git a/deployment/grafana/01-namespace.yaml b/deployment/grafana/01-namespace.yaml new file mode 100644 index 00000000000..83b77c139a2 --- /dev/null +++ b/deployment/grafana/01-namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: contour-monitoring + labels: + app: contour-monitoring \ No newline at end of file diff --git a/deployment/grafana/02-grafana-configmap.yaml b/deployment/grafana/02-grafana-configmap.yaml new file mode 100644 index 00000000000..91c50bec8b1 --- /dev/null +++ b/deployment/grafana/02-grafana-configmap.yaml @@ -0,0 +1,2537 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app: grafana + name: grafana-dashs + namespace: contour-monitoring +data: + contour.json: | + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "5.0.4" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "5.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1532630524651, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(contour_ingressroute_total{namespace=~\"$Namespace\"}) by (namespace)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Namespace: {{ namespace }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Total IngressRoutes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(contour_ingressroute_orphaned_total{namespace=~\"$Namespace\"}) by (namespace)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Namespace: {{ namespace }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Orphaned IngressRoutes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(contour_ingressroute_valid_total{namespace=~\"$Namespace\",vhost=~\"$VHost\"}) by (namespace,vhost)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Namespace: {{ namespace }} ({{ vhost }})", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Valid IngressRoutes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(contour_ingressroute_invalid_total{namespace=~\"$Namespace\",vhost=~\"$VHost\"}) by (namespace,vhost)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Namespace: {{ namespace }} ({{ vhost }})", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Invalid IngressRoutes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(contour_ingressroute_root_total{namespace=~\"$Namespace\"}) by (namespace)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Namespace: {{ namespace }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Root IngressRoutes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "envoy_listener_manager_lds_update_failure", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}} (failure)", + "refId": "A" + }, + { + "expr": "envoy_listener_manager_lds_update_success", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}} (success)", + "refId": "B" + }, + { + "expr": "envoy_listener_manager_lds_update_rejected", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}} (rejected)", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Contour LDS Updates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 8 + }, + "id": 8, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "envoy_cluster_manager_cds_update_success", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}} (Success)", + "refId": "A" + }, + { + "expr": "envoy_cluster_manager_cds_update_failure", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}} (Failure)", + "refId": "B" + }, + { + "expr": "envoy_cluster_manager_cds_update_rejected", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}} (Rejected)", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Contour CDS Updates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 11, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "envoy_http_rds_ingress_http_update_success", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}} (Success)", + "refId": "A" + }, + { + "expr": "envoy_http_rds_ingress_http_update_failure", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}} (Failure)", + "refId": "B" + }, + { + "expr": "envoy_http_rds_ingress_http_update_rejected", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}} (Rejected)", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Contour RDS Updates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Namespace", + "options": [], + "query": "label_values(namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "VHost", + "options": [], + "query": "label_values(vhost)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Contour", + "uid": "KYcCfvKik", + "version": 1 + } + envoy.json: | + { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "5.0.4" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "5.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1533835806222, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(envoy_http_downstream_rq_total[1m])) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Downstream RPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(envoy_http_downstream_cx_total[1m])) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Downstream CPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9, sum(rate(envoy_http_downstream_rq_time_bucket[1m])) by (le, kubernetes_pod_name))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}} 90%", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(envoy_http_downstream_rq_time_bucket[1m])) by (le, kubernetes_pod_name))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}} 50% ", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(envoy_http_downstream_rq_time_bucket[1m])) by (le, kubernetes_pod_name))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}} 99%", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Downstream Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(envoy_http_downstream_cx_active) by (kubernetes_pod_name)", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Downstream Total Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Displays the number of Requests per Second being performed against each Upstream.", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(envoy_cluster_upstream_rq_total{namespace=~\"$Namespace\",service=~\"$Service\"}[1m])) by (service,namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upstream RPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(envoy_cluster_upstream_cx_total{namespace=~\"$Namespace\",service=~\"$Service\"}[1m])) by (namespace, service)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upstream CPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 8 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(envoy_cluster_upstream_rq_time_bucket{service=~\"$Service\",namespace=~\"$Namespace\"}[1m])) by (le, service, namespace))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}}/{{service}} 99%", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(envoy_cluster_upstream_rq_time_bucket{service=~\"$Service\",namespace=~\"$Namespace\"}[1m])) by (le, service, namespace))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}}/{{service}} 90%", + "refId": "C" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(envoy_cluster_upstream_rq_time_bucket{service=~\"$Service\",namespace=~\"$Namespace\"}[1m])) by (le, service, namespace))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{namespace}}/{{service}} 50% ", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upstream Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(envoy_cluster_upstream_cx_active{namespace=~\"$Namespace\",service=~\"$Service\"}) by (namespace, service)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upstream Total Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(envoy_cluster_upstream_rq_xx{namespace=~\"$Namespace\",service=~\"$Service\",envoy_response_code_class=~\"2\"}[1m])) by (namespace,service)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upstream 2xx Responses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(envoy_cluster_upstream_rq_xx{namespace=~\"$Namespace\",service=~\"$Service\",envoy_response_code_class=\"3\"}[1m])) by (namespace,service)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upstream 3xx Responses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(envoy_cluster_upstream_rq_xx{namespace=~\"$Namespace\",service=~\"$Service\",envoy_response_code_class=\"4\"}[1m])) by (namespace,service)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upstream 4xx Responses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(envoy_cluster_upstream_rq_xx{namespace=~\"$Namespace\",service=~\"$Service\",envoy_response_code_class=\"5\"}[1m])) by (namespace,service)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Upstream 5xx Responses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 24 + }, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(envoy_cluster_membership_healthy{namespace=~\"$Namespace\",service=~\"$Service\"}) by (namespace, service) / avg(envoy_cluster_membership_total{namespace=~\"$Namespace\",service=~\"$Service\"}) by (namespace, service)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Endpoint Percentage Health", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 24 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(envoy_cluster_membership_total{namespace=~\"$Namespace\",service=~\"$Service\"}) by (namespace, service)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Total Endpoints", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 24 + }, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(envoy_cluster_membership_healthy{namespace=~\"$Namespace\",service=~\"$Service\"}) by (namespace, service)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Healthy Endpoints", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "", + "fill": 1, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 24 + }, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(envoy_cluster_membership_total{namespace=~\"$Namespace\",service=~\"$Service\"}) by (namespace, service) - avg(envoy_cluster_membership_healthy{namespace=~\"$Namespace\",service=~\"$Service\"}) by (namespace, service)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}/{{service}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Unhealthy Endpoints", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Namespace", + "options": [], + "query": "label_values(envoy_cluster_upstream_rq_time_bucket,namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Service", + "options": [], + "query": "label_values(envoy_cluster_upstream_rq_time_bucket,service)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Envoy Metrics", + "uid": "khVnG8iiz", + "version": 2 + } + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app: grafana + name: grafana-config + namespace: contour-monitoring +data: + grafana.ini: | + ; instance_name = ${HOSTNAME} + [paths] + data = /var/lib/grafana/data + logs = /var/log/grafana + plugins = /var/lib/grafana/plugins + + [server] + ;protocol = http + ;http_addr = + ;http_port = 3000 + ;domain = localhost + ;enforce_domain = false + ;root_url = %(protocol)s://%(domain)s:%(http_port)s/ + ;router_logging = false + ;static_root_path = public + ;enable_gzip = false + ;cert_file = + ;cert_key = + + [database] + ;type = sqlite3 + ;host = 127.0.0.1:3306 + ;name = grafana + ;user = root + ;password = + ;ssl_mode = disable + ;path = grafana.db + + [session] + ;provider = file + ;provider_config = sessions + ;cookie_name = grafana_sess + ;cookie_secure = false + ;session_life_time = 86400 + + [analytics] + ;reporting_enabled = true + check_for_updates = true + ;google_analytics_ua_id = + + [security] + ;admin_user = admin + ;admin_password = + ;secret_key = + ;login_remember_days = 7 + ;cookie_username = grafana_user + ;cookie_remember_name = grafana_remember + ;disable_gravatar = false + ;data_source_proxy_whitelist = + + [snapshots] + ;external_enabled = true + ;external_snapshot_url = https://snapshots-origin.raintank.io + ;external_snapshot_name = Publish to snapshot.raintank.io + + [users] + ;allow_sign_up = true + ;allow_org_create = true + ;auto_assign_org = true + ;auto_assign_org_role = Viewer + ;login_hint = email or username + ;default_theme = dark + + [auth.anonymous] + ;enabled = false + ;org_name = Main Org. + ;org_role = Viewer + + [auth.proxy] + ;enabled = false + ;header_name = X-WEBAUTH-USER + ;header_property = username + ;auto_sign_up = true + + [auth.basic] + ;enabled = true + + [auth.ldap] + ;enabled = false + ;config_file = /etc/grafana/ldap.toml + + [smtp] + ;enabled = false + ;host = localhost:25 + ;user = + ;password = + ;cert_file = + ;key_file = + ;skip_verify = false + ;from_address = admin@grafana.localhost + + [emails] + ;welcome_email_on_sign_up = false + + [log] + mode = console + level = info + + [log.console] + ;level = + ;format = console + + [event_publisher] + ;enabled = false + ;rabbitmq_url = amqp://localhost/ + ;exchange = grafana_events + + [dashboards.json] + enabled = true + path = /var/lib/grafana/dashboards + + [metrics] + ;enabled = true + ;interval_seconds = 10 + + ; [metrics.graphite] + ; address = localhost:2003 + ; prefix = prod.grafana.%(instance_name)s. + + [grafana_net] + url = https://grafana.net +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app: grafana + name: grafana-dash-provider + namespace: contour-monitoring +data: + providers.yaml: | + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: false + options: + path: /var/lib/grafana/dashboards +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app: grafana + name: grafana-datasources-provider + namespace: contour-monitoring +data: + providers.yaml: | + apiVersion: 1 + datasources: + - name: 'prometheus' + type: prometheus + access: proxy + orgId: 1 + url: http://prometheus:9090 + isDefault: true + editable: false \ No newline at end of file diff --git a/deployment/grafana/03-grafana-deployment.yaml b/deployment/grafana/03-grafana-deployment.yaml new file mode 100644 index 00000000000..1dc41db02cb --- /dev/null +++ b/deployment/grafana/03-grafana-deployment.yaml @@ -0,0 +1,76 @@ +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app: grafana + name: grafana + namespace: contour-monitoring +spec: + replicas: 1 + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:5.0.4 + imagePullPolicy: Always + env: + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana + key: grafana-admin-user + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana + key: grafana-admin-password + ports: + - containerPort: 3000 + readinessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + timeoutSeconds: 30 + resources: + requests: + cpu: 100m + memory: 100Mi + volumeMounts: + - name: config-volume + mountPath: /etc/grafana + - name: dashboards-volume + mountPath: /var/lib/grafana/dashboards + - name: storage-volume + mountPath: /var/lib/grafana + - name: dashboard-provider + mountPath: /etc/grafana/provisioning/dashboards + - name: datasources-provider + mountPath: /etc/grafana/provisioning/datasources + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: grafana-config + - name: dashboards-volume + configMap: + name: grafana-dashs + - name: storage-volume + emptyDir: {} + - name: dashboard-provider + configMap: + name: grafana-dash-provider + items: + - key: providers.yaml + path: providers.yaml + - name: datasources-provider + configMap: + name: grafana-datasources-provider + items: + - key: providers.yaml + path: providers.yaml +--- \ No newline at end of file diff --git a/deployment/grafana/03-grafana-service.yaml b/deployment/grafana/03-grafana-service.yaml new file mode 100644 index 00000000000..34759dbf953 --- /dev/null +++ b/deployment/grafana/03-grafana-service.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: grafana + name: grafana + namespace: contour-monitoring +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 3000 + selector: + app: grafana + type: "ClusterIP" +--- \ No newline at end of file diff --git a/deployment/prometheus/01-namespace.yaml b/deployment/prometheus/01-namespace.yaml new file mode 100644 index 00000000000..83b77c139a2 --- /dev/null +++ b/deployment/prometheus/01-namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: contour-monitoring + labels: + app: contour-monitoring \ No newline at end of file diff --git a/deployment/prometheus/02-prometheus-alertmanager-configmap.yaml b/deployment/prometheus/02-prometheus-alertmanager-configmap.yaml new file mode 100644 index 00000000000..5a71542a0b4 --- /dev/null +++ b/deployment/prometheus/02-prometheus-alertmanager-configmap.yaml @@ -0,0 +1,104 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-alertmanager + namespace: contour-monitoring +data: + alertmanager.yml: |- + global: + + ## + # tool to help visualize where your alerts are going + # https://prometheus.io/webtools/alerting/routing-tree-editor/ + ## + + # The directory from which notification templates are read. + templates: + - '/etc/alertmanager/templates/*.tmpl' + + # The root route on which each incoming alert enters. + route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # A default receiver + receiver: default-receiver + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + + # The child route trees. + routes: + # This routes performs a regular expression match on alert labels to + # catch alerts that are related to a list of services. + # - match_re: + # service: ^(foo1|foo2|baz)$ + # receiver: team-X-mails + # # The service has a sub-route for critical alerts, any alerts + # # that do not match, i.e. severity != critical, fall-back to the + # # parent node and are sent to 'team-X-mails' + # routes: + # - match: + # severity: critical + # receiver: team-X-pager + # - match: + # severity: critical + # receiver: critical_alert + + # - match: + # severity: warning + # receiver: slack_prometheus + + # Inhibition rules allow to mute a set of alerts given that another alert is + # firing. + # We use this to mute any warning-level notifications if the same alert is + # already critical. + inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['cluster', 'service'] + + + receivers: + - name: default-receiver + # # receivers for critical alerts. For example Sends to pager duty _and_ slack + # - name: critical_alert + # pagerduty_configs: + # - send_resolved: true + # service_key: '' + + # slack_configs: + # - send_resolved: true + # api_url: '' + # channel: '#' + # text: '{{ template "slack.default.text" . }}' + # title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}" + + # # send to a slack channel. This is being used by the warning severity + # - name: 'slack_prometheus' + # slack_configs: + # - send_resolved: true + # api_url: '' + # channel: '#' + # title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}" + # text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}" diff --git a/deployment/prometheus/02-prometheus-alertrules-configmap.yaml b/deployment/prometheus/02-prometheus-alertrules-configmap.yaml new file mode 100644 index 00000000000..8c41f2d1a0e --- /dev/null +++ b/deployment/prometheus/02-prometheus-alertrules-configmap.yaml @@ -0,0 +1,18 @@ + +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-alert-rules + namespace: contour-monitoring +data: + alert.rules: |- + groups: + # This alert can be enabled to test the Alerting pipeline. + - name: allrules + rules: + - alert: DeadMansSwitch + expr: vector(1) + labels: + severity: deadman + annotations: + description: This is a Dead Man's Switch alert meant to ensure that the Alerting pipeline is functional. diff --git a/deployment/prometheus/02-prometheus-configmap.yaml b/deployment/prometheus/02-prometheus-configmap.yaml new file mode 100644 index 00000000000..2b1011cf6e3 --- /dev/null +++ b/deployment/prometheus/02-prometheus-configmap.yaml @@ -0,0 +1,357 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus + namespace: contour-monitoring +data: + prometheus.yml: |- + # A scrape configuration for running Prometheus on a Kubernetes cluster. + # This uses separate scrape configs for cluster components (i.e. API server, node) + # and services to allow each to use different authentication configs. + # + # Kubernetes labels will be added as Prometheus labels on metrics via the + # `labelmap` relabeling action. + # + # If you are using Kubernetes 1.7.2 or earlier, please take note of the comments + # for the kubernetes-cadvisor job; you will need to edit or remove this job. + + rule_files: + - '/etc/prometheus-rules/*.rules' + + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + scrape_configs: + - job_name: 'kubernetes-apiservers' + scrape_interval: 30s + + kubernetes_sd_configs: + - role: endpoints + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + # insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + + # Scrape config for nodes (kubelet). + # + # Rather than connecting directly to the node, the scrape is proxied though the + # Kubernetes apiserver. This means it will work if Prometheus is running out of + # cluster, or can't connect to nodes for some other reason (e.g. because of + # firewalling). + - job_name: 'kubernetes-nodes' + scrape_interval: 30s + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + + # Scrape config for Kubelet cAdvisor. + # + # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics + # (those whose names begin with 'container_') have been removed from the + # Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to + # retrieve those metrics. + # + # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor + # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics" + # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with + # the --cadvisor-port=0 Kubelet flag). + # + # This job is not necessary and should be removed in Kubernetes 1.6 and + # earlier versions, or it will cause the metrics to be scraped twice. + - job_name: 'kubernetes-cadvisor' + scrape_interval: 30s + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + - job_name: 'kubernetes-service-endpoints' + scrape_interval: 30s + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + + # Example scrape config for probing services via the Blackbox Exporter. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + - job_name: 'kubernetes-services' + scrape_interval: 30s + + metrics_path: /probe + params: + module: [http_2xx] + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox-exporter.example.com:9115 + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + target_label: kubernetes_name + + # Example scrape config for probing ingresses via the Blackbox Exporter. + # + # The relabeling allows the actual ingress scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + - job_name: 'kubernetes-ingresses' + scrape_interval: 30s + + metrics_path: /probe + params: + module: [http_2xx] + + kubernetes_sd_configs: + - role: ingress + + relabel_configs: + - source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path] + regex: (.+);(.+);(.+) + replacement: ${1}://${2}${3} + target_label: __param_target + - target_label: __address__ + replacement: blackbox-exporter.example.com:9115 + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_ingress_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_ingress_name] + target_label: kubernetes_name + + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the + # pod's declared ports (default is a port-free target if none are declared). + - job_name: 'kubernetes-pods' + scrape_interval: 30s + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + metric_relabel_configs: + - source_labels: [envoy_cluster_name] + regex: '(.+?)_(.+?)_(.*)' + action: replace + target_label: namespace + replacement: '$1' + - source_labels: [envoy_cluster_name] + regex: '(.+?)_(.+?)_(.*)' + action: replace + target_label: service + replacement: '$2' + - source_labels: [envoy_cluster_name] + regex: '(.+?)_(.+?)_(.*)' + action: replace + target_label: port + replacement: '$3' + + - job_name: 'envoy-statsd' + scrape_interval: 30s + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + # Only scrape pods that have the statsdport annotation set to a number + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_statsdport] + action: keep + regex: ^\d+$ + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_statsdport] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + alerting: + alertmanagers: + # Discover alert manager instances using K8s service discovery + - kubernetes_sd_configs: + - role: pod + scheme: http + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_namespace] + regex: contour-monitoring + action: keep + - source_labels: [__meta_kubernetes_pod_label_app] + regex: prometheus + action: keep + - source_labels: [__meta_kubernetes_pod_label_component] + regex: alertmanager + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_number] + regex: + action: drop diff --git a/deployment/prometheus/03-prometheus-alertmanager-deployment.yaml b/deployment/prometheus/03-prometheus-alertmanager-deployment.yaml new file mode 100644 index 00000000000..cbf437efbb8 --- /dev/null +++ b/deployment/prometheus/03-prometheus-alertmanager-deployment.yaml @@ -0,0 +1,82 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app: prometheus + component: alertmanager + namespace: contour-monitoring + name: prometheus-alertmanager +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus + component: alertmanager + name: prometheus-alertmanager + namespace: contour-monitoring +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9093 + selector: + app: prometheus + component: alertmanager + type: ClusterIP +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + app: prometheus + component: alertmanager + name: prometheus-alertmanager + namespace: contour-monitoring +spec: + replicas: 1 + template: + metadata: + labels: + app: prometheus + component: alertmanager + spec: + serviceAccountName: prometheus-alertmanager + containers: + - name: prometheus-alertmanager + image: prom/alertmanager:v0.13.0 + imagePullPolicy: Always + args: + - --config.file=/etc/config/alertmanager.yml + - --storage.path=/data + - --web.external-url=/ + ports: + - containerPort: 9093 + readinessProbe: + httpGet: + path: /#/status + port: 9093 + initialDelaySeconds: 30 + timeoutSeconds: 30 + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + - name: prometheus-alertmanager-configmap-reload + image: jimmidyson/configmap-reload:v0.1 + imagePullPolicy: Always + args: + - --volume-dir=/etc/config + - --webhook-url=http://localhost:9093/-/reload + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + volumes: + - name: config-volume + configMap: + name: prometheus-alertmanager + - name: storage-volume + emptyDir: {} diff --git a/deployment/prometheus/03-prometheus-deployment.yaml b/deployment/prometheus/03-prometheus-deployment.yaml new file mode 100644 index 00000000000..f26c6c22513 --- /dev/null +++ b/deployment/prometheus/03-prometheus-deployment.yaml @@ -0,0 +1,125 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: + - extensions + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: contour-monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: contour-monitoring +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: 'true' + name: prometheus + namespace: contour-monitoring +spec: + ports: + - protocol: TCP + name: prometheus + port: 9090 + selector: + app: prometheus + component: server +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: prometheus + namespace: contour-monitoring + labels: + app: prometheus + component: server +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + component: server + template: + metadata: + name: prometheus + labels: + app: prometheus + component: server + spec: + serviceAccountName: prometheus + containers: + - name: prometheus + image: prom/prometheus:v2.2.1 + imagePullPolicy: Always + args: + - '--storage.tsdb.retention=24h' + - '--config.file=/etc/prometheus/prometheus.yml' + ports: + - containerPort: 9090 + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + - name: config-volume-alert-rules + mountPath: /etc/prometheus-rules + - name: cert-file + mountPath: /etc/ssl/etcd.pem + - name: key-file + mountPath: /etc/ssl/etcd-key.pem + + # blackbox exporter + # - name: blackbox + # image: quay.io/prometheus/blackbox-exporter:v0.11.0 + # args: + # - --config.file=/config/blackbox.yml + # ports: + # - containerPort: 9115 + # volumeMounts: + # - name: blackbox-config + # mountPath: /config + + volumes: + - name: config-volume + configMap: + name: prometheus + - name: cert-file + hostPath: + path: /etc/kubernetes/ssl/etcd.pem + - name: key-file + hostPath: + path: /etc/kubernetes/ssl/etcd-key.pem + - name: config-volume-alert-rules + configMap: + name: prometheus-alert-rules + #black box exporter + # - name: blackbox-config + # configMap: + # name: prometheus-blackbox \ No newline at end of file diff --git a/deployment/prometheus/03-prometheus-node-exporter.yaml b/deployment/prometheus/03-prometheus-node-exporter.yaml new file mode 100644 index 00000000000..5459e2d43b3 --- /dev/null +++ b/deployment/prometheus/03-prometheus-node-exporter.yaml @@ -0,0 +1,67 @@ +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + labels: + app: prometheus + component: node-exporter + name: prometheus-node-exporter + namespace: contour-monitoring +spec: + updateStrategy: + type: OnDelete + template: + metadata: + labels: + app: prometheus + component: node-exporter + spec: + containers: + - name: prometheus-node-exporter + image: prom/node-exporter:v0.15.2 + imagePullPolicy: Always + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + ports: + - name: metrics + containerPort: 9100 + hostPort: 9100 + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + hostNetwork: true + hostPID: true + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-node-exporter + namespace: contour-monitoring + annotations: + prometheus.io/scrape: "true" + labels: + app: prometheus + component: node-exporter +spec: + type: ClusterIP + clusterIP: None + ports: + - name: metrics + port: 9100 + protocol: TCP + targetPort: 9100 + selector: + app: prometheus + component: node-exporter diff --git a/docs/prometheus.md b/docs/prometheus.md index 5c5a238917e..5efe929e2ce 100644 --- a/docs/prometheus.md +++ b/docs/prometheus.md @@ -12,7 +12,7 @@ stats endpoint and nowhere else. To enable the static listener, set the `--statsd-enabled` flag. By default, Envoy's stats will be exposed over `0.0.0.0:8002` but can be overridden setting the `--stats-address` and `--stats-port` flags in Contour. -Envoy supports Prometheus-compatible `/stats/prometheus` endpoint for metrics. +Envoy supports Prometheus-compatible `/stats/prometheus` endpoint for metrics. ## Contour Metrics @@ -31,3 +31,65 @@ Contour exposes a Prometheus-compatible `/metrics` endpoint with the following m - namespace - vhost - **contour_ingressroute_dagrebuild_timestamp (gauge):** Timestamp of the last DAG rebuild + +## Sample Deployment + +In the `/deployment` directory there are example deployment files that can be used to spin up an example environment. +The `ds-hostnet-split` is configured to utilize the following quickstart example instructions. + +### Deploy Prometheus + +A sample deployment of Prometheus and Alertmanager is provided that uses temporary storage. This deployment can be used for testing and development, but might not be suitable for all environments. + +#### Stateful Deployment + + A stateful deployment of Prometheus should use persistent storage with [Persistent Volumes and Persistent Volume Claims](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) to maintain a correlation between a data volume and the Prometheus Pod. + Persistent volumes can be static or dynamic and depends on the backend storage implementation utilized in environment in which the cluster is deployed. For more information, see the [Kubernetes documentation on types of persistent volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#types-of-persistent-volumes). + +#### Quick start + +```sh +# Deploy +$ kubectl apply -f deployment/prometheus +``` + +#### Access the Prometheus web UI + +```sh +$ kubectl -n contour-monitoring port-forward $(kubectl -n contour-monitoring get pods -l app=prometheus -l component=server -o jsonpath='{.items[0].metadata.name}') 9090:9090 +``` + +then go to [http://localhost:9090](http://localhost:9090) in your browser. + +#### Access the Alertmanager web UI + +```sh +$ kubectl -n contour-monitoring port-forward $(kubectl -n contour-monitoring get pods -l app=prometheus -l component=alertmanager -o jsonpath='{.items[0].metadata.name}') 9093:9093 +``` + +then go to [http://localhost:9093](http://localhost:9093) in your browser. + +### Deploy Grafana + +A sample deployment of Grafana is provided that uses temporary storage. + +#### Quick start + +```sh +# Deploy +$ kubectl apply -f deployment/grafana/ + +# Create secret with grafana credentials +$ kubectl create secret generic grafana -n contour-monitoring \ + --from-literal=grafana-admin-password=admin \ + --from-literal=grafana-admin-user=admin +``` + +#### Access the Grafana UI + +```sh +$ kubectl port-forward $(kubectl get pods -l app=grafana -n contour-monitoring -o jsonpath='{.items[0].metadata.name}') 3000 -n contour-monitoring +``` + +then go to [http://localhost:3000](http://localhost:3000) in your browser. +The username and password are from when you defined the Grafana secret in the previous step.