From 79493cd2539a0a5998845bb6c686260045ff0ee3 Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Wed, 28 Mar 2018 21:45:24 -0700 Subject: [PATCH] Add etcd metrics, Prometheus scrapes, and Grafana dash * Use etcd v3.3 --listen-metrics-urls to expose only metrics data via http://0.0.0.0:2381 on controllers * Add Prometheus config to scrape etcd members. On Typhoon clusters, etcd runs across controllers (i.e. 3 controllers == 3 etcd). * Hold off on allowing workers firewall access. Move Prometheus to a controller node for a while. --- CHANGES.md | 4 ++++ addons/prometheus/config.yaml | 16 +++++++++++++++ addons/prometheus/deployment.yaml | 6 ++++++ .../kubernetes/cl/controller.yaml.tmpl | 1 + aws/container-linux/kubernetes/security.tf | 20 +++++++++++++++++++ .../kubernetes/cl/controller.yaml.tmpl | 1 + .../kubernetes/cl/controller.yaml.tmpl | 1 + .../controllers/cl/controller.yaml.tmpl | 1 + .../container-linux/kubernetes/network.tf | 4 ++-- 9 files changed, 52 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 6888ddd64..500907857 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ Notable changes between versions. ## Latest +* Enable etcd v3.3 endpoint for metrics scraping * Remove unused, unmaintained `pxe-worker` internal module #### Digital Ocean @@ -20,6 +21,9 @@ Notable changes between versions. #### Addons +* Add Prometheus discovery for etcd peers on controller nodes + * Scrape etcd v3.3 `--listen-metrics-urls` for metrics + * Enables etcd related alerts and populates the etcd Grafana dashboard * Update Grafana from v4.6.3 to v5.0.4 ([#153](https://github.com/poseidon/typhoon/pull/153), [#174](https://github.com/poseidon/typhoon/pull/174)) * Restrict dashboard organization role to Viewer diff --git a/addons/prometheus/config.yaml b/addons/prometheus/config.yaml index ec04e7727..9832d1fde 100644 --- a/addons/prometheus/config.yaml +++ b/addons/prometheus/config.yaml @@ -112,6 +112,22 @@ data: target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + # Scrap etcd metrics from controllers + - job_name: 'etcd' + kubernetes_sd_configs: + - role: node + scheme: http + relabel_configs: + - source_labels: [__meta_kubernetes_node_label_node_role_kubernetes_io_controller] + action: keep + regex: 'true' + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - source_labels: [__meta_kubernetes_node_name] + action: replace + target_label: __address__ + replacement: '${1}:2381' + # Scrape config for service endpoints. # # The relabeling allows the actual service scrape endpoint to be configured diff --git a/addons/prometheus/deployment.yaml b/addons/prometheus/deployment.yaml index 82c6981d7..f416de4ae 100644 --- a/addons/prometheus/deployment.yaml +++ b/addons/prometheus/deployment.yaml @@ -15,6 +15,12 @@ spec: name: prometheus phase: prod spec: + nodeSelector: + node-role.kubernetes.io/master: "" + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule serviceAccountName: prometheus containers: - name: prometheus diff --git a/aws/container-linux/kubernetes/cl/controller.yaml.tmpl b/aws/container-linux/kubernetes/cl/controller.yaml.tmpl index e9d6c287f..179a1b9ac 100644 --- a/aws/container-linux/kubernetes/cl/controller.yaml.tmpl +++ b/aws/container-linux/kubernetes/cl/controller.yaml.tmpl @@ -13,6 +13,7 @@ systemd: Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380" Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379" Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380" + Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381" Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}" Environment="ETCD_STRICT_RECONFIG_CHECK=true" Environment="ETCD_SSL_DIR=/etc/ssl/etcd" diff --git a/aws/container-linux/kubernetes/security.tf b/aws/container-linux/kubernetes/security.tf index 8c71da6b1..79fa1cc72 100644 --- a/aws/container-linux/kubernetes/security.tf +++ b/aws/container-linux/kubernetes/security.tf @@ -81,6 +81,16 @@ resource "aws_security_group_rule" "controller-node-exporter" { source_security_group_id = "${aws_security_group.worker.id}" } +resource "aws_security_group_rule" "controller-node-exporter-self" { + security_group_id = "${aws_security_group.controller.id}" + + type = "ingress" + protocol = "tcp" + from_port = 9100 + to_port = 9100 + self = true +} + resource "aws_security_group_rule" "controller-kubelet-self" { security_group_id = "${aws_security_group.controller.id}" @@ -256,6 +266,16 @@ resource "aws_security_group_rule" "worker-flannel-self" { resource "aws_security_group_rule" "worker-node-exporter" { security_group_id = "${aws_security_group.worker.id}" + type = "ingress" + protocol = "tcp" + from_port = 9100 + to_port = 9100 + source_security_group_id = "${aws_security_group.controller.id}" +} + +resource "aws_security_group_rule" "worker-node-exporter-self" { + security_group_id = "${aws_security_group.worker.id}" + type = "ingress" protocol = "tcp" from_port = 9100 diff --git a/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl b/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl index f912f6f70..0051ab3f0 100644 --- a/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl +++ b/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl @@ -13,6 +13,7 @@ systemd: Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${domain_name}:2380" Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379" Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380" + Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381" Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}" Environment="ETCD_STRICT_RECONFIG_CHECK=true" Environment="ETCD_SSL_DIR=/etc/ssl/etcd" diff --git a/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl b/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl index a03e0b031..1cdd38214 100644 --- a/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl +++ b/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl @@ -13,6 +13,7 @@ systemd: Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380" Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379" Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380" + Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381" Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}" Environment="ETCD_STRICT_RECONFIG_CHECK=true" Environment="ETCD_SSL_DIR=/etc/ssl/etcd" diff --git a/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl b/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl index d9f5a013e..03f83dc13 100644 --- a/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl +++ b/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl @@ -13,6 +13,7 @@ systemd: Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380" Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379" Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380" + Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381" Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}" Environment="ETCD_STRICT_RECONFIG_CHECK=true" Environment="ETCD_SSL_DIR=/etc/ssl/etcd" diff --git a/google-cloud/container-linux/kubernetes/network.tf b/google-cloud/container-linux/kubernetes/network.tf index 228b2b076..74b07ab34 100644 --- a/google-cloud/container-linux/kubernetes/network.tf +++ b/google-cloud/container-linux/kubernetes/network.tf @@ -93,7 +93,7 @@ resource "google_compute_firewall" "internal-flannel" { target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] } -# Allow prometheus (workload) to scrape node-exporter daemonset +# Allow Prometheus to scrape node-exporter daemonset resource "google_compute_firewall" "internal-node-exporter" { name = "${var.cluster_name}-internal-node-exporter" network = "${google_compute_network.network.name}" @@ -103,7 +103,7 @@ resource "google_compute_firewall" "internal-node-exporter" { ports = [9100] } - source_tags = ["${var.cluster_name}-worker"] + source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"] }