Skip to content

Commit 6d7bd60

Browse files
committed
Add etcd metrics, Prometheus scrapes, and Grafana dash
* Use etcd v3.3 --listen-metrics-urls to expose only metrics data via http://0.0.0.0:2381 on controllers * Hold off on allowing workers firewall access. Move Prometheus to a controller node for a while. * Add Prometheus config to scrape etcd members. On Typhoon clusters, etcd runs across controllers (i.e. 3 controllers == 3 etcd).
1 parent b1e41dc commit 6d7bd60

File tree

8 files changed

+52
-2
lines changed

8 files changed

+52
-2
lines changed

addons/prometheus/config.yaml

+20
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,26 @@ data:
112112
target_label: __metrics_path__
113113
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
114114

115+
# Scrap etcd metrics from controllers
116+
- job_name: 'etcd'
117+
kubernetes_sd_configs:
118+
- role: node
119+
scheme: http
120+
tls_config:
121+
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
122+
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
123+
124+
relabel_configs:
125+
- source_labels: [__meta_kubernetes_node_label_node_role_kubernetes_io_controller]
126+
action: keep
127+
regex: 'true'
128+
- action: labelmap
129+
regex: __meta_kubernetes_node_label_(.+)
130+
- source_labels: [__meta_kubernetes_node_name]
131+
action: replace
132+
target_label: __address__
133+
replacement: '${1}:2381'
134+
115135
# Scrape config for service endpoints.
116136
#
117137
# The relabeling allows the actual service scrape endpoint to be configured

addons/prometheus/deployment.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ spec:
1515
name: prometheus
1616
phase: prod
1717
spec:
18+
nodeSelector:
19+
node-role.kubernetes.io/master: ""
20+
tolerations:
21+
- key: node-role.kubernetes.io/master
22+
operator: Exists
23+
effect: NoSchedule
1824
serviceAccountName: prometheus
1925
containers:
2026
- name: prometheus

aws/container-linux/kubernetes/cl/controller.yaml.tmpl

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ systemd:
1313
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
1414
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
1515
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
16+
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
1617
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
1718
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
1819
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"

aws/container-linux/kubernetes/security.tf

+20
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,16 @@ resource "aws_security_group_rule" "controller-node-exporter" {
8181
source_security_group_id = "${aws_security_group.worker.id}"
8282
}
8383

84+
resource "aws_security_group_rule" "controller-node-exporter-self" {
85+
security_group_id = "${aws_security_group.controller.id}"
86+
87+
type = "ingress"
88+
protocol = "tcp"
89+
from_port = 9100
90+
to_port = 9100
91+
self = true
92+
}
93+
8494
resource "aws_security_group_rule" "controller-kubelet-self" {
8595
security_group_id = "${aws_security_group.controller.id}"
8696

@@ -256,6 +266,16 @@ resource "aws_security_group_rule" "worker-flannel-self" {
256266
resource "aws_security_group_rule" "worker-node-exporter" {
257267
security_group_id = "${aws_security_group.worker.id}"
258268

269+
type = "ingress"
270+
protocol = "tcp"
271+
from_port = 9100
272+
to_port = 9100
273+
source_security_group_id = "${aws_security_group.controller.id}"
274+
}
275+
276+
resource "aws_security_group_rule" "worker-node-exporter-self" {
277+
security_group_id = "${aws_security_group.worker.id}"
278+
259279
type = "ingress"
260280
protocol = "tcp"
261281
from_port = 9100

bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ systemd:
1313
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${domain_name}:2380"
1414
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
1515
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
16+
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
1617
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
1718
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
1819
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"

digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ systemd:
1313
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
1414
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
1515
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
16+
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
1617
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
1718
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
1819
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"

google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ systemd:
1313
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
1414
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
1515
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
16+
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
1617
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
1718
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
1819
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"

google-cloud/container-linux/kubernetes/network.tf

+2-2
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ resource "google_compute_firewall" "internal-flannel" {
9393
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
9494
}
9595

96-
# Allow prometheus (workload) to scrape node-exporter daemonset
96+
# Allow Prometheus to scrape node-exporter daemonset
9797
resource "google_compute_firewall" "internal-node-exporter" {
9898
name = "${var.cluster_name}-internal-node-exporter"
9999
network = "${google_compute_network.network.name}"
@@ -103,7 +103,7 @@ resource "google_compute_firewall" "internal-node-exporter" {
103103
ports = [9100]
104104
}
105105

106-
source_tags = ["${var.cluster_name}-worker"]
106+
source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
107107
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
108108
}
109109

0 commit comments

Comments
 (0)