From e15a3e6d7326e39cd5954d421d6259130136b68d Mon Sep 17 00:00:00 2001 From: Kobi Samoray Date: Tue, 31 Dec 2019 17:55:17 +0200 Subject: [PATCH] Antrea Prometheus integration (#236) Integrate with Prometheus monitoring solution. Integration of the Prometheus client into Antrea controller and agent allows the exposure of various metrics to Prometheus server. In addition to Antrea's own set of metrics, Prometheus client will also expose metrics which are defined by various components which are part of the Antrea ecosystem, e.g golang, Prometheus itself etc. --- build/yamls/antrea-ipsec.yml | 35 +++++- build/yamls/antrea.yml | 35 +++++- build/yamls/base/agent.yml | 4 + build/yamls/base/conf/antrea-agent.conf | 9 ++ build/yamls/base/conf/antrea-controller.conf | 8 ++ build/yamls/base/controller.yml | 4 + cmd/antrea-agent/agent.go | 8 ++ cmd/antrea-agent/config.go | 9 ++ cmd/antrea-controller/config.go | 9 ++ cmd/antrea-controller/controller.go | 51 +++++++- go.mod | 1 + pkg/agent/metrics/prometheus.go | 119 +++++++++++++++++++ 12 files changed, 279 insertions(+), 13 deletions(-) create mode 100644 pkg/agent/metrics/prometheus.go diff --git a/build/yamls/antrea-ipsec.yml b/build/yamls/antrea-ipsec.yml index e73c2d8e0e5..e67740b936e 100644 --- a/build/yamls/antrea-ipsec.yml +++ b/build/yamls/antrea-ipsec.yml @@ -239,7 +239,7 @@ subjects: --- apiVersion: v1 data: - antrea-agent.conf: | + antrea-agent.conf: |- # Name of the OpenVSwitch bridge antrea-agent will create and use. # Make sure it doesn't conflict with your existing OpenVSwitch bridges. #ovsBridge: br-int @@ -281,6 +281,15 @@ data: # Underlying network must be capable of supporting Pod traffic across IP subnet. # hybrid: noEncap if worker Nodes on same subnet, otherwise encap. #trafficEncapMode: encap + + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false antrea-cni.conf: | { "cniVersion":"0.3.0", @@ -290,13 +299,21 @@ data: "type": "host-local" } } - antrea-controller.conf: "" + antrea-controller.conf: |- + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener. + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false kind: ConfigMap metadata: annotations: {} labels: app: antrea - name: antrea-config-ghc6hct4mg + name: antrea-config-ffmk8fd5g4 namespace: kube-system --- apiVersion: v1 @@ -311,6 +328,10 @@ type: Opaque apiVersion: v1 kind: Service metadata: + annotations: + prometheus.io/port: "443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea name: antrea @@ -398,7 +419,7 @@ spec: key: node-role.kubernetes.io/master volumes: - configMap: - name: antrea-config-ghc6hct4mg + name: antrea-config-ffmk8fd5g4 name: antrea-config --- apiVersion: apiregistration.k8s.io/v1 @@ -432,6 +453,10 @@ spec: component: antrea-agent template: metadata: + annotations: + prometheus.io/port: "10443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea component: antrea-agent @@ -598,7 +623,7 @@ spec: operator: Exists volumes: - configMap: - name: antrea-config-ghc6hct4mg + name: antrea-config-ffmk8fd5g4 name: antrea-config - hostPath: path: /etc/cni/net.d diff --git a/build/yamls/antrea.yml b/build/yamls/antrea.yml index f038ba52598..019932c9fad 100644 --- a/build/yamls/antrea.yml +++ b/build/yamls/antrea.yml @@ -239,7 +239,7 @@ subjects: --- apiVersion: v1 data: - antrea-agent.conf: | + antrea-agent.conf: |- # Name of the OpenVSwitch bridge antrea-agent will create and use. # Make sure it doesn't conflict with your existing OpenVSwitch bridges. #ovsBridge: br-int @@ -281,6 +281,15 @@ data: # Underlying network must be capable of supporting Pod traffic across IP subnet. # hybrid: noEncap if worker Nodes on same subnet, otherwise encap. #trafficEncapMode: encap + + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false antrea-cni.conf: | { "cniVersion":"0.3.0", @@ -290,18 +299,30 @@ data: "type": "host-local" } } - antrea-controller.conf: "" + antrea-controller.conf: |- + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener. + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false kind: ConfigMap metadata: annotations: {} labels: app: antrea - name: antrea-config-4gb24b784b + name: antrea-config-k49g4578m4 namespace: kube-system --- apiVersion: v1 kind: Service metadata: + annotations: + prometheus.io/port: "443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea name: antrea @@ -389,7 +410,7 @@ spec: key: node-role.kubernetes.io/master volumes: - configMap: - name: antrea-config-4gb24b784b + name: antrea-config-k49g4578m4 name: antrea-config --- apiVersion: apiregistration.k8s.io/v1 @@ -423,6 +444,10 @@ spec: component: antrea-agent template: metadata: + annotations: + prometheus.io/port: "10443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea component: antrea-agent @@ -557,7 +582,7 @@ spec: operator: Exists volumes: - configMap: - name: antrea-config-4gb24b784b + name: antrea-config-k49g4578m4 name: antrea-config - hostPath: path: /etc/cni/net.d diff --git a/build/yamls/base/agent.yml b/build/yamls/base/agent.yml index f1bf396db89..612e35b0169 100644 --- a/build/yamls/base/agent.yml +++ b/build/yamls/base/agent.yml @@ -13,6 +13,10 @@ spec: type: RollingUpdate template: metadata: + annotations: + prometheus.io/port: "10443" + prometheus.io/scrape: "true" + prometheus.io/scheme: "https" labels: component: antrea-agent spec: diff --git a/build/yamls/base/conf/antrea-agent.conf b/build/yamls/base/conf/antrea-agent.conf index 3637b9dc0b3..843069af7d3 100644 --- a/build/yamls/base/conf/antrea-agent.conf +++ b/build/yamls/base/conf/antrea-agent.conf @@ -39,3 +39,12 @@ # Underlying network must be capable of supporting Pod traffic across IP subnet. # hybrid: noEncap if worker Nodes on same subnet, otherwise encap. #trafficEncapMode: encap + +# Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener +#enablePrometheusMetrics: false + +# Enable golang metrics exposure via Prometheus. +#enablePrometheusGoMetrics: false + +# Enable process metrics exposure via Prometheus. +#enablePrometheusProcessMetrics: false \ No newline at end of file diff --git a/build/yamls/base/conf/antrea-controller.conf b/build/yamls/base/conf/antrea-controller.conf index e69de29bb2d..454f0975e4c 100644 --- a/build/yamls/base/conf/antrea-controller.conf +++ b/build/yamls/base/conf/antrea-controller.conf @@ -0,0 +1,8 @@ +# Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener. +#enablePrometheusMetrics: false + +# Enable golang metrics exposure via Prometheus. +#enablePrometheusGoMetrics: false + +# Enable process metrics exposure via Prometheus. +#enablePrometheusProcessMetrics: false \ No newline at end of file diff --git a/build/yamls/base/controller.yml b/build/yamls/base/controller.yml index d2b09612100..f19cc2a64c6 100644 --- a/build/yamls/base/controller.yml +++ b/build/yamls/base/controller.yml @@ -2,6 +2,10 @@ apiVersion: v1 kind: Service metadata: + annotations: + prometheus.io/port: "443" + prometheus.io/scrape: "true" + prometheus.io/scheme: "https" name: antrea spec: ports: diff --git a/cmd/antrea-agent/agent.go b/cmd/antrea-agent/agent.go index 75dc375cc66..7b5cc6dc1b1 100644 --- a/cmd/antrea-agent/agent.go +++ b/cmd/antrea-agent/agent.go @@ -30,6 +30,7 @@ import ( "github.com/vmware-tanzu/antrea/pkg/agent/controller/networkpolicy" "github.com/vmware-tanzu/antrea/pkg/agent/controller/noderoute" "github.com/vmware-tanzu/antrea/pkg/agent/interfacestore" + "github.com/vmware-tanzu/antrea/pkg/agent/metrics" "github.com/vmware-tanzu/antrea/pkg/agent/openflow" "github.com/vmware-tanzu/antrea/pkg/agent/route" "github.com/vmware-tanzu/antrea/pkg/apis/networking/v1beta1" @@ -146,6 +147,13 @@ func run(o *Options) error { go networkPolicyController.Run(stopCh) + if o.config.EnablePrometheusMetrics { + go metrics.StartListener( + o.config.EnablePrometheusGoMetrics, + o.config.EnablePrometheusProcessMetrics, + o.config.OVSBridge, ifaceStore, ofClient) + } + agentMonitor := monitor.NewAgentMonitor( crdClient, o.config.OVSBridge, diff --git a/cmd/antrea-agent/config.go b/cmd/antrea-agent/config.go index fa10d0a72c7..7d157471d59 100644 --- a/cmd/antrea-agent/config.go +++ b/cmd/antrea-agent/config.go @@ -71,4 +71,13 @@ type AgentConfig struct { // Underlying network must be capable of supporting Pod traffic across IP subnet. // Hybrid: noEncap if worker Nodes on same subnet, otherwise encap. TrafficEncapMode string `yaml:"trafficEncapMode,omitempty"` + // Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener + // Defaults to false. + EnablePrometheusMetrics bool `yaml:"enablePrometheusMetrics,omitempty"` + // Enable golang metrics exposure via Prometheus + // Defaults to false. + EnablePrometheusGoMetrics bool `yaml:"enablePrometheusGoMetrics,omitempty"` + // Enable process metrics exposure via Prometheus + // Defaults to false. + EnablePrometheusProcessMetrics bool `yaml:"enablePrometheusProcessMetrics,omitempty"` } diff --git a/cmd/antrea-controller/config.go b/cmd/antrea-controller/config.go index aa4b57b5658..9dbe16344c9 100644 --- a/cmd/antrea-controller/config.go +++ b/cmd/antrea-controller/config.go @@ -22,4 +22,13 @@ type ControllerConfig struct { // clientConnection specifies the kubeconfig file and client connection settings for the agent // to communicate with the apiserver. ClientConnection componentbaseconfig.ClientConnectionConfiguration `yaml:"clientConnection"` + // Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener + // Defaults to false. + EnablePrometheusMetrics bool `yaml:"enablePrometheusMetrics,omitempty"` + // Enable golang metrics exposure via Prometheus + // Defaults to false. + EnablePrometheusGoMetrics bool `yaml:"enablePrometheusGoMetrics,omitempty"` + // Enable process metrics exposure via Prometheus + // Defaults to false. + EnablePrometheusProcessMetrics bool `yaml:"enablePrometheusProcessMetrics,omitempty"` } diff --git a/cmd/antrea-controller/controller.go b/cmd/antrea-controller/controller.go index 32d1d1e0eb8..d817910bbdc 100644 --- a/cmd/antrea-controller/controller.go +++ b/cmd/antrea-controller/controller.go @@ -17,8 +17,12 @@ package main import ( "fmt" "net" + "net/http" + "os" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" genericopenapi "k8s.io/apiserver/pkg/endpoints/openapi" genericapiserver "k8s.io/apiserver/pkg/server" genericoptions "k8s.io/apiserver/pkg/server/options" @@ -71,7 +75,8 @@ func run(o *Options) error { apiServerConfig, err := createAPIServerConfig(o.config.ClientConnection.Kubeconfig, addressGroupStore, appliedToGroupStore, - networkPolicyStore) + networkPolicyStore, + o.config.EnablePrometheusMetrics) if err != nil { return fmt.Errorf("error creating API server config: %v", err) } @@ -94,22 +99,62 @@ func run(o *Options) error { go apiServer.GenericAPIServer.PrepareRun().Run(stopCh) + if o.config.EnablePrometheusMetrics { + go createPrometheusMetricsListener( + o.config.EnablePrometheusGoMetrics, + o.config.EnablePrometheusGoMetrics) + } + <-stopCh klog.Info("Stopping Antrea controller") return nil } +// Initialize Prometheus listener and metrics collection. +func createPrometheusMetricsListener( + enablePrometheusGoMetrics bool, + enablePrometheusProcessMetrics bool) { + hostname, err := os.Hostname() + if err != nil { + klog.Errorf("Failed to retrieve agent node name, %v", err) + } + + klog.Info("Initializing prometheus") + gaugeHost := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "antrea_controller_host", + Help: "Antrea controller hostname (as a label), typically used in grouping/aggregating stats; " + + "the label defaults to the hostname of the host but can be overridden by configuration. " + + "The value of the gauge is always set to 1.", + ConstLabels: prometheus.Labels{"host": hostname}, + }) + gaugeHost.Set(1) + prometheus.MustRegister(gaugeHost) + http.Handle("/metrics", promhttp.Handler()) + + if !enablePrometheusGoMetrics { + klog.Info("Golang metrics are disabled") + prometheus.Unregister(prometheus.NewGoCollector()) + } + if !enablePrometheusProcessMetrics { + klog.Info("Process metrics are disabled") + prometheus.Unregister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) + } +} + func createAPIServerConfig(kubeconfig string, addressGroupStore storage.Interface, appliedToGroupStore storage.Interface, - networkPolicyStore storage.Interface) (*apiserver.Config, error) { + networkPolicyStore storage.Interface, + enablePrometheusMetrics bool) (*apiserver.Config, error) { // TODO: // 1. Support user-provided certificate. // 2. Support configurable https port. secureServing := genericoptions.NewSecureServingOptions().WithLoopback() authentication := genericoptions.NewDelegatingAuthenticationOptions() authorization := genericoptions.NewDelegatingAuthorizationOptions() - + if enablePrometheusMetrics { + authorization.WithAlwaysAllowPaths("/metrics") + } // Set the PairName but leave certificate directory blank to generate in-memory by default secureServing.ServerCert.CertDirectory = "" secureServing.ServerCert.PairName = "antrea-apiserver" diff --git a/go.mod b/go.mod index 408ca81bf92..977fcc7ca9f 100644 --- a/go.mod +++ b/go.mod @@ -25,6 +25,7 @@ require ( github.com/imdario/mergo v0.3.7 // indirect github.com/j-keck/arping v1.0.0 github.com/kevinburke/ssh_config v0.0.0-20190725054713-01f96b0aa0cd + github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829 github.com/satori/go.uuid v1.2.0 github.com/spf13/cobra v0.0.5 github.com/spf13/pflag v1.0.3 diff --git a/pkg/agent/metrics/prometheus.go b/pkg/agent/metrics/prometheus.go new file mode 100644 index 00000000000..ba17bb818a2 --- /dev/null +++ b/pkg/agent/metrics/prometheus.go @@ -0,0 +1,119 @@ +// Copyright 2020 Antrea Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/vmware-tanzu/antrea/pkg/agent/interfacestore" + "k8s.io/klog" + "net/http" + "os" + "strconv" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/vmware-tanzu/antrea/pkg/agent/openflow" +) + +type OVSStatManager struct { + ofClient openflow.Client + OVSBridge string + OVSTableDesc *prometheus.Desc +} + +func (c *OVSStatManager) OVSGetStatistics() ( + ovsFlowsByTable map[string]float64, +) { + ovsFlowsByTable = make(map[string]float64) + flowTableStatus := c.ofClient.GetFlowTableStatus() + for _, tableStatus := range flowTableStatus { + ovsFlowsByTable[strconv.Itoa(int(tableStatus.ID))] = float64(tableStatus.FlowCount) + } + return +} + +func (c *OVSStatManager) Describe(ch chan<- *prometheus.Desc) { + ch <- c.OVSTableDesc +} + +func (c *OVSStatManager) Collect(ch chan<- prometheus.Metric) { + ovsFlowsByTable := c.OVSGetStatistics() + for tableId, tableFlowCount := range ovsFlowsByTable { + ch <- prometheus.MustNewConstMetric( + c.OVSTableDesc, + prometheus.GaugeValue, + tableFlowCount, + tableId, + ) + } +} + +func NewOVSStatManager(ovsBridge string, ofClient openflow.Client) *OVSStatManager { + return &OVSStatManager{ + ofClient: ofClient, + OVSBridge: ovsBridge, + OVSTableDesc: prometheus.NewDesc( + "antrea_agent_ovs_flow_table", + "OVS flow table flow count.", + []string{"table_id"}, + prometheus.Labels{"bridge": ovsBridge}, + ), + } +} + +func StartListener( + enablePrometheusGoMetrics bool, + enablePrometheusProcessMetrics bool, + ovsBridge string, + ifaceStore interfacestore.InterfaceStore, + ofClient openflow.Client) { + hostname, err := os.Hostname() + if err != nil { + klog.Errorf("Failed to retrieve agent node name, %v", err) + } + klog.Info("Binding antrea_local_pod_count") + if err := prometheus.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Name: "antrea_agent_local_pod_count", + Help: "Number of pods on local node.", + }, + func() float64 { return float64(ifaceStore.GetContainerInterfaceNum()) }, + )); err == nil { + klog.Error("Failed to register local_pod_count with Prometheus") + } + + gaugeHost := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "antrea_agent_host", + Help: "Antrea agent hostname (as a label), typically used in grouping/aggregating stats; " + + "the label defaults to the hostname of the host but can be overridden by configuration. " + + "The value of the gauge is always set to 1.", + ConstLabels: prometheus.Labels{"host": hostname}, + }) + gaugeHost.Set(1) + prometheus.MustRegister(gaugeHost) + http.Handle("/metrics", promhttp.Handler()) + + ovsStats := NewOVSStatManager(ovsBridge, ofClient) + prometheus.MustRegister(ovsStats) + + if !enablePrometheusGoMetrics { + klog.Info("Golang metrics are disabled") + prometheus.Unregister(prometheus.NewGoCollector()) + } + if !enablePrometheusProcessMetrics { + klog.Info("Process metrics are disabled") + prometheus.Unregister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) + } +}