Skip to content

Commit

Permalink
Roffe/metrics polish (#595)
Browse files Browse the repository at this point in the history
* update metrics docs & dashboard
* renamed `namespace` label to `svc_namespace` for service metrics as it would be overwritten by most Prometheus setups
* Made histograms for all the controller sync times for better visualization
* added `controller_routes_sync_time`, `controller_bgp_advertisements_sent` & `controller_policy_chains_sync_time` metrics
  • Loading branch information
roffe authored Dec 7, 2018
1 parent 0cdaa43 commit e5d599b
Show file tree
Hide file tree
Showing 8 changed files with 1,639 additions and 1,389 deletions.
Binary file modified dashboard/dashboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2,875 changes: 1,533 additions & 1,342 deletions dashboard/kube-router.json

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ The default values unless other specified is
By enabling [Kubernetes SD](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#<kubernetes_sd_config>) in Prometheus configuration & adding required annotations Prometheus can automaticly discover & scrape kube-router metrics

## Version notes
kube-router v0.2.4 received a metrics overhaul where some metrics were changed into histograms, additional metrics was also added. Please make sure you are using the latest dashboard version with versions => v0.2.4

kube-router 0.1.0-rc2 and upwards supports the runtime configuration for controlling where to expose the metrics. If you are using a older version, metrics path & port is locked to `/metrics` & `8080`

## Supported annotations
Expand Down Expand Up @@ -56,14 +58,20 @@ The following metrics is exposed by kube-router prefixed by `kube_router_`
* controller_bgp_peers
Number of BGP peers of the instance
* controller_bgp_advertisements_received
Number of total BGP advertisements received since kube-router start
Total number of BGP advertisements received since kube-router started
* controller_bgp_advertisements_sent
Total number of BGP advertisements sent since kube-router started
* controller_bgp_internal_peers_sync_time
Time it took for the BGP internal peer sync loop to complete
* controller_routes_sync_time
Time it took for controller to sync routes

### run-firewall=true

* controller_iptables_sync_time
Time it took for the iptables sync loop to complete
* controller_policy_chains_sync_time
Time it took for controller to sync policy chains

### run-service-proxy = true

Expand Down Expand Up @@ -95,7 +103,7 @@ The following metrics is exposed by kube-router prefixed by `kube_router_`
Outgoing bytes per second

To get a grouped list of CPS for each service a Prometheus query could look like this e.g:
`sum(kube_router_service_cps) by (namespace, service_name)`
`sum(kube_router_service_cps) by (svc_namespace, service_name)`

## Grafana Dashboard

Expand Down
12 changes: 9 additions & 3 deletions pkg/controllers/netpol/network_policy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"net"
"regexp"
"strconv"
"strings"
"sync"
Expand All @@ -26,7 +27,6 @@ import (
"k8s.io/client-go/kubernetes"
listers "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"regexp"
)

const (
Expand Down Expand Up @@ -215,7 +215,7 @@ func (npc *NetworkPolicyController) Sync() error {
defer func() {
endTime := time.Since(start)
if npc.MetricsEnabled {
metrics.ControllerIptablesSyncTime.WithLabelValues().Set(float64(endTime.Seconds()))
metrics.ControllerIptablesSyncTime.Observe(endTime.Seconds())
}
glog.V(1).Infof("sync iptables took %v", endTime)
}()
Expand Down Expand Up @@ -258,7 +258,12 @@ func (npc *NetworkPolicyController) Sync() error {
// policyspec is evaluated to set of matching pods, which are grouped in to a
// ipset used for source ip addr matching.
func (npc *NetworkPolicyController) syncNetworkPolicyChains(version string) (map[string]bool, map[string]bool, error) {

start := time.Now()
defer func() {
endTime := time.Since(start)
metrics.ControllerPolicyChainsSyncTime.Observe(endTime.Seconds())
glog.V(2).Infof("Syncing network policy chains took %v", endTime)
}()
activePolicyChains := make(map[string]bool)
activePolicyIpSets := make(map[string]bool)

Expand Down Expand Up @@ -1536,6 +1541,7 @@ func NewNetworkPolicyController(clientset kubernetes.Interface,
if config.MetricsEnabled {
//Register the metrics for this controller
prometheus.MustRegister(metrics.ControllerIptablesSyncTime)
prometheus.MustRegister(metrics.ControllerPolicyChainsSyncTime)
npc.MetricsEnabled = true
}

Expand Down
8 changes: 5 additions & 3 deletions pkg/controllers/proxy/network_services_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,9 @@ func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoM
defer func() {
endTime := time.Since(start)
glog.V(2).Infof("Publishing IPVS metrics took %v", endTime)
metrics.ControllerIpvsMetricsExportTime.WithLabelValues().Set(float64(endTime.Seconds()))
if nsc.MetricsEnabled {
metrics.ControllerIpvsMetricsExportTime.Observe(float64(endTime.Seconds()))
}
}()

ipvsSvcs, err := nsc.ln.ipvsGetServices()
Expand Down Expand Up @@ -429,7 +431,7 @@ func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoM
metrics.ServicePpsIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSIn))
metrics.ServicePpsOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSOut))
metrics.ServiceTotalConn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.Connections))
metrics.ControllerIpvsServices.WithLabelValues().Set(float64(len(ipvsSvcs)))
metrics.ControllerIpvsServices.Set(float64(len(ipvsSvcs)))
}
}
}
Expand Down Expand Up @@ -528,7 +530,7 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf
defer func() {
endTime := time.Since(start)
if nsc.MetricsEnabled {
metrics.ControllerIpvsServicesSyncTime.WithLabelValues().Set(float64(endTime.Seconds()))
metrics.ControllerIpvsServicesSyncTime.Observe(endTime.Seconds())
}
glog.V(1).Infof("sync ipvs services took %v", endTime)
}()
Expand Down
9 changes: 6 additions & 3 deletions pkg/controllers/routing/bgp_peers.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ func (nrc *NetworkRoutingController) syncInternalPeers() {
start := time.Now()
defer func() {
endTime := time.Since(start)
metrics.ControllerBGPInternalPeersSyncTime.WithLabelValues().Set(float64(endTime.Seconds()))
if nrc.MetricsEnabled {
metrics.ControllerBGPInternalPeersSyncTime.Observe(endTime.Seconds())
}
glog.V(2).Infof("Syncing BGP peers for the node took %v", endTime)
}()

Expand All @@ -40,8 +42,9 @@ func (nrc *NetworkRoutingController) syncInternalPeers() {
glog.Errorf("Failed to list nodes from API server due to: %s. Can not perform BGP peer sync", err.Error())
return
}

metrics.ControllerBPGpeers.WithLabelValues().Set(float64(len(nodes.Items)))
if nrc.MetricsEnabled {
metrics.ControllerBPGpeers.Set(float64(len(nodes.Items)))
}
// establish peer and add Pod CIDRs with current set of nodes
currentNodes := make([]string, 0)
for _, node := range nodes.Items {
Expand Down
12 changes: 11 additions & 1 deletion pkg/controllers/routing/network_routes_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ func (nrc *NetworkRoutingController) watchBgpUpdates() {
case *gobgp.WatchEventBestPath:
glog.V(3).Info("Processing bgp route advertisement from peer")
if nrc.MetricsEnabled {
metrics.ControllerBGPadvertisementsReceived.WithLabelValues().Add(float64(1))
metrics.ControllerBGPadvertisementsReceived.Inc()
}
for _, path := range msg.PathList {
if path.IsLocal() {
Expand All @@ -342,6 +342,9 @@ func (nrc *NetworkRoutingController) watchBgpUpdates() {
}

func (nrc *NetworkRoutingController) advertisePodRoute() error {
if nrc.MetricsEnabled {
metrics.ControllerBGPadvertisementsSent.Inc()
}
cidr, err := utils.GetPodCidrFromNodeSpec(nrc.clientset, nrc.hostnameOverride)
if err != nil {
return err
Expand Down Expand Up @@ -486,6 +489,12 @@ func (nrc *NetworkRoutingController) Cleanup() {
}

func (nrc *NetworkRoutingController) syncNodeIPSets() error {
start := time.Now()
defer func() {
if nrc.MetricsEnabled {
metrics.ControllerRoutesSyncTime.Observe(time.Since(start).Seconds())
}
}()
// Get the current list of the nodes from API server
nodes, err := nrc.clientset.CoreV1().Nodes().List(metav1.ListOptions{})
if err != nil {
Expand Down Expand Up @@ -786,6 +795,7 @@ func NewNetworkRoutingController(clientset kubernetes.Interface,
prometheus.MustRegister(metrics.ControllerBGPadvertisementsReceived)
prometheus.MustRegister(metrics.ControllerBGPInternalPeersSyncTime)
prometheus.MustRegister(metrics.ControllerBPGpeers)
prometheus.MustRegister(metrics.ControllerRoutesSyncTime)
nrc.MetricsEnabled = true
}

Expand Down
100 changes: 65 additions & 35 deletions pkg/metrics/metrics_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,108 +21,138 @@ const (
)

var (
// ServiceTotalConn Total incoming connections made
ServiceTotalConn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_total_connections",
Help: "Total incoming connections made",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServicePacketsIn Total incoming packets
ServicePacketsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_packets_in",
Help: "Total incoming packets",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServicePacketsOut Total outgoing packets
ServicePacketsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_packets_out",
Help: "Total outgoing packets",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceBytesIn Total incoming bytes
ServiceBytesIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_bytes_in",
Help: "Total incoming bytes",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceBytesOut Total outgoing bytes
ServiceBytesOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_bytes_out",
Help: "Total outgoing bytes",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServicePpsIn Incoming packets per second
ServicePpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_pps_in",
Help: "Incoming packets per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServicePpsOut Outgoing packets per second
ServicePpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_pps_out",
Help: "Outgoing packets per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceCPS Service connections per second
ServiceCPS = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_cps",
Help: "Service connections per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceBpsIn Incoming bytes per second
ServiceBpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_bps_in",
Help: "Incoming bytes per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceBpsOut Outgoing bytes per second
ServiceBpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_bps_out",
Help: "Outgoing bytes per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
ControllerIpvsServices = prometheus.NewGaugeVec(prometheus.GaugeOpts{
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ControllerIpvsServices Number of ipvs services in the instance
ControllerIpvsServices = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "controller_ipvs_services",
Help: "Number of ipvs services in the instance",
}, []string{})
ControllerIptablesSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerIptablesSyncTime Time it took for controller to sync iptables
ControllerIptablesSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_iptables_sync_time",
Help: "Time it took for controller to sync iptables",
}, []string{})
ControllerPublishMetricsTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "controller_publish_metrics_time",
Help: "Time it took to publish metrics",
}, []string{})
ControllerIpvsServicesSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerIpvsServicesSyncTime Time it took for controller to sync ipvs services
ControllerIpvsServicesSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_ipvs_services_sync_time",
Help: "Time it took for controller to sync ipvs services",
}, []string{})
ControllerBPGpeers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerRoutesSyncTime Time it took for controller to sync ipvs services
ControllerRoutesSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_routes_sync_time",
Help: "Time it took for controller to sync routes",
})
// ControllerBPGpeers BGP peers in the runtime configuration
ControllerBPGpeers = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "controller_bgp_peers",
Help: "BGP peers in the runtime configuration",
}, []string{})
ControllerBGPInternalPeersSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerBGPInternalPeersSyncTime Time it took to sync internal bgp peers
ControllerBGPInternalPeersSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_bgp_internal_peers_sync_time",
Help: "Time it took to sync internal bgp peers",
}, []string{})
ControllerBGPadvertisementsReceived = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerBGPadvertisementsReceived Time it took to sync internal bgp peers
ControllerBGPadvertisementsReceived = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Name: "controller_bgp_advertisements_received",
Help: "Time it took to sync internal bgp peers",
}, []string{})
ControllerIpvsMetricsExportTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Help: "BGP advertisements received",
})
// ControllerBGPadvertisementsSent Time it took to sync internal bgp peers
ControllerBGPadvertisementsSent = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Name: "controller_bgp_advertisements_sent",
Help: "BGP advertisements sent",
})
// ControllerIpvsMetricsExportTime Time it took to export metrics
ControllerIpvsMetricsExportTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_ipvs_metrics_export_time",
Help: "Time it took to export metrics",
}, []string{})
})
// ControllerPolicyChainsSyncTime Time it took for controller to sync policys
ControllerPolicyChainsSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_policy_chains_sync_time",
Help: "Time it took for controller to sync policy chains",
})
)

// MetricsController Holds settings for the metrics controller
type MetricsController struct {
// Controller Holds settings for the metrics controller
type Controller struct {
MetricsPath string
MetricsPort uint16
mu sync.Mutex
nodeIP net.IP
}

// Run prometheus metrics controller
func (mc *MetricsController) Run(healthChan chan<- *healthcheck.ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
func (mc *Controller) Run(healthChan chan<- *healthcheck.ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
t := time.NewTicker(3 * time.Second)
defer wg.Done()
glog.Info("Starting metrics controller")
Expand Down Expand Up @@ -157,8 +187,8 @@ func (mc *MetricsController) Run(healthChan chan<- *healthcheck.ControllerHeartb
}

// NewMetricsController returns new MetricController object
func NewMetricsController(clientset kubernetes.Interface, config *options.KubeRouterConfig) (*MetricsController, error) {
mc := MetricsController{}
func NewMetricsController(clientset kubernetes.Interface, config *options.KubeRouterConfig) (*Controller, error) {
mc := Controller{}
mc.MetricsPath = config.MetricsPath
mc.MetricsPort = config.MetricsPort
return &mc, nil
Expand Down

0 comments on commit e5d599b

Please sign in to comment.