Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions cmd/machine-config-controller/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,18 @@ var (
}

startOpts struct {
kubeconfig string
templates string

resourceLockNamespace string
kubeconfig string
templates string
promMetricsListenAddress string
resourceLockNamespace string
}
)

func init() {
rootCmd.AddCommand(startCmd)
startCmd.PersistentFlags().StringVar(&startOpts.kubeconfig, "kubeconfig", "", "Kubeconfig file to access a remote cluster (testing only)")
startCmd.PersistentFlags().StringVar(&startOpts.resourceLockNamespace, "resourcelock-namespace", metav1.NamespaceSystem, "Path to the template files used for creating MachineConfig objects")
startCmd.PersistentFlags().StringVar(&startOpts.promMetricsListenAddress, "metrics-listen-address", "127.0.0.1:8797", "Listen address for prometheus metrics listener")
}

func runStartCmd(cmd *cobra.Command, args []string) {
Expand All @@ -56,6 +57,9 @@ func runStartCmd(cmd *cobra.Command, args []string) {
run := func(ctx context.Context) {
ctrlctx := ctrlcommon.CreateControllerContext(cb, ctx.Done(), componentName)

// Start the metrics handler
go ctrlcommon.StartMetricsListener(startOpts.promMetricsListenAddress, ctrlctx.Stop)

controllers := createControllers(ctrlctx)

// Start the shared factory informers that you need to use in your controller
Expand Down Expand Up @@ -140,6 +144,7 @@ func createControllers(ctx *ctrlcommon.ControllerContext) []ctrlcommon.Controlle
// The node controller consumes data written by the above
node.New(
ctx.InformerFactory.Machineconfiguration().V1().ControllerConfigs(),
ctx.InformerFactory.Machineconfiguration().V1().MachineConfigs(),
ctx.InformerFactory.Machineconfiguration().V1().MachineConfigPools(),
ctx.KubeInformerFactory.Core().V1().Nodes(),
ctx.ConfigInformerFactory.Config().V1().Schedulers(),
Expand Down
21 changes: 21 additions & 0 deletions install/0000_80_machine-config-operator_00_service.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,26 @@
apiVersion: v1
kind: Service
metadata:
name: machine-config-controller
namespace: openshift-machine-config-operator
labels:
k8s-app: machine-config-controller
annotations:
include.release.openshift.io/ibm-cloud-managed: "true"
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
service.beta.openshift.io/serving-cert-secret-name: mcc-proxy-tls
spec:
type: ClusterIP
selector:
k8s-app: machine-config-controller
ports:
- name: metrics
port: 9001
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
name: machine-config-daemon
namespace: openshift-machine-config-operator
Expand Down
36 changes: 36 additions & 0 deletions install/0000_90_machine-config-operator_00_servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,41 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: machine-config-controller
namespace: openshift-machine-config-operator
labels:
k8s-app: machine-config-controller
annotations:
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
spec:
endpoints:
- interval: 30s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: metrics
scheme: https
path: /metrics
relabelings:
- action: replace
regex: ;(.*)
replacement: $1
separator: ";"
sourceLabels:
- node
- __meta_kubernetes_pod_node_name
targetLabel: node
tlsConfig:
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
serverName: machine-config-controller.openshift-machine-config-operator.svc
namespaceSelector:
matchNames:
- openshift-machine-config-operator
selector:
matchLabels:
k8s-app: machine-config-controller
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: machine-config-daemon
namespace: openshift-machine-config-operator
Expand Down
36 changes: 36 additions & 0 deletions install/0000_90_machine-config-operator_01_prometheus-rules.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,41 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: machine-config-controller
namespace: openshift-machine-config-operator
labels:
k8s-app: machine-config-controller
annotations:
include.release.openshift.io/ibm-cloud-managed: "true"
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
spec:
groups:
- name: mcc-paused-pool-kubelet-ca
rules:
- alert: MachineConfigControllerPausedPoolKubeletCA
expr: |
max by (namespace,pool) (last_over_time(machine_config_controller_paused_pool_kubelet_ca[5m])) > 0
for: 60m
labels:
severity: warning
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Firing warning alerts immediately seems like it's wound a bit too tightly. We can go a while before the lack of certificate rotation becomes a problem (significant fractions of a year?). So setting a large-ish for here seems useful to avoid desensitizing folks by alerting every time they briefly pause. Of course, you could have someone leaving pools paused for the bulk of that time, and then very briefly unpause, have Prom happen to scrape, and then pause again, before the pool had time to rotate out many certs. Which is one reason "are we paused now, and have we been paused for a bit?" only a rough proxy for "are we at risk of certs expiring?". And that makes picking an appropriate for difficult. But if we stick with the paused-pool angle on this, I'd expect at least for: 60m on this alert, based on our consistency guidelines.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I think if alerting config allows to change severity then it should change to critical after certain number of warning alert has been fired. This is to emphasize that unpausing pool is necessary to keep cluster healthy.

annotations:
summary: "Paused machine configuration pool '{{$labels.pool}}' is blocking a necessary certificate rotation and must be unpaused before the current kube-apiserver-to-kubelet-signer certificate expires on {{ $value | humanizeTimestamp }}."
description: "Machine config pools have a 'pause' feature, which allows config to be rendered, but prevents it from being rolled out to the nodes. This alert indicates that a certificate rotation has taken place, and the new kubelet-ca certificate bundle has been rendered into a machine config, but because the pool '{{$labels.pool}}' is paused, the config cannot be rolled out to the nodes in that pool. You will notice almost immediately that for nodes in pool '{{$labels.pool}}', pod logs will not be visible in the console and interactive commands (oc log, oc exec, oc debug, oc attach) will not work. You must unpause machine config pool '{{$labels.pool}}' to let the certificates through before the kube-apiserver-to-kubelet-signer certificate expires on {{ $value | humanizeTimestamp }} or this pool's nodes will cease to function properly."
runbook_url: https://github.com/openshift/blob/master/alerts/machine-config-operator/MachineConfigControllerPausedPoolKubeletCA.md
- alert: MachineConfigControllerPausedPoolKubeletCA
expr: |
max by (namespace,pool) (last_over_time(machine_config_controller_paused_pool_kubelet_ca[5m]) - time()) < (86400 * 14) AND max by (namespace,pool) (last_over_time(machine_config_controller_paused_pool_kubelet_ca[5m])) > 0
for: 60m
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the alert expression already delays the firing, it might not be necessary to add a for clause.

labels:
severity: critical
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping @openshift/sre-alert-sme since it's adding a critical alert.

annotations:
summary: "Paused machine configuration pool '{{$labels.pool}}' is blocking a necessary certificate rotation and must be unpaused before the current kube-apiserver-to-kubelet-signer certificate expires in {{ $value | humanizeDuration }}."
description: "Machine config pools have a 'pause' feature, which allows config to be rendered, but prevents it from being rolled out to the nodes. This alert indicates that a certificate rotation has taken place, and the new kubelet-ca certificate bundle has been rendered into a machine config, but because the pool '{{$labels.pool}}' is paused, the config cannot be rolled out to the nodes in that pool. You will notice almost immediately that for nodes in pool '{{$labels.pool}}', pod logs will not be visible in the console and interactive commands (oc log, oc exec, oc debug, oc attach) will not work. You must unpause machine config pool '{{$labels.pool}}' to let the certificates through before the kube-apiserver-to-kubelet-signer certificate expires. You have approximately {{ $value | humanizeDuration }} remaining before this happens and nodes in '{{$labels.pool}}' cease to function properly."
runbook_url: https://github.com/openshift/blob/master/alerts/machine-config-operator/MachineConfigControllerPausedPoolKubeletCA.md
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: machine-config-daemon
namespace: openshift-machine-config-operator
Expand Down
13 changes: 13 additions & 0 deletions manifests/machineconfigcontroller/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,16 @@ rules:
- apiGroups: ["operator.openshift.io"]
resources: ["etcds"]
verbs: ["get", "list", "watch"]
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
- subjectaccessreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
15 changes: 15 additions & 0 deletions manifests/machineconfigcontroller/clusterrolebinding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,18 @@ subjects:
- kind: ServiceAccount
namespace: {{.TargetNamespace}}
name: machine-config-controller
---
# Bind auth-delegator role to the MCC service account
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: machine-config-controller
namespace: {{.TargetNamespace}}
roleRef:
kind: ClusterRole
apiGroup: rbac.authorization.k8s.io
name: system:auth-delegator
subjects:
- kind: ServiceAccount
namespace: {{.TargetNamespace}}
name: machine-config-controller
32 changes: 32 additions & 0 deletions manifests/machineconfigcontroller/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,42 @@ spec:
cpu: 20m
memory: 50Mi
terminationMessagePolicy: FallbackToLogsOnError
- name: oauth-proxy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not using kube-rbac-proxy instead? The advantage is that it can use client TLS certificates instead of bearer tokens which is more gentle on the Kube API (see this OEP).
cc @s-urbaniak

Copy link
Member Author

@jkyros jkyros Dec 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because I didn't know about it :)

It sounds like we probably should use it -- I set this up to mirror what the machine-config-daemon does, since we already had precedent, and machine-config-daemon uses oauth-proxy. I probably won't change the machine-config-controller to operate this way today since I don't fully understand all of the ramifications, but I have added a JIRA ticket to investigate this as part of our monitoring improvement epic https://issues.redhat.com/browse/MCO-128

image: {{.Images.OauthProxy}}
ports:
- containerPort: 9001
name: metrics
protocol: TCP
args:
- --https-address=:9001
- --provider=openshift
- --openshift-service-account=machine-config-controller
- --upstream=http://127.0.0.1:8797
- --tls-cert=/etc/tls/private/tls.crt
- --tls-key=/etc/tls/private/tls.key
- --cookie-secret-file=/etc/tls/cookie-secret/cookie-secret
- '--openshift-sar={"resource": "namespaces", "verb": "get"}'
- '--openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get"}}'
resources:
requests:
cpu: 20m
memory: 50Mi
volumeMounts:
- mountPath: /etc/tls/private
name: proxy-tls
- mountPath: /etc/tls/cookie-secret
name: cookie-secret
serviceAccountName: machine-config-controller
nodeSelector:
node-role.kubernetes.io/master: ""
priorityClassName: "system-cluster-critical"
volumes:
- name: proxy-tls
secret:
secretName: mcc-proxy-tls
- name: cookie-secret
secret:
secretName: cookie-secret
restartPolicy: Always
tolerations:
- key: node-role.kubernetes.io/master
Expand Down
50 changes: 50 additions & 0 deletions pkg/controller/common/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@ package common

import (
"context"
"crypto/x509"
"encoding/pem"
"fmt"
"io/ioutil"
"net/url"
"os"
"reflect"
"sort"
"strings"

"github.com/clarketm/json"
fcctbase "github.com/coreos/fcct/base/v0_1"
Expand Down Expand Up @@ -740,3 +743,50 @@ func GetIgnitionFileDataByPath(config *ign3types.Config, path string) ([]byte, e
}
return nil, nil
}

// GetNewestCertificatesFromPEMBundle breaks a pem-encoded bundle out into its component certificates
func GetCertificatesFromPEMBundle(pemBytes []byte) ([]*x509.Certificate, error) {
var certs []*x509.Certificate
// There can be multiple certificates in the file
for {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait sorry - do we really want an infinite loop here? could we make this a little more explicit?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Future John will come back to this later to add clarifying comments which is fine with me.

// Decode a block to parse
block, rest := pem.Decode(pemBytes)
// Once we get no more blocks, we've read all the certs
if block == nil {
break
}
// Right now we just care about certificates, not keys
if block.Type == "CERTIFICATE" {
cert, err := x509.ParseCertificate(block.Bytes)
if err != nil {
// This isn't fatal, *this* cert could just be junk, next one could be okay
glog.Warningf("Failed to parse certificate: %v", err.Error())
} else {
certs = append(certs, cert)
}
}
// Keep reading from where we left off
pemBytes = rest
}
return certs, nil
}

// GetLongestValidCertificate returns the latest-expiring certificate from a given list of certificates
// whose Subject.CommonName also matches any of the given common-name prefixes
func GetLongestValidCertificate(certificateList []*x509.Certificate, subjectPrefixes []string) *x509.Certificate {
// Sort is smallest-to-largest, so we're putting the cert with the latest expiry date at the top
sort.Slice(certificateList, func(i, j int) bool {
return certificateList[i].NotAfter.After(certificateList[j].NotAfter)
})
// For each certificate in our list
for _, certificate := range certificateList {
// Check it against our prefixes
for _, prefix := range subjectPrefixes {
// If it matches, this is the latest-expiring one since it's closest to the "top"
if strings.HasPrefix(certificate.Subject.CommonName, prefix) {
return certificate
}
}
}
return nil
}
68 changes: 68 additions & 0 deletions pkg/controller/common/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package common

import (
"context"
"net/http"

"github.com/golang/glog"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

const (
// DefaultBindAddress is the port for the metrics listener
DefaultBindAddress = ":8797"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const instead of var?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, we don't write to it, but in my defense I stole that verbatim from the daemon. 😃

This is probably in line with Zack's comments about the boilerplate: #2802 whatever we fix here, we should fix across all our metrics listeners so there is "one true metrics listener" setup function.

)

var (
// MachineConfigControllerPausedPoolKubeletCA logs when a certificate rotation is being held up by pause
MachineConfigControllerPausedPoolKubeletCA = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "machine_config_controller_paused_pool_kubelet_ca",
Help: "Set to the unix timestamp in utc of the current certificate expiry date if a certificate rotation is pending in specified paused pool",
}, []string{"pool"})

metricsList = []prometheus.Collector{
MachineConfigControllerPausedPoolKubeletCA,
}
)

func RegisterMCCMetrics() error {
for _, metric := range metricsList {
err := prometheus.Register(metric)
if err != nil {
return err
}
}

return nil
}

// StartMetricsListener is metrics listener via http on localhost
func StartMetricsListener(addr string, stopCh <-chan struct{}) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks very similar to daemon.StartMetricsListener. I wonder if there's an opportunity for consolidating the boilerplate of starting the listener, e.g.:

func StartMetricsListener(addr string, stopCh <-chan struct{}, register func() error) {
  // Boilerplate
  if err := register(); err != nil {
    // Handle error
  }
  // More boilerplate
}

Then the controller and daemon can call this and pass in their registration funcs.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've refactored StartMetricsListener to take a function argument as suggested, and adjusted daemon to call it from controller/common.

if addr == "" {
addr = DefaultBindAddress
}

glog.Info("Registering Prometheus metrics")
if err := RegisterMCCMetrics(); err != nil {
glog.Errorf("unable to register metrics: %v", err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logs that an error occurred during metrics registration and then continues starting the metrics listener. Does it make sense to continue trying to start the metrics listener if registration fails? Or would it make more sense to return early?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I adjusted it to return if it fails to register, but I didn't have it return the error all the way up, I assumed we didn't want the failure of metrics registration to be fatal/impact anything else aside from the metrics handler.

If I'm wrong and we do want it to be fatal for some reason, let me know :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In terms of error-handling, I think this is fine for now; we can revisit bubbling the error further later. A further improvement would be adding an additional error line (or tweaking the existing one) to say something like, "could not start metrics listener", just so it's clear that the metrics listener is not running.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In practice, a failure during metric registration is a bug in your code (you registered the same metric twice for instance) and should be treated as a hard failure. Which is why prometheus.MustRegister() exists so you don't have to worry about the potential error.

// No sense in continuing starting the listener if this fails
return
}

glog.Infof("Starting metrics listener on %s", addr)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
s := http.Server{Addr: addr, Handler: mux}

go func() {
if err := s.ListenAndServe(); err != nil && err != http.ErrServerClosed {
glog.Errorf("metrics listener exited with error: %v", err)
}
}()
<-stopCh
if err := s.Shutdown(context.Background()); err != http.ErrServerClosed {
glog.Errorf("error stopping metrics listener: %v", err)
}
}
Loading