Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions cmd/machine-config-controller/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,18 @@ var (
}

startOpts struct {
kubeconfig string
templates string

resourceLockNamespace string
kubeconfig string
templates string
promMetricsListenAddress string
resourceLockNamespace string
}
)

func init() {
rootCmd.AddCommand(startCmd)
startCmd.PersistentFlags().StringVar(&startOpts.kubeconfig, "kubeconfig", "", "Kubeconfig file to access a remote cluster (testing only)")
startCmd.PersistentFlags().StringVar(&startOpts.resourceLockNamespace, "resourcelock-namespace", metav1.NamespaceSystem, "Path to the template files used for creating MachineConfig objects")
startCmd.PersistentFlags().StringVar(&startOpts.promMetricsListenAddress, "metrics-listen-address", "127.0.0.1:8797", "Listen address for prometheus metrics listener")
}

func runStartCmd(cmd *cobra.Command, args []string) {
Expand All @@ -56,6 +57,9 @@ func runStartCmd(cmd *cobra.Command, args []string) {
run := func(ctx context.Context) {
ctrlctx := ctrlcommon.CreateControllerContext(cb, ctx.Done(), componentName)

// Start the metrics handler
go ctrlcommon.StartMetricsListener(startOpts.promMetricsListenAddress, ctrlctx.Stop)

controllers := createControllers(ctrlctx)

// Start the shared factory informers that you need to use in your controller
Expand Down Expand Up @@ -140,6 +144,7 @@ func createControllers(ctx *ctrlcommon.ControllerContext) []ctrlcommon.Controlle
// The node controller consumes data written by the above
node.New(
ctx.InformerFactory.Machineconfiguration().V1().ControllerConfigs(),
ctx.InformerFactory.Machineconfiguration().V1().MachineConfigs(),
ctx.InformerFactory.Machineconfiguration().V1().MachineConfigPools(),
ctx.KubeInformerFactory.Core().V1().Nodes(),
ctx.ConfigInformerFactory.Config().V1().Schedulers(),
Expand Down
21 changes: 21 additions & 0 deletions install/0000_80_machine-config-operator_00_service.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,26 @@
apiVersion: v1
kind: Service
metadata:
name: machine-config-controller
namespace: openshift-machine-config-operator
labels:
k8s-app: machine-config-controller
annotations:
include.release.openshift.io/ibm-cloud-managed: "true"
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
service.beta.openshift.io/serving-cert-secret-name: mcc-proxy-tls
spec:
type: ClusterIP
selector:
k8s-app: machine-config-controller
ports:
- name: metrics
port: 9001
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
name: machine-config-daemon
namespace: openshift-machine-config-operator
Expand Down
36 changes: 36 additions & 0 deletions install/0000_90_machine-config-operator_00_servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,41 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: machine-config-controller
namespace: openshift-machine-config-operator
labels:
k8s-app: machine-config-controller
annotations:
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
spec:
endpoints:
- interval: 30s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: metrics
scheme: https
path: /metrics
relabelings:
- action: replace
regex: ;(.*)
replacement: $1
separator: ";"
sourceLabels:
- node
- __meta_kubernetes_pod_node_name
targetLabel: node
tlsConfig:
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
serverName: machine-config-controller.openshift-machine-config-operator.svc
namespaceSelector:
matchNames:
- openshift-machine-config-operator
selector:
matchLabels:
k8s-app: machine-config-controller
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: machine-config-daemon
namespace: openshift-machine-config-operator
Expand Down
36 changes: 36 additions & 0 deletions install/0000_90_machine-config-operator_01_prometheus-rules.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,41 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: machine-config-controller
namespace: openshift-machine-config-operator
labels:
k8s-app: machine-config-controller
annotations:
include.release.openshift.io/ibm-cloud-managed: "true"
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
spec:
groups:
- name: mcc-paused-pool-kubelet-ca
rules:
- alert: MachineConfigControllerPausedPoolKubeletCA
expr: |
max by (namespace,pool) (last_over_time(machine_config_controller_paused_pool_kubelet_ca[5m])) > 0
for: 60m
labels:
severity: warning
annotations:
summary: "Paused machine configuration pool '{{$labels.pool}}' is blocking a necessary certificate rotation and must be unpaused before the current kube-apiserver-to-kubelet-signer certificate expires on {{ $value | humanizeTimestamp }}."
description: "Machine config pools have a 'pause' feature, which allows config to be rendered, but prevents it from being rolled out to the nodes. This alert indicates that a certificate rotation has taken place, and the new kubelet-ca certificate bundle has been rendered into a machine config, but because the pool '{{$labels.pool}}' is paused, the config cannot be rolled out to the nodes in that pool. You will notice almost immediately that for nodes in pool '{{$labels.pool}}', pod logs will not be visible in the console and interactive commands (oc log, oc exec, oc debug, oc attach) will not work. You must unpause machine config pool '{{$labels.pool}}' to let the certificates through before the kube-apiserver-to-kubelet-signer certificate expires on {{ $value | humanizeTimestamp }} or this pool's nodes will cease to function properly."
runbook_url: https://github.com/openshift/blob/master/alerts/machine-config-operator/MachineConfigControllerPausedPoolKubeletCA.md
- alert: MachineConfigControllerPausedPoolKubeletCA
expr: |
max by (namespace,pool) (last_over_time(machine_config_controller_paused_pool_kubelet_ca[5m]) - time()) < (86400 * 14) AND max by (namespace,pool) (last_over_time(machine_config_controller_paused_pool_kubelet_ca[5m])) > 0
for: 60m
labels:
severity: critical
annotations:
summary: "Paused machine configuration pool '{{$labels.pool}}' is blocking a necessary certificate rotation and must be unpaused before the current kube-apiserver-to-kubelet-signer certificate expires in {{ $value | humanizeDuration }}."
description: "Machine config pools have a 'pause' feature, which allows config to be rendered, but prevents it from being rolled out to the nodes. This alert indicates that a certificate rotation has taken place, and the new kubelet-ca certificate bundle has been rendered into a machine config, but because the pool '{{$labels.pool}}' is paused, the config cannot be rolled out to the nodes in that pool. You will notice almost immediately that for nodes in pool '{{$labels.pool}}', pod logs will not be visible in the console and interactive commands (oc log, oc exec, oc debug, oc attach) will not work. You must unpause machine config pool '{{$labels.pool}}' to let the certificates through before the kube-apiserver-to-kubelet-signer certificate expires. You have approximately {{ $value | humanizeDuration }} remaining before this happens and nodes in '{{$labels.pool}}' cease to function properly."
runbook_url: https://github.com/openshift/blob/master/alerts/machine-config-operator/MachineConfigControllerPausedPoolKubeletCA.md
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: machine-config-daemon
namespace: openshift-machine-config-operator
Expand Down
16 changes: 16 additions & 0 deletions lib/resourcemerge/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ func ensureContainer(modified *bool, existing *corev1.Container, required corev1
setStringIfSet(modified, &existing.Name, required.Name)
setStringIfSet(modified, &existing.Image, required.Image)

// This previously didn't properly sync the cpu and memory request fields, which caused a payload rejection
// https://github.com/openshift/machine-config-operator/pull/3027
// cpu and memory are unfortunately not explicit fields, they are functions that reference keys in a map
// (a map that it really looks like we're not supposed to modify directly), so we just overwrite the whole map
setResourceListIfSet(modified, &existing.Resources.Requests, required.Resources.Requests)

// if you want modify the launch, you need to modify it in the config, not in the launch args
setStringSliceIfSet(modified, &existing.Command, required.Command)
setStringSliceIfSet(modified, &existing.Args, required.Args)
Expand Down Expand Up @@ -306,6 +312,16 @@ func setStringSliceIfSet(modified *bool, existing *[]string, required []string)
}
}

func setResourceListIfSet(modified *bool, existing *corev1.ResourceList, required corev1.ResourceList) {
if required == nil {
return
}
if !equality.Semantic.DeepEqual(required, *existing) {
*existing = required
*modified = true
}
}

func mergeStringSlice(modified *bool, existing *[]string, required []string) {
for _, required := range required {
found := false
Expand Down
13 changes: 13 additions & 0 deletions manifests/machineconfigcontroller/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,16 @@ rules:
- apiGroups: ["operator.openshift.io"]
resources: ["etcds"]
verbs: ["get", "list", "watch"]
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
- subjectaccessreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
15 changes: 15 additions & 0 deletions manifests/machineconfigcontroller/clusterrolebinding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,18 @@ subjects:
- kind: ServiceAccount
namespace: {{.TargetNamespace}}
name: machine-config-controller
---
# Bind auth-delegator role to the MCC service account
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: machine-config-controller
namespace: {{.TargetNamespace}}
roleRef:
kind: ClusterRole
apiGroup: rbac.authorization.k8s.io
name: system:auth-delegator
subjects:
- kind: ServiceAccount
namespace: {{.TargetNamespace}}
name: machine-config-controller
32 changes: 32 additions & 0 deletions manifests/machineconfigcontroller/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,42 @@ spec:
cpu: 20m
memory: 50Mi
terminationMessagePolicy: FallbackToLogsOnError
- name: oauth-proxy
image: {{.Images.OauthProxy}}
ports:
- containerPort: 9001
name: metrics
protocol: TCP
args:
- --https-address=:9001
- --provider=openshift
- --openshift-service-account=machine-config-controller
- --upstream=http://127.0.0.1:8797
- --tls-cert=/etc/tls/private/tls.crt
- --tls-key=/etc/tls/private/tls.key
- --cookie-secret-file=/etc/tls/cookie-secret/cookie-secret
- '--openshift-sar={"resource": "namespaces", "verb": "get"}'
- '--openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get"}}'
resources:
requests:
cpu: 20m
memory: 50Mi
volumeMounts:
- mountPath: /etc/tls/private
name: proxy-tls
- mountPath: /etc/tls/cookie-secret
name: cookie-secret
serviceAccountName: machine-config-controller
nodeSelector:
node-role.kubernetes.io/master: ""
priorityClassName: "system-cluster-critical"
volumes:
- name: proxy-tls
secret:
secretName: mcc-proxy-tls
- name: cookie-secret
secret:
secretName: cookie-secret
restartPolicy: Always
tolerations:
- key: node-role.kubernetes.io/master
Expand Down
50 changes: 50 additions & 0 deletions pkg/controller/common/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@ import (
"bytes"
"compress/gzip"
"context"
"crypto/x509"
"encoding/pem"
"fmt"
"io"
"io/ioutil"
"net/url"
"os"
"reflect"
"sort"
"strings"

"github.com/clarketm/json"
fcctbase "github.com/coreos/fcct/base/v0_1"
Expand Down Expand Up @@ -779,3 +782,50 @@ func GetIgnitionFileDataByPath(config *ign3types.Config, path string) ([]byte, e
}
return nil, nil
}

// GetNewestCertificatesFromPEMBundle breaks a pem-encoded bundle out into its component certificates
func GetCertificatesFromPEMBundle(pemBytes []byte) ([]*x509.Certificate, error) {
var certs []*x509.Certificate
// There can be multiple certificates in the file
for {
// Decode a block to parse
block, rest := pem.Decode(pemBytes)
// Once we get no more blocks, we've read all the certs
if block == nil {
break
}
// Right now we just care about certificates, not keys
if block.Type == "CERTIFICATE" {
cert, err := x509.ParseCertificate(block.Bytes)
if err != nil {
// This isn't fatal, *this* cert could just be junk, next one could be okay
glog.Warningf("Failed to parse certificate: %v", err.Error())
} else {
certs = append(certs, cert)
}
}
// Keep reading from where we left off
pemBytes = rest
}
return certs, nil
}

// GetLongestValidCertificate returns the latest-expiring certificate from a given list of certificates
// whose Subject.CommonName also matches any of the given common-name prefixes
func GetLongestValidCertificate(certificateList []*x509.Certificate, subjectPrefixes []string) *x509.Certificate {
// Sort is smallest-to-largest, so we're putting the cert with the latest expiry date at the top
sort.Slice(certificateList, func(i, j int) bool {
return certificateList[i].NotAfter.After(certificateList[j].NotAfter)
})
// For each certificate in our list
for _, certificate := range certificateList {
// Check it against our prefixes
for _, prefix := range subjectPrefixes {
// If it matches, this is the latest-expiring one since it's closest to the "top"
if strings.HasPrefix(certificate.Subject.CommonName, prefix) {
return certificate
}
}
}
return nil
}
68 changes: 68 additions & 0 deletions pkg/controller/common/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package common

import (
"context"
"net/http"

"github.com/golang/glog"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

const (
// DefaultBindAddress is the port for the metrics listener
DefaultBindAddress = ":8797"
)

var (
// MachineConfigControllerPausedPoolKubeletCA logs when a certificate rotation is being held up by pause
MachineConfigControllerPausedPoolKubeletCA = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "machine_config_controller_paused_pool_kubelet_ca",
Help: "Set to the unix timestamp in utc of the current certificate expiry date if a certificate rotation is pending in specified paused pool",
}, []string{"pool"})

metricsList = []prometheus.Collector{
MachineConfigControllerPausedPoolKubeletCA,
}
)

func RegisterMCCMetrics() error {
for _, metric := range metricsList {
err := prometheus.Register(metric)
if err != nil {
return err
}
}

return nil
}

// StartMetricsListener is metrics listener via http on localhost
func StartMetricsListener(addr string, stopCh <-chan struct{}) {
if addr == "" {
addr = DefaultBindAddress
}

glog.Info("Registering Prometheus metrics")
if err := RegisterMCCMetrics(); err != nil {
glog.Errorf("unable to register metrics: %v", err)
// No sense in continuing starting the listener if this fails
return
}

glog.Infof("Starting metrics listener on %s", addr)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
s := http.Server{Addr: addr, Handler: mux}

go func() {
if err := s.ListenAndServe(); err != nil && err != http.ErrServerClosed {
glog.Errorf("metrics listener exited with error: %v", err)
}
}()
<-stopCh
if err := s.Shutdown(context.Background()); err != http.ErrServerClosed {
glog.Errorf("error stopping metrics listener: %v", err)
}
}
Loading