From 4e133e7148701566f6699cabe807c5fb581625df Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 30 Jul 2020 12:45:39 +0000 Subject: [PATCH] controller: Emit events A while ago I'd invested some time in tweaking the node controller to have useful logs around what it's doing; my first "point of contact" when looking at upgrades was its pod logs. But...we lose most those on upgrade since the pod gets killed. Add events to the node controller too. Currently the MCD emits useful events which can be queried afterwards (in our CI runs we dump `events.json`). With this we can create a "journal/history" for upgrade/update events just by querying the event stream. --- .../events-clusterrole.yaml | 9 ++ .../events-rolebinding-default.yaml | 12 ++ .../events-rolebinding-target.yaml | 12 ++ pkg/controller/node/node_controller.go | 48 ++++++-- pkg/controller/node/status.go | 6 + pkg/operator/assets/bindata.go | 103 +++++++++++++++++- pkg/operator/sync.go | 34 ++++-- 7 files changed, 200 insertions(+), 24 deletions(-) create mode 100644 manifests/machineconfigcontroller/events-clusterrole.yaml create mode 100644 manifests/machineconfigcontroller/events-rolebinding-default.yaml create mode 100644 manifests/machineconfigcontroller/events-rolebinding-target.yaml diff --git a/manifests/machineconfigcontroller/events-clusterrole.yaml b/manifests/machineconfigcontroller/events-clusterrole.yaml new file mode 100644 index 0000000000..ead6524561 --- /dev/null +++ b/manifests/machineconfigcontroller/events-clusterrole.yaml @@ -0,0 +1,9 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: machine-config-controller-events + namespace: {{.TargetNamespace}} +rules: +- apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] diff --git a/manifests/machineconfigcontroller/events-rolebinding-default.yaml b/manifests/machineconfigcontroller/events-rolebinding-default.yaml new file mode 100644 index 0000000000..91f6cfeddc --- /dev/null +++ b/manifests/machineconfigcontroller/events-rolebinding-default.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: machine-config-controller-events + namespace: default +roleRef: + kind: ClusterRole + name: machine-config-controller-events +subjects: +- kind: ServiceAccount + namespace: {{.TargetNamespace}} + name: machine-config-controller diff --git a/manifests/machineconfigcontroller/events-rolebinding-target.yaml b/manifests/machineconfigcontroller/events-rolebinding-target.yaml new file mode 100644 index 0000000000..c951c64a8d --- /dev/null +++ b/manifests/machineconfigcontroller/events-rolebinding-target.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: machine-config-controller-events + namespace: {{.TargetNamespace}} +roleRef: + kind: ClusterRole + name: machine-config-controller-events +subjects: +- kind: ServiceAccount + namespace: {{.TargetNamespace}} + name: machine-config-controller diff --git a/pkg/controller/node/node_controller.go b/pkg/controller/node/node_controller.go index 97cc7aeb1a..b2b26ca1ef 100644 --- a/pkg/controller/node/node_controller.go +++ b/pkg/controller/node/node_controller.go @@ -409,6 +409,16 @@ func (ctrl *Controller) addNode(obj interface{}) { } } +func (ctrl *Controller) logPool(pool *mcfgv1.MachineConfigPool, format string, args ...interface{}) { + msg := fmt.Sprintf(format, args...) + glog.Infof("Pool %s: %s", pool.Name, msg) +} + +func (ctrl *Controller) logPoolNode(pool *mcfgv1.MachineConfigPool, node *corev1.Node, format string, args ...interface{}) { + msg := fmt.Sprintf(format, args...) + glog.Infof("Pool %s: node %s: %s", pool.Name, node.Name, msg) +} + func (ctrl *Controller) updateNode(old, cur interface{}) { oldNode := old.(*corev1.Node) curNode := cur.(*corev1.Node) @@ -441,16 +451,16 @@ func (ctrl *Controller) updateNode(old, cur interface{}) { if oldReady != newReady { changed = true if newReadyErr != nil { - glog.Infof("Pool %s: node %s is now reporting unready: %v", pool.Name, curNode.Name, newReadyErr) + ctrl.logPoolNode(pool, curNode, "Reporting unready: %v", newReadyErr) } else { - glog.Infof("Pool %s: node %s is now reporting ready", pool.Name, curNode.Name) + ctrl.logPoolNode(pool, curNode, "Reporting ready") } } // Specifically log when a node has completed an update so the MCC logs are a useful central aggregate of state changes if oldNode.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] != oldNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] && isNodeDone(curNode) { - glog.Infof("Pool %s: node %s has completed update to %s", pool.Name, curNode.Name, curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey]) + ctrl.logPoolNode(pool, curNode, "Completed update to %s", curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey]) changed = true } else { annos := []string{ @@ -459,13 +469,18 @@ func (ctrl *Controller) updateNode(old, cur interface{}) { daemonconsts.MachineConfigDaemonStateAnnotationKey, } for _, anno := range annos { - if oldNode.Annotations[anno] != curNode.Annotations[anno] { - glog.Infof("Pool %s: node %s changed %s = %s", pool.Name, curNode.Name, anno, curNode.Annotations[anno]) + newValue := curNode.Annotations[anno] + if oldNode.Annotations[anno] != newValue { + ctrl.logPoolNode(pool, curNode, "changed annotation %s = %s", anno, newValue) changed = true + // For the control plane, emit events for these since they're important + if pool.Name == masterPoolName { + ctrl.eventRecorder.Eventf(pool, corev1.EventTypeNormal, "AnnotationChange", "Node %s now has %s=%s", curNode.Name, anno, newValue) + } } } if !reflect.DeepEqual(oldNode.Labels, curNode.Labels) { - glog.Infof("Pool %s: node %s changed labels", pool.Name, curNode.Name) + ctrl.logPoolNode(pool, curNode, "changed labels") changed = true } } @@ -742,7 +757,7 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error { candidates, capacity := getAllCandidateMachines(pool, nodes, maxunavail) if len(candidates) > 0 { - glog.Infof("Pool %s: %d candidate nodes for update, capacity: %d", pool.Name, len(candidates), capacity) + ctrl.logPool(pool, "%d candidate nodes for update, capacity: %d", len(candidates), capacity) if err := ctrl.updateCandidateMachines(pool, candidates, capacity); err != nil { if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil { return goerrs.Wrapf(err, "error setting desired machine config annotation for pool %q, sync error: %v", pool.Name, syncErr) @@ -783,7 +798,6 @@ func (ctrl *Controller) getNodesForPool(pool *mcfgv1.MachineConfigPool) ([]*core } func (ctrl *Controller) setDesiredMachineConfigAnnotation(nodeName, currentConfig string) error { - glog.Infof("Setting node %s to desired config %s", nodeName, currentConfig) return clientretry.RetryOnConflict(nodeUpdateBackoff, func() error { oldNode, err := ctrl.kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) if err != nil { @@ -869,7 +883,7 @@ func (ctrl *Controller) getCurrentEtcdLeader(candidates []*corev1.Node) (*corev1 // filterControlPlaneCandidateNodes adjusts the candidates and capacity specifically // for the control plane, e.g. based on which node is the etcd leader at the time. // nolint:unparam -func (ctrl *Controller) filterControlPlaneCandidateNodes(candidates []*corev1.Node, capacity uint) ([]*corev1.Node, uint, error) { +func (ctrl *Controller) filterControlPlaneCandidateNodes(pool *mcfgv1.MachineConfigPool, candidates []*corev1.Node, capacity uint) ([]*corev1.Node, uint, error) { if len(candidates) <= 1 { return candidates, capacity, nil } @@ -880,6 +894,8 @@ func (ctrl *Controller) filterControlPlaneCandidateNodes(candidates []*corev1.No var newCandidates []*corev1.Node for _, node := range candidates { if node == etcdLeader { + // For now make this an event so we know it's working, even though it's more of a non-event + ctrl.eventRecorder.Eventf(pool, corev1.EventTypeNormal, "DeferringEtcdLeaderUpdate", "Deferring update of etcd leader %s", node.Name) glog.Infof("Deferring update of etcd leader: %s", node.Name) continue } @@ -892,23 +908,31 @@ func (ctrl *Controller) filterControlPlaneCandidateNodes(candidates []*corev1.No func (ctrl *Controller) updateCandidateMachines(pool *mcfgv1.MachineConfigPool, candidates []*corev1.Node, capacity uint) error { if pool.Name == masterPoolName { var err error - candidates, capacity, err = ctrl.filterControlPlaneCandidateNodes(candidates, capacity) + candidates, capacity, err = ctrl.filterControlPlaneCandidateNodes(pool, candidates, capacity) if err != nil { return err } // In practice right now these counts will be 1 but let's stay general to support 5 etcd nodes in the future - glog.Infof("Pool %s: filtered to %d candidate nodes for update, capacity: %d", pool.Name, len(candidates), capacity) + ctrl.logPool(pool, "filtered to %d candidate nodes for update, capacity: %d", len(candidates), capacity) } if capacity < uint(len(candidates)) { // Arbitrarily pick the first N candidates; no attempt at sorting. // Perhaps later we allow admins to weight somehow, or do something more intelligent. candidates = candidates[:capacity] } + targetConfig := pool.Spec.Configuration.Name for _, node := range candidates { - if err := ctrl.setDesiredMachineConfigAnnotation(node.Name, pool.Spec.Configuration.Name); err != nil { + ctrl.logPool(pool, "Setting node %s target to %s", node.Name, targetConfig) + if err := ctrl.setDesiredMachineConfigAnnotation(node.Name, targetConfig); err != nil { return goerrs.Wrapf(err, "setting desired config for node %s", node.Name) } } + if len(candidates) == 1 { + candidate := candidates[0] + ctrl.eventRecorder.Eventf(pool, corev1.EventTypeNormal, "SetDesiredConfig", "Targeted node %s to config %s", candidate.Name, targetConfig) + } else { + ctrl.eventRecorder.Eventf(pool, corev1.EventTypeNormal, "SetDesiredConfig", "Set target for %d nodes to config %s", targetConfig) + } return nil } diff --git a/pkg/controller/node/status.go b/pkg/controller/node/status.go index 0ecc245712..32fb62697d 100644 --- a/pkg/controller/node/status.go +++ b/pkg/controller/node/status.go @@ -27,6 +27,12 @@ func (ctrl *Controller) syncStatusOnly(pool *mcfgv1.MachineConfigPool) error { newPool := pool newPool.Status = newStatus _, err = ctrl.client.MachineconfigurationV1().MachineConfigPools().UpdateStatus(context.TODO(), newPool, metav1.UpdateOptions{}) + if pool.Spec.Configuration.Name != newPool.Spec.Configuration.Name { + ctrl.eventRecorder.Eventf(pool, corev1.EventTypeNormal, "Updating", "Pool %s now targeting %s", pool.Name, newPool.Spec.Configuration.Name) + } + if pool.Status.Configuration.Name != newPool.Status.Configuration.Name { + ctrl.eventRecorder.Eventf(pool, corev1.EventTypeNormal, "Completed", "Pool %s has completed update to %s", pool.Name, newPool.Status.Configuration.Name) + } return err } diff --git a/pkg/operator/assets/bindata.go b/pkg/operator/assets/bindata.go index cfbefdc802..d00218c163 100644 --- a/pkg/operator/assets/bindata.go +++ b/pkg/operator/assets/bindata.go @@ -10,6 +10,9 @@ // manifests/machineconfigcontroller/clusterrolebinding.yaml // manifests/machineconfigcontroller/controllerconfig.yaml // manifests/machineconfigcontroller/deployment.yaml +// manifests/machineconfigcontroller/events-clusterrole.yaml +// manifests/machineconfigcontroller/events-rolebinding-default.yaml +// manifests/machineconfigcontroller/events-rolebinding-target.yaml // manifests/machineconfigcontroller/sa.yaml // manifests/machineconfigdaemon/clusterrole.yaml // manifests/machineconfigdaemon/clusterrolebinding.yaml @@ -1172,6 +1175,90 @@ func manifestsMachineconfigcontrollerDeploymentYaml() (*asset, error) { return a, nil } +var _manifestsMachineconfigcontrollerEventsClusterroleYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: machine-config-controller-events + namespace: {{.TargetNamespace}} +rules: +- apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] +`) + +func manifestsMachineconfigcontrollerEventsClusterroleYamlBytes() ([]byte, error) { + return _manifestsMachineconfigcontrollerEventsClusterroleYaml, nil +} + +func manifestsMachineconfigcontrollerEventsClusterroleYaml() (*asset, error) { + bytes, err := manifestsMachineconfigcontrollerEventsClusterroleYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "manifests/machineconfigcontroller/events-clusterrole.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _manifestsMachineconfigcontrollerEventsRolebindingDefaultYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: machine-config-controller-events + namespace: default +roleRef: + kind: ClusterRole + name: machine-config-controller-events +subjects: +- kind: ServiceAccount + namespace: {{.TargetNamespace}} + name: machine-config-controller +`) + +func manifestsMachineconfigcontrollerEventsRolebindingDefaultYamlBytes() ([]byte, error) { + return _manifestsMachineconfigcontrollerEventsRolebindingDefaultYaml, nil +} + +func manifestsMachineconfigcontrollerEventsRolebindingDefaultYaml() (*asset, error) { + bytes, err := manifestsMachineconfigcontrollerEventsRolebindingDefaultYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "manifests/machineconfigcontroller/events-rolebinding-default.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +var _manifestsMachineconfigcontrollerEventsRolebindingTargetYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: machine-config-controller-events + namespace: {{.TargetNamespace}} +roleRef: + kind: ClusterRole + name: machine-config-controller-events +subjects: +- kind: ServiceAccount + namespace: {{.TargetNamespace}} + name: machine-config-controller +`) + +func manifestsMachineconfigcontrollerEventsRolebindingTargetYamlBytes() ([]byte, error) { + return _manifestsMachineconfigcontrollerEventsRolebindingTargetYaml, nil +} + +func manifestsMachineconfigcontrollerEventsRolebindingTargetYaml() (*asset, error) { + bytes, err := manifestsMachineconfigcontrollerEventsRolebindingTargetYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "manifests/machineconfigcontroller/events-rolebinding-target.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + var _manifestsMachineconfigcontrollerSaYaml = []byte(`apiVersion: v1 kind: ServiceAccount metadata: @@ -2772,6 +2859,9 @@ var _bindata = map[string]func() (*asset, error){ "manifests/machineconfigcontroller/clusterrolebinding.yaml": manifestsMachineconfigcontrollerClusterrolebindingYaml, "manifests/machineconfigcontroller/controllerconfig.yaml": manifestsMachineconfigcontrollerControllerconfigYaml, "manifests/machineconfigcontroller/deployment.yaml": manifestsMachineconfigcontrollerDeploymentYaml, + "manifests/machineconfigcontroller/events-clusterrole.yaml": manifestsMachineconfigcontrollerEventsClusterroleYaml, + "manifests/machineconfigcontroller/events-rolebinding-default.yaml": manifestsMachineconfigcontrollerEventsRolebindingDefaultYaml, + "manifests/machineconfigcontroller/events-rolebinding-target.yaml": manifestsMachineconfigcontrollerEventsRolebindingTargetYaml, "manifests/machineconfigcontroller/sa.yaml": manifestsMachineconfigcontrollerSaYaml, "manifests/machineconfigdaemon/clusterrole.yaml": manifestsMachineconfigdaemonClusterroleYaml, "manifests/machineconfigdaemon/clusterrolebinding.yaml": manifestsMachineconfigdaemonClusterrolebindingYaml, @@ -2858,11 +2948,14 @@ var _bintree = &bintree{nil, map[string]*bintree{ "bootstrap-pod-v2.yaml": &bintree{manifestsBootstrapPodV2Yaml, map[string]*bintree{}}, "controllerconfig.crd.yaml": &bintree{manifestsControllerconfigCrdYaml, map[string]*bintree{}}, "machineconfigcontroller": &bintree{nil, map[string]*bintree{ - "clusterrole.yaml": &bintree{manifestsMachineconfigcontrollerClusterroleYaml, map[string]*bintree{}}, - "clusterrolebinding.yaml": &bintree{manifestsMachineconfigcontrollerClusterrolebindingYaml, map[string]*bintree{}}, - "controllerconfig.yaml": &bintree{manifestsMachineconfigcontrollerControllerconfigYaml, map[string]*bintree{}}, - "deployment.yaml": &bintree{manifestsMachineconfigcontrollerDeploymentYaml, map[string]*bintree{}}, - "sa.yaml": &bintree{manifestsMachineconfigcontrollerSaYaml, map[string]*bintree{}}, + "clusterrole.yaml": &bintree{manifestsMachineconfigcontrollerClusterroleYaml, map[string]*bintree{}}, + "clusterrolebinding.yaml": &bintree{manifestsMachineconfigcontrollerClusterrolebindingYaml, map[string]*bintree{}}, + "controllerconfig.yaml": &bintree{manifestsMachineconfigcontrollerControllerconfigYaml, map[string]*bintree{}}, + "deployment.yaml": &bintree{manifestsMachineconfigcontrollerDeploymentYaml, map[string]*bintree{}}, + "events-clusterrole.yaml": &bintree{manifestsMachineconfigcontrollerEventsClusterroleYaml, map[string]*bintree{}}, + "events-rolebinding-default.yaml": &bintree{manifestsMachineconfigcontrollerEventsRolebindingDefaultYaml, map[string]*bintree{}}, + "events-rolebinding-target.yaml": &bintree{manifestsMachineconfigcontrollerEventsRolebindingTargetYaml, map[string]*bintree{}}, + "sa.yaml": &bintree{manifestsMachineconfigcontrollerSaYaml, map[string]*bintree{}}, }}, "machineconfigdaemon": &bintree{nil, map[string]*bintree{ "clusterrole.yaml": &bintree{manifestsMachineconfigdaemonClusterroleYaml, map[string]*bintree{}}, diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index 9181170391..db1ddf71e2 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -415,14 +415,34 @@ func (optr *Operator) syncMachineConfigPools(config *renderConfig) error { } func (optr *Operator) syncMachineConfigController(config *renderConfig) error { - crBytes, err := renderAsset(config, "manifests/machineconfigcontroller/clusterrole.yaml") - if err != nil { - return err + for _, path := range []string{ + "manifests/machineconfigcontroller/clusterrole.yaml", + "manifests/machineconfigcontroller/events-clusterrole.yaml", + } { + crBytes, err := renderAsset(config, path) + if err != nil { + return err + } + cr := resourceread.ReadClusterRoleV1OrDie(crBytes) + _, _, err = resourceapply.ApplyClusterRole(optr.kubeClient.RbacV1(), cr) + if err != nil { + return err + } } - cr := resourceread.ReadClusterRoleV1OrDie(crBytes) - _, _, err = resourceapply.ApplyClusterRole(optr.kubeClient.RbacV1(), cr) - if err != nil { - return err + + for _, path := range []string{ + "manifests/machineconfigcontroller/events-rolebinding-default.yaml", + "manifests/machineconfigcontroller/events-rolebinding-target.yaml", + } { + crbBytes, err := renderAsset(config, path) + if err != nil { + return err + } + crb := resourceread.ReadRoleBindingV1OrDie(crbBytes) + _, _, err = resourceapply.ApplyRoleBinding(optr.kubeClient.RbacV1(), crb) + if err != nil { + return err + } } crbBytes, err := renderAsset(config, "manifests/machineconfigcontroller/clusterrolebinding.yaml")