Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
coreinformersv1 "k8s.io/client-go/informers/core/v1"
Expand Down Expand Up @@ -164,6 +165,13 @@ const (
// in the face of errors.
maxUpdateBackoff = 60 * time.Second

// excludeFromLoadBalancerLabel is used by the cloud service controller in order to know
// which nodes to exclude from backend pools of cloud load balancers.
excludeFromLoadBalancerLabel = "node.kubernetes.io/exclude-from-external-load-balancers"
// excludeFromLoadBalancerLabelValue is a custom value to recognize that
// the label was added by MCO.
excludeFromLoadBalancerLabelValue = "added-by-mco"

// used for Hypershift daemon
mcsServedConfigPath = "/etc/mcs-machine-config-content.json"
hypershiftCurrentConfigPath = "/etc/mcd-currentconfig.json"
Expand Down Expand Up @@ -456,6 +464,11 @@ func (dn *Daemon) initializeNode() error {
if dn.nodeInitialized {
return nil
}

if err := dn.addOrRemoveExcludeFromLoadBalancerLabel(false); err != nil {
glog.Warningf("Unable to remove label %s from node : %s", excludeFromLoadBalancerLabel, err)
}

// Some parts of the MCO dispatch on whether or not we're managing a control plane node
if _, isControlPlane := dn.node.Labels[ctrlcommon.MasterLabel]; isControlPlane {
glog.Infof("Node %s is part of the control plane", dn.node.Name)
Expand Down Expand Up @@ -2102,3 +2115,107 @@ func forceFileExists() bool {

return false
}

// patchLabelsOnNode takes a map of key/value string pairs and patches them as labels on the node
func (dn *Daemon) patchLabelsOnNode(labels map[string]interface{}) error {
var err error
var patchData []byte
patch := struct {
Metadata map[string]interface{} `json:"metadata"`
}{
Metadata: map[string]interface{}{
"labels": labels,
},
}
dn.logSystem("Patching labels %v", labels)
patchData, err = json.Marshal(&patch)
if err != nil {
dn.logSystem("Error in patching labels: %v", err)
return err
}
_, err = dn.kubeClient.CoreV1().Nodes().Patch(
context.TODO(), dn.name, types.MergePatchType, patchData, metav1.PatchOptions{})
if err != nil {
dn.logSystem("Error patching labels: %v", err)
return err
}
return nil
}

// Add or remove excludeFromLoadBalancerLabel to the node. If addLabel=true, it
// adds the label if the label doesn't already exist and sets it to a custom value
// so as to know that it was added by MCO. If addLabel=false, it removes the label,
// only if its value is the known custom value, otherwise it leaves it as is.
func (dn *Daemon) addOrRemoveExcludeFromLoadBalancerLabel(addLabel bool) error {
var err error
var labelValue interface{}
if dn.node == nil {
return nil
}
// Add the label only if not present; remove the label only if it has our custom value
labelValue, labelExists := dn.node.Labels[excludeFromLoadBalancerLabel]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ricky-rav , we are having trouble testing this PR with cluster-bot and it looks like the reason is because the cluster doesn't properly install. this job was able to collect some logs.

I downloaded the ipi-install-install tar file and found an interesting trace:

Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: panic: runtime error: invalid memory address or nil pointer dereference
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: [signal SIGSEGV: segmentation violation code=0x1 addr=0xb0 pc=0x192693c]
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: goroutine 1 [running]:
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: github.com/openshift/machine-config-operator/pkg/daemon.(*Daemon).addOrRemoveExcludeFromLoadBalan
cerLabel(0xc00011a000, 0x1)
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/pkg/daemon/daemon.go:1679 +0x3c
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: github.com/openshift/machine-config-operator/pkg/daemon.(*Daemon).finalizeBeforeReboot(0xc00011a0
00, 0xc0002091e0)
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/pkg/daemon/update.go:174 +0x65
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: github.com/openshift/machine-config-operator/pkg/daemon.(*Daemon).update(0xc00011a000, 0xc0009b83
4b, 0xc0002091e0)
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/pkg/daemon/update.go:617 +0xee5
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: github.com/openshift/machine-config-operator/pkg/daemon.(*Daemon).RunFirstbootCompleteMachineconfig(0xc00011a000)
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/pkg/daemon/daemon.go:616 +0xf0
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: main.runFirstBootCompleteMachineConfig(0xc0001440a0, {0xc0007dfcb8, 0xc0007dfcf8, 0x1949efa})
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/cmd/machine-config-daemon/firstboot_complete_machineconfig.go:39 +0x11d
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: main.executeFirstbootCompleteMachineConfig(0x314d960, {0x319bd10, 0x0, 0x0})
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/cmd/machine-config-daemon/firstboot_complete_machineconfig.go:43 +0x1e
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: github.com/spf13/cobra.(*Command).execute(0x314d960, {0x319bd10, 0x0, 0x0})
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/vendor/github.com/spf13/cobra/command.go:860 +0x5f8
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: github.com/spf13/cobra.(*Command).ExecuteC(0x314dbe0)
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/vendor/github.com/spf13/cobra/command.go:974 +0x3bc
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: github.com/spf13/cobra.(*Command).Execute(...)
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/vendor/github.com/spf13/cobra/command.go:902
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: k8s.io/component-base/cli.Run(0x314dbe0)
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/vendor/k8s.io/component-base/cli/run.go:105 +0x389
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]: main.main()
Feb 22 18:52:28 ci-ln-dlrg7rk-1d09d-9fmsd-master-0 machine-config-daemon[2165]:         /go/src/github.com/openshift/machine-config-operator/cmd/machine-config-daemon/main.go:28 +0x25

That came from the file @ log-bundle-20220222192117/control-plane/10.0.0.8/unit-status/machine-config-daemon-firstboot.service.log

does this line need some extra error handling?

Copy link
Contributor

@kikisdeliveryservice kikisdeliveryservice Feb 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeppp see:

if dn.node != nil {

you need to add a if dn.node{} around the label getting and conditionals so it can pass over during firstboot where there is no node object.

if addLabel {
if labelExists {
dn.logSystem("node %s already has label %s=%s (addLabel=%t), nothing to add",
dn.name, excludeFromLoadBalancerLabel, labelValue, addLabel)
return nil
}
// Add addOrRemoveExcludeFromLoadBalancerLabel to the node, as in:
// labels:
// beta.kubernetes.io/arch: amd64
// beta.kubernetes.io/instance-type: Standard_D4s_v3
// beta.kubernetes.io/os: linux
// failure-domain.beta.kubernetes.io/region: centralus
// failure-domain.beta.kubernetes.io/zone: centralus-1
// kubernetes.io/arch: amd64
// kubernetes.io/hostname: ci-ln-0h2ssy2-1d09d-vpvjc-worker-centralus1-l2thg
// kubernetes.io/os: linux
// node-role.kubernetes.io/worker: ""
// node.kubernetes.io/exclude-from-external-load-balancers: added-by-mco
// node.kubernetes.io/instance-type: Standard_D4s_v3
// node.openshift.io/os_id: rhcos
// topology.disk.csi.azure.com/zone: centralus-1
// topology.kubernetes.io/region: centralus
// topology.kubernetes.io/zone: centralus-1
labelValue = excludeFromLoadBalancerLabelValue
} else {
if labelExists {
if labelValue != excludeFromLoadBalancerLabelValue {
dn.logSystem("label %s=%s was not added by MCO, not removing it",
excludeFromLoadBalancerLabel, labelValue)
return nil
}
} else {
dn.logSystem("label %s not present on node %s, nothing to remove",
excludeFromLoadBalancerLabel, dn.name)
return nil
}

// Removing the label by setting its value to nil.
// Continuing from the example above:
// labels:
// beta.kubernetes.io/arch: amd64
// beta.kubernetes.io/instance-type: Standard_D4s_v3
// beta.kubernetes.io/os: linux
// failure-domain.beta.kubernetes.io/region: centralus
// failure-domain.beta.kubernetes.io/zone: centralus-1
// kubernetes.io/arch: amd64
// kubernetes.io/hostname: ci-ln-0h2ssy2-1d09d-vpvjc-worker-centralus1-l2thg
// kubernetes.io/os: linux
// node-role.kubernetes.io/worker: ""
// node.kubernetes.io/instance-type: Standard_D4s_v3
// node.openshift.io/os_id: rhcos
// topology.disk.csi.azure.com/zone: centralus-1
// topology.kubernetes.io/region: centralus
// topology.kubernetes.io/zone: centralus-1

labelValue = nil
}
err = dn.patchLabelsOnNode(map[string]interface{}{excludeFromLoadBalancerLabel: labelValue})
if err != nil {
dn.logSystem("Error patching node %s with label %s=%s: %v",
dn.name, excludeFromLoadBalancerLabel, labelValue, err)
return err
}
return nil
}
144 changes: 144 additions & 0 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package daemon

import (
"context"
"encoding/json"
"os"
"reflect"
Expand All @@ -13,6 +14,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/vincent-petithory/dataurl"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -318,6 +320,14 @@ func newNode(annotations map[string]string) *corev1.Node {
}
}

func newNodeWithLabels(labels map[string]string) *corev1.Node {
return &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Labels: labels,
},
}
}

func TestPrepUpdateFromClusterOnDiskDrift(t *testing.T) {
tmpCurrentConfig, err := os.CreateTemp("", "currentconfig")
require.Nil(t, err)
Expand Down Expand Up @@ -395,3 +405,137 @@ func TestPrepUpdateFromClusterOnDiskDrift(t *testing.T) {
require.Equal(t, onDiskMC.GetName(), current.GetName())
require.Equal(t, desired.GetName(), "test2")
}

func TestAddOrRemoveExcludeFromLoadBalancerLabel(t *testing.T) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯 💯 💯

otherExcludeValue := "not-set-by-mco"
otherLabel := "myLabel"
otherLabelValue := "myValue"
labelsWithNoExclude := map[string]string{
otherLabel: otherLabelValue,
}
labelsWithEmptyExclude := map[string]string{
otherLabel: otherLabelValue,
excludeFromLoadBalancerLabel: "",
}
labelsWithMCOExclude := map[string]string{
otherLabel: otherLabelValue,
excludeFromLoadBalancerLabel: excludeFromLoadBalancerLabelValue,
}
labelsWithOtherExclude := map[string]string{
otherLabel: otherLabelValue,
excludeFromLoadBalancerLabel: otherExcludeValue,
}

initializeEnvironment := func(labels map[string]string) *Daemon {
f := newFixture(t)
node := newNodeWithLabels(labels)
node.Name = "node_name_test"
f.nodeLister = append(f.nodeLister, node)
f.kubeobjects = append(f.kubeobjects, node)
dn := f.newController()
dn.node = node
return dn
}

readNode := func(dn *Daemon) *v1.Node {
newNode, err := dn.kubeClient.CoreV1().Nodes().Get(
context.TODO(), dn.node.Name, metav1.GetOptions{})
if err != nil {
t.Errorf("error reading node: %v", err)
}
return newNode
}

checkForOtherLabel := func(node *v1.Node) {
otherLabelValueRead, otherLabelExists := node.Labels[otherLabel]
assert.True(t, otherLabelExists)
assert.EqualValues(t, otherLabelValue, otherLabelValueRead)
}

// Add "exclude" label to a node that doesn't have it yet
dn := initializeEnvironment(labelsWithNoExclude)
err := dn.addOrRemoveExcludeFromLoadBalancerLabel(true)
assert.Nil(t, err)
newNode := readNode(dn)
labelValue, labelExists := newNode.Labels[excludeFromLoadBalancerLabel]
assert.True(t, labelExists)
assert.EqualValues(t, excludeFromLoadBalancerLabelValue, labelValue)
checkForOtherLabel(newNode)

// Try to remove the "exclude" label from a node that doesn't have it yet
dn = initializeEnvironment(labelsWithNoExclude)
err = dn.addOrRemoveExcludeFromLoadBalancerLabel(false)
assert.Nil(t, err)
newNode = readNode(dn)
labelValue, labelExists = newNode.Labels[excludeFromLoadBalancerLabel]
assert.False(t, labelExists)
checkForOtherLabel(newNode)

// Try to add the "exclude" label to a node that has it already, but whose value is empty.
// It should leave the node as is.
dn = initializeEnvironment(labelsWithEmptyExclude)
err = dn.addOrRemoveExcludeFromLoadBalancerLabel(true)
assert.Nil(t, err)
newNode = readNode(dn)
labelValue, labelExists = newNode.Labels[excludeFromLoadBalancerLabel]
assert.True(t, labelExists)
assert.EqualValues(t, "", labelValue)
checkForOtherLabel(newNode)

// Try to remove the "exclude" label from a node that has it already,
// but whose value is empty. Check that the label does NOT get removed.
dn = initializeEnvironment(labelsWithEmptyExclude)
err = dn.addOrRemoveExcludeFromLoadBalancerLabel(false)
assert.Nil(t, err)
newNode = readNode(dn)
labelValue, labelExists = newNode.Labels[excludeFromLoadBalancerLabel]
assert.True(t, labelExists)
assert.EqualValues(t, "", labelValue)
checkForOtherLabel(newNode)

// Try to add the "exclude" label to a node that has it already, and whose value is
// the one that MCO adds. Nothing should change.
dn = initializeEnvironment(labelsWithMCOExclude)
err = dn.addOrRemoveExcludeFromLoadBalancerLabel(true)
assert.Nil(t, err)
// check that the label is still there
newNode = readNode(dn)
labelValue, labelExists = newNode.Labels[excludeFromLoadBalancerLabel]
assert.True(t, labelExists)
assert.EqualValues(t, excludeFromLoadBalancerLabelValue, labelValue)
checkForOtherLabel(newNode)

// Remove the "exclude" label from a node that has it already, and whose value is
// the one MCO adds. The label should be removed
dn = initializeEnvironment(labelsWithMCOExclude)
err = dn.addOrRemoveExcludeFromLoadBalancerLabel(false)
assert.Nil(t, err)
newNode = readNode(dn)
labelValue, labelExists = newNode.Labels[excludeFromLoadBalancerLabel]
assert.False(t, labelExists)
checkForOtherLabel(newNode)

// Try to add the "exclude" label to a node that has it already, but whose value
// is arbitrary.
// Both "add" and "remove" should leave the node as is.
dn = initializeEnvironment(labelsWithOtherExclude)
err = dn.addOrRemoveExcludeFromLoadBalancerLabel(true)
assert.Nil(t, err)
newNode = readNode(dn)
labelValue, labelExists = newNode.Labels[excludeFromLoadBalancerLabel]
assert.True(t, labelExists)
assert.EqualValues(t, otherExcludeValue, labelValue)
checkForOtherLabel(newNode)

// Try to remove the "exclude" label from a node that has it already, but whose value
// is arbitrary. Check that the label does NOT get removed
dn = initializeEnvironment(labelsWithOtherExclude)
err = dn.addOrRemoveExcludeFromLoadBalancerLabel(false)
assert.Nil(t, err)
newNode = readNode(dn)
labelValue, labelExists = newNode.Labels[excludeFromLoadBalancerLabel]
assert.True(t, labelExists)
assert.EqualValues(t, otherExcludeValue, labelValue)
checkForOtherLabel(newNode)

}
4 changes: 4 additions & 0 deletions pkg/daemon/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string
// finalizeBeforeReboot is the last step in an update() and then we take appropriate postConfigChangeAction.
// It can also be called as a special case for the "bootstrap pivot".
func (dn *Daemon) finalizeBeforeReboot(newConfig *mcfgv1.MachineConfig) (retErr error) {
if err := dn.addOrRemoveExcludeFromLoadBalancerLabel(true); err != nil {
glog.Warningf("Unable to add label %s to node : %s", excludeFromLoadBalancerLabel, err)
}

if out, err := dn.storePendingState(newConfig, 1); err != nil {
return fmt.Errorf("failed to log pending config: %s: %w", string(out), err)
}
Expand Down