diff --git a/pkg/monitortests/cli/adm_upgrade/status/controlplane.go b/pkg/monitortests/cli/adm_upgrade/status/controlplane.go new file mode 100644 index 000000000000..bb44bbafceef --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/controlplane.go @@ -0,0 +1,133 @@ +package admupgradestatus + +import ( + "fmt" + "regexp" + "strings" + "time" + + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +var ( + operatorLinePattern = regexp.MustCompile(`^\S+\s+\S+\s+\S+\s+.*$`) +) + +func (w *monitor) controlPlane() *junitapi.JUnitTestCase { + controlPlane := &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status control plane section is consistent", + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was successfully collected", + }, + } + + failureOutputBuilder := strings.Builder{} + + for _, observed := range w.ocAdmUpgradeStatusOutputModels { + if observed.output == nil { + // Failing to parse the output is handled in expectedLayout, so we can skip here + continue + } + // We saw at least one successful execution of oc adm upgrade status, so we have data to process + controlPlane.SkipMessage = nil + + wroteOnce := false + fail := func(message string) { + if !wroteOnce { + wroteOnce = true + failureOutputBuilder.WriteString(fmt.Sprintf("\n===== %s\n", observed.when.Format(time.RFC3339))) + failureOutputBuilder.WriteString(observed.output.rawOutput) + failureOutputBuilder.WriteString(fmt.Sprintf("\n\n=> %s\n", message)) + } + } + + if !observed.output.updating { + // If the cluster is not updating, control plane should not be updating + if observed.output.controlPlane != nil { + fail("Cluster is not updating but control plane section is present") + } + continue + } + + cp := observed.output.controlPlane + if cp == nil { + fail("Cluster is updating but control plane section is not present") + continue + } + + if cp.Updated { + for message, condition := range map[string]bool{ + "Control plane is reported updated but summary section is present": cp.Summary != nil, + "Control plane is reported updated but operators section is present": cp.Operators != nil, + "Control plane is reported updated but nodes section is present": cp.Nodes != nil, + "Control plane is reported updated but nodes are not updated": !cp.NodesUpdated, + } { + if condition { + fail(message) + } + } + continue + } + + if cp.Summary == nil { + fail("Control plane is not updated but summary section is not present") + } + + for _, key := range []string{"Assessment", "Target Version", "Completion", "Duration", "Operator Health"} { + value, ok := cp.Summary[key] + if !ok { + fail(fmt.Sprintf("Control plane summary does not contain %s", key)) + } else if value == "" { + fail(fmt.Sprintf("%s is empty", key)) + } + } + + updatingOperators, ok := cp.Summary["Updating"] + if !ok { + if cp.Operators != nil { + fail("Control plane summary does not contain Updating key but operators section is present") + continue + } + } else { + if updatingOperators == "" { + fail("Control plane summary contains Updating key but it is empty") + continue + } + + if cp.Operators == nil { + fail("Control plane summary contains Updating key but operators section is not present") + continue + } + + items := len(strings.Split(updatingOperators, ",")) + // TODO: These should actually exactly match, but `oc adm upgrade status` emits operators with linebreaks in + // messages in a crappy way which we will need to fix + if len(cp.Operators) < items { + fail(fmt.Sprintf("Control plane summary contains Updating key with %d operators but operators section has %d items", items, len(cp.Operators))) + continue + } + } + + // TODO: `oc adm upgrade status` emits operators with linebreaks in messages in a crappy way which we will need to fix + // for _, operator := range cp.Operators { + // if !operatorLinePattern.MatchString(operator) { + // fail(fmt.Sprintf("Bad line in operators: %s", operator)) + // } + // } + + for _, node := range cp.Nodes { + if !nodeLinePattern.MatchString(node) { + fail(fmt.Sprintf("Bad line in nodes: %s", node)) + } + } + } + + if failureOutputBuilder.Len() > 0 { + controlPlane.FailureOutput = &junitapi.FailureOutput{ + Message: fmt.Sprintf("observed unexpected outputs in oc adm upgrade status control plane section"), + Output: failureOutputBuilder.String(), + } + } + + return controlPlane +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/controlplane_test.go b/pkg/monitortests/cli/adm_upgrade/status/controlplane_test.go new file mode 100644 index 000000000000..db65313077ac --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/controlplane_test.go @@ -0,0 +1,270 @@ +package admupgradestatus + +import ( + "errors" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +var cpExampleOutput = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Updating: kube-apiserver +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Updating Cluster Operators +NAME SINCE REASON MESSAGE +kube-apiserver 7m27s NodeInstaller NodeInstallerProgressing: 1 node is at revision 7; 2 nodes are at revision 8 + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-53-218.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-99-189.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-0-72.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-100-255.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-106-212.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +var cpBadOutput = `= Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + +SOMETHING UNEXPECTED HERE + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +var missingItemInSummary = `= Control Plane = +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +var operatorsSectionWithoutSummaryLine = `= Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Updating Cluster Operators +NAME SINCE REASON MESSAGE +kube-apiserver 7m27s NodeInstaller NodeInstallerProgressing: 1 node is at revision 7; 2 nodes are at revision 8 + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +// TODO: This is a bug in `oc adm upgrade status` that we will fix but for now we need to tolerate +// Eventually we will fail on output like this and we will also need to add a testcase for the fixed output +var operatorsWithLinebreaksInMessages = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-182454-test-ci-op-5wilvz46-latest (from 4.20.0-0.ci-2025-08-13-174821-test-ci-op-5wilvz46-initial) +Updating: image-registry, monitoring, openshift-controller-manager +Completion: 50% (17 operators updated, 3 updating, 14 waiting) +Duration: 24m (Est. Time Remaining: 45m) +Operator Health: 34 Healthy + +Updating Cluster Operators +NAME SINCE REASON MESSAGE +image-registry 6s DeploymentNotCompleted::NodeCADaemonUnavailable NodeCADaemonProgressing: The daemon set node-ca is deploying node pods +Progressing: The deployment has not completed +monitoring 4s RollOutInProgress Rolling out the stack. +openshift-controller-manager 11s RouteControllerManager_DesiredStateNotYetAchieved::_DesiredStateNotYetAchieved Progressing: deployment/controller-manager: observed generation is 10, desired generation is 11 +Progressing: deployment/controller-manager: updated replicas is 1, desired replicas is 3 +RouteControllerManagerProgressing: deployment/route-controller-manager: observed generation is 7, desired generation is 8 +RouteControllerManagerProgressing: deployment/route-controller-manager: updated replicas is 1, desired replicas is 3 + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-10-232.ec2.internal Outdated Pending 4.20.0-0.ci-2025-08-13-174821-test-ci-op-5wilvz46-initial ? +ip-10-0-8-129.ec2.internal Outdated Pending 4.20.0-0.ci-2025-08-13-174821-test-ci-op-5wilvz46-initial ? +ip-10-0-88-44.ec2.internal Outdated Pending 4.20.0-0.ci-2025-08-13-174821-test-ci-op-5wilvz46-initial ? + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-47-75.ec2.internal Outdated Pending 4.20.0-0.ci-2025-08-13-174821-test-ci-op-5wilvz46-initial ? +ip-10-0-57-235.ec2.internal Outdated Pending 4.20.0-0.ci-2025-08-13-174821-test-ci-op-5wilvz46-initial ? +ip-10-0-64-121.ec2.internal Outdated Pending 4.20.0-0.ci-2025-08-13-174821-test-ci-op-5wilvz46-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +24m12s Info None Update is proceeding well` + +var controlPlaneCompleted = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Update to 4.20.0-0.ci-2025-08-13-182454-test-ci-op-5wilvz46-latest successfully completed at 2025-08-13T20:33:32Z (duration: 59m) + +All control plane nodes successfully updated to 4.20.0-0.ci-2025-08-13-182454-test-ci-op-5wilvz46-latest + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Completed 100% (3/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-47-75.ec2.internal Completed Updated 4.20.0-0.ci-2025-08-13-182454-test-ci-op-5wilvz46-latest - +ip-10-0-57-235.ec2.internal Completed Updated 4.20.0-0.ci-2025-08-13-182454-test-ci-op-5wilvz46-latest - +ip-10-0-64-121.ec2.internal Completed Updated 4.20.0-0.ci-2025-08-13-182454-test-ci-op-5wilvz46-latest - + += Update Health = +SINCE LEVEL IMPACT MESSAGE +59m22s Info None Update is proceeding well` + +func TestMonitor_ControlPlane(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + snapshots []snapshot + expected *junitapi.JUnitTestCase + }{ + { + name: "no snapshots -> test skipped", + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status control plane section is consistent", + SkipMessage: &junitapi.SkipMessage{Message: "Test skipped because no oc adm upgrade status output was successfully collected"}, + }, + }, + { + name: "good snapshots", + snapshots: []snapshot{ + {when: time.Now(), out: cpExampleOutput}, + {when: time.Now(), out: cpExampleOutput}, + {when: time.Now(), out: cpExampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status control plane section is consistent", + }, + }, + { + name: "errored snapshots are skipped", + snapshots: []snapshot{ + {when: time.Now(), out: cpExampleOutput}, + {when: time.Now(), out: cpBadOutput, err: errors.New("some error")}, + {when: time.Now(), out: cpExampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status control plane section is consistent", + }, + }, + { + name: "unparseable snapshots are skipped", + snapshots: []snapshot{ + {when: time.Now(), out: cpExampleOutput}, + {when: time.Now(), out: "unparseable output"}, + {when: time.Now(), out: cpExampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status control plane section is consistent", + }, + }, + { + name: "missing items in summary", + snapshots: []snapshot{ + {when: time.Now(), out: missingItemInSummary}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status control plane section is consistent", + FailureOutput: &junitapi.FailureOutput{ + Message: "observed unexpected outputs in oc adm upgrade status control plane section", + }, + }, + }, + { + name: "operators section without summary line", + snapshots: []snapshot{ + {when: time.Now(), out: operatorsSectionWithoutSummaryLine}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status control plane section is consistent", + FailureOutput: &junitapi.FailureOutput{ + Message: "observed unexpected outputs in oc adm upgrade status control plane section", + }, + }, + }, + { + name: "operators section with line breaks in messages", + snapshots: []snapshot{ + {when: time.Now(), out: operatorsWithLinebreaksInMessages}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status control plane section is consistent", + }, + }, + { + name: "control plane completed", + snapshots: []snapshot{ + {when: time.Now(), out: controlPlaneCompleted}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status control plane section is consistent", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + m := NewOcAdmUpgradeStatusChecker().(*monitor) + m.ocAdmUpgradeStatus = append(m.ocAdmUpgradeStatus, tc.snapshots...) + + ignoreOutput := cmpopts.IgnoreFields(junitapi.FailureOutput{}, "Output") + + // Process snapshots into models for the controlPlane check to work with + _ = m.expectedLayout() + + result := m.controlPlane() + if diff := cmp.Diff(tc.expected, result, ignoreOutput); diff != "" { + t.Errorf("unexpected result (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/health.go b/pkg/monitortests/cli/adm_upgrade/status/health.go new file mode 100644 index 000000000000..23e18f29f2ca --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/health.go @@ -0,0 +1,90 @@ +package admupgradestatus + +import ( + "fmt" + "regexp" + "strings" + "time" + + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +var ( + healthLinePattern = regexp.MustCompile(`^\S+\s+\S+\S+\s+\S+.*$`) + healthMessageFields = map[string]*regexp.Regexp{ + "Message": regexp.MustCompile(`(?m)^Message: +\S+.*$`), + "Since": regexp.MustCompile(`(?m)^ {2}Since: +\S+.*$`), + "Level": regexp.MustCompile(`(?m)^ {2}Level: +\S+.*$`), + "Impact": regexp.MustCompile(`(?m)^ {2}Impact: +\S+.*$`), + "Reference": regexp.MustCompile(`(?m)^ {2}Reference: +\S+.*$`), + "Resources": regexp.MustCompile(`(?m)^ {2}Resources:$`), + "resource reference": regexp.MustCompile(`(?m)^ {4}[a-z0-9_.-]+: +\S+$`), + "Description": regexp.MustCompile(`(?m)^ {2}Description: +\S+.*$`), + } +) + +func (w *monitor) health() *junitapi.JUnitTestCase { + health := &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was successfully collected", + }, + } + + failureOutputBuilder := strings.Builder{} + + for _, observed := range w.ocAdmUpgradeStatusOutputModels { + if observed.output == nil { + // Failing to parse the output is handled in expectedLayout, so we can skip here + continue + } + // We saw at least one successful execution of oc adm upgrade status, so we have data to process + health.SkipMessage = nil + + wroteOnce := false + fail := func(message string) { + if !wroteOnce { + wroteOnce = true + failureOutputBuilder.WriteString(fmt.Sprintf("\n===== %s\n", observed.when.Format(time.RFC3339))) + failureOutputBuilder.WriteString(observed.output.rawOutput) + failureOutputBuilder.WriteString(fmt.Sprintf("\n\n=> %s\n", message)) + } + } + + if !observed.output.updating { + if observed.output.health != nil { + fail("Cluster is not updating but health section is present") + } + continue + } + + h := observed.output.health + if h == nil { + fail("Cluster is updating but health section is not present") + continue + } + + for _, item := range h.Messages { + if h.Detailed { + for field, pattern := range healthMessageFields { + if pattern.FindString(item) == "" { + fail(fmt.Sprintf("Health message does not contain field %s", field)) + } + } + } else { + if !healthLinePattern.MatchString(item) { + fail(fmt.Sprintf("Health message does not match expected pattern:\n%s", item)) + } + } + } + } + + if failureOutputBuilder.Len() > 0 { + health.FailureOutput = &junitapi.FailureOutput{ + Message: fmt.Sprintf("observed unexpected outputs in oc adm upgrade status health section"), + Output: failureOutputBuilder.String(), + } + } + + return health +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/health_test.go b/pkg/monitortests/cli/adm_upgrade/status/health_test.go new file mode 100644 index 000000000000..165070c14947 --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/health_test.go @@ -0,0 +1,269 @@ +package admupgradestatus + +import ( + "errors" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +var healthExampleOutput = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Updating: kube-apiserver +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Updating Cluster Operators +NAME SINCE REASON MESSAGE +kube-apiserver 7m27s NodeInstaller NodeInstallerProgressing: 1 node is at revision 7; 2 nodes are at revision 8 + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-53-218.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-99-189.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-0-72.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-100-255.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-106-212.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +var healthBadOutput = `= Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + +SOMETHING UNEXPECTED HERE + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +var healthTableOutput = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Update to 4.16.0-ec.3 successfully completed at 2024-02-27T15:42:58Z (duration: 3h31m) + +All control plane nodes successfully updated to 4.16.0-ec.3 + += Update Health = +SINCE LEVEL IMPACT MESSAGE +58m18s Error API Availability Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady) +now Warning Update Stalled Cluster Version version is failing to proceed with the update (ClusterOperatorsDegraded)` + +var healthDetailedOutputSingle = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Update to 4.16.0-ec.3 successfully completed at 2024-02-27T15:42:58Z (duration: 3h31m) + +All control plane nodes successfully updated to 4.16.0-ec.3 + += Update Health = +Message: Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady) + Since: 58m18s + Level: Error + Impact: API Availability + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + Resources: + clusteroperators.config.openshift.io: kube-apiserver + Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)` + +var healthDetailedOutputMultiple = ` +Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Update to 4.16.0-ec.3 successfully completed at 2024-02-27T15:42:58Z (duration: 3h31m) + +All control plane nodes successfully updated to 4.16.0-ec.3 + += Update Health = +Message: Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady) + Since: 58m18s + Level: Error + Impact: API Availability + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + Resources: + clusteroperators.config.openshift.io: kube-apiserver + Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?) + +Message: Cluster Version version is failing to proceed with the update (ClusterOperatorsDegraded) + Since: now + Level: Warning + Impact: Update Stalled + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + Resources: + clusterversions.config.openshift.io: version + Description: Cluster operators etcd, kube-apiserver are degraded` + +var healthMissingField = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Update to 4.16.0-ec.3 successfully completed at 2024-02-27T15:42:58Z (duration: 3h31m) + +All control plane nodes successfully updated to 4.16.0-ec.3 + += Update Health = +Message: Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady) + Since: 58m18s + Impact: API Availability + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + Resources: + clusteroperators.config.openshift.io: kube-apiserver + Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)` + +var healthEmptyField = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Update to 4.16.0-ec.3 successfully completed at 2024-02-27T15:42:58Z (duration: 3h31m) + +All control plane nodes successfully updated to 4.16.0-ec.3 + += Update Health = +Message: Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady) + Since: + Level: Warning + Impact: API Availability + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + Resources: + clusteroperators.config.openshift.io: kube-apiserver + Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)` + +func TestMonitor_Health(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + snapshots []snapshot + expected *junitapi.JUnitTestCase + }{ + { + name: "no snapshots -> test skipped", + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was successfully collected", + }, + }, + }, + { + name: "good snapshots", + snapshots: []snapshot{ + {when: time.Now(), out: healthExampleOutput}, + {when: time.Now(), out: healthExampleOutput}, + {when: time.Now(), out: healthExampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + }, + }, + { + name: "errored snapshots are skipped", + snapshots: []snapshot{ + {when: time.Now(), out: healthExampleOutput}, + {when: time.Now(), out: healthBadOutput, err: errors.New("some error")}, + {when: time.Now(), out: healthExampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + }, + }, + { + name: "unparseable snapshots are skipped", + snapshots: []snapshot{ + {when: time.Now(), out: healthExampleOutput}, + {when: time.Now(), out: "unparseable output"}, + {when: time.Now(), out: healthExampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + }, + }, + { + name: "multiple table lines", + snapshots: []snapshot{ + {when: time.Now(), out: healthTableOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + }, + }, + { + name: "detailed output single item", + snapshots: []snapshot{ + {when: time.Now(), out: healthDetailedOutputSingle}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + }, + }, + { + name: "detailed output multiple items", + snapshots: []snapshot{ + {when: time.Now(), out: healthDetailedOutputMultiple}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + }, + }, + { + name: "missing item from detailed output", + snapshots: []snapshot{ + {when: time.Now(), out: healthMissingField}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + FailureOutput: &junitapi.FailureOutput{ + Message: "observed unexpected outputs in oc adm upgrade status health section", + }, + }, + }, + { + name: "empty item from detailed output", + snapshots: []snapshot{ + {when: time.Now(), out: healthEmptyField}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status health section is consistent", + FailureOutput: &junitapi.FailureOutput{ + Message: "observed unexpected outputs in oc adm upgrade status health section", + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + m := NewOcAdmUpgradeStatusChecker().(*monitor) + m.ocAdmUpgradeStatus = append(m.ocAdmUpgradeStatus, tc.snapshots...) + + ignoreOutput := cmpopts.IgnoreFields(junitapi.FailureOutput{}, "Output") + + // Process snapshots into models for the health check to work with + _ = m.expectedLayout() + + result := m.health() + if diff := cmp.Diff(tc.expected, result, ignoreOutput); diff != "" { + t.Errorf("unexpected result (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/monitortest.go b/pkg/monitortests/cli/adm_upgrade/status/monitortest.go index 6461d580d864..1cfec0967a45 100644 --- a/pkg/monitortests/cli/adm_upgrade/status/monitortest.go +++ b/pkg/monitortests/cli/adm_upgrade/status/monitortest.go @@ -6,12 +6,15 @@ import ( "os" "path" "path/filepath" + "sort" "strings" "time" + configv1 "github.com/openshift/api/config/v1" clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned" "github.com/openshift/origin/pkg/monitortestframework" exutil "github.com/openshift/origin/test/extended/util" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" @@ -26,17 +29,28 @@ type snapshot struct { out string err error } + +type outputModel struct { + when time.Time + output *upgradeStatusOutput +} type monitor struct { - collectionDone chan struct{} - ocAdmUpgradeStatus map[time.Time]*snapshot + collectionDone chan struct{} + + ocAdmUpgradeStatus []snapshot + ocAdmUpgradeStatusOutputModels []outputModel + notSupportedReason error isSNO bool + + configv1client *clientconfigv1.Clientset + initialClusterVersion *configv1.ClusterVersion } func NewOcAdmUpgradeStatusChecker() monitortestframework.MonitorTest { return &monitor{ collectionDone: make(chan struct{}), - ocAdmUpgradeStatus: map[time.Time]*snapshot{}, + ocAdmUpgradeStatus: make([]snapshot, 0, 60), // expect at least 60 snaphots in a job, one per minute } } @@ -53,23 +67,32 @@ func (w *monitor) PrepareCollection(ctx context.Context, adminRESTConfig *rest.C w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: "platform MicroShift not supported"} return w.notSupportedReason } - clientconfigv1client, err := clientconfigv1.NewForConfig(adminRESTConfig) + configClient, err := clientconfigv1.NewForConfig(adminRESTConfig) if err != nil { return err } + w.configv1client = configClient - if ok, err := exutil.IsHypershift(ctx, clientconfigv1client); err != nil { + if ok, err := exutil.IsHypershift(ctx, w.configv1client); err != nil { return fmt.Errorf("unable to determine if cluster is Hypershift: %v", err) } else if ok { w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: "platform Hypershift not supported"} return w.notSupportedReason } - if ok, err := exutil.IsSingleNode(ctx, clientconfigv1client); err != nil { + if ok, err := exutil.IsSingleNode(ctx, w.configv1client); err != nil { return fmt.Errorf("unable to determine if cluster is single node: %v", err) } else { w.isSNO = ok } + + cv, err := w.configv1client.ConfigV1().ClusterVersions().Get(ctx, "version", metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("unable to get cluster version: %w", err) + } + + w.initialClusterVersion = cv + return nil } @@ -83,7 +106,7 @@ func snapshotOcAdmUpgradeStatus(ch chan *snapshot) { var err error // retry on brief apiserver unavailability if errWait := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, true, func(context.Context) (bool, error) { - cmd := oc.Run("adm", "upgrade", "status").EnvVar("OC_ENABLE_CMD_UPGRADE_STATUS", "true") + cmd := oc.Run("adm", "upgrade", "status", "--details=all").EnvVar("OC_ENABLE_CMD_UPGRADE_STATUS", "true") out, err = cmd.Output() if err != nil { return false, nil @@ -106,12 +129,12 @@ func (w *monitor) StartCollection(ctx context.Context, adminRESTConfig *rest.Con go func() { for snap := range snapshots { // TODO: Maybe also collect some cluster resources (CV? COs?) through recorder? - w.ocAdmUpgradeStatus[snap.when] = snap + w.ocAdmUpgradeStatus = append(w.ocAdmUpgradeStatus, *snap) } w.collectionDone <- struct{}{} }() // TODO: Configurable interval? - // TODO: Collect multiple invocations (--details)? Would need more another producer/consumer pair and likely + // TODO: Collect multiple invocations (without --details)? Would need more another producer/consumer pair and likely // collectionDone would need to be a WaitGroup wait.UntilWithContext(ctx, func(ctx context.Context) { snapshotOcAdmUpgradeStatus(snapshots) }, time.Minute) @@ -133,31 +156,28 @@ func (w *monitor) CollectData(ctx context.Context, storageDir string, beginning, // the collection goroutines spawned in StartedCollection to finish <-w.collectionDone - noFailures := &junitapi.JUnitTestCase{ - Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc amd upgrade status never fails", - } + sort.Slice(w.ocAdmUpgradeStatus, func(i, j int) bool { + return w.ocAdmUpgradeStatus[i].when.Before(w.ocAdmUpgradeStatus[j].when) + }) - var failures []string - var total int - for when, observed := range w.ocAdmUpgradeStatus { - total++ - if observed.err != nil { - failures = append(failures, fmt.Sprintf("- %s: %v", when.Format(time.RFC3339), observed.err)) - } - } + // TODO: Maybe utilize Intervals somehow and do tests in ComputeComputedIntervals and EvaluateTestsFromConstructedIntervals - // Zero failures is too strict for at least SNO clusters - p := (len(failures) / total) * 100 - if (!w.isSNO && p > 0) || (w.isSNO && p > 10) { - noFailures.FailureOutput = &junitapi.FailureOutput{ - Message: fmt.Sprintf("oc adm upgrade status failed %d times (of %d)", len(failures), len(w.ocAdmUpgradeStatus)), - Output: strings.Join(failures, "\n"), + wasUpdated := func() (bool, error) { + cv, err := w.configv1client.ConfigV1().ClusterVersions().Get(ctx, "version", metav1.GetOptions{}) + if err != nil { + return false, fmt.Errorf("failed to get cluster version: %w", err) } + return len(cv.Status.History) > len(w.initialClusterVersion.Status.History), nil } - // TODO: Maybe utilize Intervals somehow and do tests in ComputeComputedIntervals and EvaluateTestsFromConstructedIntervals - - return nil, []*junitapi.JUnitTestCase{noFailures}, nil + return nil, []*junitapi.JUnitTestCase{ + w.noFailures(), + w.expectedLayout(), + w.controlPlane(), + w.workers(), + w.health(), + w.updateLifecycle(wasUpdated), + }, nil } func (w *monitor) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { @@ -178,10 +198,10 @@ func (w *monitor) WriteContentToStorage(ctx context.Context, storageDir, timeSuf } var errs []error - for when, observed := range w.ocAdmUpgradeStatus { - outputFilename := fmt.Sprintf("adm-upgrade-status-%s_%s.txt", when, timeSuffix) + for _, snap := range w.ocAdmUpgradeStatus { + outputFilename := fmt.Sprintf("adm-upgrade-status-%s_%s.txt", snap.when, timeSuffix) outputFile := filepath.Join(folderPath, outputFilename) - if err := os.WriteFile(outputFile, []byte(observed.out), 0644); err != nil { + if err := os.WriteFile(outputFile, []byte(snap.out), 0644); err != nil { errs = append(errs, fmt.Errorf("failed to write %s: %w", outputFile, err)) } } @@ -191,3 +211,87 @@ func (w *monitor) WriteContentToStorage(ctx context.Context, storageDir, timeSuf func (*monitor) Cleanup(ctx context.Context) error { return nil } + +func (w *monitor) noFailures() *junitapi.JUnitTestCase { + noFailures := &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc amd upgrade status never fails", + } + + var failures []string + var total int + for _, snap := range w.ocAdmUpgradeStatus { + total++ + if snap.err != nil { + failures = append(failures, fmt.Sprintf("- %s: %v", snap.when.Format(time.RFC3339), snap.err)) + } + } + + if total == 0 { + noFailures.SkipMessage = &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was collected", + } + return noFailures + } + + // Zero failures is too strict for at least SNO clusters + p := (float32(len(failures)) / float32(total)) * 100 + if (!w.isSNO && p > 0) || (w.isSNO && p > 10) { + noFailures.FailureOutput = &junitapi.FailureOutput{ + Message: fmt.Sprintf("oc adm upgrade status failed %d times (of %d)", len(failures), len(w.ocAdmUpgradeStatus)), + Output: strings.Join(failures, "\n"), + } + } + return noFailures +} + +func (w *monitor) expectedLayout() *junitapi.JUnitTestCase { + expectedLayout := &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status output has expected layout", + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was successfully collected", + }, + } + + w.ocAdmUpgradeStatusOutputModels = make([]outputModel, len(w.ocAdmUpgradeStatus)) + + failureOutputBuilder := strings.Builder{} + + for i, observed := range w.ocAdmUpgradeStatus { + w.ocAdmUpgradeStatusOutputModels[i] = outputModel{ + when: observed.when, + } + + if observed.err != nil { + // Failures are handled in noFailures, so we can skip them here + continue + } + + // We saw at least one successful execution of oc adm upgrade status, so we have data to process + // and we do not need to skip + expectedLayout.SkipMessage = nil + + if observed.out == "" { + failureOutputBuilder.WriteString(fmt.Sprintf("- %s: unexpected empty output", observed.when.Format(time.RFC3339))) + continue + } + + model, err := newUpgradeStatusOutput(observed.out) + if err != nil { + failureOutputBuilder.WriteString(fmt.Sprintf("\n===== %s\n", observed.when.Format(time.RFC3339))) + failureOutputBuilder.WriteString(observed.out) + failureOutputBuilder.WriteString(fmt.Sprintf("\n\n=> Failed to parse output above: %v\n", err)) + continue + } + + w.ocAdmUpgradeStatusOutputModels[i].output = model + } + + if failureOutputBuilder.Len() > 0 { + expectedLayout.FailureOutput = &junitapi.FailureOutput{ + Message: fmt.Sprintf("observed unexpected outputs in oc adm upgrade status"), + Output: failureOutputBuilder.String(), + } + } + + return expectedLayout +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/monitortest_test.go b/pkg/monitortests/cli/adm_upgrade/status/monitortest_test.go new file mode 100644 index 000000000000..60f49e4bb886 --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/monitortest_test.go @@ -0,0 +1,230 @@ +package admupgradestatus + +import ( + "fmt" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +var exampleOutput = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Updating: kube-apiserver +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Updating Cluster Operators +NAME SINCE REASON MESSAGE +kube-apiserver 7m27s NodeInstaller NodeInstallerProgressing: 1 node is at revision 7; 2 nodes are at revision 8 + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-53-218.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-99-189.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-0-72.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-100-255.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-106-212.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +var badOutput = `= Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + +SOMETHING UNEXPECTED HERE + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +func TestMonitor_NoFailures(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + snapshots []snapshot + expected *junitapi.JUnitTestCase + }{ + { + name: "no snapshots", + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc amd upgrade status never fails", + SkipMessage: &junitapi.SkipMessage{Message: "Test skipped because no oc adm upgrade status output was collected"}, + }, + }, + { + name: "no successful snapshots", + snapshots: []snapshot{ + {when: time.Now(), err: fmt.Errorf("some error")}, + {when: time.Now(), err: fmt.Errorf("some error")}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc amd upgrade status never fails", + FailureOutput: &junitapi.FailureOutput{ + Message: "oc adm upgrade status failed 2 times (of 2)", + }, + }, + }, + { + name: "two successful snapshots", + snapshots: []snapshot{ + {when: time.Now(), err: nil, out: exampleOutput}, + {when: time.Now(), err: nil, out: exampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc amd upgrade status never fails", + }, + }, + { + name: "mixed snapshots", + snapshots: []snapshot{ + {when: time.Now(), err: nil, out: exampleOutput}, + {when: time.Now(), err: fmt.Errorf("some error")}, + {when: time.Now(), err: nil, out: ""}, + {when: time.Now(), err: nil, out: exampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc amd upgrade status never fails", + FailureOutput: &junitapi.FailureOutput{ + Message: "oc adm upgrade status failed 1 times (of 4)", + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + m := NewOcAdmUpgradeStatusChecker().(*monitor) + m.ocAdmUpgradeStatus = append(m.ocAdmUpgradeStatus, tc.snapshots...) + + ignoreOutput := cmpopts.IgnoreFields(junitapi.FailureOutput{}, "Output") + + result := m.noFailures() + if diff := cmp.Diff(tc.expected, result, ignoreOutput); diff != "" { + t.Errorf("unexpected result (-want +got):\n%s", diff) + } + }) + } +} + +func TestMonitor_ExpectedLayout(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + snapshots []snapshot + expected *junitapi.JUnitTestCase + }{ + { + name: "no snapshots", + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status output has expected layout", + SkipMessage: &junitapi.SkipMessage{Message: "Test skipped because no oc adm upgrade status output was successfully collected"}, + }, + }, + { + name: "two unsuccessful snapshots", + snapshots: []snapshot{ + {when: time.Now(), err: fmt.Errorf("some error")}, + {when: time.Now(), err: fmt.Errorf("another error")}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status output has expected layout", + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was successfully collected", + }, + }, + }, + { + name: "two successful snapshots", + snapshots: []snapshot{ + {when: time.Now(), err: nil, out: exampleOutput}, + {when: time.Now(), err: nil, out: exampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status output has expected layout", + }, + }, + { + name: "errored snapshots do not count", + snapshots: []snapshot{ + {when: time.Now(), err: nil, out: exampleOutput}, + {when: time.Now(), err: fmt.Errorf("some error")}, + {when: time.Now(), err: nil, out: exampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status output has expected layout", + }, + }, + { + name: "no error but empty output fails the check", + snapshots: []snapshot{ + {when: time.Now(), err: nil, out: exampleOutput}, + {when: time.Now(), err: nil, out: ""}, + {when: time.Now(), err: nil, out: exampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status output has expected layout", + FailureOutput: &junitapi.FailureOutput{ + Message: "observed unexpected outputs in oc adm upgrade status", + }, + }, + }, + { + name: "nonconforming output fails the check", + snapshots: []snapshot{ + {when: time.Now(), err: nil, out: exampleOutput}, + {when: time.Now(), err: nil, out: badOutput}, + {when: time.Now(), err: nil, out: exampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status output has expected layout", + FailureOutput: &junitapi.FailureOutput{ + Message: "observed unexpected outputs in oc adm upgrade status", + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + m := NewOcAdmUpgradeStatusChecker().(*monitor) + m.ocAdmUpgradeStatus = append(m.ocAdmUpgradeStatus, tc.snapshots...) + + ignoreOutput := cmpopts.IgnoreFields(junitapi.FailureOutput{}, "Output") + + result := m.expectedLayout() + if diff := cmp.Diff(tc.expected, result, ignoreOutput); diff != "" { + t.Errorf("unexpected result (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/outputmodel.go b/pkg/monitortests/cli/adm_upgrade/status/outputmodel.go new file mode 100644 index 000000000000..e4550ca5eb70 --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/outputmodel.go @@ -0,0 +1,505 @@ +package admupgradestatus + +import ( + "errors" + "fmt" + "regexp" + "strings" +) + +type ControlPlaneStatus struct { + Updated bool + Summary map[string]string + Operators []string + NodesUpdated bool + Nodes []string +} + +type WorkersStatus struct { + Pools []string + Nodes map[string][]string +} + +type Health struct { + Detailed bool + Messages []string +} + +type upgradeStatusOutput struct { + rawOutput string + updating bool + controlPlane *ControlPlaneStatus + workers *WorkersStatus + health *Health +} + +var unableToFetchAlerts = regexp.MustCompile(`^Unable to fetch alerts.*`) + +func newUpgradeStatusOutput(output string) (*upgradeStatusOutput, error) { + output = strings.TrimSpace(output) + + if output == "The cluster is not updating." { + return &upgradeStatusOutput{ + rawOutput: output, + updating: false, + controlPlane: nil, + workers: nil, + }, nil + } + + lines := strings.Split(output, "\n") + parser := &parser{lines: lines, pos: 0} + + if parser.tryRegex(unableToFetchAlerts) { + _ = parser.eatRegex(unableToFetchAlerts) + } + + controlPlane, err := parser.parseControlPlaneSection() + if err != nil { + return nil, err + } + + workers, err := parser.parseWorkerUpgradeSection() + if err != nil { + return nil, err + } + + health, err := parser.parseHealthSection() + if err != nil { + return nil, err + } + + return &upgradeStatusOutput{ + rawOutput: output, + updating: true, + controlPlane: controlPlane, + workers: workers, + health: health, + }, nil +} + +type parser struct { + lines []string + pos int +} + +var ( + updatingOperatorsHeaderPattern = regexp.MustCompile(`^NAME\s+SINCE\s+REASON\s+MESSAGE$`) + nodesHeaderPattern = regexp.MustCompile(`^NAME\s+ASSESSMENT\s+PHASE\s+VERSION\s+EST\s+MESSAGE$`) + workerPoolsHeaderPattern = regexp.MustCompile(`^WORKER POOL\s+ASSESSMENT\s+COMPLETION\s+STATUS$`) + healthHeaderPattern = regexp.MustCompile(`^SINCE\s+LEVEL\s+IMPACT\s+MESSAGE$`) + + workerUpgradeHeaderPattern = regexp.MustCompile(`^= Worker Upgrade =$`) + controlPlaneUpdatedPattern = regexp.MustCompile(`^Update to .* successfully completed at .*$`) + controlPlaneNodesUpdatedPattern = regexp.MustCompile(`^All control plane nodes successfully updated to .*`) +) + +type nextOption int + +const ( + preserveLeadingWhitespace nextOption = iota +) + +func (p *parser) next(opts ...nextOption) (string, bool) { + if p.pos >= len(p.lines) { + return "", true + } + + line := p.lines[p.pos] + p.pos++ + + // Check if we should preserve leading whitespace + preserveLeading := false + for _, opt := range opts { + if opt == preserveLeadingWhitespace { + preserveLeading = true + break + } + } + + if preserveLeading { + return strings.TrimRight(line, " \t\r\n"), false + } else { + return strings.TrimSpace(line), false + } +} + +func (p *parser) eatEmptyLines() { + for { + line, done := p.next() + if done { + return + } + if line != "" { + p.pos-- + return + } + } +} + +func (p *parser) tryRegex(what *regexp.Regexp) bool { + line, done := p.next() + p.pos-- + + return !done && what.MatchString(line) +} + +func (p *parser) eat(what string) error { + line, done := p.next() + if done { + return fmt.Errorf("expected '%s' but reached end of input", what) + } + + if line != what { + return fmt.Errorf("expected '%s' but got '%s'", what, line) + } + + return nil +} + +func (p *parser) eatRegex(what *regexp.Regexp) error { + line, done := p.next() + if done { + return fmt.Errorf("expected '%s' but reached end of input", what) + } + + if !what.MatchString(line) { + return fmt.Errorf("expected '%s' but got '%s'", what, line) + } + + return nil +} + +func (p *parser) parseControlPlaneSection() (*ControlPlaneStatus, error) { + if err := p.eat("= Control Plane ="); err != nil { + return nil, err + } + + var status ControlPlaneStatus + + if p.tryRegex(controlPlaneUpdatedPattern) { + _ = p.eatRegex(controlPlaneUpdatedPattern) + status.Updated = true + p.eatEmptyLines() + if err := p.eatRegex(controlPlaneNodesUpdatedPattern); err != nil { + return nil, fmt.Errorf("expected 'All control plane nodes successfully updated to' message, got: %w", err) + } + status.NodesUpdated = true + + return &status, nil + } + + summary, err := p.parseControlPlaneSummary() + if err != nil { + return nil, err + } + status.Summary = summary + + operators, err := p.parseControlPlaneOperators() + if err != nil { + return nil, err + } + status.Operators = operators + + p.eatEmptyLines() + + if p.tryRegex(controlPlaneNodesUpdatedPattern) { + _ = p.eatRegex(controlPlaneNodesUpdatedPattern) + status.NodesUpdated = true + } else { + nodes, err := p.parseControlPlaneNodes() + if err != nil { + return nil, err + } + status.Nodes = nodes + } + + return &status, nil +} + +func (p *parser) parseControlPlaneSummary() (map[string]string, error) { + p.eatEmptyLines() + + summary := map[string]string{} + for { + line, done := p.next() + if done || line == "" { + break + } + + parts := strings.SplitN(line, ":", 2) + if len(parts) != 2 { + return nil, fmt.Errorf("expected 'Key: Value' format, got: %s", line) + } + + key := strings.TrimSpace(parts[0]) + value := strings.TrimSpace(parts[1]) + summary[key] = value + } + + if len(summary) == 0 { + return nil, errors.New("found no entries in control plane summary section") + } + + return summary, nil +} + +func (p *parser) parseControlPlaneOperators() ([]string, error) { + p.eatEmptyLines() + + if line, _ := p.next(); line != "Updating Cluster Operators" { + // section is optional, put back the line and return nil + p.pos-- + return nil, nil + } + + if err := p.eatRegex(updatingOperatorsHeaderPattern); err != nil { + return nil, fmt.Errorf("expected Updating Cluster Operators table header, got: %w", err) + } + + var operators []string + + for { + line, done := p.next() + if done || line == "" { + break + } + + operators = append(operators, line) + } + + if len(operators) == 0 { + return nil, errors.New("found no entries in Updating Cluster Operators section") + } + + return operators, nil +} + +func (p *parser) parseControlPlaneNodes() ([]string, error) { + p.eatEmptyLines() + + if p.eat("Control Plane Nodes") != nil { + return nil, errors.New("expected 'Control Plane Nodes' section") + } + + if err := p.eatRegex(nodesHeaderPattern); err != nil { + return nil, fmt.Errorf("expected Control Plane Nodes table header: %w", err) + } + + var nodes []string + for { + line, done := p.next() + if done || line == "" { + break + } + + nodes = append(nodes, line) + } + + if len(nodes) == 0 { + return nil, errors.New("no nodes found in Control Plane Nodes section") + } + + return nodes, nil +} + +func (p *parser) parseWorkerUpgradeSection() (*WorkersStatus, error) { + p.eatEmptyLines() + + if !p.tryRegex(workerUpgradeHeaderPattern) { + return nil, nil + } + + if err := p.eat("= Worker Upgrade ="); err != nil { + return nil, err + } + + pools, err := p.parseWorkerPools() + if err != nil { + return nil, err + } + + nodes, err := p.parseWorkerPoolNodes() + if err != nil { + return nil, err + } + + return &WorkersStatus{ + Pools: pools, + Nodes: nodes, + }, nil +} + +func (p *parser) parseWorkerPools() ([]string, error) { + p.eatEmptyLines() + + if err := p.eatRegex(workerPoolsHeaderPattern); err != nil { + return nil, fmt.Errorf("expected Worker Upgrade table header: %w", err) + } + + var pools []string + for { + line, done := p.next() + if done || line == "" { + break + } + + pools = append(pools, line) + } + + if len(pools) == 0 { + return nil, errors.New("no worker pools found in Worker Upgrade section") + } + + return pools, nil +} + +func (p *parser) parseWorkerPoolNodes() (map[string][]string, error) { + nodes := make(map[string][]string) + + for { + p.eatEmptyLines() + + name, entries, err := p.tryParseWorkerNodeTable() + if err != nil { + return nil, err + } + + if name == "" { + break + } + + nodes[name] = entries + } + + if len(nodes) == 0 { + return nil, errors.New("no worker pool nodes found in Worker Upgrade section") + } + + return nodes, nil +} + +func (p *parser) tryParseWorkerNodeTable() (string, []string, error) { + p.eatEmptyLines() + + line, done := p.next() + if done { + return "", nil, errors.New("expected 'Worker Pool Nodes:' section but reached end of input") + } + if !strings.HasPrefix(line, "Worker Pool Nodes:") { + p.pos-- // put it back + return "", nil, nil + } + + name := strings.TrimPrefix(line, "Worker Pool Nodes: ") + + if err := p.eatRegex(nodesHeaderPattern); err != nil { + return "", nil, fmt.Errorf("expected worker pool nodes table header for pool '%s': %w", name, err) + } + + // Read node entries + var nodeEntries []string + for { + line, done := p.next() + if done || line == "" { + break + } + + nodeEntries = append(nodeEntries, line) + } + + if len(nodeEntries) == 0 { + return "", nil, fmt.Errorf("no nodes found for worker pool '%s'", name) + } + + return name, nodeEntries, nil +} + +func (p *parser) parseHealthSection() (*Health, error) { + p.eatEmptyLines() + + if err := p.eat("= Update Health ="); err != nil { + return nil, err + } + + var health Health + + line, done := p.next() + if done { + return nil, errors.New("expected 'Update Health' section but reached end of input") + } + + var getMessage func() (string, error) + if strings.HasPrefix(line, "Message: ") { + getMessage = p.parseHealthMessage + health.Detailed = true + p.pos-- + } else if healthHeaderPattern.MatchString(line) { + getMessage = p.parseHealthMessageLine + } else { + return nil, fmt.Errorf("expected 'Update Health' to start with either a table header or a 'Message: ' line, got %s", line) + } + + for { + message, err := getMessage() + if err != nil { + return nil, err + } + + if message == "" { + // No more messages + break + } + + health.Messages = append(health.Messages, message) + } + + if len(health.Messages) == 0 { + return nil, errors.New("no health messages found in Update Health section") + } + + return &health, nil +} + +func (p *parser) parseHealthMessageLine() (string, error) { + line, _ := p.next() + return line, nil +} + +func (p *parser) parseHealthMessage() (string, error) { + var messageBuilder strings.Builder + + line, done := p.next() + if done { + return "", nil // No more input + } + + if !strings.HasPrefix(line, "Message: ") { + return "", fmt.Errorf("expected health message to start with 'Message: ', got: %s", line) + } + + messageBuilder.WriteString(line) + + // Read continuation lines until we hit the next "Message: " or end of input + for { + line, done := p.next(preserveLeadingWhitespace) + if done { + break + } + + if line == "" { + peek, done := p.next() + if done { + break + } + p.pos-- + if strings.HasPrefix(peek, "Message: ") { + break + } + } + + messageBuilder.WriteString("\n" + line) + } + + return strings.TrimSpace(messageBuilder.String()), nil +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/outputmodel_test.go b/pkg/monitortests/cli/adm_upgrade/status/outputmodel_test.go new file mode 100644 index 000000000000..f979101c3352 --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/outputmodel_test.go @@ -0,0 +1,630 @@ +package admupgradestatus + +import ( + "strings" + "testing" + + "github.com/google/go-cmp/cmp" +) + +var ( + emptyLine = "" + clusterNotUpdating = `The cluster is not updating.` + + noTokenNoAlerts = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session` + + controlPlaneHeader = `= Control Plane =` + + genericControlPlane = `Update to 4.16.0-ec.3 successfully completed at 2024-02-27T15:42:58Z (duration: 3h31m) + +All control plane nodes successfully updated to 4.16.0-ec.3` + + controlPlaneSummary = `Assessment: Stalled +Target Version: 4.14.1 (from 4.14.0-rc.3) +Completion: 97% (32 operators updated, 1 updating, 0 waiting) +Duration: 1h59m (Est. Time Remaining: N/A; estimate duration was 1h24m) +Operator Health: 28 Healthy, 1 Unavailable, 4 Available but degraded` + + controlPlaneSummaryWithUpdating = `Assessment: Stalled +Target Version: 4.14.1 (from 4.14.0-rc.3) +Updating: machine-config +Completion: 97% (32 operators updated, 1 updating, 0 waiting) +Duration: 1h59m (Est. Time Remaining: N/A; estimate duration was 1h24m) +Operator Health: 28 Healthy, 1 Unavailable, 4 Available but degraded` + + controlPlaneSummaryInconsistentOperators = `Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-182454-test-ci-op-5wilvz46-latest (from 4.20.0-0.ci-2025-08-13-174821-test-ci-op-5wilvz46-initial) +Updating: image-registry, monitoring, openshift-controller-manager +Completion: 50% (17 operators updated, 3 updating, 14 waiting) +Duration: 24m (Est. Time Remaining: 45m) +Operator Health: 34 Healthy` + + controlPlaneUpdated = `Update to 4.16.0-ec.3 successfully completed at 2024-02-27T15:42:58Z (duration: 3h31m)` + + expectedControlPlaneSummaries = map[string]map[string]string{ + controlPlaneSummary: { + "Assessment": "Stalled", + "Target Version": "4.14.1 (from 4.14.0-rc.3)", + "Completion": "97% (32 operators updated, 1 updating, 0 waiting)", + "Duration": "1h59m (Est. Time Remaining: N/A; estimate duration was 1h24m)", + "Operator Health": "28 Healthy, 1 Unavailable, 4 Available but degraded", + }, + controlPlaneSummaryWithUpdating: { + "Assessment": "Stalled", + "Target Version": "4.14.1 (from 4.14.0-rc.3)", + "Updating": "machine-config", + "Completion": "97% (32 operators updated, 1 updating, 0 waiting)", + "Duration": "1h59m (Est. Time Remaining: N/A; estimate duration was 1h24m)", + "Operator Health": "28 Healthy, 1 Unavailable, 4 Available but degraded", + }, + controlPlaneSummaryInconsistentOperators: { + "Assessment": "Progressing", + "Target Version": "4.20.0-0.ci-2025-08-13-182454-test-ci-op-5wilvz46-latest (from 4.20.0-0.ci-2025-08-13-174821-test-ci-op-5wilvz46-initial)", + "Updating": "image-registry, monitoring, openshift-controller-manager", + "Completion": "50% (17 operators updated, 3 updating, 14 waiting)", + "Duration": "24m (Est. Time Remaining: 45m)", + "Operator Health": "34 Healthy", + }, + controlPlaneUpdated: nil, // No summary for updated control plane + } + + controlPlaneOperators = `Updating Cluster Operators +NAME SINCE REASON MESSAGE +machine-config 1h4m41s - Working towards 4.14.1` + + // TODO: This is actually a bug we should fix in the output, we will fix this + controlPlaneInconsistentOperators = `Updating Cluster Operators +NAME SINCE REASON MESSAGE +image-registry 6s DeploymentNotCompleted::NodeCADaemonUnavailable NodeCADaemonProgressing: The daemon set node-ca is deploying node pods +Progressing: The deployment has not completed +monitoring 4s RollOutInProgress Rolling out the stack. +openshift-controller-manager 11s RouteControllerManager_DesiredStateNotYetAchieved::_DesiredStateNotYetAchieved Progressing: deployment/controller-manager: observed generation is 10, desired generation is 11 +Progressing: deployment/controller-manager: updated replicas is 1, desired replicas is 3 +RouteControllerManagerProgressing: deployment/route-controller-manager: observed generation is 7, desired generation is 8 +RouteControllerManagerProgressing: deployment/route-controller-manager: updated replicas is 1, desired replicas is 3` + + expectedControlPlaneOperators = map[string][]string{ + controlPlaneOperators: {"machine-config 1h4m41s - Working towards 4.14.1"}, + controlPlaneInconsistentOperators: { + "image-registry 6s DeploymentNotCompleted::NodeCADaemonUnavailable NodeCADaemonProgressing: The daemon set node-ca is deploying node pods", + "Progressing: The deployment has not completed", + "monitoring 4s RollOutInProgress Rolling out the stack.", + "openshift-controller-manager 11s RouteControllerManager_DesiredStateNotYetAchieved::_DesiredStateNotYetAchieved Progressing: deployment/controller-manager: observed generation is 10, desired generation is 11", + "Progressing: deployment/controller-manager: updated replicas is 1, desired replicas is 3", + "RouteControllerManagerProgressing: deployment/route-controller-manager: observed generation is 7, desired generation is 8", + "RouteControllerManagerProgressing: deployment/route-controller-manager: updated replicas is 1, desired replicas is 3", + }, + } + + controlPlaneThreeNodes = `Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-30-217.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ? +ip-10-0-53-40.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ? +ip-10-0-92-180.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ? ` + + controlPlaneNodesUpdated = `All control plane nodes successfully updated to 4.16.0-ec.3` + + expectedControlPlaneNodes = map[string][]string{ + controlPlaneThreeNodes: { + "ip-10-0-30-217.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?", + "ip-10-0-53-40.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?", + "ip-10-0-92-180.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?", + }, + controlPlaneNodesUpdated: nil, + } + + workerSectionHeader = `= Worker Upgrade =` + + genericWorkerPool = oneWorkerPool + genericWorkerNodes = oneWorkerPoolNodes + + oneWorkerPool = `WORKER POOL ASSESSMENT COMPLETION STATUS +worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining` + + twoPools = `WORKER POOL ASSESSMENT COMPLETION STATUS +worker Progressing 0% (0/2) 1 Available, 1 Progressing, 1 Draining +infra Progressing 0% (0/2) 1 Available, 1 Progressing, 1 Draining` + + twoPoolsOneEmpty = `WORKER POOL ASSESSMENT COMPLETION STATUS +worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining +zbeast Empty 0 Total` + + expectedPools = map[string][]string{ + oneWorkerPool: {"worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining"}, + twoPools: { + "worker Progressing 0% (0/2) 1 Available, 1 Progressing, 1 Draining", + "infra Progressing 0% (0/2) 1 Available, 1 Progressing, 1 Draining", + }, + twoPoolsOneEmpty: { + "worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining", + "zbeast Empty 0 Total", + }, + } + + oneWorkerPoolNodes = `Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-20-162.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ? +ip-10-0-4-159.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ? +ip-10-0-99-40.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ? ` + + twoPoolsWorkerNodes = `Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-4-159.us-east-2.compute.internal Progressing Draining 4.14.0 +10m +ip-10-0-99-40.us-east-2.compute.internal Outdated Pending 4.14.0 ? ` + + twoPoolsInfraNodes = `Worker Pool Nodes: infra +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-4-159-infra.us-east-2.compute.internal Progressing Draining 4.14.0 +10m +ip-10-0-20-162.us-east-2.compute.internal Outdated Pending 4.14.0 ? ` + + expectedPoolNodes = map[string]map[string][]string{ + oneWorkerPool: { + "worker": { + "ip-10-0-20-162.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?", + "ip-10-0-4-159.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?", + "ip-10-0-99-40.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?", + }, + }, + twoPools: { + "worker": { + "ip-10-0-4-159.us-east-2.compute.internal Progressing Draining 4.14.0 +10m", + "ip-10-0-99-40.us-east-2.compute.internal Outdated Pending 4.14.0 ?", + }, + "infra": { + "ip-10-0-4-159-infra.us-east-2.compute.internal Progressing Draining 4.14.0 +10m", + "ip-10-0-20-162.us-east-2.compute.internal Outdated Pending 4.14.0 ?", + }, + }, + twoPoolsOneEmpty: { + "worker": { + "ip-10-0-20-162.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?", + "ip-10-0-4-159.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?", + "ip-10-0-99-40.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?", + }, + }, + } + + healthSectionHeader = `= Update Health = ` + + genericHealthSection = healthProceedingWell + + healthProceedingWell = `SINCE LEVEL IMPACT MESSAGE +52m56s Info None Update is proceeding well` + + healthMultipleTable = `SINCE LEVEL IMPACT MESSAGE +58m18s Error API Availability Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady) +now Warning Update Stalled Cluster Version version is failing to proceed with the update (ClusterOperatorsDegraded)` + + healthDetailed = `Message: Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady) + Since: 58m18s + Level: Error + Impact: API Availability + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + Resources: + clusteroperators.config.openshift.io: kube-apiserver + Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?) + +Message: Cluster Version version is failing to proceed with the update (ClusterOperatorsDegraded) + Since: now + Level: Warning + Impact: Update Stalled + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + Resources: + clusterversions.config.openshift.io: version + Description: Cluster operators etcd, kube-apiserver are degraded` + + expectedHealthMessages = map[string][]string{ + healthProceedingWell: {"52m56s Info None Update is proceeding well"}, + healthMultipleTable: { + "58m18s Error API Availability Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady)", + "now Warning Update Stalled Cluster Version version is failing to proceed with the update (ClusterOperatorsDegraded)", + }, + healthDetailed: { + `Message: Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady) + Since: 58m18s + Level: Error + Impact: API Availability + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + Resources: + clusteroperators.config.openshift.io: kube-apiserver + Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)`, + `Message: Cluster Version version is failing to proceed with the update (ClusterOperatorsDegraded) + Since: now + Level: Warning + Impact: Update Stalled + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + Resources: + clusterversions.config.openshift.io: version + Description: Cluster operators etcd, kube-apiserver are degraded`, + }, + } +) + +func TestUpgradeStatusOutput_Updating(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + segments []string + expected bool + }{ + { + name: "cluster not updating", + segments: []string{clusterNotUpdating}, + expected: false, + }, + { + name: "cluster updating", + segments: []string{ + controlPlaneHeader, + genericControlPlane, + emptyLine, + workerSectionHeader, + genericWorkerPool, + emptyLine, + genericWorkerNodes, + emptyLine, + healthSectionHeader, + healthProceedingWell, + }, + expected: true, + }, + { + name: "cluster updating | no token no alerts", + segments: []string{ + noTokenNoAlerts, + controlPlaneHeader, + genericControlPlane, + emptyLine, + workerSectionHeader, + genericWorkerPool, + emptyLine, + genericWorkerNodes, + emptyLine, + healthSectionHeader, + healthProceedingWell, + }, + expected: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + builder := strings.Builder{} + for _, input := range tc.segments { + builder.WriteString(input) + builder.WriteString("\n") + } + + output, err := newUpgradeStatusOutput(builder.String()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if output.updating != tc.expected { + t.Errorf("Expected IsUpdating() to return %v, got %v", tc.expected, output.updating) + } + }) + } +} + +func TestUpgradeStatusOutput_ControlPlane(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + segments []string + expected *ControlPlaneStatus + expectError string + }{ + { + name: "cluster not updating", + segments: []string{clusterNotUpdating}, + expected: nil, + }, + { + name: "control plane without updating operators line", + segments: []string{ + controlPlaneHeader, + controlPlaneSummary, + emptyLine, + controlPlaneThreeNodes, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: &ControlPlaneStatus{ + Updated: false, + Summary: expectedControlPlaneSummaries[controlPlaneSummary], + Operators: nil, // No operators line present + NodesUpdated: false, + Nodes: expectedControlPlaneNodes[controlPlaneThreeNodes], + }, + }, + { + name: "control plane without updating operators line, no token warning", + segments: []string{ + noTokenNoAlerts, + controlPlaneHeader, + controlPlaneSummary, + emptyLine, + controlPlaneThreeNodes, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: &ControlPlaneStatus{ + Updated: false, + Summary: expectedControlPlaneSummaries[controlPlaneSummary], + Operators: nil, // No operators line present + NodesUpdated: false, + Nodes: expectedControlPlaneNodes[controlPlaneThreeNodes], + }, + }, + { + name: "control plane with updating", + segments: []string{ + controlPlaneHeader, + controlPlaneSummaryWithUpdating, + emptyLine, + controlPlaneOperators, + emptyLine, + controlPlaneThreeNodes, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: &ControlPlaneStatus{ + Updated: false, + Summary: expectedControlPlaneSummaries[controlPlaneSummaryWithUpdating], + Operators: expectedControlPlaneOperators[controlPlaneOperators], + NodesUpdated: false, + Nodes: expectedControlPlaneNodes[controlPlaneThreeNodes], + }, + }, + { + name: "control plane with updated nodes", + segments: []string{ + controlPlaneHeader, + controlPlaneSummary, + emptyLine, + controlPlaneNodesUpdated, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: &ControlPlaneStatus{ + Updated: false, + Summary: expectedControlPlaneSummaries[controlPlaneSummary], + Operators: nil, + NodesUpdated: true, + Nodes: nil, + }, + }, + { + name: "updated control plane", + segments: []string{ + controlPlaneHeader, + controlPlaneUpdated, + emptyLine, + controlPlaneNodesUpdated, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: &ControlPlaneStatus{ + Updated: true, + Summary: nil, + Operators: nil, + NodesUpdated: true, + Nodes: nil, + }, + }, + { + name: "control plane with inconsistent operators (bug that will be fixed)", + segments: []string{ + controlPlaneHeader, + controlPlaneSummaryInconsistentOperators, + emptyLine, + controlPlaneInconsistentOperators, + emptyLine, + controlPlaneThreeNodes, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: &ControlPlaneStatus{ + Updated: false, + Summary: expectedControlPlaneSummaries[controlPlaneSummaryInconsistentOperators], + Operators: expectedControlPlaneOperators[controlPlaneInconsistentOperators], + NodesUpdated: false, + Nodes: expectedControlPlaneNodes[controlPlaneThreeNodes], + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + builder := strings.Builder{} + for _, input := range tc.segments { + builder.WriteString(input) + builder.WriteString("\n") + } + + output, err := newUpgradeStatusOutput(builder.String()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if diff := cmp.Diff(tc.expected, output.controlPlane); diff != "" { + t.Errorf("ControlPlane mismatch (-expected +actual):\n%s", diff) + } + }) + } +} + +func TestUpgradeStatusOutput_Workers(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + segments []string + expected *WorkersStatus + }{ + { + name: "cluster not updating", + segments: []string{clusterNotUpdating}, + expected: nil, + }, + { + name: "worker section is optional (SNO & compact)", + segments: []string{ + controlPlaneHeader, + genericControlPlane, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: nil, + }, + { + name: "one pool with three nodes", + segments: []string{ + controlPlaneHeader, + genericControlPlane, + emptyLine, + workerSectionHeader, + emptyLine, + oneWorkerPool, + emptyLine, + oneWorkerPoolNodes, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: &WorkersStatus{ + Pools: expectedPools[oneWorkerPool], + Nodes: expectedPoolNodes[oneWorkerPool], + }, + }, + { + name: "two pools with two nodes each", + segments: []string{ + controlPlaneHeader, + genericControlPlane, + emptyLine, + workerSectionHeader, + emptyLine, + twoPools, + emptyLine, + twoPoolsWorkerNodes, + emptyLine, + twoPoolsInfraNodes, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: &WorkersStatus{ + Pools: expectedPools[twoPools], + Nodes: expectedPoolNodes[twoPools], + }, + }, + { + name: "two pools, one of them empty", + segments: []string{ + controlPlaneHeader, + genericControlPlane, + emptyLine, + workerSectionHeader, + emptyLine, + twoPoolsOneEmpty, + emptyLine, + oneWorkerPoolNodes, + emptyLine, + healthSectionHeader, + genericHealthSection, + }, + expected: &WorkersStatus{ + Pools: expectedPools[twoPoolsOneEmpty], + Nodes: expectedPoolNodes[twoPoolsOneEmpty], + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + builder := strings.Builder{} + for _, input := range tc.segments { + builder.WriteString(input) + builder.WriteString("\n") + } + + output, err := newUpgradeStatusOutput(builder.String()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if diff := cmp.Diff(tc.expected, output.workers); diff != "" { + t.Errorf("Workers mismatch (-expected +actual):\n%s", diff) + } + }) + } +} + +func TestUpgradeStatusOutput_Health(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + segments []string + expected *Health + }{ + { + name: "cluster not updating", + segments: []string{clusterNotUpdating}, + expected: nil, + }, + { + name: "Update is proceeding well", + segments: []string{ + controlPlaneHeader, + genericControlPlane, + emptyLine, + workerSectionHeader, + genericWorkerPool, + emptyLine, + genericWorkerNodes, + emptyLine, + healthSectionHeader, + healthProceedingWell, + }, + expected: &Health{ + Detailed: false, + Messages: expectedHealthMessages[healthProceedingWell], + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + builder := strings.Builder{} + for _, input := range tc.segments { + builder.WriteString(input) + builder.WriteString("\n") + } + + output, err := newUpgradeStatusOutput(builder.String()) + if err != nil { + t.Fatalf("Expected no error, got: %v", err) + } + + if diff := cmp.Diff(tc.expected, output.health); diff != "" { + t.Errorf("Health mismatch (-expected +actual):\n%s", diff) + } + }) + } +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/updatelifecycle.go b/pkg/monitortests/cli/adm_upgrade/status/updatelifecycle.go new file mode 100644 index 000000000000..a0e8e51020b6 --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/updatelifecycle.go @@ -0,0 +1,135 @@ +package admupgradestatus + +import ( + "fmt" + "strings" + "time" + + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +type wasUpdatedFn func() (bool, error) + +func (w *monitor) updateLifecycle(wasUpdated wasUpdatedFn) *junitapi.JUnitTestCase { + health := &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status snapshots reflect the cluster upgrade lifecycle", + } + + clusterUpdated, err := wasUpdated() + if err != nil { + health.FailureOutput = &junitapi.FailureOutput{ + Message: fmt.Sprintf("failed to get cluster version: %v", err), + } + return health + } + + health.SkipMessage = &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was successfully collected", + } + + type state string + const ( + beforeUpdate state = "before update" + controlPlaneUpdating state = "control plane updating" + controlPlaneNodesUpdated state = "control plane nodes updated" + controlPlaneUpdated state = "control plane updated" + afterUpdate state = "after update" + ) + + type observation string + const ( + notUpdating observation = "not updating" + controlPlaneObservedUpdating observation = "control plane updating" + controlPlaneObservedNodesUpdated observation = "control plane nodes updated" + controlPlaneObservedUpdated observation = "control plane updated" + ) + + stateTransitions := map[state]map[observation]state{ + beforeUpdate: { + notUpdating: beforeUpdate, + controlPlaneObservedUpdating: controlPlaneUpdating, + controlPlaneObservedNodesUpdated: controlPlaneNodesUpdated, + controlPlaneObservedUpdated: controlPlaneUpdated, + }, + controlPlaneUpdating: { + notUpdating: afterUpdate, + controlPlaneObservedUpdating: controlPlaneUpdating, + controlPlaneObservedNodesUpdated: controlPlaneNodesUpdated, + controlPlaneObservedUpdated: controlPlaneUpdated, + }, + controlPlaneNodesUpdated: { + notUpdating: afterUpdate, + controlPlaneObservedNodesUpdated: controlPlaneNodesUpdated, + controlPlaneObservedUpdated: controlPlaneUpdated, + }, + controlPlaneUpdated: { + notUpdating: afterUpdate, + controlPlaneObservedUpdated: controlPlaneUpdated, + }, + afterUpdate: { + notUpdating: afterUpdate, + // TODO: MCO churn sometimes briefly tricks our code into thinking the cluster is updating, we'll tolerate for + // now but we should try fixing this + controlPlaneObservedNodesUpdated: controlPlaneUpdated, + }, + } + + current := beforeUpdate + failureOutputBuilder := strings.Builder{} + + for _, observed := range w.ocAdmUpgradeStatusOutputModels { + if observed.output == nil { + // Failing to parse the output is handled in expectedLayout, so we can skip here + continue + } + // We saw at least one successful execution of oc adm upgrade status, so we have data to process + health.SkipMessage = nil + + wroteOnce := false + fail := func(message string) { + if !wroteOnce { + wroteOnce = true + failureOutputBuilder.WriteString(fmt.Sprintf("\n===== %s\n", observed.when.Format(time.RFC3339))) + failureOutputBuilder.WriteString(observed.output.rawOutput) + failureOutputBuilder.WriteString(fmt.Sprintf("\n\n=> %s\n", message)) + } + } + + if !clusterUpdated { + // TODO: MCO churn sometimes briefly tricks our code into thinking the cluster is updating, we'll tolerate for + // now but we should try fixing this + // if observed.output.updating || observed.output.controlPlane != nil || observed.output.workers != nil || observed.output.health != nil { + // fail("Cluster did not update but oc adm upgrade status reported that it is updating") + // } + continue + } + + controlPlane := observed.output.controlPlane + + o := notUpdating + switch { + case controlPlane != nil && controlPlane.Updated: + o = controlPlaneObservedUpdated + case controlPlane != nil && controlPlane.NodesUpdated: + o = controlPlaneObservedNodesUpdated + case observed.output.updating: + o = controlPlaneObservedUpdating + } + + fromCurrent := stateTransitions[current] + if next, ok := fromCurrent[o]; !ok { + fail(fmt.Sprintf("Unexpected observation '%s' in state '%s'", o, current)) + } else { + current = next + } + } + + if failureOutputBuilder.Len() > 0 { + health.FailureOutput = &junitapi.FailureOutput{ + Message: fmt.Sprintf("observed unexpected update lifecycle transition in oc adm upgrade status"), + Output: failureOutputBuilder.String(), + } + } + + return health +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/updatelifecycle_test.go b/pkg/monitortests/cli/adm_upgrade/status/updatelifecycle_test.go new file mode 100644 index 000000000000..8ab9ddc59cf5 --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/updatelifecycle_test.go @@ -0,0 +1,236 @@ +package admupgradestatus + +import ( + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +var ( + lifecycle01before = `The cluster is not updating.` + + lifecycle02updating = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Updating: etcd, kube-apiserver +Completion: 3% (1 operators updated, 2 updating, 31 waiting) +Duration: 3m51s (Est. Time Remaining: 1h7m) +Operator Health: 34 Healthy + +Updating Cluster Operators +NAME SINCE REASON MESSAGE +etcd 2m53s NodeInstaller NodeInstallerProgressing: 2 nodes are at revision 8; 1 node is at revision 9 +kube-apiserver 2m21s NodeInstaller NodeInstallerProgressing: 3 nodes are at revision 7; 0 nodes have achieved new revision 8 + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-53-218.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-99-189.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-0-72.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-100-255.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-106-212.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +3m51s Info None Update is proceeding well` + + lifecycle03controlPlaneNodesUpdated = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Assessment: Progressing - Slow +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Updating: machine-config +Completion: 97% (33 operators updated, 1 updating, 0 waiting) +Duration: 1h1m (Est. Time Remaining: <10m) +Operator Health: 30 Healthy, 4 Available but degraded + +Updating Cluster Operators +NAME SINCE REASON MESSAGE +machine-config 22m21s - Working towards 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest + +All control plane nodes successfully updated to 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Completed 100% (3/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-0-72.us-west-1.compute.internal Completed Updated 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest - +ip-10-0-100-255.us-west-1.compute.internal Completed Updated 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest - +ip-10-0-106-212.us-west-1.compute.internal Completed Updated 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest - + += Update Health = +SINCE LEVEL IMPACT MESSAGE +1h1m36s Info None Update is proceeding well` + + lifecycle04controlPlaneUpdated = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Update to 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest successfully completed at 2025-08-13T14:15:18Z (duration: 1h2m) + +All control plane nodes successfully updated to 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Completed 100% (3/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-0-72.us-west-1.compute.internal Completed Updated 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest - +ip-10-0-100-255.us-west-1.compute.internal Completed Updated 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest - +ip-10-0-106-212.us-west-1.compute.internal Completed Updated 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest - + += Update Health = +SINCE LEVEL IMPACT MESSAGE +1h1m36s Info None Update is proceeding well` + + lifecycle05after = `The cluster is not updating.` +) + +func TestMonitor_UpdateLifecycle(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + snapshots []snapshot + wasUpdated bool + expected *junitapi.JUnitTestCase + }{ + { + name: "no snapshots -> test skipped", + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status snapshots reflect the cluster upgrade lifecycle", + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was successfully collected", + }, + }, + }, + { + name: "model update", + snapshots: []snapshot{ + {when: time.Now(), out: lifecycle01before}, + {when: time.Now(), out: lifecycle02updating}, + {when: time.Now(), out: lifecycle03controlPlaneNodesUpdated}, + {when: time.Now(), out: lifecycle04controlPlaneUpdated}, + {when: time.Now(), out: lifecycle05after}, + }, + wasUpdated: true, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status snapshots reflect the cluster upgrade lifecycle", + }, + }, + { + name: "sometimes we miss the control plane updated state, this is okay", + snapshots: []snapshot{ + {when: time.Now(), out: lifecycle01before}, + {when: time.Now(), out: lifecycle02updating}, + {when: time.Now(), out: lifecycle05after}, + }, + wasUpdated: true, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status snapshots reflect the cluster upgrade lifecycle", + }, + }, + { + name: "completed control plane nodes went back to updating", + snapshots: []snapshot{ + {when: time.Now(), out: lifecycle01before}, + {when: time.Now(), out: lifecycle02updating}, + {when: time.Now(), out: lifecycle03controlPlaneNodesUpdated}, + {when: time.Now(), out: lifecycle02updating}, + {when: time.Now(), out: lifecycle05after}, + }, + wasUpdated: true, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status snapshots reflect the cluster upgrade lifecycle", + FailureOutput: &junitapi.FailureOutput{ + Message: "observed unexpected update lifecycle transition in oc adm upgrade status", + }, + }, + }, + { + name: "no update observed when cluster was not updated", + snapshots: []snapshot{ + {when: time.Now(), out: lifecycle01before}, + {when: time.Now(), out: lifecycle01before}, + {when: time.Now(), out: lifecycle01before}, + }, + wasUpdated: false, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status snapshots reflect the cluster upgrade lifecycle", + }, + }, + { + name: "update observed when cluster was not updated", + snapshots: []snapshot{ + {when: time.Now(), out: lifecycle01before}, + {when: time.Now(), out: lifecycle02updating}, + {when: time.Now(), out: lifecycle01before}, + }, + wasUpdated: false, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status snapshots reflect the cluster upgrade lifecycle", + // TODO: MCO churn sometimes briefly tricks our code into thinking the cluster is updating, we'll tolerate for + // now but we should try fixing this + // FailureOutput: &junitapi.FailureOutput{ + // Message: "observed unexpected update lifecycle transition in oc adm upgrade status", + // }, + }, + }, + { + + name: "completed update goes back to updating", + snapshots: []snapshot{ + {when: time.Now(), out: lifecycle03controlPlaneNodesUpdated}, + {when: time.Now(), out: lifecycle05after}, + {when: time.Now(), out: lifecycle03controlPlaneNodesUpdated}, + }, + wasUpdated: true, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status snapshots reflect the cluster upgrade lifecycle", + // TODO: MCO churn sometimes briefly tricks our code into thinking the cluster is updating, we'll tolerate for + // now but we should try fixing this + // FailureOutput: &junitapi.FailureOutput{ + // Message: "observed unexpected update lifecycle transition in oc adm upgrade status", + // }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + m := NewOcAdmUpgradeStatusChecker().(*monitor) + m.ocAdmUpgradeStatus = append(m.ocAdmUpgradeStatus, tc.snapshots...) + + ignoreOutput := cmpopts.IgnoreFields(junitapi.FailureOutput{}, "Output") + + // Process snapshots into models for the health check to work with + _ = m.expectedLayout() + + wasUpdated := func() (bool, error) { + return tc.wasUpdated, nil + } + + result := m.updateLifecycle(wasUpdated) + if diff := cmp.Diff(tc.expected, result, ignoreOutput); diff != "" { + t.Errorf("unexpected result (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/workers.go b/pkg/monitortests/cli/adm_upgrade/status/workers.go new file mode 100644 index 000000000000..fc3b1f7ac2f2 --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/workers.go @@ -0,0 +1,102 @@ +package admupgradestatus + +import ( + "fmt" + "regexp" + "strings" + "time" + + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +var ( + nodeLinePattern = regexp.MustCompile(`^\S+\s+\S+\s+\S+\s+\S+\s+\S+.*$`) + + emptyPoolLinePattern = regexp.MustCompile(`^\S+\s+Empty\s+0 Total$`) + poolLinePattern = regexp.MustCompile(`^\S+\s+\S+\s+\d+% \(\d+/\d+\)\s+.*$`) +) + +func (w *monitor) workers() *junitapi.JUnitTestCase { + workers := &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status workers section is consistent", + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was successfully collected", + }, + } + + failureOutputBuilder := strings.Builder{} + + for _, observed := range w.ocAdmUpgradeStatusOutputModels { + if observed.output == nil { + // Failing to parse the output is handled in expectedLayout, so we can skip here + continue + } + // We saw at least one successful execution of oc adm upgrade status, so we have data to process + workers.SkipMessage = nil + + wroteOnce := false + fail := func(message string) { + if !wroteOnce { + wroteOnce = true + failureOutputBuilder.WriteString(fmt.Sprintf("\n===== %s\n", observed.when.Format(time.RFC3339))) + failureOutputBuilder.WriteString(observed.output.rawOutput) + failureOutputBuilder.WriteString(fmt.Sprintf("\n\n=> %s\n", message)) + } + } + + if !observed.output.updating { + // If the cluster is not updating, workers should not be updating + if observed.output.workers != nil { + fail("Cluster is not updating but workers section is present") + } + continue + } + + ws := observed.output.workers + if ws == nil { + // We do not show workers in SNO / compact clusters + // TODO: Crosscheck with topology + continue + } + + for _, pool := range ws.Pools { + if emptyPoolLinePattern.MatchString(pool) { + name := strings.Split(pool, " ")[0] + _, ok := ws.Nodes[name] + if ok { + fail(fmt.Sprintf("Nodes table should not be shown for an empty pool %s", name)) + } + continue + } + if !poolLinePattern.MatchString(pool) { + fail(fmt.Sprintf("Bad line in Worker Pool table: %s", pool)) + } + } + + if len(ws.Nodes) > len(ws.Pools) { + fail("Showing more Worker Pool Nodes tables than lines in Worker Pool table") + } + + for name, nodes := range ws.Nodes { + if len(nodes) == 0 { + fail(fmt.Sprintf("Worker Pool Nodes table for %s is empty", name)) + continue + } + + for _, node := range nodes { + if !nodeLinePattern.MatchString(node) { + fail(fmt.Sprintf("Bad line in Worker Pool Nodes table for %s: %s", name, node)) + } + } + } + } + + if failureOutputBuilder.Len() > 0 { + workers.FailureOutput = &junitapi.FailureOutput{ + Message: fmt.Sprintf("observed unexpected outputs in oc adm upgrade status workers section"), + Output: failureOutputBuilder.String(), + } + } + + return workers +} diff --git a/pkg/monitortests/cli/adm_upgrade/status/workers_test.go b/pkg/monitortests/cli/adm_upgrade/status/workers_test.go new file mode 100644 index 000000000000..287567a7e914 --- /dev/null +++ b/pkg/monitortests/cli/adm_upgrade/status/workers_test.go @@ -0,0 +1,212 @@ +package admupgradestatus + +import ( + "errors" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +var workersExampleOutput = `Unable to fetch alerts, ignoring alerts in 'Update Health': failed to get alerts from Thanos: no token is currently in use for this session += Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Updating: kube-apiserver +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Updating Cluster Operators +NAME SINCE REASON MESSAGE +kube-apiserver 7m27s NodeInstaller NodeInstallerProgressing: 1 node is at revision 7; 2 nodes are at revision 8 + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-53-218.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-99-189.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-0-72.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-100-255.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-106-212.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +var workersBadOutput = `= Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + +SOMETHING UNEXPECTED HERE + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +var workersInEmptyPool = `= Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +zbeast Empty 0 Total + +Worker Pool Nodes: zbeast +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-106-212.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +var moreNodeTables = `= Control Plane = +Assessment: Progressing +Target Version: 4.20.0-0.ci-2025-08-13-121604-test-ci-op-njttt0ww-latest (from 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial) +Completion: 6% (2 operators updated, 1 updating, 31 waiting) +Duration: 8m57s (Est. Time Remaining: 1h9m) +Operator Health: 34 Healthy + +Control Plane Nodes +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-111-19.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Worker Upgrade = + +WORKER POOL ASSESSMENT COMPLETION STATUS +worker Pending 0% (0/3) 3 Available, 0 Progressing, 0 Draining + +Worker Pool Nodes: worker +NAME ASSESSMENT PHASE VERSION EST MESSAGE +ip-10-0-0-72.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-100-255.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? +ip-10-0-106-212.us-west-1.compute.internal Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + +Worker Pool Nodes: what-is-this +NAME ASSESSMENT PHASE VERSION EST MESSAGE +nope-nope-nope Outdated Pending 4.20.0-0.ci-2025-08-13-114210-test-ci-op-njttt0ww-initial ? + += Update Health = +SINCE LEVEL IMPACT MESSAGE +8m57s Info None Update is proceeding well` + +func TestMonitor_Workers(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + snapshots []snapshot + expected *junitapi.JUnitTestCase + }{ + { + name: "no snapshots -> test skipped", + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status workers section is consistent", + SkipMessage: &junitapi.SkipMessage{ + Message: "Test skipped because no oc adm upgrade status output was successfully collected", + }, + }, + }, + { + name: "good snapshots", + snapshots: []snapshot{ + {when: time.Now(), out: workersExampleOutput}, + {when: time.Now(), out: workersExampleOutput}, + {when: time.Now(), out: workersExampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status workers section is consistent", + }, + }, + { + name: "errored snapshots are skipped", + snapshots: []snapshot{ + {when: time.Now(), out: workersExampleOutput}, + {when: time.Now(), out: workersBadOutput, err: errors.New("some error")}, + {when: time.Now(), out: workersExampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status workers section is consistent", + }, + }, + { + name: "unparseable snapshots are skipped", + snapshots: []snapshot{ + {when: time.Now(), out: workersExampleOutput}, + {when: time.Now(), out: "unparseable output"}, + {when: time.Now(), out: workersExampleOutput}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status workers section is consistent", + }, + }, + { + name: "empty table shown for empty pool", + snapshots: []snapshot{ + {when: time.Now(), out: workersInEmptyPool}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status workers section is consistent", + FailureOutput: &junitapi.FailureOutput{ + Message: "observed unexpected outputs in oc adm upgrade status workers section", + }, + }, + }, + { + name: "pool node table shown for pool not present in pool table", + snapshots: []snapshot{ + {when: time.Now(), out: moreNodeTables}, + }, + expected: &junitapi.JUnitTestCase{ + Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc adm upgrade status workers section is consistent", + FailureOutput: &junitapi.FailureOutput{ + Message: "observed unexpected outputs in oc adm upgrade status workers section", + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + m := NewOcAdmUpgradeStatusChecker().(*monitor) + m.ocAdmUpgradeStatus = append(m.ocAdmUpgradeStatus, tc.snapshots...) + + ignoreOutput := cmpopts.IgnoreFields(junitapi.FailureOutput{}, "Output") + + // Process snapshots into models for the workers check to work with + _ = m.expectedLayout() + + result := m.workers() + if diff := cmp.Diff(tc.expected, result, ignoreOutput); diff != "" { + t.Errorf("unexpected result (-want +got):\n%s", diff) + } + }) + } +}