Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/cmd/openshift-tests/monitor/run/run_monitor_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/openshift/origin/pkg/clioptions/imagesetup"
"github.com/openshift/origin/pkg/monitortestframework"
exutil "github.com/openshift/origin/test/extended/util"

"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/test/extended/util/image"
Expand Down Expand Up @@ -92,6 +93,9 @@ func (f *RunMonitorFlags) BindFlags(flags *pflag.FlagSet) {
}

func (f *RunMonitorFlags) ToOptions() (*RunMonitorOptions, error) {
// This is to set testsStarted = true to avoid panic
exutil.WithCleanup(func() {})

var displayFilterFn monitorapi.EventIntervalMatchesFunc
if f.DisplayFromNow {
now := time.Now()
Expand Down
3 changes: 3 additions & 0 deletions pkg/defaultmonitortests/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"github.com/openshift/origin/pkg/monitortestframework"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unclear if this is related to these changes or not, but the e2e-gcp-ovn-upgrade run's monitor logs contain:

  E0731 11:46:20.951174    2139 runtime.go:140] "Observed a panic" panic="May only be called from within a test case" stacktrace=<time="2025-07-31T11:46:20Z" level=info msg="  Starting legacy-kube-apiserver-invariants for kube-apiserver"

  	goroutine 498 [running]:
  	k8s.io/apimachinery/pkg/util/runtime.logPanic({0x9005b58, 0xc00075e000}, {0x73297c0, 0x8f7f930})
  		k8s.io/[email protected]/pkg/util/runtime/runtime.go:132 +0xbc
  time="2025-07-31T11:46:20Z" level=info msg="  Starting legacy-networking-invariants for Networking / cluster-network-operator"
	k8s.io/apimachinery/pkg/util/runtime.handleCrash({0x9005b58, 0xc00075e000}, {0x73297c0, 0x8f7f930}, {0x0, 0x0, 0x2951560?})
  		k8s.io/[email protected]/pkg/util/runtime/runtime.go:107 +0x116
  	k8s.io/apimachinery/pkg/util/runtime.HandleCrashWithContext({0x9005b58, 0xc00075e000}, {0x0, 0x0, 0x0})
  		k8s.io/[email protected]/pkg/util/runtime/runtime.go:78 +0x5a
  	panic({0x73297c0?, 0x8f7f930?})
time="2025-07-31T11:46:20Z" level=info msg="  Starting pod-lifecycle for Node / Kubelet"
  		runtime/panic.go:792 +0x132
  time="2025-07-31T11:46:20Z" level=info msg="  Starting legacy-storage-invariants for Storage"
time="2025-07-31T11:46:20Z" level=info msg="  Starting legacy-test-framework-invariants for Test Framework"
	github.com/openshift/origin/test/extended/util.requiresTestStart(...)time="2025-07-31T11:46:20Z" level=info msg="  Starting staicpod-install-monitor for kube-apiserver"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While I am figuring out the cause, let see if it is reoccurring.

/test e2e-gcp-ovn-upgrade

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

@hongkailiu hongkailiu Aug 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let us see if a957e64 fixes it.

I cannot figure out why the status command stopped panicking after a while because the status cmd seem working and stored in https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/pr-logs/pull/30031/pull-ci-openshift-origin-main-e2e-gcp-ovn-upgrade/1950870499584118784/artifacts/e2e-gcp-ovn-upgrade/openshift-e2e-test/artifacts/junit/adm-upgrade-status/

That means testStarted was handled already, and just a bit late?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"github.com/openshift/origin/pkg/monitortests/authentication/legacyauthenticationmonitortests"
"github.com/openshift/origin/pkg/monitortests/authentication/requiredsccmonitortests"
admupgradestatus "github.com/openshift/origin/pkg/monitortests/cli/adm_upgrade/status"
azuremetrics "github.com/openshift/origin/pkg/monitortests/cloud/azure/metrics"
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/legacycvomonitortests"
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
Expand Down Expand Up @@ -207,5 +208,7 @@ func newUniversalMonitorTests(info monitortestframework.MonitorTestInitializatio
monitorTestRegistry.AddMonitorTestOrDie("watch-namespaces", "Test Framework", watchnamespaces.NewNamespaceWatcher())
monitorTestRegistry.AddMonitorTestOrDie("high-cpu-test-analyzer", "Test Framework", highcputestanalyzer.NewHighCPUTestAnalyzer())

monitorTestRegistry.AddMonitorTestOrDie("oc-adm-upgrade-status", "oc / update", admupgradestatus.NewOcAdmUpgradeStatusChecker())

return monitorTestRegistry
}
6 changes: 6 additions & 0 deletions pkg/monitortests/cli/adm_upgrade/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# See the OWNERS docs: https://git.k8s.io/community/contributors/guide/owners.md

approvers:
- cluster-version-operator-test-case-approvers
reviewers:
- cluster-version-operator-test-case-reviewers
193 changes: 193 additions & 0 deletions pkg/monitortests/cli/adm_upgrade/status/monitortest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
package admupgradestatus

import (
"context"
"fmt"
"os"
"path"
"path/filepath"
"strings"
"time"

clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned"
"github.com/openshift/origin/pkg/monitortestframework"
exutil "github.com/openshift/origin/test/extended/util"
"k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"

"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
)

type snapshot struct {
when time.Time
out string
err error
}
type monitor struct {
collectionDone chan struct{}
ocAdmUpgradeStatus map[time.Time]*snapshot
notSupportedReason error
isSNO bool
}

func NewOcAdmUpgradeStatusChecker() monitortestframework.MonitorTest {
return &monitor{
collectionDone: make(chan struct{}),
ocAdmUpgradeStatus: map[time.Time]*snapshot{},
}
}

func (w *monitor) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
kubeClient, err := kubernetes.NewForConfig(adminRESTConfig)
if err != nil {
return err
}
isMicroShift, err := exutil.IsMicroShiftCluster(kubeClient)
if err != nil {
return fmt.Errorf("unable to determine if cluster is MicroShift: %v", err)
}
if isMicroShift {
w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: "platform MicroShift not supported"}
return w.notSupportedReason
}
clientconfigv1client, err := clientconfigv1.NewForConfig(adminRESTConfig)
if err != nil {
return err
}

if ok, err := exutil.IsHypershift(ctx, clientconfigv1client); err != nil {
return fmt.Errorf("unable to determine if cluster is Hypershift: %v", err)
} else if ok {
w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: "platform Hypershift not supported"}
return w.notSupportedReason
}

if ok, err := exutil.IsSingleNode(ctx, clientconfigv1client); err != nil {
return fmt.Errorf("unable to determine if cluster is single node: %v", err)
} else {
w.isSNO = ok
}
return nil
}

func snapshotOcAdmUpgradeStatus(ch chan *snapshot) {
// TODO: I _think_ this should somehow use the adminRESTConfig given to StartCollection but I don't know how to
// how to do pass that to exutil.NewCLI* or if it is even possible. It seems to work this way though.
oc := exutil.NewCLIWithoutNamespace("adm-upgrade-status").AsAdmin()
now := time.Now()

var out string
var err error
// retry on brief apiserver unavailability
if errWait := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, true, func(context.Context) (bool, error) {
cmd := oc.Run("adm", "upgrade", "status").EnvVar("OC_ENABLE_CMD_UPGRADE_STATUS", "true")
out, err = cmd.Output()
if err != nil {
return false, nil
}
return true, nil
}); errWait != nil {
out = ""
err = errWait
}
ch <- &snapshot{when: now, out: out, err: err}
}

func (w *monitor) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
if w.notSupportedReason != nil {
return w.notSupportedReason
}
// TODO: The double goroutine spawn should probably be placed under some abstraction
go func(ctx context.Context) {
snapshots := make(chan *snapshot)
go func() {
for snap := range snapshots {
// TODO: Maybe also collect some cluster resources (CV? COs?) through recorder?
w.ocAdmUpgradeStatus[snap.when] = snap
}
w.collectionDone <- struct{}{}
}()
// TODO: Configurable interval?
// TODO: Collect multiple invocations (--details)? Would need more another producer/consumer pair and likely
// collectionDone would need to be a WaitGroup

wait.UntilWithContext(ctx, func(ctx context.Context) { snapshotOcAdmUpgradeStatus(snapshots) }, time.Minute)
// The UntilWithContext blocks until the framework cancels the context when it wants tests to stop -> when we
// get here, we know last snapshotOcAdmUpgradeStatus producer wrote to the snapshots channel, we can close it
// which in turn will allow the consumer to finish and signal collectionDone.
close(snapshots)
}(ctx)

return nil
}

func (w *monitor) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
if w.notSupportedReason != nil {
return nil, nil, w.notSupportedReason
}

// The framework cancels the context it gave StartCollection before it calls CollectData, but we need to wait for
// the collection goroutines spawned in StartedCollection to finish
<-w.collectionDone

noFailures := &junitapi.JUnitTestCase{
Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc amd upgrade status never fails",
}

var failures []string
var total int
for when, observed := range w.ocAdmUpgradeStatus {
total++
if observed.err != nil {
failures = append(failures, fmt.Sprintf("- %s: %v", when.Format(time.RFC3339), observed.err))
}
}

// Zero failures is too strict for at least SNO clusters
p := (len(failures) / total) * 100
if (!w.isSNO && p > 0) || (w.isSNO && p > 10) {
noFailures.FailureOutput = &junitapi.FailureOutput{
Message: fmt.Sprintf("oc adm upgrade status failed %d times (of %d)", len(failures), len(w.ocAdmUpgradeStatus)),
Output: strings.Join(failures, "\n"),
}
}

// TODO: Maybe utilize Intervals somehow and do tests in ComputeComputedIntervals and EvaluateTestsFromConstructedIntervals

return nil, []*junitapi.JUnitTestCase{noFailures}, nil
}

func (w *monitor) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
return nil, w.notSupportedReason
}

func (w *monitor) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
if w.notSupportedReason != nil {
return nil, w.notSupportedReason
}
return nil, nil
}

func (w *monitor) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
folderPath := path.Join(storageDir, "adm-upgrade-status")
if err := os.MkdirAll(folderPath, os.ModePerm); err != nil {
return fmt.Errorf("unable to create directory %s: %w", folderPath, err)
}

var errs []error
for when, observed := range w.ocAdmUpgradeStatus {
outputFilename := fmt.Sprintf("adm-upgrade-status-%s_%s.txt", when, timeSuffix)
outputFile := filepath.Join(folderPath, outputFilename)
if err := os.WriteFile(outputFile, []byte(observed.out), 0644); err != nil {
errs = append(errs, fmt.Errorf("failed to write %s: %w", outputFile, err))
}
}
return errors.NewAggregate(errs)
}

func (*monitor) Cleanup(ctx context.Context) error {
return nil
}