-
Notifications
You must be signed in to change notification settings - Fork 4.8k
OTA-1580: Monitortest framework test for oc adm upgrade status #30031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
1bd2e97
OTA-1580: Monitortest to `oc adm upgrade status`
petr-muller e2a4755
OTA-1580: Simple "never fails" test
petr-muller 9514c30
Skip the test on MicroShift
hongkailiu 57b45b4
Retry on brief apiserver unavailability
hongkailiu 3764b28
Lower the bar of success for SNO
hongkailiu e710b3c
Use sub-directory to save files
hongkailiu 6fcacab
platform HyperShift not supported
hongkailiu ff47d29
Set testsStarted=true for monitortests
hongkailiu de89f9e
Return early in StartCollection for unsupported tests
hongkailiu File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| # See the OWNERS docs: https://git.k8s.io/community/contributors/guide/owners.md | ||
|
|
||
| approvers: | ||
| - cluster-version-operator-test-case-approvers | ||
| reviewers: | ||
| - cluster-version-operator-test-case-reviewers |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,193 @@ | ||
| package admupgradestatus | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "os" | ||
| "path" | ||
| "path/filepath" | ||
| "strings" | ||
| "time" | ||
|
|
||
| clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned" | ||
| "github.com/openshift/origin/pkg/monitortestframework" | ||
| exutil "github.com/openshift/origin/test/extended/util" | ||
| "k8s.io/apimachinery/pkg/util/errors" | ||
| "k8s.io/apimachinery/pkg/util/wait" | ||
| "k8s.io/client-go/kubernetes" | ||
| "k8s.io/client-go/rest" | ||
|
|
||
| "github.com/openshift/origin/pkg/monitor/monitorapi" | ||
| "github.com/openshift/origin/pkg/test/ginkgo/junitapi" | ||
| ) | ||
|
|
||
| type snapshot struct { | ||
| when time.Time | ||
| out string | ||
| err error | ||
| } | ||
| type monitor struct { | ||
| collectionDone chan struct{} | ||
| ocAdmUpgradeStatus map[time.Time]*snapshot | ||
| notSupportedReason error | ||
| isSNO bool | ||
| } | ||
|
|
||
| func NewOcAdmUpgradeStatusChecker() monitortestframework.MonitorTest { | ||
| return &monitor{ | ||
| collectionDone: make(chan struct{}), | ||
| ocAdmUpgradeStatus: map[time.Time]*snapshot{}, | ||
| } | ||
| } | ||
|
|
||
| func (w *monitor) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error { | ||
| kubeClient, err := kubernetes.NewForConfig(adminRESTConfig) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| isMicroShift, err := exutil.IsMicroShiftCluster(kubeClient) | ||
| if err != nil { | ||
| return fmt.Errorf("unable to determine if cluster is MicroShift: %v", err) | ||
| } | ||
| if isMicroShift { | ||
| w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: "platform MicroShift not supported"} | ||
| return w.notSupportedReason | ||
| } | ||
| clientconfigv1client, err := clientconfigv1.NewForConfig(adminRESTConfig) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| if ok, err := exutil.IsHypershift(ctx, clientconfigv1client); err != nil { | ||
| return fmt.Errorf("unable to determine if cluster is Hypershift: %v", err) | ||
| } else if ok { | ||
| w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: "platform Hypershift not supported"} | ||
| return w.notSupportedReason | ||
| } | ||
|
|
||
| if ok, err := exutil.IsSingleNode(ctx, clientconfigv1client); err != nil { | ||
| return fmt.Errorf("unable to determine if cluster is single node: %v", err) | ||
| } else { | ||
| w.isSNO = ok | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func snapshotOcAdmUpgradeStatus(ch chan *snapshot) { | ||
| // TODO: I _think_ this should somehow use the adminRESTConfig given to StartCollection but I don't know how to | ||
| // how to do pass that to exutil.NewCLI* or if it is even possible. It seems to work this way though. | ||
| oc := exutil.NewCLIWithoutNamespace("adm-upgrade-status").AsAdmin() | ||
| now := time.Now() | ||
|
|
||
| var out string | ||
| var err error | ||
| // retry on brief apiserver unavailability | ||
| if errWait := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, true, func(context.Context) (bool, error) { | ||
| cmd := oc.Run("adm", "upgrade", "status").EnvVar("OC_ENABLE_CMD_UPGRADE_STATUS", "true") | ||
| out, err = cmd.Output() | ||
| if err != nil { | ||
| return false, nil | ||
| } | ||
| return true, nil | ||
| }); errWait != nil { | ||
| out = "" | ||
| err = errWait | ||
| } | ||
| ch <- &snapshot{when: now, out: out, err: err} | ||
| } | ||
|
|
||
| func (w *monitor) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error { | ||
| if w.notSupportedReason != nil { | ||
| return w.notSupportedReason | ||
| } | ||
| // TODO: The double goroutine spawn should probably be placed under some abstraction | ||
hongkailiu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| go func(ctx context.Context) { | ||
| snapshots := make(chan *snapshot) | ||
| go func() { | ||
| for snap := range snapshots { | ||
| // TODO: Maybe also collect some cluster resources (CV? COs?) through recorder? | ||
| w.ocAdmUpgradeStatus[snap.when] = snap | ||
| } | ||
| w.collectionDone <- struct{}{} | ||
| }() | ||
| // TODO: Configurable interval? | ||
| // TODO: Collect multiple invocations (--details)? Would need more another producer/consumer pair and likely | ||
| // collectionDone would need to be a WaitGroup | ||
|
|
||
| wait.UntilWithContext(ctx, func(ctx context.Context) { snapshotOcAdmUpgradeStatus(snapshots) }, time.Minute) | ||
| // The UntilWithContext blocks until the framework cancels the context when it wants tests to stop -> when we | ||
| // get here, we know last snapshotOcAdmUpgradeStatus producer wrote to the snapshots channel, we can close it | ||
| // which in turn will allow the consumer to finish and signal collectionDone. | ||
| close(snapshots) | ||
| }(ctx) | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| func (w *monitor) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) { | ||
| if w.notSupportedReason != nil { | ||
| return nil, nil, w.notSupportedReason | ||
| } | ||
|
|
||
| // The framework cancels the context it gave StartCollection before it calls CollectData, but we need to wait for | ||
| // the collection goroutines spawned in StartedCollection to finish | ||
| <-w.collectionDone | ||
|
|
||
| noFailures := &junitapi.JUnitTestCase{ | ||
| Name: "[sig-cli][OCPFeatureGate:UpgradeStatus] oc amd upgrade status never fails", | ||
| } | ||
|
|
||
| var failures []string | ||
| var total int | ||
| for when, observed := range w.ocAdmUpgradeStatus { | ||
| total++ | ||
| if observed.err != nil { | ||
| failures = append(failures, fmt.Sprintf("- %s: %v", when.Format(time.RFC3339), observed.err)) | ||
| } | ||
| } | ||
|
|
||
| // Zero failures is too strict for at least SNO clusters | ||
| p := (len(failures) / total) * 100 | ||
| if (!w.isSNO && p > 0) || (w.isSNO && p > 10) { | ||
hongkailiu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| noFailures.FailureOutput = &junitapi.FailureOutput{ | ||
| Message: fmt.Sprintf("oc adm upgrade status failed %d times (of %d)", len(failures), len(w.ocAdmUpgradeStatus)), | ||
| Output: strings.Join(failures, "\n"), | ||
| } | ||
| } | ||
|
|
||
| // TODO: Maybe utilize Intervals somehow and do tests in ComputeComputedIntervals and EvaluateTestsFromConstructedIntervals | ||
|
|
||
| return nil, []*junitapi.JUnitTestCase{noFailures}, nil | ||
| } | ||
|
|
||
| func (w *monitor) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { | ||
| return nil, w.notSupportedReason | ||
| } | ||
|
|
||
| func (w *monitor) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) { | ||
| if w.notSupportedReason != nil { | ||
| return nil, w.notSupportedReason | ||
| } | ||
| return nil, nil | ||
| } | ||
|
|
||
| func (w *monitor) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error { | ||
| folderPath := path.Join(storageDir, "adm-upgrade-status") | ||
| if err := os.MkdirAll(folderPath, os.ModePerm); err != nil { | ||
| return fmt.Errorf("unable to create directory %s: %w", folderPath, err) | ||
| } | ||
|
|
||
| var errs []error | ||
| for when, observed := range w.ocAdmUpgradeStatus { | ||
| outputFilename := fmt.Sprintf("adm-upgrade-status-%s_%s.txt", when, timeSuffix) | ||
| outputFile := filepath.Join(folderPath, outputFilename) | ||
| if err := os.WriteFile(outputFile, []byte(observed.out), 0644); err != nil { | ||
| errs = append(errs, fmt.Errorf("failed to write %s: %w", outputFile, err)) | ||
| } | ||
| } | ||
| return errors.NewAggregate(errs) | ||
| } | ||
|
|
||
| func (*monitor) Cleanup(ctx context.Context) error { | ||
| return nil | ||
| } | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unclear if this is related to these changes or not, but the
e2e-gcp-ovn-upgraderun's monitor logs contain:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While I am figuring out the cause, let see if it is reoccurring.
/test e2e-gcp-ovn-upgrade
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Found in other tests too. For example, https://prow.ci.openshift.org/view/gs/test-platform-results/pr-logs/pull/30031/pull-ci-openshift-origin-main-e2e-aws-ovn-upgrade/1950870479451459584
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let us see if a957e64 fixes it.
I cannot figure out why the status command stopped panicking after a while because the status cmd seem working and stored in https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/pr-logs/pull/30031/pull-ci-openshift-origin-main-e2e-gcp-ovn-upgrade/1950870499584118784/artifacts/e2e-gcp-ovn-upgrade/openshift-e2e-test/artifacts/junit/adm-upgrade-status/
That means
testStartedwas handled already, and just a bit late?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new run seems fixed.
https://prow.ci.openshift.org/view/gs/test-platform-results/pr-logs/pull/30031/pull-ci-openshift-origin-main-e2e-gcp-ovn-upgrade/1952380047100743680
https://prow.ci.openshift.org/view/gs/test-platform-results/pr-logs/pull/30031/pull-ci-openshift-origin-main-e2e-aws-ovn-upgrade/1952380026976473088
The latter was failed for other reasons.
/retest-required