-
Notifications
You must be signed in to change notification settings - Fork 4.8k
NE-2561: Add Gateway API OLM to NO-OLM migration upgrade test #30897
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,306 @@ | ||
| package router | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "strings" | ||
| "time" | ||
|
|
||
| g "github.com/onsi/ginkgo/v2" | ||
| o "github.com/onsi/gomega" | ||
|
|
||
| exutil "github.com/openshift/origin/test/extended/util" | ||
|
|
||
| apierrors "k8s.io/apimachinery/pkg/api/errors" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/util/wait" | ||
| e2e "k8s.io/kubernetes/test/e2e/framework" | ||
| "k8s.io/kubernetes/test/e2e/upgrades" | ||
| gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" | ||
| ) | ||
|
|
||
| // GatewayAPIUpgradeTest verifies that Gateway API resources work during upgrade, | ||
| // whether using OLM-based or CIO-based (NO-OLM) provisioning | ||
| type GatewayAPIUpgradeTest struct { | ||
| oc *exutil.CLI | ||
| namespace string | ||
| gatewayName string | ||
| routeName string | ||
| hostname string | ||
| startedWithNoOLM bool // tracks if GatewayAPIWithoutOLM was enabled at start | ||
| loadBalancerSupported bool | ||
| managedDNS bool | ||
| } | ||
|
|
||
| func (t *GatewayAPIUpgradeTest) Name() string { | ||
| return "gateway-api-upgrade" | ||
| } | ||
|
|
||
| func (t *GatewayAPIUpgradeTest) DisplayName() string { | ||
| return "[sig-network-edge][Feature:Router][apigroup:gateway.networking.k8s.io] Verify Gateway API functionality during upgrade" | ||
| } | ||
|
|
||
| // Setup creates Gateway and HTTPRoute resources and tests connectivity | ||
| func (t *GatewayAPIUpgradeTest) Setup(ctx context.Context, f *e2e.Framework) { | ||
| g.By("Setting up Gateway API upgrade test") | ||
|
|
||
| t.oc = exutil.NewCLIWithFramework(f).AsAdmin() | ||
| t.namespace = f.Namespace.Name | ||
| t.gatewayName = "upgrade-test-gateway" | ||
| t.routeName = "test-httproute" | ||
|
|
||
| // Check platform support and get capabilities (LoadBalancer, DNS) | ||
| t.loadBalancerSupported, t.managedDNS = checkPlatformSupportAndGetCapabilities(t.oc) | ||
|
|
||
| g.By("Checking if GatewayAPIWithoutOLM feature gate is enabled before upgrade") | ||
| t.startedWithNoOLM = isNoOLMFeatureGateEnabled(t.oc) | ||
|
|
||
| // Skip on clusters missing OLM/Marketplace capabilities if starting with OLM mode | ||
| if !t.startedWithNoOLM { | ||
| exutil.SkipIfMissingCapabilities(t.oc, olmCapabilities...) | ||
| } | ||
|
|
||
| if t.startedWithNoOLM { | ||
| e2e.Logf("Starting with GatewayAPIWithoutOLM enabled (NO-OLM mode)") | ||
| } else { | ||
| e2e.Logf("Starting with OLM-based Gateway API provisioning") | ||
| } | ||
|
|
||
| g.By("Creating default GatewayClass to trigger Gateway API installation") | ||
| gatewayClassControllerName := "openshift.io/gateway-controller/v1" | ||
| gatewayClass := buildGatewayClass(gatewayClassName, gatewayClassControllerName) | ||
| _, err := t.oc.AdminGatewayApiClient().GatewayV1().GatewayClasses().Create(ctx, gatewayClass, metav1.CreateOptions{}) | ||
| if err != nil && !apierrors.IsAlreadyExists(err) { | ||
| e2e.Failf("Failed to create GatewayClass %q: %v", gatewayClassName, err) | ||
| } | ||
|
|
||
| g.By("Waiting for GatewayClass to be accepted") | ||
| err = checkGatewayClassCondition(t.oc, gatewayClassName, string(gatewayv1.GatewayClassConditionStatusAccepted), metav1.ConditionTrue) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "GatewayClass %q was not accepted", gatewayClassName) | ||
|
|
||
| g.By("Getting the default domain") | ||
| defaultIngressDomain, err := getDefaultIngressClusterDomainName(t.oc, time.Minute) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Failed to find default domain name") | ||
| customDomain := strings.Replace(defaultIngressDomain, "apps.", "gw-upgrade.", 1) | ||
| t.hostname = "test-upgrade." + customDomain | ||
|
|
||
| g.By("Creating Gateway") | ||
| _, err = createAndCheckGateway(t.oc, t.gatewayName, gatewayClassName, customDomain, t.loadBalancerSupported) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Failed to create Gateway") | ||
|
|
||
| g.By("Verify the gateway's LoadBalancer service and DNSRecords") | ||
| if t.loadBalancerSupported { | ||
| assertGatewayLoadbalancerReady(t.oc, t.gatewayName, t.gatewayName+"-openshift-default") | ||
| } | ||
|
|
||
| if t.managedDNS { | ||
| assertDNSRecordStatus(t.oc, t.gatewayName) | ||
| } | ||
|
|
||
| if !t.startedWithNoOLM { | ||
| g.By("Validating OLM-based provisioning before upgrade") | ||
| validateOLMBasedOSSM(t.oc, 20*time.Minute) | ||
| e2e.Logf("GatewayAPI resources successfully created with OLM-based provisioning") | ||
| } else { | ||
| g.By("Validating CIO-based (NO-OLM) provisioning before upgrade") | ||
| t.validateCIOProvisioning(ctx, false) // false = no migration occurred | ||
| e2e.Logf("GatewayAPI resources successfully created with CIO-based (NO-OLM) provisioning") | ||
| } | ||
|
|
||
| g.By("Creating HTTPRoute with backend") | ||
| backendName := "echo-backend-" + t.gatewayName | ||
| createHttpRoute(t.oc, t.gatewayName, t.routeName, t.hostname, backendName) | ||
|
|
||
| g.By("Waiting for HTTPRoute to be accepted") | ||
| _, err = assertHttpRouteSuccessful(t.oc, t.gatewayName, t.routeName) | ||
| o.Expect(err).NotTo(o.HaveOccurred()) | ||
|
|
||
| if t.loadBalancerSupported && t.managedDNS { | ||
| g.By("Verifying HTTP connectivity before upgrade") | ||
| assertHttpRouteConnection(t.hostname) | ||
| e2e.Logf("HTTPRoute connectivity verified before upgrade") | ||
| } | ||
| } | ||
|
|
||
| // Test validates that resources continue working during upgrade and validates provisioning method | ||
| func (t *GatewayAPIUpgradeTest) Test(ctx context.Context, f *e2e.Framework, done <-chan struct{}, _ upgrades.UpgradeType) { | ||
| g.By("Validating Gateway API resources remain functional after upgrade") | ||
|
|
||
| // Block until upgrade completes | ||
| g.By("Waiting for upgrade to complete") | ||
| <-done | ||
|
|
||
| g.By("Verifying Gateway still exists and is programmed") | ||
| _, err := checkGatewayStatus(t.oc, t.gatewayName, ingressNamespace, t.loadBalancerSupported) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Gateway should remain programmed") | ||
|
|
||
| g.By("Checking if GatewayAPIWithoutOLM feature gate is enabled after upgrade") | ||
| endsWithNoOLM := isNoOLMFeatureGateEnabled(t.oc) | ||
|
|
||
| // Determine if migration happened: started with OLM, ended with NO-OLM | ||
| migrationOccurred := !t.startedWithNoOLM && endsWithNoOLM | ||
| if migrationOccurred { | ||
| e2e.Logf("Migration detected: started with OLM, ended with NO-OLM") | ||
| } else { | ||
| e2e.Logf("No migration occurred (started with NO-OLM=%v, ended with NO-OLM=%v)", t.startedWithNoOLM, endsWithNoOLM) | ||
| } | ||
|
|
||
| if endsWithNoOLM { | ||
| g.By("GatewayAPIWithoutOLM is enabled - validating CIO-based (NO-OLM) provisioning") | ||
| t.validateCIOProvisioning(ctx, migrationOccurred) | ||
| } else { | ||
| g.By("GatewayAPIWithoutOLM is disabled - validating OLM-based provisioning") | ||
| // A shorter timeout here is because the resources should already exist post-upgrade state. | ||
| validateOLMBasedOSSM(t.oc, 2*time.Minute) | ||
gcs278 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| g.By("Verifying HTTPRoute still exists and is accepted after upgrade") | ||
| _, err = assertHttpRouteSuccessful(t.oc, t.gatewayName, t.routeName) | ||
| o.Expect(err).NotTo(o.HaveOccurred()) | ||
|
|
||
| if t.loadBalancerSupported && t.managedDNS { | ||
| g.By("Verifying HTTP connectivity after upgrade") | ||
| assertHttpRouteConnection(t.hostname) | ||
| } | ||
|
|
||
| if migrationOccurred { | ||
| e2e.Logf("Gateway API successfully migrated from OLM to CIO (NO-OLM) during upgrade") | ||
| } else if endsWithNoOLM { | ||
| e2e.Logf("Gateway API using CIO-based (NO-OLM) provisioning - no migration occurred") | ||
| } else { | ||
| e2e.Logf("Gateway API remains on OLM-based provisioning after upgrade") | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a bit of a nit, but is this really required since in both OLM enabled and disabled checks we have a log saying the resources were successfully checked?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea this is a bit verbose. It summarizes that the migration was successful and what type of migration happened. I think that's still helpful for clarity in the logs, it helps "signal" that we are at the end of the test function. |
||
| } | ||
|
|
||
| // validateCIOProvisioning validates that Gateway API is using CIO-based (NO-OLM) provisioning | ||
| // If migrationOccurred is true, validates the migration from OLM to CIO Sail Library | ||
| func (t *GatewayAPIUpgradeTest) validateCIOProvisioning(ctx context.Context, migrationOccurred bool) { | ||
| g.By("Verifying Istiod control plane is running") | ||
| err := checkIstiodRunning(t.oc, 2*time.Minute) | ||
| o.Expect(err).NotTo(o.HaveOccurred()) | ||
|
|
||
| g.By("Verifying CIO has taken ownership via GatewayClass") | ||
| // Check GatewayClass has CIO sail library finalizer | ||
| err = checkGatewayClassFinalizer(t.oc, gatewayClassName, "openshift.io/ingress-operator-sail-finalizer") | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "GatewayClass should have CIO sail library finalizer") | ||
|
|
||
| // Check GatewayClass has required CIO conditions | ||
| err = checkGatewayClassCondition(t.oc, gatewayClassName, gatewayClassControllerInstalledConditionType, metav1.ConditionTrue) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "GatewayClass should have ControllerInstalled condition") | ||
|
|
||
| err = checkGatewayClassCondition(t.oc, gatewayClassName, gatewayClassCRDsReadyConditionType, metav1.ConditionTrue) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "GatewayClass should have CRDsReady condition") | ||
|
|
||
| g.By("Verifying istiod deployment is managed by sail library") | ||
| err = checkIstiodManagedBySailLibrary(t.oc) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Istiod should be managed by sail library") | ||
|
|
||
| if migrationOccurred { | ||
| g.By("Verifying Istio CRDs remain managed by OLM after migration") | ||
| // When migrating from OLM, CRDs were installed by OLM and should remain OLM-managed | ||
| err = assertIstioCRDsOwnedByOLM(t.oc) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Istio CRDs should remain OLM-managed after migration") | ||
|
|
||
| g.By("Verifying OLM subscription still exists after migration") | ||
| // The OLM Subscription for Sail Operator should still exist (it's not removed during migration) | ||
| _, err = t.oc.Run("get").Args("subscription", "-n", expectedSubscriptionNamespace, expectedSubscriptionName, "-o", "name").Output() | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Sail Operator subscription should still exist after migration") | ||
|
|
||
| g.By("Verifying Istio CR was removed during migration") | ||
| out, err := t.oc.Run("get").Args("--ignore-not-found=true", "istio", istioName, "-o", "name").Output() | ||
| o.Expect(err).NotTo(o.HaveOccurred()) | ||
| o.Expect(strings.TrimSpace(out)).To(o.BeEmpty(), "Istio CR %q should not exist", istioName) | ||
|
|
||
| e2e.Logf("Successfully validated OLM to NO-OLM migration") | ||
| } else { | ||
| g.By("Verifying Istio CRDs are managed by CIO") | ||
| // When using CIO from the start, CRDs are CIO-managed | ||
| err := assertIstioCRDsOwnedByCIO(t.oc) | ||
| o.Expect(err).NotTo(o.HaveOccurred(), "Istio CRDs should be CIO-managed when using NO-OLM from the start") | ||
|
|
||
| g.By("Verifying CIO-managed resources") | ||
| // When using CIO from the start, Istio CR should not exist (CIO uses Sail Library directly) | ||
| out, err := t.oc.Run("get").Args("--ignore-not-found=true", "istio", istioName, "-o", "name").Output() | ||
| o.Expect(err).NotTo(o.HaveOccurred()) | ||
| o.Expect(strings.TrimSpace(out)).To(o.BeEmpty(), "Istio CR %q should not exist when using CIO-based provisioning", istioName) | ||
| e2e.Logf("Successfully validated CIO-based (NO-OLM) provisioning") | ||
| } | ||
| } | ||
|
|
||
| // Teardown cleans up Gateway API resources, Istio CR, and OSSM subscription | ||
| // This runs even if the test fails, ensuring complete cleanup | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering if need to clean up the resources, maybe keep them and allow must-gather to collect.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The issue is that the normal E2E test suite runs after the upgrade suite runs, and we have baked in assumptions that we are running on a fresh cluster (no OSSM), like in [OCPFeatureGate:GatewayAPIWithoutOLM] Ensure Sail operator resources are not installed . So if we left OSSM installed, we break our current tests. The other option is to rewrite our tests to not assume that it's a vanilla cluster (aka OSSM can pre-exist), or at least, skip certain checks when an upgrade was performed. However, I was cautious against having "dynamic assumptions" and more conditional skips in our tests. We'd probably have to pass some sort of "upgrade marker" into the standard E2E test suite. It felt easier to just clean up. The other part of this, is following the So, I'm still leaning towards a full-cleanup, mainly for the sake of simplicity, but let me know if you have any thoughts on a different way to do this. The nice thing about the new noOLM, is that we don't need cluster scoped resources like this 😄 |
||
| func (t *GatewayAPIUpgradeTest) Teardown(ctx context.Context, f *e2e.Framework) { | ||
| g.By("Deleting the Gateway") | ||
| err := t.oc.AdminGatewayApiClient().GatewayV1().Gateways(ingressNamespace).Delete(ctx, t.gatewayName, metav1.DeleteOptions{}) | ||
| if err != nil && !apierrors.IsNotFound(err) { | ||
| e2e.Logf("Failed to delete Gateway %q: %v", t.gatewayName, err) | ||
| } | ||
|
|
||
| // Wait for Gateway to be fully deleted before removing GatewayClass | ||
| // This prevents orphaned resources if the controller (defined by GatewayClass) is removed | ||
| // before Istiod completes cleanup | ||
| g.By("Waiting for Gateway to be fully deleted") | ||
| err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 2*time.Minute, false, func(ctx context.Context) (bool, error) { | ||
| _, err := t.oc.AdminGatewayApiClient().GatewayV1().Gateways(ingressNamespace).Get(ctx, t.gatewayName, metav1.GetOptions{}) | ||
| if apierrors.IsNotFound(err) { | ||
| e2e.Logf("Gateway %q successfully deleted", t.gatewayName) | ||
| return true, nil | ||
| } | ||
| return false, nil | ||
| }) | ||
| if err != nil { | ||
| e2e.Logf("Gateway %q still exists after 2 minutes, continuing cleanup anyway", t.gatewayName) | ||
| } | ||
|
|
||
| g.By("Deleting the GatewayClass") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we wait until the gateway is gone ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point - there is a race here that Istiod control plane could disappear before the Gateway is deleted. I'll add a wait. |
||
| err = t.oc.AdminGatewayApiClient().GatewayV1().GatewayClasses().Delete(ctx, gatewayClassName, metav1.DeleteOptions{}) | ||
| if err != nil && !apierrors.IsNotFound(err) { | ||
| e2e.Logf("Failed to delete GatewayClass %q: %v", gatewayClassName, err) | ||
| } | ||
|
|
||
| g.By("Deleting the Istio CR if it exists") | ||
| // This should get cleaned up by the CIO, but this is here just in case of failure | ||
| err = t.oc.Run("delete").Args("--ignore-not-found=true", "istio", istioName).Execute() | ||
| if err != nil { | ||
| e2e.Logf("Failed to delete Istio CR %q: %v", istioName, err) | ||
| } | ||
|
|
||
| g.By("Waiting for istiod pods to be deleted") | ||
| waitForIstiodPodDeletion(t.oc) | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| g.By("Deleting the Sail Operator subscription") | ||
| // This doesn't get deleted by the CIO, so must manually clean up | ||
| err = t.oc.Run("delete").Args("--ignore-not-found=true", "subscription", "-n", expectedSubscriptionNamespace, expectedSubscriptionName).Execute() | ||
| if err != nil { | ||
| e2e.Logf("Failed to delete Subscription %q: %v", expectedSubscriptionName, err) | ||
| } | ||
|
|
||
| g.By("Deleting Sail Operator CSV by label selector") | ||
| // Delete CSV using label selector to handle any version (e.g., servicemeshoperator3.v3.2.0) | ||
| labelSelector := fmt.Sprintf("operators.coreos.com/%s", serviceMeshOperatorName) | ||
| err = t.oc.Run("delete").Args("csv", "-n", expectedSubscriptionNamespace, "-l", labelSelector, "--ignore-not-found=true").Execute() | ||
| if err != nil { | ||
| e2e.Logf("Failed to delete CSV with label %q: %v", labelSelector, err) | ||
| } | ||
|
|
||
| g.By("Deleting OLM-managed Istio CRDs to clean up migration state") | ||
| // Delete Istio CRDs so subsequent NO-OLM tests don't see OLM-managed CRDs | ||
| // Use LabelSelector to only delete OLM-managed CRDs and suffix check as additional safety | ||
| crdList, err := t.oc.AdminApiextensionsClient().ApiextensionsV1().CustomResourceDefinitions().List(ctx, metav1.ListOptions{ | ||
| LabelSelector: "olm.managed=true", | ||
| }) | ||
| if err != nil { | ||
| e2e.Logf("Failed to list OLM-managed CRDs: %v", err) | ||
| } else { | ||
| for _, crd := range crdList.Items { | ||
| if strings.HasSuffix(crd.Name, "istio.io") { | ||
| err := t.oc.Run("delete").Args("--ignore-not-found=true", "crd", crd.Name).Execute() | ||
| if err != nil { | ||
| e2e.Logf("Failed to delete CRD %q: %v", crd.Name, err) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| e2e.Logf("Gateway API resources, Istio CR, OSSM subscription, CSV, and Istio CRDs successfully cleaned up") | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.