Skip to content

Commit

Permalink
fix: add check for all platform offering on components (opendatahub-i…
Browse files Browse the repository at this point in the history
…o#1205)

* fix: add check for all platform offering on components

- wait for component's deployment is ready before return to caller
- reduce logging
- prolong timeout for workbench, trustyai, dsp, kserve

Signed-off-by: Wen Zhou <[email protected]>

* fix: wrong log

Signed-off-by: Wen Zhou <[email protected]>

* test: increase timeout and move a bit of order

Signed-off-by: Wen Zhou <[email protected]>

---------

Signed-off-by: Wen Zhou <[email protected]>
Co-authored-by: ajaypratap003 <[email protected]>
  • Loading branch information
zdtsw and ajaypratap003 authored Aug 28, 2024
1 parent 742b09b commit ab7e04e
Show file tree
Hide file tree
Showing 15 changed files with 102 additions and 99 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ SHELL = /usr/bin/env bash -o pipefail
.SHELLFLAGS = -ec

# E2E tests additional flags
E2E_TEST_FLAGS = "--skip-deletion=false" -timeout 20m # See README.md, default go test timeout 10m
E2E_TEST_FLAGS = "--skip-deletion=false" -timeout 25m # See README.md, default go test timeout 10m

# Default image-build is to not use local odh-manifests folder
# set to "true" to use local instead
Expand Down
15 changes: 7 additions & 8 deletions components/codeflare/codeflare.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,16 +106,15 @@ func (c *CodeFlare) ReconcileComponent(ctx context.Context,
return err
}
l.Info("apply manifests done")
// CloudServiceMonitoring handling
if platform == cluster.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus won't fire alerts when it is just startup
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
l.Info("deployment is done, updating monitoring rules")

if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
}

// CloudServiceMonitoring handling
if platform == cluster.ManagedRhods {
// inject prometheus codeflare*.rules in to /opt/manifests/monitoring/prometheus/prometheus-configs.yaml
if err := c.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
Expand Down
20 changes: 12 additions & 8 deletions components/dashboard/dashboard.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,16 +141,14 @@ func (d *Dashboard) ReconcileComponent(ctx context.Context,
}
l.Info("apply manifests done")

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus won't fire alerts when it is just startup
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentNameDownstream, dscispec.ApplicationsNamespace, 20, 3); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentNameDownstream, err)
}
l.Info("deployment is done, updating monitoring rules")
if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentNameDownstream, dscispec.ApplicationsNamespace, 20, 3); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentNameDownstream, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if err := d.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentNameDownstream); err != nil {
return err
}
Expand All @@ -170,6 +168,12 @@ func (d *Dashboard) ReconcileComponent(ctx context.Context,
return err
}
l.Info("apply manifests done")
if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentNameUpstream, dscispec.ApplicationsNamespace, 20, 3); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentNameUpstream, err)
}
}

return nil
}
}
Expand Down
16 changes: 7 additions & 9 deletions components/datasciencepipelines/datasciencepipelines.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,17 +124,15 @@ func (d *DataSciencePipelines) ReconcileComponent(ctx context.Context,
}
l.Info("apply manifests done")

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus won't fire alerts when it is just startup
// only 1 replica should be very quick
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
l.Info("deployment is done, updating monitoring rules")
// Wait for deployment available
if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if err := d.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
Expand Down
15 changes: 8 additions & 7 deletions components/kserve/kserve.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,16 @@ func (k *Kserve) ReconcileComponent(ctx context.Context, cli client.Client,
}
}
l.WithValues("Path", Path).Info("apply manifests done for odh-model-controller")

// Wait for deployment available
if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 3); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus won't fire alerts when it is just startup
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
l.Info("deployment is done, updating monitoing rules")
}
// kesrve rules
if err := k.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
Expand Down
14 changes: 7 additions & 7 deletions components/kueue/kueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ func (k *Kueue) ReconcileComponent(ctx context.Context, cli client.Client, logge
return fmt.Errorf("failed to apply manifetss %s: %w", Path, err)
}
l.Info("apply manifests done")

if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus won't fire alerts when it is just startup
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
l.Info("deployment is done, updating monitoring rules")
}
if err := k.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
Expand Down
14 changes: 7 additions & 7 deletions components/modelmeshserving/modelmeshserving.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,15 @@ func (m *ModelMeshServing) ReconcileComponent(ctx context.Context,
}

l.WithValues("Path", DependentPath).Info("apply manifests done for odh-model-controller")

if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
// first check if service is up, so prometheus won't fire alerts when it is just startup
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
l.Info("deployment is done, updating monitoring rules")
}
// first model-mesh rules
if err := m.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
Expand Down
12 changes: 6 additions & 6 deletions components/modelregistry/modelregistry.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,14 @@ func (m *ModelRegistry) ReconcileComponent(ctx context.Context, cli client.Clien
}
l.Info("apply extra manifests done")

if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, m.GetComponentName(), dscispec.ApplicationsNamespace, 10, 1); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
l.Info("deployment is done, updating monitoring rules")
}
if err := m.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
Expand Down
14 changes: 7 additions & 7 deletions components/ray/ray.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,15 @@ func (r *Ray) ReconcileComponent(ctx context.Context, cli client.Client, logger
return fmt.Errorf("failed to apply manifets from %s : %w", RayPath, err)
}
l.Info("apply manifests done")

if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus won't fire alerts when it is just startup
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
l.Info("deployment is done, updating monitoring rules")
}
if err := r.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
Expand Down
15 changes: 7 additions & 8 deletions components/trainingoperator/trainingoperator.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,15 @@ func (r *TrainingOperator) ReconcileComponent(ctx context.Context, cli client.Cl
return err
}
l.Info("apply manifests done")

if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus wont fire alerts when it is just startup
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
logger.Info("deployment for " + ComponentName + " is done, updating monitoring rules")
}
l.Info("deployment is done, updating monitoring rules")
if err := r.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
Expand Down
13 changes: 7 additions & 6 deletions components/trustyai/trustyai.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,15 @@ func (t *TrustyAI) ReconcileComponent(ctx context.Context, cli client.Client, lo
}
l.Info("apply manifests done")

// Wait for deployment available
if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
l.Info("deployment is done, updating monitoring rules")
}
if err := t.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
Expand Down
16 changes: 8 additions & 8 deletions components/workbenches/workbenches.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,16 +172,16 @@ func (w *Workbenches) ReconcileComponent(ctx context.Context, cli client.Client,
return err
}
l.WithValues("Path", manifestsPath).Info("apply manifests done notebook image")

// Wait for deployment available
if enabled {
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 2); err != nil {
return fmt.Errorf("deployments for %s are not ready to server: %w", ComponentName, err)
}
}

// CloudService Monitoring handling
if platform == cluster.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus wont fire alerts when it is just startup
// only 1 replica set timeout to 1min
if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil {
return fmt.Errorf("deployments for %s are not ready to server: %w", ComponentName, err)
}
l.Info("deployment is done, updating monitoring rules")
}
if err := w.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cluster/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ func WaitForDeploymentAvailable(ctx context.Context, c client.Client, componentN
return false, fmt.Errorf("error fetching list of deployments: %w", err)
}

ctrl.Log.Info("waiting for " + strconv.Itoa(len(componentDeploymentList.Items)) + " deployment to be ready for %s" + componentName)
ctrl.Log.Info("waiting for " + strconv.Itoa(len(componentDeploymentList.Items)) + " deployment to be ready for " + componentName)
for _, deployment := range componentDeploymentList.Items {
if deployment.Status.ReadyReplicas != deployment.Status.Replicas {
return false, nil
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/controller_setup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ func NewTestContext() (*testContext, error) {
customClient: custClient,
operatorNamespace: opNamespace,
applicationsNamespace: testDSCI.Spec.ApplicationsNamespace,
resourceCreationTimeout: time.Minute * 2,
resourceCreationTimeout: time.Minute * 5, // since we introduce check for all deployment, we need prolong time here too and match what we set in the component
resourceRetryInterval: time.Second * 10,
ctx: context.TODO(),
testDsc: testDSC,
Expand Down
31 changes: 16 additions & 15 deletions tests/e2e/dsc_creation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,10 @@ func creationTestSuite(t *testing.T) {
t.Run("Creation of more than one of DataScienceCluster instance", func(t *testing.T) {
testCtx.testDSCDuplication(t)
})
t.Run("Validate all deployed components", func(t *testing.T) {
err = testCtx.testAllApplicationCreation(t)
require.NoError(t, err, "error testing deployments for DataScienceCluster: "+testCtx.testDsc.Name)
})
t.Run("Validate DSCInitialization instance", func(t *testing.T) {
err = testCtx.validateDSCI()
require.NoError(t, err, "error validating DSCInitialization instance")
})
t.Run("Validate DataScienceCluster instance", func(t *testing.T) {
err = testCtx.validateDSC()
require.NoError(t, err, "error validating DataScienceCluster instance")
})
t.Run("Validate Ownerrefrences exist", func(t *testing.T) {
err = testCtx.testOwnerrefrences()
require.NoError(t, err, "error getting all DataScienceCluster's Ownerrefrences")
Expand All @@ -78,6 +70,15 @@ func creationTestSuite(t *testing.T) {
err = testCtx.testDefaultCertsAvailable()
require.NoError(t, err, "error getting default cert secrets for Kserve")
})
t.Run("Validate Knative resoruce", func(t *testing.T) {
err = testCtx.validateDSC()
require.NoError(t, err, "error getting Knatvie resrouce as part of DataScienceCluster validation")
})
t.Run("Validate all deployed components", func(t *testing.T) {
// this will take about 5-6mins to complete
err = testCtx.testAllApplicationCreation(t)
require.NoError(t, err, "error testing deployments for DataScienceCluster: "+testCtx.testDsc.Name)
})
t.Run("Validate default model registry cert available", func(t *testing.T) {
err = testCtx.testDefaultModelRegistryCertAvailable()
require.NoError(t, err, "error getting default cert secret for ModelRegistry")
Expand Down Expand Up @@ -261,11 +262,6 @@ func (tc *testContext) testAllApplicationCreation(t *testing.T) error { //nolint
}
tc.testDsc = createdDSC

// Verify DSC instance is in Ready phase
if tc.testDsc.Status.Phase != "Ready" {
return fmt.Errorf("DSC instance is not in Ready phase. Current phase: %v", tc.testDsc.Status.Phase)
}

components, err := tc.testDsc.GetComponents()
if err != nil {
return err
Expand All @@ -287,6 +283,10 @@ func (tc *testContext) testAllApplicationCreation(t *testing.T) error { //nolint
}
})
}
// Verify DSC instance is in Ready phase in the end when all components are up and running
if tc.testDsc.Status.Phase != "Ready" {
return fmt.Errorf("DSC instance is not in Ready phase. Current phase: %v", tc.testDsc.Status.Phase)
}

return nil
}
Expand Down Expand Up @@ -354,6 +354,7 @@ func (tc *testContext) validateDSCI() error {
return nil
}

// test if knative resource has been created.
func (tc *testContext) validateDSC() error {
expServingSpec := infrav1.ServingSpec{
ManagementState: operatorv1.Managed,
Expand Down Expand Up @@ -586,8 +587,8 @@ func (tc *testContext) testUpdateDSCComponentEnabled() error {
return fmt.Errorf("error after retry %w", err)
}

// Sleep for 40 seconds to allow the operator to reconcile
time.Sleep(4 * tc.resourceRetryInterval)
// Sleep for 80 seconds to allow the operator to reconcile
time.Sleep(8 * tc.resourceRetryInterval)
_, err = tc.kubeClient.AppsV1().Deployments(tc.applicationsNamespace).Get(tc.ctx, dashboardDeploymentName, metav1.GetOptions{})
if err != nil {
if k8serr.IsNotFound(err) {
Expand Down

0 comments on commit ab7e04e

Please sign in to comment.