diff --git a/components/README.md b/components/README.md index 47f812a45f9..a4f6ccf1c47 100644 --- a/components/README.md +++ b/components/README.md @@ -36,6 +36,8 @@ can be found [here](https://github.com/opendatahub-io/opendatahub-operator/tree/ GetComponentName() string GetManagementState() operatorv1.ManagementState SetImageParamsMap(imageMap map[string]string) map[string]string + UpdatePrometheusConfig(cli client.Client, enable bool, component string) error + WaitForDeploymentAvailable(ctx context.Context, r *rest.Config, c string, n string, i int, t int) error } ``` ### Add reconcile and Events diff --git a/components/codeflare/codeflare.go b/components/codeflare/codeflare.go index 487f30a0cfa..ee6783f83a4 100644 --- a/components/codeflare/codeflare.go +++ b/components/codeflare/codeflare.go @@ -3,16 +3,19 @@ package codeflare import ( + "context" "fmt" "path/filepath" operatorv1 "github.com/openshift/api/operator/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" "github.com/opendatahub-io/opendatahub-operator/v2/components" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring" ) var ( @@ -52,7 +55,7 @@ func (c *CodeFlare) GetComponentName() string { return ComponentName } -func (c *CodeFlare) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { +func (c *CodeFlare) ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { var imageParamMap = map[string]string{ "odh-codeflare-operator-controller-image": "RELATED_IMAGE_ODH_CODEFLARE_OPERATOR_IMAGE", // no need mcad, embedded in cfo "namespace": dscispec.ApplicationsNamespace, @@ -101,6 +104,14 @@ func (c *CodeFlare) ReconcileComponent(cli client.Client, owner metav1.Object, d // CloudServiceMonitoring handling if platform == deploy.ManagedRhods { + if enabled { + // first check if the service is up, so prometheus wont fire alerts when it is just startup + if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName) + } + // inject prometheus codeflare*.rules in to /opt/manifests/monitoring/prometheus/prometheus-configs.yaml if err = c.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil { return err diff --git a/components/component.go b/components/component.go index 20761572fa3..e88eacd7f47 100644 --- a/components/component.go +++ b/components/component.go @@ -1,6 +1,7 @@ package components import ( + "context" "fmt" "os" "path/filepath" @@ -9,6 +10,7 @@ import ( operatorv1 "github.com/openshift/api/operator/v1" "gopkg.in/yaml.v2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" @@ -78,13 +80,14 @@ type ManifestsConfig struct { } type ComponentInterface interface { - ReconcileComponent(cli client.Client, owner metav1.Object, DSCISpec *dsciv1.DSCInitializationSpec, currentComponentStatus bool) error + ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, DSCISpec *dsciv1.DSCInitializationSpec, currentComponentStatus bool) error Cleanup(cli client.Client, DSCISpec *dsciv1.DSCInitializationSpec) error GetComponentName() string GetManagementState() operatorv1.ManagementState SetImageParamsMap(imageMap map[string]string) map[string]string OverrideManifests(platform string) error UpdatePrometheusConfig(cli client.Client, enable bool, component string) error + // WaitForDeploymentAvailable(ctx context.Context, r *rest.Config, c string, n string, i int, t int) error } // UpdatePrometheusConfig update prometheus-configs.yaml to include/exclude .rules @@ -181,3 +184,34 @@ func (c *Component) UpdatePrometheusConfig(cli client.Client, enable bool, compo } return nil } + +// WaitForDeploymentAvailable to check if component deployment from 'namepsace' is ready within 'timeout' before apply prometheus rules for the component +// func (c *Component) WaitForDeploymentAvailable(ctx context.Context, restConfig *rest.Config, componentName string, namespace string, interval int, timeout int) error { +// resourceInterval := time.Duration(interval) * time.Second +// resourceTimeout := time.Duration(timeout) * time.Minute +// return wait.PollUntilContextTimeout(context.TODO(), resourceInterval, resourceTimeout, true, func(ctx context.Context) (bool, error) { +// clientset, err := kubernetes.NewForConfig(restConfig) +// if err != nil { +// return false, fmt.Errorf("error getting client %w", err) +// } +// componentDeploymentList, err := clientset.AppsV1().Deployments(namespace).List(context.TODO(), metav1.ListOptions{ +// LabelSelector: "app.opendatahub.io/" + componentName, +// }) +// if err != nil { +// if errors.IsNotFound(err) { +// return false, nil +// } +// } +// isReady := false +// if len(componentDeploymentList.Items) != 0 { +// for _, deployment := range componentDeploymentList.Items { +// if deployment.Status.ReadyReplicas == deployment.Status.Replicas { +// isReady = true +// } else { +// isReady = false +// } +// } +// } +// return isReady, nil +// }) +// } diff --git a/components/dashboard/dashboard.go b/components/dashboard/dashboard.go index f879e67bf2d..e850829084b 100644 --- a/components/dashboard/dashboard.go +++ b/components/dashboard/dashboard.go @@ -13,6 +13,7 @@ import ( v1 "k8s.io/api/core/v1" apierrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" @@ -20,6 +21,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/common" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring" ) var ( @@ -77,7 +79,13 @@ func (d *Dashboard) GetComponentName() string { } //nolint:gocyclo -func (d *Dashboard) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, currentComponentStatus bool) error { +func (d *Dashboard) ReconcileComponent(ctx context.Context, + cli client.Client, + resConf *rest.Config, + owner metav1.Object, + dscispec *dsciv1.DSCInitializationSpec, + currentComponentStatus bool, +) error { var imageParamMap = map[string]string{ "odh-dashboard-image": "RELATED_IMAGE_ODH_DASHBOARD_IMAGE", } @@ -161,6 +169,14 @@ func (d *Dashboard) ReconcileComponent(cli client.Client, owner metav1.Object, d } // CloudService Monitoring handling if platform == deploy.ManagedRhods { + if enabled { + // first check if the service is up, so prometheus wont fire alerts when it is just startup + if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentNameSupported, dscispec.ApplicationsNamespace, 20, 3); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentNameSupported) + } + if err := d.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentNameSupported); err != nil { return err } diff --git a/components/datasciencepipelines/datasciencepipelines.go b/components/datasciencepipelines/datasciencepipelines.go index 8bc37de2cc3..639e90fb328 100644 --- a/components/datasciencepipelines/datasciencepipelines.go +++ b/components/datasciencepipelines/datasciencepipelines.go @@ -3,15 +3,19 @@ package datasciencepipelines import ( + "context" + "fmt" "path/filepath" operatorv1 "github.com/openshift/api/operator/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" "github.com/opendatahub-io/opendatahub-operator/v2/components" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring" ) var ( @@ -49,7 +53,13 @@ func (d *DataSciencePipelines) GetComponentName() string { return ComponentName } -func (d *DataSciencePipelines) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { +func (d *DataSciencePipelines) ReconcileComponent(ctx context.Context, + cli client.Client, + resConf *rest.Config, + owner metav1.Object, + dscispec *dsciv1.DSCInitializationSpec, + _ bool, +) error { var imageParamMap = map[string]string{ "IMAGES_APISERVER": "RELATED_IMAGE_ODH_ML_PIPELINES_API_SERVER_IMAGE", "IMAGES_ARTIFACT": "RELATED_IMAGE_ODH_ML_PIPELINES_ARTIFACT_MANAGER_IMAGE", @@ -87,6 +97,15 @@ func (d *DataSciencePipelines) ReconcileComponent(cli client.Client, owner metav } // CloudService Monitoring handling if platform == deploy.ManagedRhods { + if enabled { + // first check if the service is up, so prometheus wont fire alerts when it is just startup + // only 1 replica should be very quick + if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName) + } + if err := d.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/components/kserve/kserve.go b/components/kserve/kserve.go index 1d954cfa434..c69e4a47876 100644 --- a/components/kserve/kserve.go +++ b/components/kserve/kserve.go @@ -2,6 +2,7 @@ package kserve import ( + "context" "fmt" "path/filepath" "strings" @@ -9,6 +10,7 @@ import ( "github.com/hashicorp/go-multierror" operatorv1 "github.com/openshift/api/operator/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" @@ -79,7 +81,7 @@ func (k *Kserve) GetComponentName() string { return ComponentName } -func (k *Kserve) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { +func (k *Kserve) ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { // paramMap for Kserve to use. var imageParamMap = map[string]string{} diff --git a/components/modelmeshserving/modelmeshserving.go b/components/modelmeshserving/modelmeshserving.go index c2c3e451274..22c65a9a31b 100644 --- a/components/modelmeshserving/modelmeshserving.go +++ b/components/modelmeshserving/modelmeshserving.go @@ -2,17 +2,21 @@ package modelmeshserving import ( + "context" + "fmt" "path/filepath" "strings" operatorv1 "github.com/openshift/api/operator/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" "github.com/opendatahub-io/opendatahub-operator/v2/components" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring" ) var ( @@ -67,7 +71,13 @@ func (m *ModelMeshServing) GetComponentName() string { return ComponentName } -func (m *ModelMeshServing) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { +func (m *ModelMeshServing) ReconcileComponent(ctx context.Context, + cli client.Client, + resConf *rest.Config, + owner metav1.Object, + dscispec *dsciv1.DSCInitializationSpec, + _ bool, +) error { var imageParamMap = map[string]string{ "odh-mm-rest-proxy": "RELATED_IMAGE_ODH_MM_REST_PROXY_IMAGE", "odh-modelmesh-runtime-adapter": "RELATED_IMAGE_ODH_MODELMESH_RUNTIME_ADAPTER_IMAGE", @@ -137,12 +147,25 @@ func (m *ModelMeshServing) ReconcileComponent(cli client.Client, owner metav1.Ob } // CloudService Monitoring handling - // TODO: cleanup logic in rhods-2.5 if platform == deploy.ManagedRhods { + if enabled { + // first check if the 1st service is up, so prometheus wont fire alerts when it is just startup + if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName) + } // first model-mesh rules if err := m.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil { return err } + if enabled { + // then check if the 2nd service is up, so prometheus wont fire alerts when it is just startup + if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, DependentComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment %s is not ready to server: %w", DependentComponentName, err) + } + fmt.Printf("deployment for %s is done, updating monitoing rules", DependentComponentName) + } // then odh-model-controller rules if err := m.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, DependentComponentName); err != nil { return err diff --git a/components/ray/ray.go b/components/ray/ray.go index 0520e4a260f..0dc5f8d082f 100644 --- a/components/ray/ray.go +++ b/components/ray/ray.go @@ -3,15 +3,19 @@ package ray import ( + "context" + "fmt" "path/filepath" operatorv1 "github.com/openshift/api/operator/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" "github.com/opendatahub-io/opendatahub-operator/v2/components" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring" ) var ( @@ -49,7 +53,7 @@ func (r *Ray) GetComponentName() string { return ComponentName } -func (r *Ray) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { +func (r *Ray) ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { var imageParamMap = map[string]string{ "odh-kuberay-operator-controller-image": "RELATED_IMAGE_ODH_KUBERAY_OPERATOR_CONTROLLER_IMAGE", "namespace": dscispec.ApplicationsNamespace, @@ -80,6 +84,13 @@ func (r *Ray) ReconcileComponent(cli client.Client, owner metav1.Object, dscispe } // CloudService Monitoring handling if platform == deploy.ManagedRhods { + if enabled { + // first check if the service is up, so prometheus wont fire alerts when it is just startup + if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName) + } if err := r.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/components/trustyai/trustyai.go b/components/trustyai/trustyai.go index 5dba88f14a9..1bdd7dbc734 100644 --- a/components/trustyai/trustyai.go +++ b/components/trustyai/trustyai.go @@ -2,10 +2,12 @@ package trustyai import ( + "context" "path/filepath" operatorv1 "github.com/openshift/api/operator/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" @@ -48,7 +50,7 @@ func (t *TrustyAI) GetComponentName() string { return ComponentName } -func (t *TrustyAI) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { +func (t *TrustyAI) ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error { var imageParamMap = map[string]string{ "trustyaiServiceImage": "RELATED_IMAGE_ODH_TRUSTYAI_SERVICE_IMAGE", "trustyaiOperatorImage": "RELATED_IMAGE_ODH_TRUSTYAI_SERVICE_OPERATOR_IMAGE", diff --git a/components/workbenches/workbenches.go b/components/workbenches/workbenches.go index 8e7fbdf9775..8c89ef20b49 100644 --- a/components/workbenches/workbenches.go +++ b/components/workbenches/workbenches.go @@ -2,17 +2,21 @@ package workbenches import ( + "context" + "fmt" "path/filepath" "strings" operatorv1 "github.com/openshift/api/operator/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" dsci "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" "github.com/opendatahub-io/opendatahub-operator/v2/components" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring" ) var ( @@ -93,7 +97,7 @@ func (w *Workbenches) GetComponentName() string { return ComponentName } -func (w *Workbenches) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsci.DSCInitializationSpec, _ bool) error { +func (w *Workbenches) ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, dscispec *dsci.DSCInitializationSpec, _ bool) error { var imageParamMap = map[string]string{ "odh-notebook-controller-image": "RELATED_IMAGE_ODH_NOTEBOOK_CONTROLLER_IMAGE", "odh-kf-notebook-controller-image": "RELATED_IMAGE_ODH_KF_NOTEBOOK_CONTROLLER_IMAGE", @@ -169,6 +173,14 @@ func (w *Workbenches) ReconcileComponent(cli client.Client, owner metav1.Object, } // CloudService Monitoring handling if platform == deploy.ManagedRhods { + if enabled { + // first check if the service is up, so prometheus wont fire alerts when it is just startup + // only 1 replica set timeout to 1min + if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil { + return fmt.Errorf("deployments for %s are not ready to server: %w", ComponentName, err) + } + fmt.Printf("deployments for %s are done, updating monitoing rules", ComponentName) + } if err := w.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/controllers/datasciencecluster/datasciencecluster_controller.go b/controllers/datasciencecluster/datasciencecluster_controller.go index 584a3d81be9..0e71b7688ab 100644 --- a/controllers/datasciencecluster/datasciencecluster_controller.go +++ b/controllers/datasciencecluster/datasciencecluster_controller.go @@ -265,7 +265,7 @@ func (r *DataScienceClusterReconciler) reconcileSubComponent(ctx context.Context } // Reconcile component - err = component.ReconcileComponent(r.Client, instance, r.DataScienceCluster.DSCISpec, instance.Status.InstalledComponents[componentName]) + err = component.ReconcileComponent(ctx, r.Client, r.RestConfig, instance, r.DataScienceCluster.DSCISpec, instance.Status.InstalledComponents[componentName]) if err != nil { // reconciliation failed: log errors, raise event and update status accordingly diff --git a/pkg/monitoring/monitoring.go b/pkg/monitoring/monitoring.go new file mode 100644 index 00000000000..2b0424dff86 --- /dev/null +++ b/pkg/monitoring/monitoring.go @@ -0,0 +1,45 @@ +package monitoring + +import ( + "context" + "fmt" + "time" + + errors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +// WaitForDeploymentAvailable to check if component deployment from 'namepsace' is ready within 'timeout' before apply prometheus rules for the component +func WaitForDeploymentAvailable(ctx context.Context, restConfig *rest.Config, componentName string, namespace string, interval int, timeout int) error { + resourceInterval := time.Duration(interval) * time.Second + resourceTimeout := time.Duration(timeout) * time.Minute + return wait.PollUntilContextTimeout(context.TODO(), resourceInterval, resourceTimeout, true, func(ctx context.Context) (bool, error) { + clientset, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return false, fmt.Errorf("error getting client %v", err) + } + componentDeploymentList, err := clientset.AppsV1().Deployments(namespace).List(context.TODO(), metav1.ListOptions{ + LabelSelector: "app.opendatahub.io/" + componentName, + }) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + } + isReady := false + fmt.Printf("we are waiting for %d deployment ready for component %s\n", len(componentDeploymentList.Items), componentName) + if len(componentDeploymentList.Items) != 0 { + for _, deployment := range componentDeploymentList.Items { + if deployment.Status.ReadyReplicas == deployment.Status.Replicas { + isReady = true + } else { + isReady = false + } + } + } + return isReady, nil + }) +}