Skip to content

Commit

Permalink
fix(mm-monitoring): revert the code logic but set to disable as delete (
Browse files Browse the repository at this point in the history
red-hat-data-services#153)

Signed-off-by: Wen Zhou <[email protected]>

fix(dsc): stop watching validationwebhook for non-create/delete events (red-hat-data-services#150)

* fix(dsc): stop watching validationwebhook for non-create/delete events
* update: remove CRD in the DSC watch and cleanup debug
* fix: add more ignore on the label changes
---------

Signed-off-by: Wen Zhou <[email protected]>

Revert "Remove modelmesh monitoring"

This reverts commit 91dd78f.

fix(modelmesh): remove wrong check on the deployment of modelmesh (red-hat-data-services#148)

Signed-off-by: Wen Zhou <[email protected]>

Retain existing DSCI values

Explicilty add Servicemesh in default dsci

Update defaults for modelmesh

(cherry picked from commit 6eb6d4a)
  • Loading branch information
zdtsw authored Dec 6, 2023
1 parent 71d8ddf commit a4788f3
Show file tree
Hide file tree
Showing 13 changed files with 113 additions and 51 deletions.
10 changes: 9 additions & 1 deletion bundle/manifests/rhods-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ metadata:
"managementState": "Managed"
},
"modelmeshserving": {
"managementState": "Removed"
"managementState": "Managed"
},
"ray": {
"managementState": "Removed"
Expand Down Expand Up @@ -64,6 +64,14 @@ metadata:
"monitoring": {
"managementState": "Managed",
"namespace": "redhat-ods-monitoring"
},
"serviceMesh": {
"controlPlane": {
"metricsCollection": "Istio",
"name": "data-science-smcp",
"namespace": "istio-system"
},
"managementState": "Managed"
}
}
},
Expand Down
2 changes: 1 addition & 1 deletion components/codeflare/codeflare.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func (c *CodeFlare) ReconcileComponent(ctx context.Context, cli client.Client, r
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName)
fmt.Printf("deployment for %s is done, updating monitoring rules\n", ComponentName)
}

// inject prometheus codeflare*.rules in to /opt/manifests/monitoring/prometheus/prometheus-configs.yaml
Expand Down
2 changes: 1 addition & 1 deletion components/dashboard/dashboard.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ func (d *Dashboard) ReconcileComponent(ctx context.Context,
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentNameSupported, dscispec.ApplicationsNamespace, 20, 3); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentNameSupported)
fmt.Printf("deployment for %s is done, updating monitoring rules\n", ComponentNameSupported)
}

if err := d.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentNameSupported); err != nil {
Expand Down
2 changes: 1 addition & 1 deletion components/datasciencepipelines/datasciencepipelines.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (d *DataSciencePipelines) ReconcileComponent(ctx context.Context,
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName)
fmt.Printf("deployment for %s is done, updating monitoring rules\n", ComponentName)
}

if err := d.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil {
Expand Down
25 changes: 16 additions & 9 deletions components/modelmeshserving/modelmeshserving.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
var (
ComponentName = "model-mesh"
Path = deploy.DefaultManifestPath + "/" + ComponentName + "/overlays/odh"
monitoringPath = deploy.DefaultManifestPath + "/" + "modelmesh-monitoring/base"
DependentComponentName = "odh-model-controller"
DependentPath = deploy.DefaultManifestPath + "/" + DependentComponentName + "/base"
)
Expand Down Expand Up @@ -146,26 +147,32 @@ func (m *ModelMeshServing) ReconcileComponent(ctx context.Context,
}
}

// Get monitoring namespace
var monitoringNamespace string
if dscispec.Monitoring.Namespace != "" {
monitoringNamespace = dscispec.Monitoring.Namespace
} else {
monitoringNamespace = dscispec.ApplicationsNamespace
}

// Ensure we do not deploy modelmesh-monitoring if it has been there from previous releases
if err = deploy.DeployManifestsFromPath(cli, owner, monitoringPath, monitoringNamespace, ComponentName, false); err != nil {
return err
}

// CloudService Monitoring handling
if platform == deploy.ManagedRhods {
if enabled {
// first check if the 1st service is up, so prometheus wont fire alerts when it is just startup
// first check if service is up, so prometheus wont fire alerts when it is just startup
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName)
fmt.Printf("deployment for %s is done, updating monitoring rules\n", ComponentName)
}
// first model-mesh rules
if err := m.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
if enabled {
// then check if the 2nd service is up, so prometheus wont fire alerts when it is just startup
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, DependentComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment %s is not ready to server: %w", DependentComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", DependentComponentName)
}
// then odh-model-controller rules
if err := m.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, DependentComponentName); err != nil {
return err
Expand Down
2 changes: 1 addition & 1 deletion components/ray/ray.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ func (r *Ray) ReconcileComponent(ctx context.Context, cli client.Client, resConf
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName)
fmt.Printf("deployment for %s is done, updating monitoring rules\n", ComponentName)
}
if err := r.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil {
return err
Expand Down
2 changes: 1 addition & 1 deletion components/workbenches/workbenches.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ func (w *Workbenches) ReconcileComponent(ctx context.Context, cli client.Client,
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil {
return fmt.Errorf("deployments for %s are not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployments for %s are done, updating monitoing rules", ComponentName)
fmt.Printf("deployments for %s are done, updating monitoring rules\n", ComponentName)
}
if err := w.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil {
return err
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ spec:
kserve:
managementState: "Managed"
modelmeshserving:
managementState: "Removed"
managementState: "Managed"
ray:
managementState: "Removed"
workbenches:
Expand Down
6 changes: 6 additions & 0 deletions config/samples/dscinitialization_v1_dscinitialization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,10 @@ spec:
managementState: "Managed"
namespace: 'redhat-ods-monitoring'
applicationsNamespace: 'redhat-ods-applications'
serviceMesh:
controlPlane:
metricsCollection: Istio
name: data-science-smcp
namespace: istio-system
managementState: Managed

69 changes: 60 additions & 9 deletions controllers/datasciencecluster/datasciencecluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"errors"
"fmt"
"reflect"
"strings"
"time"

"github.com/go-logr/logr"
Expand All @@ -35,7 +36,6 @@ import (
corev1 "k8s.io/api/core/v1"
netv1 "k8s.io/api/networking/v1"
authv1 "k8s.io/api/rbac/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/rest"
Expand Down Expand Up @@ -324,6 +324,58 @@ var configMapPredicates = predicate.Funcs{
},
}

// a workaround for 2.5 due to odh-model-controller serivceaccount keeps updates with label
var saPredicates = predicate.Funcs{
UpdateFunc: func(e event.UpdateEvent) bool {
if e.ObjectNew.GetName() == "odh-model-controller" && e.ObjectNew.GetNamespace() == "redhat-ods-applications" {
return false
}
return true
},
}

// a workaround for 2.5 due to modelmesh-servingruntime.serving.kserve.io keeps updates
var modelMeshwebhookPredicates = predicate.Funcs{
UpdateFunc: func(e event.UpdateEvent) bool {
return e.ObjectNew.GetName() != "modelmesh-servingruntime.serving.kserve.io"
},
}

var modelMeshRolePredicates = predicate.Funcs{
UpdateFunc: func(e event.UpdateEvent) bool {
notAllowedNames := []string{"leader-election-role", "proxy-role", "metrics-reader", "kserve-prometheus-k8s", "odh-model-controller-role"}
for _, notallowedName := range notAllowedNames {
if e.ObjectNew.GetName() == notallowedName {
return false
}
}
return true
},
}

var modelMeshRBPredicates = predicate.Funcs{
UpdateFunc: func(e event.UpdateEvent) bool {
notAllowedNames := []string{"leader-election-rolebinding", "proxy-rolebinding", "odh-model-controller-rolebinding-opendatahub"}
for _, notallowedName := range notAllowedNames {
if e.ObjectNew.GetName() == notallowedName {
return false
}
}
return true
},
}

// ignore label updates if it is from application namespace
var modelMeshGeneralPredicates = predicate.Funcs{
UpdateFunc: func(e event.UpdateEvent) bool {
if strings.Contains(e.ObjectNew.GetName(), "odh-model-controller") || strings.Contains(e.ObjectNew.GetName(), "kserve") {
return false
} else {
return true
}
},
}

// SetupWithManager sets up the controller with the Manager.
func (r *DataScienceClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
Expand All @@ -332,26 +384,25 @@ func (r *DataScienceClusterReconciler) SetupWithManager(mgr ctrl.Manager) error
Owns(&corev1.Secret{}).
Owns(&corev1.ConfigMap{}, builder.WithPredicates(configMapPredicates)).
Owns(&netv1.NetworkPolicy{}).
Owns(&authv1.Role{}).
Owns(&authv1.RoleBinding{}).
Owns(&authv1.ClusterRole{}).
Owns(&authv1.ClusterRoleBinding{}).
Owns(&authv1.Role{}, builder.WithPredicates(predicate.Or(predicate.GenerationChangedPredicate{}, modelMeshRolePredicates))).
Owns(&authv1.RoleBinding{}, builder.WithPredicates(predicate.Or(predicate.GenerationChangedPredicate{}, modelMeshRBPredicates))).
Owns(&authv1.ClusterRole{}, builder.WithPredicates(predicate.Or(predicate.GenerationChangedPredicate{}, modelMeshRolePredicates))).
Owns(&authv1.ClusterRoleBinding{}, builder.WithPredicates(predicate.Or(predicate.GenerationChangedPredicate{}, modelMeshRBPredicates))).
Owns(&appsv1.Deployment{}).
Owns(&appsv1.ReplicaSet{}).
Owns(&corev1.Pod{}).
Owns(&corev1.PersistentVolumeClaim{}).
Owns(&corev1.Service{}).
Owns(&corev1.Service{}, builder.WithPredicates(predicate.Or(predicate.GenerationChangedPredicate{}, modelMeshGeneralPredicates))).
Owns(&appsv1.DaemonSet{}).
Owns(&appsv1.StatefulSet{}).
Owns(&ocappsv1.DeploymentConfig{}).
Owns(&ocimgv1.ImageStream{}).
Owns(&ocbuildv1.BuildConfig{}).
Owns(&apiextensionsv1.CustomResourceDefinition{}).
Owns(&apiregistrationv1.APIService{}).
Owns(&netv1.Ingress{}).
Owns(&admv1.MutatingWebhookConfiguration{}).
Owns(&admv1.ValidatingWebhookConfiguration{}).
Owns(&corev1.ServiceAccount{}).
Owns(&admv1.ValidatingWebhookConfiguration{}, builder.WithPredicates(modelMeshwebhookPredicates)).
Owns(&corev1.ServiceAccount{}, builder.WithPredicates(saPredicates)).
Watches(&source.Kind{Type: &dsci.DSCInitialization{}}, handler.EnqueueRequestsFromMapFunc(r.watchDataScienceClusterResources)).
Watches(&source.Kind{Type: &corev1.ConfigMap{}}, handler.EnqueueRequestsFromMapFunc(r.watchDataScienceClusterResources), builder.WithPredicates(configMapPredicates)).
// this predicates prevents meaningless reconciliations from being triggered
Expand Down
2 changes: 2 additions & 0 deletions get_all_manifests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ rm -fr ./odh-manifests/* ./.odh-manifests-tmp/
mkdir -p ./.odh-manifests-tmp/ ./odh-manifests/
wget -q -c ${MANIFESTS_TARBALL_URL} -O - | tar -zxv -C ./.odh-manifests-tmp/ --strip-components 1 > /dev/null

# mm-monitroing
cp -r ./.odh-manifests-tmp/modelmesh-monitoring/ ./odh-manifests

# Dashboard
cp -r ./.odh-manifests-tmp/odh-dashboard/ ./odh-manifests/dashboard
Expand Down
2 changes: 1 addition & 1 deletion pkg/monitoring/monitoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func WaitForDeploymentAvailable(ctx context.Context, restConfig *rest.Config, co
}
}
isReady := false
fmt.Printf("we are waiting for %d deployment ready for component %s\n", len(componentDeploymentList.Items), componentName)
fmt.Printf("waiting for %d deployment to be ready for %s\n", len(componentDeploymentList.Items), componentName)
if len(componentDeploymentList.Items) != 0 {
for _, deployment := range componentDeploymentList.Items {
if deployment.Status.ReadyReplicas == deployment.Status.Replicas {
Expand Down
38 changes: 13 additions & 25 deletions pkg/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package upgrade

import (
"context"
"encoding/json"
"fmt"
"os"
// "reflect"
Expand All @@ -18,7 +17,6 @@ import (
apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
apierrs "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -35,6 +33,7 @@ import (
"github.com/opendatahub-io/opendatahub-operator/v2/components/ray"
"github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai"
"github.com/opendatahub-io/opendatahub-operator/v2/components/workbenches"
infrav1 "github.com/opendatahub-io/opendatahub-operator/v2/infrastructure/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
)
Expand Down Expand Up @@ -163,7 +162,7 @@ func CreateDefaultDSC(cli client.Client, platform deploy.Platform) error {
Component: components.Component{ManagementState: operatorv1.Managed},
},
ModelMeshServing: modelmeshserving.ModelMeshServing{
Component: components.Component{ManagementState: operatorv1.Removed},
Component: components.Component{ManagementState: operatorv1.Managed},
},
DataSciencePipelines: datasciencepipelines.DataSciencePipelines{
Component: components.Component{ManagementState: operatorv1.Managed},
Expand Down Expand Up @@ -207,6 +206,14 @@ func CreateDefaultDSCI(cli client.Client, platform deploy.Platform, appNamespace
ManagementState: operatorv1.Managed,
Namespace: monNamespace,
},
ServiceMesh: infrav1.ServiceMeshSpec{
ManagementState: "Managed",
ControlPlane: infrav1.ControlPlaneSpec{
Name: "data-science-smcp",
Namespace: "istio-system",
MetricsCollection: "Istio",
},
},
}

defaultDsci := &dsci.DSCInitialization{
Expand All @@ -220,14 +227,6 @@ func CreateDefaultDSCI(cli client.Client, platform deploy.Platform, appNamespace
Spec: *defaultDsciSpec,
}

patchedDSCI := &dsci.DSCInitialization{
TypeMeta: metav1.TypeMeta{
Kind: "DSCInitialization",
APIVersion: "dscinitialization.opendatahub.io/v1",
},
Spec: *defaultDsciSpec,
}

instances := &dsci.DSCInitializationList{}
if err := cli.List(context.TODO(), instances); err != nil {
return err
Expand All @@ -238,20 +237,9 @@ func CreateDefaultDSCI(cli client.Client, platform deploy.Platform, appNamespace
fmt.Printf("only one instance of DSCInitialization object is allowed. Please delete other instances ")
return nil
case len(instances.Items) == 1:
if platform == deploy.ManagedRhods || platform == deploy.SelfManagedRhods {
data, err := json.Marshal(patchedDSCI)
if err != nil {
return err
}
existingDSCI := &instances.Items[0]
err = cli.Patch(context.TODO(), existingDSCI, client.RawPatch(types.ApplyPatchType, data),
client.ForceOwnership, client.FieldOwner("rhods-operator"))
if err != nil {
return err
}
} else {
return nil
}
// Do not patch/update if DSCI already exists.
fmt.Printf("DSCInitialization resource already exists. It will not be updated with default DSCI.")
return nil
case len(instances.Items) == 0:
err := cli.Create(context.TODO(), defaultDsci)
if err != nil {
Expand Down

0 comments on commit a4788f3

Please sign in to comment.