Skip to content

Commit

Permalink
fix(monitoring): do not add component rules till service is up (opend…
Browse files Browse the repository at this point in the history
…atahub-io#137)

Signed-off-by: Wen Zhou <[email protected]>
  • Loading branch information
zdtsw authored Nov 24, 2023
1 parent f7c2713 commit d85efc5
Show file tree
Hide file tree
Showing 12 changed files with 188 additions and 11 deletions.
2 changes: 2 additions & 0 deletions components/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ can be found [here](https://github.com/opendatahub-io/opendatahub-operator/tree/
GetComponentName() string
GetManagementState() operatorv1.ManagementState
SetImageParamsMap(imageMap map[string]string) map[string]string
UpdatePrometheusConfig(cli client.Client, enable bool, component string) error
WaitForDeploymentAvailable(ctx context.Context, r *rest.Config, c string, n string, i int, t int) error
}
```
### Add reconcile and Events
Expand Down
13 changes: 12 additions & 1 deletion components/codeflare/codeflare.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@
package codeflare

import (
"context"
"fmt"
"path/filepath"

operatorv1 "github.com/openshift/api/operator/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"

dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/components"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring"
)

var (
Expand Down Expand Up @@ -52,7 +55,7 @@ func (c *CodeFlare) GetComponentName() string {
return ComponentName
}

func (c *CodeFlare) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
func (c *CodeFlare) ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
var imageParamMap = map[string]string{
"odh-codeflare-operator-controller-image": "RELATED_IMAGE_ODH_CODEFLARE_OPERATOR_IMAGE", // no need mcad, embedded in cfo
"namespace": dscispec.ApplicationsNamespace,
Expand Down Expand Up @@ -101,6 +104,14 @@ func (c *CodeFlare) ReconcileComponent(cli client.Client, owner metav1.Object, d

// CloudServiceMonitoring handling
if platform == deploy.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus wont fire alerts when it is just startup
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName)
}

// inject prometheus codeflare*.rules in to /opt/manifests/monitoring/prometheus/prometheus-configs.yaml
if err = c.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil {
return err
Expand Down
36 changes: 35 additions & 1 deletion components/component.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package components

import (
"context"
"fmt"
"os"
"path/filepath"
Expand All @@ -9,6 +10,7 @@ import (
operatorv1 "github.com/openshift/api/operator/v1"
"gopkg.in/yaml.v2"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"

dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
Expand Down Expand Up @@ -78,13 +80,14 @@ type ManifestsConfig struct {
}

type ComponentInterface interface {
ReconcileComponent(cli client.Client, owner metav1.Object, DSCISpec *dsciv1.DSCInitializationSpec, currentComponentStatus bool) error
ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, DSCISpec *dsciv1.DSCInitializationSpec, currentComponentStatus bool) error
Cleanup(cli client.Client, DSCISpec *dsciv1.DSCInitializationSpec) error
GetComponentName() string
GetManagementState() operatorv1.ManagementState
SetImageParamsMap(imageMap map[string]string) map[string]string
OverrideManifests(platform string) error
UpdatePrometheusConfig(cli client.Client, enable bool, component string) error
// WaitForDeploymentAvailable(ctx context.Context, r *rest.Config, c string, n string, i int, t int) error
}

// UpdatePrometheusConfig update prometheus-configs.yaml to include/exclude <component>.rules
Expand Down Expand Up @@ -181,3 +184,34 @@ func (c *Component) UpdatePrometheusConfig(cli client.Client, enable bool, compo
}
return nil
}

// WaitForDeploymentAvailable to check if component deployment from 'namepsace' is ready within 'timeout' before apply prometheus rules for the component
// func (c *Component) WaitForDeploymentAvailable(ctx context.Context, restConfig *rest.Config, componentName string, namespace string, interval int, timeout int) error {
// resourceInterval := time.Duration(interval) * time.Second
// resourceTimeout := time.Duration(timeout) * time.Minute
// return wait.PollUntilContextTimeout(context.TODO(), resourceInterval, resourceTimeout, true, func(ctx context.Context) (bool, error) {
// clientset, err := kubernetes.NewForConfig(restConfig)
// if err != nil {
// return false, fmt.Errorf("error getting client %w", err)
// }
// componentDeploymentList, err := clientset.AppsV1().Deployments(namespace).List(context.TODO(), metav1.ListOptions{
// LabelSelector: "app.opendatahub.io/" + componentName,
// })
// if err != nil {
// if errors.IsNotFound(err) {
// return false, nil
// }
// }
// isReady := false
// if len(componentDeploymentList.Items) != 0 {
// for _, deployment := range componentDeploymentList.Items {
// if deployment.Status.ReadyReplicas == deployment.Status.Replicas {
// isReady = true
// } else {
// isReady = false
// }
// }
// }
// return isReady, nil
// })
// }
18 changes: 17 additions & 1 deletion components/dashboard/dashboard.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ import (
v1 "k8s.io/api/core/v1"
apierrs "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"

dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/components"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/common"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring"
)

var (
Expand Down Expand Up @@ -77,7 +79,13 @@ func (d *Dashboard) GetComponentName() string {
}

//nolint:gocyclo
func (d *Dashboard) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, currentComponentStatus bool) error {
func (d *Dashboard) ReconcileComponent(ctx context.Context,
cli client.Client,
resConf *rest.Config,
owner metav1.Object,
dscispec *dsciv1.DSCInitializationSpec,
currentComponentStatus bool,
) error {
var imageParamMap = map[string]string{
"odh-dashboard-image": "RELATED_IMAGE_ODH_DASHBOARD_IMAGE",
}
Expand Down Expand Up @@ -161,6 +169,14 @@ func (d *Dashboard) ReconcileComponent(cli client.Client, owner metav1.Object, d
}
// CloudService Monitoring handling
if platform == deploy.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus wont fire alerts when it is just startup
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentNameSupported, dscispec.ApplicationsNamespace, 20, 3); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentNameSupported)
}

if err := d.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentNameSupported); err != nil {
return err
}
Expand Down
21 changes: 20 additions & 1 deletion components/datasciencepipelines/datasciencepipelines.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
package datasciencepipelines

import (
"context"
"fmt"
"path/filepath"

operatorv1 "github.com/openshift/api/operator/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"

dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/components"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring"
)

var (
Expand Down Expand Up @@ -49,7 +53,13 @@ func (d *DataSciencePipelines) GetComponentName() string {
return ComponentName
}

func (d *DataSciencePipelines) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
func (d *DataSciencePipelines) ReconcileComponent(ctx context.Context,
cli client.Client,
resConf *rest.Config,
owner metav1.Object,
dscispec *dsciv1.DSCInitializationSpec,
_ bool,
) error {
var imageParamMap = map[string]string{
"IMAGES_APISERVER": "RELATED_IMAGE_ODH_ML_PIPELINES_API_SERVER_IMAGE",
"IMAGES_ARTIFACT": "RELATED_IMAGE_ODH_ML_PIPELINES_ARTIFACT_MANAGER_IMAGE",
Expand Down Expand Up @@ -87,6 +97,15 @@ func (d *DataSciencePipelines) ReconcileComponent(cli client.Client, owner metav
}
// CloudService Monitoring handling
if platform == deploy.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus wont fire alerts when it is just startup
// only 1 replica should be very quick
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName)
}

if err := d.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
Expand Down
4 changes: 3 additions & 1 deletion components/kserve/kserve.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
package kserve

import (
"context"
"fmt"
"path/filepath"
"strings"

"github.com/hashicorp/go-multierror"
operatorv1 "github.com/openshift/api/operator/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"

dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
Expand Down Expand Up @@ -79,7 +81,7 @@ func (k *Kserve) GetComponentName() string {
return ComponentName
}

func (k *Kserve) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
func (k *Kserve) ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
// paramMap for Kserve to use.
var imageParamMap = map[string]string{}

Expand Down
27 changes: 25 additions & 2 deletions components/modelmeshserving/modelmeshserving.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@
package modelmeshserving

import (
"context"
"fmt"
"path/filepath"
"strings"

operatorv1 "github.com/openshift/api/operator/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"

dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/components"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring"
)

var (
Expand Down Expand Up @@ -67,7 +71,13 @@ func (m *ModelMeshServing) GetComponentName() string {
return ComponentName
}

func (m *ModelMeshServing) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
func (m *ModelMeshServing) ReconcileComponent(ctx context.Context,
cli client.Client,
resConf *rest.Config,
owner metav1.Object,
dscispec *dsciv1.DSCInitializationSpec,
_ bool,
) error {
var imageParamMap = map[string]string{
"odh-mm-rest-proxy": "RELATED_IMAGE_ODH_MM_REST_PROXY_IMAGE",
"odh-modelmesh-runtime-adapter": "RELATED_IMAGE_ODH_MODELMESH_RUNTIME_ADAPTER_IMAGE",
Expand Down Expand Up @@ -137,12 +147,25 @@ func (m *ModelMeshServing) ReconcileComponent(cli client.Client, owner metav1.Ob
}

// CloudService Monitoring handling
// TODO: cleanup logic in rhods-2.5
if platform == deploy.ManagedRhods {
if enabled {
// first check if the 1st service is up, so prometheus wont fire alerts when it is just startup
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName)
}
// first model-mesh rules
if err := m.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
if enabled {
// then check if the 2nd service is up, so prometheus wont fire alerts when it is just startup
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, DependentComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment %s is not ready to server: %w", DependentComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", DependentComponentName)
}
// then odh-model-controller rules
if err := m.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, DependentComponentName); err != nil {
return err
Expand Down
13 changes: 12 additions & 1 deletion components/ray/ray.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
package ray

import (
"context"
"fmt"
"path/filepath"

operatorv1 "github.com/openshift/api/operator/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"

dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/components"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring"
)

var (
Expand Down Expand Up @@ -49,7 +53,7 @@ func (r *Ray) GetComponentName() string {
return ComponentName
}

func (r *Ray) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
func (r *Ray) ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
var imageParamMap = map[string]string{
"odh-kuberay-operator-controller-image": "RELATED_IMAGE_ODH_KUBERAY_OPERATOR_CONTROLLER_IMAGE",
"namespace": dscispec.ApplicationsNamespace,
Expand Down Expand Up @@ -80,6 +84,13 @@ func (r *Ray) ReconcileComponent(cli client.Client, owner metav1.Object, dscispe
}
// CloudService Monitoring handling
if platform == deploy.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus wont fire alerts when it is just startup
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName)
}
if err := r.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
Expand Down
4 changes: 3 additions & 1 deletion components/trustyai/trustyai.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
package trustyai

import (
"context"
"path/filepath"

operatorv1 "github.com/openshift/api/operator/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"

dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
Expand Down Expand Up @@ -48,7 +50,7 @@ func (t *TrustyAI) GetComponentName() string {
return ComponentName
}

func (t *TrustyAI) ReconcileComponent(cli client.Client, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
func (t *TrustyAI) ReconcileComponent(ctx context.Context, cli client.Client, resConf *rest.Config, owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, _ bool) error {
var imageParamMap = map[string]string{
"trustyaiServiceImage": "RELATED_IMAGE_ODH_TRUSTYAI_SERVICE_IMAGE",
"trustyaiOperatorImage": "RELATED_IMAGE_ODH_TRUSTYAI_SERVICE_OPERATOR_IMAGE",
Expand Down
Loading

0 comments on commit d85efc5

Please sign in to comment.