Skip to content

Commit

Permalink
update(kserve): add monitoring logic (opendatahub-io#782)
Browse files Browse the repository at this point in the history
Signed-off-by: Wen Zhou <[email protected]>
  • Loading branch information
zdtsw committed Jan 15, 2024
1 parent 294063e commit 96f5566
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 33 deletions.
34 changes: 2 additions & 32 deletions components/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ type ComponentInterface interface {
SetImageParamsMap(imageMap map[string]string) map[string]string
OverrideManifests(platform string) error
UpdatePrometheusConfig(cli client.Client, enable bool, component string) error
// WaitForDeploymentAvailable(ctx context.Context, r *rest.Config, c string, n string, i int, t int) error
}

// UpdatePrometheusConfig update prometheus-configs.yaml to include/exclude <component>.rules
Expand Down Expand Up @@ -120,6 +119,8 @@ func (c *Component) UpdatePrometheusConfig(cli client.Client, enable bool, compo
RayARules string `yaml:"ray-alerting.rules"`
WorkbenchesRRules string `yaml:"workbenches-recording.rules"`
WorkbenchesARules string `yaml:"workbenches-alerting.rules"`
KserveRRules string `yaml:"kserve-recording.rules"`
KserveARules string `yaml:"kserve-alerting.rules"`
} `yaml:"data"`
}
var configMap ConfigMap
Expand Down Expand Up @@ -184,34 +185,3 @@ func (c *Component) UpdatePrometheusConfig(cli client.Client, enable bool, compo
}
return nil
}

// WaitForDeploymentAvailable to check if component deployment from 'namepsace' is ready within 'timeout' before apply prometheus rules for the component
// func (c *Component) WaitForDeploymentAvailable(ctx context.Context, restConfig *rest.Config, componentName string, namespace string, interval int, timeout int) error {
// resourceInterval := time.Duration(interval) * time.Second
// resourceTimeout := time.Duration(timeout) * time.Minute
// return wait.PollUntilContextTimeout(context.TODO(), resourceInterval, resourceTimeout, true, func(ctx context.Context) (bool, error) {
// clientset, err := kubernetes.NewForConfig(restConfig)
// if err != nil {
// return false, fmt.Errorf("error getting client %w", err)
// }
// componentDeploymentList, err := clientset.AppsV1().Deployments(namespace).List(context.TODO(), metav1.ListOptions{
// LabelSelector: "app.opendatahub.io/" + componentName,
// })
// if err != nil {
// if errors.IsNotFound(err) {
// return false, nil
// }
// }
// isReady := false
// if len(componentDeploymentList.Items) != 0 {
// for _, deployment := range componentDeploymentList.Items {
// if deployment.Status.ReadyReplicas == deployment.Status.Replicas {
// isReady = true
// } else {
// isReady = false
// }
// }
// }
// return isReady, nil
// })
// }
17 changes: 16 additions & 1 deletion components/kserve/kserve.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/feature"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/monitoring"
)

var (
Expand Down Expand Up @@ -91,6 +92,7 @@ func (k *Kserve) ReconcileComponent(ctx context.Context, cli client.Client, resC
}

enabled := k.GetManagementState() == operatorv1.Managed
monitoringEnabled := dscispec.Monitoring.ManagementState == operatorv1.Managed
platform, err := deploy.GetPlatform(cli)
if err != nil {
return err
Expand Down Expand Up @@ -148,7 +150,20 @@ func (k *Kserve) ReconcileComponent(ctx context.Context, cli client.Client, resC
return err
}
}

// CloudService Monitoring handling
if platform == deploy.ManagedRhods {
if enabled {
// first check if the service is up, so prometheus wont fire alerts when it is just startup
if err := monitoring.WaitForDeploymentAvailable(ctx, resConf, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil {
return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err)
}
fmt.Printf("deployment for %s is done, updating monitoing rules", ComponentName)
}
// kesrve rules
if err := k.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil {
return err
}
}
return nil
}

Expand Down

0 comments on commit 96f5566

Please sign in to comment.