diff --git a/pkg/controller.v1alpha3/experiment/experiment_controller.go b/pkg/controller.v1alpha3/experiment/experiment_controller.go index c1841f9496c..508510fbb72 100644 --- a/pkg/controller.v1alpha3/experiment/experiment_controller.go +++ b/pkg/controller.v1alpha3/experiment/experiment_controller.go @@ -176,6 +176,10 @@ func (r *ReconcileExperiment) Reconcile(request reconcile.Request) (reconcile.Re } instance := original.DeepCopy() + if needUpdate, finalizers := needUpdateFinalizers(instance); needUpdate { + return r.updateFinalizers(instance, finalizers) + } + if instance.IsCompleted() && !instance.HasRunningTrials() { return reconcile.Result{}, nil diff --git a/pkg/controller.v1alpha3/experiment/experiment_util.go b/pkg/controller.v1alpha3/experiment/experiment_util.go index f631f2463a8..5f5d1105723 100644 --- a/pkg/controller.v1alpha3/experiment/experiment_util.go +++ b/pkg/controller.v1alpha3/experiment/experiment_util.go @@ -9,13 +9,19 @@ import ( "k8s.io/apimachinery/pkg/types" k8syaml "k8s.io/apimachinery/pkg/util/yaml" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/reconcile" experimentsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1alpha3" suggestionsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/suggestions/v1alpha3" trialsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1alpha3" + utilv1alpha3 "github.com/kubeflow/katib/pkg/controller.v1alpha3/experiment/util" "github.com/kubeflow/katib/pkg/controller.v1alpha3/util" ) +const ( + updatePrometheusMetrics = "update-prometheus-metrics" +) + func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1alpha3.Experiment, trialAssignment *suggestionsv1alpha3.TrialAssignment) error { BUFSIZE := 1024 logger := log.WithValues("Experiment", types.NamespacedName{Name: expInstance.GetName(), Namespace: expInstance.GetNamespace()}) @@ -62,3 +68,45 @@ func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1alph return nil } + +func needUpdateFinalizers(exp *experimentsv1alpha3.Experiment) (bool, []string) { + deleted := !exp.ObjectMeta.DeletionTimestamp.IsZero() + pendingFinalizers := exp.GetFinalizers() + contained := false + for _, elem := range pendingFinalizers { + if elem == updatePrometheusMetrics { + contained = true + break + } + } + + if !deleted && !contained { + finalizers := append(pendingFinalizers, updatePrometheusMetrics) + return true, finalizers + } + if deleted && contained { + finalizers := []string{} + for _, pendingFinalizer := range pendingFinalizers { + if pendingFinalizer != updatePrometheusMetrics { + finalizers = append(finalizers, pendingFinalizer) + } + } + return true, finalizers + } + return false, []string{} +} + +func (r *ReconcileExperiment) updateFinalizers(instance *experimentsv1alpha3.Experiment, finalizers []string) (reconcile.Result, error) { + instance.SetFinalizers(finalizers) + if err := r.Update(context.TODO(), instance); err != nil { + return reconcile.Result{}, err + } else { + if !instance.ObjectMeta.DeletionTimestamp.IsZero() { + utilv1alpha3.IncreaseExperimentsDeletedCount() + } else { + utilv1alpha3.IncreaseExperimentsCreatedCount() + } + // Need to requeue because finalizer update does not change metadata.generation + return reconcile.Result{Requeue: true}, err + } +} diff --git a/pkg/controller.v1alpha3/experiment/util/prometheus_metrics.go b/pkg/controller.v1alpha3/experiment/util/prometheus_metrics.go new file mode 100644 index 00000000000..f266ab4cdbe --- /dev/null +++ b/pkg/controller.v1alpha3/experiment/util/prometheus_metrics.go @@ -0,0 +1,66 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package util + +import ( + "github.com/prometheus/client_golang/prometheus" + + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + experimentsDeletedCount = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "katib_experiment_deleted", + Help: "Counts number of Experiment deleted", + }) + experimentsCreatedCount = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "katib_experiment_created", + Help: "Counts number of Experiment created", + }) + experimentsSucceededCount = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "katib_experiment_succeeded", + Help: "Counts number of Experiment succeeded", + }) + experimentsFailedCount = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "katib_experiment_failed", + Help: "Counts number of Experiment failed", + }) +) + +func init() { + metrics.Registry.MustRegister( + experimentsDeletedCount, + experimentsCreatedCount, + experimentsSucceededCount, + experimentsFailedCount) +} + +func IncreaseExperimentsDeletedCount() { + experimentsDeletedCount.Inc() +} + +func IncreaseExperimentsCreatedCount() { + experimentsCreatedCount.Inc() +} + +func IncreaseExperimentsSucceededCount() { + experimentsSucceededCount.Inc() +} + +func IncreaseExperimentsFailedCount() { + experimentsFailedCount.Inc() +} diff --git a/pkg/controller.v1alpha3/experiment/util/status_util.go b/pkg/controller.v1alpha3/experiment/util/status_util.go index acf02407062..b071d1a8130 100644 --- a/pkg/controller.v1alpha3/experiment/util/status_util.go +++ b/pkg/controller.v1alpha3/experiment/util/status_util.go @@ -145,6 +145,7 @@ func UpdateExperimentStatusCondition(instance *experimentsv1alpha3.Experiment, i msg := "Experiment has succeeded because Objective goal has reached" instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg) instance.Status.CompletionTime = &now + IncreaseExperimentsSucceededCount() return } @@ -152,6 +153,7 @@ func UpdateExperimentStatusCondition(instance *experimentsv1alpha3.Experiment, i msg := "Experiment has succeeded because max trial count has reached" instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg) instance.Status.CompletionTime = &now + IncreaseExperimentsSucceededCount() return } @@ -159,6 +161,7 @@ func UpdateExperimentStatusCondition(instance *experimentsv1alpha3.Experiment, i msg := "Experiment has succeeded because suggestion service has reached the end" instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg) instance.Status.CompletionTime = &now + IncreaseExperimentsSucceededCount() return } @@ -166,6 +169,7 @@ func UpdateExperimentStatusCondition(instance *experimentsv1alpha3.Experiment, i msg := "Experiment has failed because max failed count has reached" instance.MarkExperimentStatusFailed(ExperimentFailedReason, msg) instance.Status.CompletionTime = &now + IncreaseExperimentsFailedCount() return } diff --git a/pkg/controller.v1alpha3/trial/prometheus_metrics.go b/pkg/controller.v1alpha3/trial/prometheus_metrics.go new file mode 100644 index 00000000000..8f05a98a2d9 --- /dev/null +++ b/pkg/controller.v1alpha3/trial/prometheus_metrics.go @@ -0,0 +1,66 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package trial + +import ( + "github.com/prometheus/client_golang/prometheus" + + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + trialsDeletedCount = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "katib_trial_deleted", + Help: "Counts number of Trial deleted", + }) + trialsCreatedCount = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "katib_trial_created", + Help: "Counts number of Trial created", + }) + trialsSucceededCount = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "katib_trial_succeeded", + Help: "Counts number of Trial succeeded", + }) + trialsFailedCount = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "katib_trial_failed", + Help: "Counts number of Trial failed", + }) +) + +func init() { + metrics.Registry.MustRegister( + trialsDeletedCount, + trialsCreatedCount, + trialsSucceededCount, + trialsFailedCount) +} + +func IncreaseTrialsDeletedCount() { + trialsDeletedCount.Inc() +} + +func IncreaseTrialsCreatedCount() { + trialsCreatedCount.Inc() +} + +func IncreaseTrialsSucceededCount() { + trialsSucceededCount.Inc() +} + +func IncreaseTrialsFailedCount() { + trialsFailedCount.Inc() +} diff --git a/pkg/controller.v1alpha3/trial/trial_controller_util.go b/pkg/controller.v1alpha3/trial/trial_controller_util.go index f260942fa2c..c2a6b386c14 100644 --- a/pkg/controller.v1alpha3/trial/trial_controller_util.go +++ b/pkg/controller.v1alpha3/trial/trial_controller_util.go @@ -98,6 +98,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1alpha3.Tri eventMsg := fmt.Sprintf("Job %s has succeeded", deployedJob.GetName()) r.recorder.Eventf(instance, corev1.EventTypeNormal, JobSucceededReason, eventMsg) + IncreaseTrialsSucceededCount() } else { msg := "Metrics are not available" instance.MarkTrialStatusSucceeded(corev1.ConditionFalse, TrialMetricsUnavailableReason, msg) @@ -113,6 +114,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1alpha3.Tri jobConditionMessage := (*jobCondition).Message eventMsg := fmt.Sprintf("Job %s has failed: %s", deployedJob.GetName(), jobConditionMessage) r.recorder.Eventf(instance, corev1.EventTypeNormal, JobFailedReason, eventMsg) + IncreaseTrialsFailedCount() } //else nothing to do return @@ -145,15 +147,23 @@ func (r *ReconcileTrial) UpdateTrialStatusObservation(instance *trialsv1alpha3.T } func (r *ReconcileTrial) updateFinalizers(instance *trialsv1alpha3.Trial, finalizers []string) (reconcile.Result, error) { + isDelete := true if !instance.ObjectMeta.DeletionTimestamp.IsZero() { if _, err := r.DeleteTrialObservationLog(instance); err != nil { return reconcile.Result{}, err } + } else { + isDelete = false } instance.SetFinalizers(finalizers) if err := r.Update(context.TODO(), instance); err != nil { return reconcile.Result{}, err } else { + if isDelete { + IncreaseTrialsDeletedCount() + } else { + IncreaseTrialsCreatedCount() + } // Need to requeue because finalizer update does not change metadata.generation return reconcile.Result{Requeue: true}, err }