Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prometheus metrics for experiment and trial #870

Merged
merged 1 commit into from
Oct 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/controller.v1alpha3/experiment/experiment_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ func (r *ReconcileExperiment) Reconcile(request reconcile.Request) (reconcile.Re
}
instance := original.DeepCopy()

if needUpdate, finalizers := needUpdateFinalizers(instance); needUpdate {
return r.updateFinalizers(instance, finalizers)
}

if instance.IsCompleted() && !instance.HasRunningTrials() {

return reconcile.Result{}, nil
Expand Down
48 changes: 48 additions & 0 deletions pkg/controller.v1alpha3/experiment/experiment_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,19 @@ import (
"k8s.io/apimachinery/pkg/types"
k8syaml "k8s.io/apimachinery/pkg/util/yaml"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

experimentsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1alpha3"
suggestionsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/suggestions/v1alpha3"
trialsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1alpha3"
utilv1alpha3 "github.com/kubeflow/katib/pkg/controller.v1alpha3/experiment/util"
"github.com/kubeflow/katib/pkg/controller.v1alpha3/util"
)

const (
updatePrometheusMetrics = "update-prometheus-metrics"
)

func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1alpha3.Experiment, trialAssignment *suggestionsv1alpha3.TrialAssignment) error {
BUFSIZE := 1024
logger := log.WithValues("Experiment", types.NamespacedName{Name: expInstance.GetName(), Namespace: expInstance.GetNamespace()})
Expand Down Expand Up @@ -62,3 +68,45 @@ func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1alph
return nil

}

func needUpdateFinalizers(exp *experimentsv1alpha3.Experiment) (bool, []string) {
deleted := !exp.ObjectMeta.DeletionTimestamp.IsZero()
pendingFinalizers := exp.GetFinalizers()
contained := false
for _, elem := range pendingFinalizers {
if elem == updatePrometheusMetrics {
contained = true
break
}
}

if !deleted && !contained {
finalizers := append(pendingFinalizers, updatePrometheusMetrics)
return true, finalizers
}
if deleted && contained {
finalizers := []string{}
for _, pendingFinalizer := range pendingFinalizers {
if pendingFinalizer != updatePrometheusMetrics {
finalizers = append(finalizers, pendingFinalizer)
}
}
return true, finalizers
}
return false, []string{}
}

func (r *ReconcileExperiment) updateFinalizers(instance *experimentsv1alpha3.Experiment, finalizers []string) (reconcile.Result, error) {
instance.SetFinalizers(finalizers)
if err := r.Update(context.TODO(), instance); err != nil {
return reconcile.Result{}, err
} else {
if !instance.ObjectMeta.DeletionTimestamp.IsZero() {
utilv1alpha3.IncreaseExperimentsDeletedCount()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of Increase, increment seems more appropriate?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you mean rename IncreaseExperimentsDeletedCount to Increment ExperimentsDeletedCount? If so, I think Increase is better, since it is a verb

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hougangliu when we specify counters, we typically specify the term Increment. See Inc definition in prometheus https://github.com/kubeflow/katib/blob/master/vendor/github.com/prometheus/client_golang/prometheus/counter.go#L37

Or, should we directly call experimentsDeletedCount.Inc() ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I find it. In fact https://github.com/kubeflow/katib/blob/master/vendor/github.com/prometheus/client_golang/prometheus/counter.go#L37 should use increase too, it should be a typo error.
The reason why I didn't use experimentsDeletedCount.Inc() is that other package "pkg/controller.v1alpha3/experiment/util" also call it, but experimentsDeletedCount cannot be access by other packages

} else {
utilv1alpha3.IncreaseExperimentsCreatedCount()
}
// Need to requeue because finalizer update does not change metadata.generation
return reconcile.Result{Requeue: true}, err
}
}
66 changes: 66 additions & 0 deletions pkg/controller.v1alpha3/experiment/util/prometheus_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
Copyright 2019 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package util

import (
"github.com/prometheus/client_golang/prometheus"

"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
experimentsDeletedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_experiment_deleted",
Copy link
Contributor

@yeya24 yeya24 Oct 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to rename the metrics?

Help: "Counts number of Experiment deleted",
})
experimentsCreatedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_experiment_created",
Help: "Counts number of Experiment created",
})
experimentsSucceededCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_experiment_succeeded",
Help: "Counts number of Experiment succeeded",
})
experimentsFailedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_experiment_failed",
Help: "Counts number of Experiment failed",
})
)

func init() {
metrics.Registry.MustRegister(
experimentsDeletedCount,
experimentsCreatedCount,
experimentsSucceededCount,
experimentsFailedCount)
}

func IncreaseExperimentsDeletedCount() {
experimentsDeletedCount.Inc()
}

func IncreaseExperimentsCreatedCount() {
experimentsCreatedCount.Inc()
}

func IncreaseExperimentsSucceededCount() {
experimentsSucceededCount.Inc()
}

func IncreaseExperimentsFailedCount() {
experimentsFailedCount.Inc()
}
4 changes: 4 additions & 0 deletions pkg/controller.v1alpha3/experiment/util/status_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,27 +145,31 @@ func UpdateExperimentStatusCondition(instance *experimentsv1alpha3.Experiment, i
msg := "Experiment has succeeded because Objective goal has reached"
instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg)
instance.Status.CompletionTime = &now
IncreaseExperimentsSucceededCount()
return
}

if (instance.Spec.MaxTrialCount != nil) && (completedTrialsCount >= *instance.Spec.MaxTrialCount) {
msg := "Experiment has succeeded because max trial count has reached"
instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg)
instance.Status.CompletionTime = &now
IncreaseExperimentsSucceededCount()
return
}

if getSuggestionDone && (instance.Status.TrialsPending+instance.Status.TrialsRunning) == 0 {
msg := "Experiment has succeeded because suggestion service has reached the end"
instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg)
instance.Status.CompletionTime = &now
IncreaseExperimentsSucceededCount()
return
}

if (instance.Spec.MaxFailedTrialCount != nil) && (failedTrialsCount >= *instance.Spec.MaxFailedTrialCount) {
msg := "Experiment has failed because max failed count has reached"
instance.MarkExperimentStatusFailed(ExperimentFailedReason, msg)
instance.Status.CompletionTime = &now
IncreaseExperimentsFailedCount()
return
}

Expand Down
66 changes: 66 additions & 0 deletions pkg/controller.v1alpha3/trial/prometheus_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
Copyright 2019 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package trial

import (
"github.com/prometheus/client_golang/prometheus"

"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
trialsDeletedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_trial_deleted",
Help: "Counts number of Trial deleted",
})
trialsCreatedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_trial_created",
Help: "Counts number of Trial created",
})
trialsSucceededCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_trial_succeeded",
Help: "Counts number of Trial succeeded",
})
trialsFailedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_trial_failed",
Help: "Counts number of Trial failed",
})
)

func init() {
metrics.Registry.MustRegister(
trialsDeletedCount,
trialsCreatedCount,
trialsSucceededCount,
trialsFailedCount)
}

func IncreaseTrialsDeletedCount() {
trialsDeletedCount.Inc()
}

func IncreaseTrialsCreatedCount() {
trialsCreatedCount.Inc()
}

func IncreaseTrialsSucceededCount() {
trialsSucceededCount.Inc()
}

func IncreaseTrialsFailedCount() {
trialsFailedCount.Inc()
}
10 changes: 10 additions & 0 deletions pkg/controller.v1alpha3/trial/trial_controller_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1alpha3.Tri

eventMsg := fmt.Sprintf("Job %s has succeeded", deployedJob.GetName())
r.recorder.Eventf(instance, corev1.EventTypeNormal, JobSucceededReason, eventMsg)
IncreaseTrialsSucceededCount()
} else {
msg := "Metrics are not available"
instance.MarkTrialStatusSucceeded(corev1.ConditionFalse, TrialMetricsUnavailableReason, msg)
Expand All @@ -113,6 +114,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1alpha3.Tri
jobConditionMessage := (*jobCondition).Message
eventMsg := fmt.Sprintf("Job %s has failed: %s", deployedJob.GetName(), jobConditionMessage)
r.recorder.Eventf(instance, corev1.EventTypeNormal, JobFailedReason, eventMsg)
IncreaseTrialsFailedCount()
}
//else nothing to do
return
Expand Down Expand Up @@ -145,15 +147,23 @@ func (r *ReconcileTrial) UpdateTrialStatusObservation(instance *trialsv1alpha3.T
}

func (r *ReconcileTrial) updateFinalizers(instance *trialsv1alpha3.Trial, finalizers []string) (reconcile.Result, error) {
isDelete := true
if !instance.ObjectMeta.DeletionTimestamp.IsZero() {
if _, err := r.DeleteTrialObservationLog(instance); err != nil {
return reconcile.Result{}, err
}
} else {
isDelete = false
}
instance.SetFinalizers(finalizers)
if err := r.Update(context.TODO(), instance); err != nil {
return reconcile.Result{}, err
} else {
if isDelete {
IncreaseTrialsDeletedCount()
} else {
IncreaseTrialsCreatedCount()
}
// Need to requeue because finalizer update does not change metadata.generation
return reconcile.Result{Requeue: true}, err
}
Expand Down