Skip to content
This repository was archived by the owner on Jan 28, 2022. It is now read-only.

Commit 0fe919d

Browse files
Dave Storeystuartleeks
authored andcommitted
PR feedback, revised metric naming etc
1 parent bdde164 commit 0fe919d

File tree

8 files changed

+111
-174
lines changed

8 files changed

+111
-174
lines changed

controllers/dcluster_controller_databricks.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ import (
2020
"context"
2121
"fmt"
2222
"reflect"
23-
"time"
2423

2524
databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1"
25+
"github.com/prometheus/client_golang/prometheus"
2626
dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models"
2727
)
2828

@@ -81,26 +81,30 @@ func (r *DclusterReconciler) delete(instance *databricksv1alpha1.Dcluster) error
8181
}
8282

8383
return trackExecutionTime(dclusterDeleteDuration, func() error {
84-
return r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID)
84+
err := r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID)
85+
trackSuccessFailure(err, dclusterCounterVec, "delete")
86+
return err
8587
})
8688
}
8789

8890
func (r *DclusterReconciler) getCluster(clusterID string) (cluster dbmodels.ClusterInfo, err error) {
89-
defer trackMillisecondsTaken(time.Now(), dclusterGetDuration)
91+
timer := prometheus.NewTimer(dclusterGetDuration)
92+
defer timer.ObserveDuration()
9093

9194
cluster, err = r.APIClient.Clusters().Get(clusterID)
9295

93-
trackSuccessFailure(err, dclusterGetSuccess, dclusterGetFailure)
96+
trackSuccessFailure(err, dclusterCounterVec, "get")
9497

9598
return cluster, err
9699
}
97100

98101
func (r *DclusterReconciler) createCluster(instance *databricksv1alpha1.Dcluster) (cluster dbmodels.ClusterInfo, err error) {
99-
defer trackMillisecondsTaken(time.Now(), dclusterCreateDuration)
102+
timer := prometheus.NewTimer(dclusterCreateDuration)
103+
defer timer.ObserveDuration()
100104

101105
cluster, err = r.APIClient.Clusters().Create(*instance.Spec)
102106

103-
trackSuccessFailure(err, dclusterCreateSuccess, dclusterCreateFailure)
107+
trackSuccessFailure(err, dclusterCounterVec, "create")
104108

105109
return cluster, err
106110
}

controllers/dcluster_metrics.go

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -22,53 +22,32 @@ import (
2222
)
2323

2424
var (
25-
dclusterCreateSuccess = prometheus.NewCounter(
25+
dclusterCounterVec = prometheus.NewCounterVec(
2626
prometheus.CounterOpts{
27-
Name: "dcluster_create_success_total",
28-
Help: "Number of create dcluster success",
29-
},
30-
)
31-
dclusterCreateFailure = prometheus.NewCounter(
32-
prometheus.CounterOpts{
33-
Name: "dcluster_create_failures_total",
34-
Help: "Number of create dcluster failures",
35-
},
36-
)
37-
38-
dclusterGetSuccess = prometheus.NewCounter(
39-
prometheus.CounterOpts{
40-
Name: "dcluster_get_success_total",
41-
Help: "Number of create dcluster success",
42-
},
43-
)
44-
dclusterGetFailure = prometheus.NewCounter(
45-
prometheus.CounterOpts{
46-
Name: "dcluster_get_failures_total",
47-
Help: "Number of create dcluster failures",
27+
Name: metricPrefix + "dcluster_total",
28+
Help: "Counter related to the dCluster CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint",
4829
},
30+
[]string{"status", "method"},
4931
)
5032

5133
dclusterCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
52-
Name: "dcluster_creation_duration",
53-
Help: "Duration of DB api dcluster create calls.",
54-
Buckets: prometheus.LinearBuckets(100, 10, 20),
34+
Name: metricPrefix + "dcluster_creation_request_duration_seconds",
35+
Help: "Duration of DB api dcluster create calls.",
5536
})
5637

5738
dclusterGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
58-
Name: "dcluster_get_duration",
59-
Help: "Duration of DB api dcluster get calls.",
60-
Buckets: prometheus.LinearBuckets(100, 10, 20),
39+
Name: metricPrefix + "dcluster_get_request_duration_seconds",
40+
Help: "Duration of DB api dcluster get calls.",
6141
})
6242

6343
dclusterDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
64-
Name: "dcluster_delete_duration",
65-
Help: "Duration of DB api dcluster delete calls.",
66-
Buckets: prometheus.LinearBuckets(100, 10, 20),
44+
Name: metricPrefix + "dcluster_delete_request_duration_seconds",
45+
Help: "Duration of DB api dcluster delete calls.",
6746
})
6847
)
6948

7049
func init() {
7150
// Register custom metrics with the global prometheus registry
72-
metrics.Registry.MustRegister(dclusterCreateSuccess, dclusterCreateFailure, dclusterGetSuccess, dclusterGetFailure,
51+
metrics.Registry.MustRegister(dclusterCounterVec,
7352
dclusterCreateDuration, dclusterGetDuration, dclusterDeleteDuration)
7453
}

controllers/djob_controller_databricks.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ import (
2121
"fmt"
2222
"reflect"
2323
"strings"
24-
"time"
2524

2625
databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1"
26+
"github.com/prometheus/client_golang/prometheus"
2727
dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models"
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2929
"k8s.io/apimachinery/pkg/types"
@@ -139,27 +139,31 @@ func (r *DjobReconciler) delete(instance *databricksv1alpha1.Djob) error {
139139
}
140140

141141
return trackExecutionTime(djobDeleteDuration, func() error {
142-
return r.APIClient.Jobs().Delete(jobID)
142+
err := r.APIClient.Jobs().Delete(jobID)
143+
trackSuccessFailure(err, djobCounterVec, "delete")
144+
return err
143145
})
144146
}
145147

146148
func (r *DjobReconciler) getJob(jobID int64) (job dbmodels.Job, err error) {
147-
defer trackMillisecondsTaken(time.Now(), djobGetDuration)
149+
timer := prometheus.NewTimer(djobGetDuration)
150+
defer timer.ObserveDuration()
148151

149152
job, err = r.APIClient.Jobs().Get(jobID)
150153

151-
trackSuccessFailure(err, djobGetSuccess, djobGetFailure)
154+
trackSuccessFailure(err, djobCounterVec, "get")
152155

153156
return job, err
154157
}
155158

156159
func (r *DjobReconciler) createJob(instance *databricksv1alpha1.Djob) (job dbmodels.Job, err error) {
157-
defer trackMillisecondsTaken(time.Now(), djobCreateDuration)
160+
timer := prometheus.NewTimer(djobCreateDuration)
161+
defer timer.ObserveDuration()
158162

159163
jobSettings := databricksv1alpha1.ToDatabricksJobSettings(instance.Spec)
160164
job, err = r.APIClient.Jobs().Create(jobSettings)
161165

162-
trackSuccessFailure(err, djobCreateSuccess, djobCreateFailure)
166+
trackSuccessFailure(err, djobCounterVec, "create")
163167

164168
return job, err
165169
}

controllers/djob_metrics.go

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -22,53 +22,32 @@ import (
2222
)
2323

2424
var (
25-
djobCreateSuccess = prometheus.NewCounter(
25+
djobCounterVec = prometheus.NewCounterVec(
2626
prometheus.CounterOpts{
27-
Name: "djob_create_success_total",
28-
Help: "Number of create djob success",
29-
},
30-
)
31-
djobCreateFailure = prometheus.NewCounter(
32-
prometheus.CounterOpts{
33-
Name: "djob_create_failures_total",
34-
Help: "Number of create djob failures",
35-
},
36-
)
37-
38-
djobGetSuccess = prometheus.NewCounter(
39-
prometheus.CounterOpts{
40-
Name: "djob_get_success_total",
41-
Help: "Number of get djob success",
42-
},
43-
)
44-
djobGetFailure = prometheus.NewCounter(
45-
prometheus.CounterOpts{
46-
Name: "djob_get_failures_total",
47-
Help: "Number of get djob failures",
27+
Name: metricPrefix + "djob_total",
28+
Help: "Counter related to the dJob CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint",
4829
},
30+
[]string{"status", "method"},
4931
)
5032

5133
djobCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
52-
Name: "djob_creation_duration",
53-
Help: "Duration of DB api djob create calls.",
54-
Buckets: prometheus.LinearBuckets(100, 10, 20),
34+
Name: metricPrefix + "djob_creation_request_duration_seconds",
35+
Help: "Duration of DB api djob create calls.",
5536
})
5637

5738
djobGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
58-
Name: "djob_get_duration",
59-
Help: "Duration of DB api djob get calls.",
60-
Buckets: prometheus.LinearBuckets(100, 10, 20),
39+
Name: metricPrefix + "djob_get_request_duration_seconds",
40+
Help: "Duration of DB api djob get calls.",
6141
})
6242

6343
djobDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
64-
Name: "djob_delete_duration",
65-
Help: "Duration of DB api djob delete calls.",
66-
Buckets: prometheus.LinearBuckets(100, 10, 20),
44+
Name: metricPrefix + "djob_delete_request_duration_seconds",
45+
Help: "Duration of DB api djob delete calls.",
6746
})
6847
)
6948

7049
func init() {
7150
// Register custom metrics with the global prometheus registry
72-
metrics.Registry.MustRegister(djobCreateSuccess, djobCreateFailure, djobGetSuccess, djobGetFailure,
51+
metrics.Registry.MustRegister(djobCounterVec,
7352
djobCreateDuration, djobGetDuration, djobDeleteDuration)
7453
}

controllers/metrics.go

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,28 +17,25 @@ limitations under the License.
1717
package controllers
1818

1919
import (
20-
"time"
21-
2220
"github.com/prometheus/client_golang/prometheus"
2321
)
2422

25-
func trackExecutionTime(histogram prometheus.Histogram, f func() error) error {
26-
startTime := time.Now()
27-
28-
defer trackMillisecondsTaken(startTime, histogram)
23+
const (
24+
metricPrefix = "databricks_"
25+
successMetric = "success"
26+
failureMetric = "failure"
27+
)
2928

29+
func trackExecutionTime(histogram prometheus.Histogram, f func() error) error {
30+
timer := prometheus.NewTimer(histogram)
31+
defer timer.ObserveDuration()
3032
return f()
3133
}
3234

33-
func trackMillisecondsTaken(startTime time.Time, histogram prometheus.Histogram) {
34-
duration := float64(time.Since(startTime) / time.Millisecond)
35-
histogram.Observe(duration)
36-
}
37-
38-
func trackSuccessFailure(err error, success prometheus.Counter, failure prometheus.Counter) {
35+
func trackSuccessFailure(err error, counterVec *prometheus.CounterVec, method string) {
3936
if err == nil {
40-
success.Inc()
37+
counterVec.With(prometheus.Labels{"status": successMetric, "method": method}).Inc()
4138
} else {
42-
failure.Inc()
39+
counterVec.With(prometheus.Labels{"status": failureMetric, "method": method}).Inc()
4340
}
4441
}

controllers/run_controller_databricks.go

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"time"
2525

2626
databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1"
27+
"github.com/prometheus/client_golang/prometheus"
2728
"github.com/xinsnake/databricks-sdk-golang/azure"
2829
dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models"
2930
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -112,12 +113,16 @@ func (r *RunReconciler) delete(instance *databricksv1alpha1.Run) error {
112113
time.Sleep(15 * time.Second)
113114

114115
return trackExecutionTime(runDeleteDuration, func() error {
115-
return r.APIClient.Jobs().RunsDelete(runID)
116+
err := r.APIClient.Jobs().RunsDelete(runID)
117+
trackSuccessFailure(err, runCounterVec, "delete")
118+
return err
116119
})
117120
}
118121

119122
func (r *RunReconciler) runUsingRunNow(instance *databricksv1alpha1.Run) (*dbmodels.Run, error) {
120-
defer trackMillisecondsTaken(time.Now(), runNowDuration)
123+
timer := prometheus.NewTimer(runNowDuration)
124+
defer timer.ObserveDuration()
125+
121126
runParameters := dbmodels.RunParameters{
122127
JarParams: instance.Spec.JarParams,
123128
NotebookParams: instance.Spec.NotebookParams,
@@ -142,12 +147,13 @@ func (r *RunReconciler) runUsingRunNow(instance *databricksv1alpha1.Run) (*dbmod
142147
})
143148

144149
run, err := r.APIClient.Jobs().RunNow(k8sJob.Status.JobStatus.JobID, runParameters)
145-
trackSuccessFailure(err, runNowSuccess, runNowFailure)
150+
trackSuccessFailure(err, runCounterVec, "runsnow")
146151
return &run, err
147152
}
148153

149154
func (r *RunReconciler) runUsingRunsSubmit(instance *databricksv1alpha1.Run) (*dbmodels.Run, error) {
150-
defer trackMillisecondsTaken(time.Now(), runSubmitDuration)
155+
timer := prometheus.NewTimer(runSubmitDuration)
156+
defer timer.ObserveDuration()
151157

152158
clusterSpec := dbmodels.ClusterSpec{
153159
NewCluster: instance.Spec.NewCluster,
@@ -162,26 +168,28 @@ func (r *RunReconciler) runUsingRunsSubmit(instance *databricksv1alpha1.Run) (*d
162168
}
163169

164170
run, err := r.APIClient.Jobs().RunsSubmit(instance.Spec.RunName, clusterSpec, jobTask, instance.Spec.TimeoutSeconds)
165-
trackSuccessFailure(err, runSubmitSuccess, runSubmitFailure)
171+
trackSuccessFailure(err, runCounterVec, "runssubmit")
166172
return &run, err
167173
}
168174

169175
func (r *RunReconciler) getRun(runID int64) (dbmodels.Run, error) {
170-
defer trackMillisecondsTaken(time.Now(), runGetDuration)
176+
timer := prometheus.NewTimer(runGetDuration)
177+
defer timer.ObserveDuration()
171178

172179
runOutput, err := r.APIClient.Jobs().RunsGet(runID)
173180

174-
trackSuccessFailure(err, runGetSuccess, runGetFailure)
181+
trackSuccessFailure(err, runCounterVec, "get")
175182

176183
return runOutput, err
177184
}
178185

179186
func (r *RunReconciler) getRunOutput(runID int64) (azure.JobsRunsGetOutputResponse, error) {
180-
defer trackMillisecondsTaken(time.Now(), runGetOutputDuration)
187+
timer := prometheus.NewTimer(runGetOutputDuration)
188+
defer timer.ObserveDuration()
181189

182190
runOutput, err := r.APIClient.Jobs().RunsGetOutput(runID)
183191

184-
trackSuccessFailure(err, runGetOutputSuccess, runGetOutputFailure)
192+
trackSuccessFailure(err, runCounterVec, "getoutput")
185193

186194
return runOutput, err
187195
}

0 commit comments

Comments
 (0)