Skip to content
This repository was archived by the owner on Jan 28, 2022. It is now read-only.

Commit 49d44fa

Browse files
storey247Azadehkhojandi
authored andcommitted
Extended databricks operator to report metrics into Prometheus (#104)
* Add initial extra metrics to controllers * Update ServiceMonitor to allow prometheus to connect
1 parent bc604f4 commit 49d44fa

File tree

13 files changed

+424
-53
lines changed

13 files changed

+424
-53
lines changed

.devcontainer/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ RUN apt-get update \
6060
&& curl -sSL -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl \
6161
&& chmod +x /usr/local/bin/kubectl \
6262
# Install Helm
63-
&& curl -s https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash -
63+
&& curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash \
64+
&& helm repo add stable https://kubernetes-charts.storage.googleapis.com/
6465

6566
# Enable bash completion
6667
RUN apt-get update && apt install -y bash-completion && echo "source /etc/bash_completion" >> "/root/.bashrc"

Makefile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
IMG ?= controller:latest
44
# Produce CRDs that work back to Kubernetes 1.11 (no version conversion)
55
CRD_OPTIONS ?= "crd:trivialVersions=true"
6+
# Prometheus helm installation name
7+
PROMETHEUS_NAME ?= "prom-azure-databricks-operator"
68

79
# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
810
ifeq (,$(shell go env GOBIN))
@@ -154,6 +156,8 @@ endif
154156

155157
kubectl cluster-info
156158

159+
make install-prometheus
160+
157161
@echo "deploying controller to cluster"
158162
make deploy-controller
159163

@@ -191,6 +195,12 @@ else
191195
@echo "kustomize has been installed"
192196
endif
193197

198+
install-prometheus:
199+
@echo "installing prometheus"
200+
# install prometheus (and set to monitor all namespaces in our kind cluster)
201+
helm install ${PROMETHEUS_NAME} stable/prometheus-operator --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false
202+
@echo "prometheus has been installed"
203+
194204
install-test-dependency:
195205
go get -u github.com/jstemmer/go-junit-report \
196206
&& go get github.com/axw/gocov/gocov \

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ Few topics are discussed in the [resources.md](https://github.com/microsoft/azur
3939
- Kubernetes on WSL
4040
- Build pipelines
4141
- Dev container
42+
- Operator metrics
4243

4344
## Contributing
4445

config/prometheus/monitor.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,9 @@ spec:
1111
endpoints:
1212
- path: /metrics
1313
port: https
14+
scheme: https
15+
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
16+
tlsConfig:
17+
insecureSkipVerify: true # Configure certs here if set up for auth_proxy (uses self-signed currently)
1418
selector:
1519
control-plane: controller-manager

controllers/dcluster_controller_databricks.go

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import (
2222
"reflect"
2323

2424
databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1"
25+
"github.com/prometheus/client_golang/prometheus"
26+
dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models"
2527
)
2628

2729
func (r *DclusterReconciler) submit(instance *databricksv1alpha1.Dcluster) error {
@@ -36,7 +38,7 @@ func (r *DclusterReconciler) submit(instance *databricksv1alpha1.Dcluster) error
3638
}
3739
}
3840

39-
clusterInfo, err := r.APIClient.Clusters().Create(*instance.Spec)
41+
clusterInfo, err := r.createCluster(instance)
4042
if err != nil {
4143
return err
4244
}
@@ -55,7 +57,7 @@ func (r *DclusterReconciler) refresh(instance *databricksv1alpha1.Dcluster) erro
5557
return nil
5658
}
5759

58-
clusterInfo, err := r.APIClient.Clusters().Get(instance.Status.ClusterInfo.ClusterID)
60+
clusterInfo, err := r.getCluster(instance.Status.ClusterInfo.ClusterID)
5961
if err != nil {
6062
return err
6163
}
@@ -78,5 +80,31 @@ func (r *DclusterReconciler) delete(instance *databricksv1alpha1.Dcluster) error
7880
return nil
7981
}
8082

81-
return r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID)
83+
return trackExecutionTime(dclusterDeleteDuration, func() error {
84+
err := r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID)
85+
trackSuccessFailure(err, dclusterCounterVec, "delete")
86+
return err
87+
})
88+
}
89+
90+
func (r *DclusterReconciler) getCluster(clusterID string) (cluster dbmodels.ClusterInfo, err error) {
91+
timer := prometheus.NewTimer(dclusterGetDuration)
92+
defer timer.ObserveDuration()
93+
94+
cluster, err = r.APIClient.Clusters().Get(clusterID)
95+
96+
trackSuccessFailure(err, dclusterCounterVec, "get")
97+
98+
return cluster, err
99+
}
100+
101+
func (r *DclusterReconciler) createCluster(instance *databricksv1alpha1.Dcluster) (cluster dbmodels.ClusterInfo, err error) {
102+
timer := prometheus.NewTimer(dclusterCreateDuration)
103+
defer timer.ObserveDuration()
104+
105+
cluster, err = r.APIClient.Clusters().Create(*instance.Spec)
106+
107+
trackSuccessFailure(err, dclusterCounterVec, "create")
108+
109+
return cluster, err
82110
}

controllers/dcluster_metrics.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
Copyright 2019 microsoft.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controllers
18+
19+
import (
20+
"github.com/prometheus/client_golang/prometheus"
21+
"sigs.k8s.io/controller-runtime/pkg/metrics"
22+
)
23+
24+
var (
25+
dclusterCounterVec = prometheus.NewCounterVec(
26+
prometheus.CounterOpts{
27+
Name: metricPrefix + "dcluster_total",
28+
Help: "Counter related to the dCluster CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint",
29+
},
30+
[]string{"status", "method"},
31+
)
32+
33+
dclusterCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
34+
Name: metricPrefix + "dcluster_creation_request_duration_seconds",
35+
Help: "Duration of DB api dcluster create calls.",
36+
})
37+
38+
dclusterGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
39+
Name: metricPrefix + "dcluster_get_request_duration_seconds",
40+
Help: "Duration of DB api dcluster get calls.",
41+
})
42+
43+
dclusterDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
44+
Name: metricPrefix + "dcluster_delete_request_duration_seconds",
45+
Help: "Duration of DB api dcluster delete calls.",
46+
})
47+
)
48+
49+
func init() {
50+
// Register custom metrics with the global prometheus registry
51+
metrics.Registry.MustRegister(dclusterCounterVec,
52+
dclusterCreateDuration, dclusterGetDuration, dclusterDeleteDuration)
53+
}

controllers/djob_controller_databricks.go

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,15 @@ package controllers
1919
import (
2020
"context"
2121
"fmt"
22+
"reflect"
23+
"strings"
24+
2225
databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1"
26+
"github.com/prometheus/client_golang/prometheus"
27+
dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models"
2328
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2429
"k8s.io/apimachinery/pkg/types"
25-
"reflect"
2630
"sigs.k8s.io/controller-runtime/pkg/client"
27-
"strings"
2831
)
2932

3033
func (r *DjobReconciler) submit(instance *databricksv1alpha1.Djob) error {
@@ -68,11 +71,12 @@ func (r *DjobReconciler) submit(instance *databricksv1alpha1.Djob) error {
6871
}
6972
instance.ObjectMeta.SetOwnerReferences(references)
7073
}
71-
jobSettings := databricksv1alpha1.ToDatabricksJobSettings(instance.Spec)
72-
job, err := r.APIClient.Jobs().Create(jobSettings)
74+
job, err := r.createJob(instance)
75+
7376
if err != nil {
7477
return err
7578
}
79+
7680
instance.Spec.Name = instance.GetName()
7781
instance.Status = &databricksv1alpha1.DjobStatus{
7882
JobStatus: &job,
@@ -85,7 +89,8 @@ func (r *DjobReconciler) refresh(instance *databricksv1alpha1.Djob) error {
8589

8690
jobID := instance.Status.JobStatus.JobID
8791

88-
job, err := r.APIClient.Jobs().Get(jobID)
92+
job, err := r.getJob(jobID)
93+
8994
if err != nil {
9095
return err
9196
}
@@ -126,12 +131,39 @@ func (r *DjobReconciler) delete(instance *databricksv1alpha1.Djob) error {
126131
jobID := instance.Status.JobStatus.JobID
127132

128133
// Check if the job exists before trying to delete it
129-
if _, err := r.APIClient.Jobs().Get(jobID); err != nil {
134+
if _, err := r.getJob(jobID); err != nil {
130135
if strings.Contains(err.Error(), "does not exist") {
131136
return nil
132137
}
133138
return err
134139
}
135140

136-
return r.APIClient.Jobs().Delete(jobID)
141+
return trackExecutionTime(djobDeleteDuration, func() error {
142+
err := r.APIClient.Jobs().Delete(jobID)
143+
trackSuccessFailure(err, djobCounterVec, "delete")
144+
return err
145+
})
146+
}
147+
148+
func (r *DjobReconciler) getJob(jobID int64) (job dbmodels.Job, err error) {
149+
timer := prometheus.NewTimer(djobGetDuration)
150+
defer timer.ObserveDuration()
151+
152+
job, err = r.APIClient.Jobs().Get(jobID)
153+
154+
trackSuccessFailure(err, djobCounterVec, "get")
155+
156+
return job, err
157+
}
158+
159+
func (r *DjobReconciler) createJob(instance *databricksv1alpha1.Djob) (job dbmodels.Job, err error) {
160+
timer := prometheus.NewTimer(djobCreateDuration)
161+
defer timer.ObserveDuration()
162+
163+
jobSettings := databricksv1alpha1.ToDatabricksJobSettings(instance.Spec)
164+
job, err = r.APIClient.Jobs().Create(jobSettings)
165+
166+
trackSuccessFailure(err, djobCounterVec, "create")
167+
168+
return job, err
137169
}

controllers/djob_metrics.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
Copyright 2019 microsoft.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controllers
18+
19+
import (
20+
"github.com/prometheus/client_golang/prometheus"
21+
"sigs.k8s.io/controller-runtime/pkg/metrics"
22+
)
23+
24+
var (
25+
djobCounterVec = prometheus.NewCounterVec(
26+
prometheus.CounterOpts{
27+
Name: metricPrefix + "djob_total",
28+
Help: "Counter related to the dJob CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint",
29+
},
30+
[]string{"status", "method"},
31+
)
32+
33+
djobCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
34+
Name: metricPrefix + "djob_creation_request_duration_seconds",
35+
Help: "Duration of DB api djob create calls.",
36+
})
37+
38+
djobGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
39+
Name: metricPrefix + "djob_get_request_duration_seconds",
40+
Help: "Duration of DB api djob get calls.",
41+
})
42+
43+
djobDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
44+
Name: metricPrefix + "djob_delete_request_duration_seconds",
45+
Help: "Duration of DB api djob delete calls.",
46+
})
47+
)
48+
49+
func init() {
50+
// Register custom metrics with the global prometheus registry
51+
metrics.Registry.MustRegister(djobCounterVec,
52+
djobCreateDuration, djobGetDuration, djobDeleteDuration)
53+
}

controllers/metrics.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
Copyright 2019 microsoft.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controllers
18+
19+
import (
20+
"github.com/prometheus/client_golang/prometheus"
21+
)
22+
23+
const (
24+
metricPrefix = "databricks_"
25+
successMetric = "success"
26+
failureMetric = "failure"
27+
)
28+
29+
func trackExecutionTime(histogram prometheus.Histogram, f func() error) error {
30+
timer := prometheus.NewTimer(histogram)
31+
defer timer.ObserveDuration()
32+
return f()
33+
}
34+
35+
func trackSuccessFailure(err error, counterVec *prometheus.CounterVec, method string) {
36+
if err == nil {
37+
counterVec.With(prometheus.Labels{"status": successMetric, "method": method}).Inc()
38+
} else {
39+
counterVec.With(prometheus.Labels{"status": failureMetric, "method": method}).Inc()
40+
}
41+
}

0 commit comments

Comments
 (0)