Skip to content
This repository was archived by the owner on Jan 28, 2022. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ RUN apt-get update \
&& curl -sSL -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl \
&& chmod +x /usr/local/bin/kubectl \
# Install Helm
&& curl -s https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash -
&& curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash \
&& helm repo add stable https://kubernetes-charts.storage.googleapis.com/

# Enable bash completion
RUN apt-get update && apt install -y bash-completion && echo "source /etc/bash_completion" >> "/root/.bashrc"
Expand Down
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
IMG ?= controller:latest
# Produce CRDs that work back to Kubernetes 1.11 (no version conversion)
CRD_OPTIONS ?= "crd:trivialVersions=true"
# Prometheus helm installation name
PROMETHEUS_NAME ?= "prom-azure-databricks-operator"

# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
ifeq (,$(shell go env GOBIN))
Expand Down Expand Up @@ -154,6 +156,8 @@ endif

kubectl cluster-info

make install-prometheus

@echo "deploying controller to cluster"
make deploy-controller

Expand Down Expand Up @@ -191,6 +195,12 @@ else
@echo "kustomize has been installed"
endif

install-prometheus:
@echo "installing prometheus"
# install prometheus (and set to monitor all namespaces in our kind cluster)
helm install ${PROMETHEUS_NAME} stable/prometheus-operator --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false
@echo "prometheus has been installed"

install-test-dependency:
go get -u github.com/jstemmer/go-junit-report \
&& go get github.com/axw/gocov/gocov \
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Few topics are discussed in the [resources.md](https://github.com/microsoft/azur
- Kubernetes on WSL
- Build pipelines
- Dev container
- Operator metrics

## Contributing

Expand Down
4 changes: 4 additions & 0 deletions config/prometheus/monitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,9 @@ spec:
endpoints:
- path: /metrics
port: https
scheme: https
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
tlsConfig:
insecureSkipVerify: true # Configure certs here if set up for auth_proxy (uses self-signed currently)
selector:
control-plane: controller-manager
34 changes: 31 additions & 3 deletions controllers/dcluster_controller_databricks.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import (
"reflect"

databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1"
"github.com/prometheus/client_golang/prometheus"
dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models"
)

func (r *DclusterReconciler) submit(instance *databricksv1alpha1.Dcluster) error {
Expand All @@ -36,7 +38,7 @@ func (r *DclusterReconciler) submit(instance *databricksv1alpha1.Dcluster) error
}
}

clusterInfo, err := r.APIClient.Clusters().Create(*instance.Spec)
clusterInfo, err := r.createCluster(instance)
if err != nil {
return err
}
Expand All @@ -55,7 +57,7 @@ func (r *DclusterReconciler) refresh(instance *databricksv1alpha1.Dcluster) erro
return nil
}

clusterInfo, err := r.APIClient.Clusters().Get(instance.Status.ClusterInfo.ClusterID)
clusterInfo, err := r.getCluster(instance.Status.ClusterInfo.ClusterID)
if err != nil {
return err
}
Expand All @@ -78,5 +80,31 @@ func (r *DclusterReconciler) delete(instance *databricksv1alpha1.Dcluster) error
return nil
}

return r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID)
return trackExecutionTime(dclusterDeleteDuration, func() error {
err := r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID)
trackSuccessFailure(err, dclusterCounterVec, "delete")
return err
})
}

func (r *DclusterReconciler) getCluster(clusterID string) (cluster dbmodels.ClusterInfo, err error) {
timer := prometheus.NewTimer(dclusterGetDuration)
defer timer.ObserveDuration()

cluster, err = r.APIClient.Clusters().Get(clusterID)

trackSuccessFailure(err, dclusterCounterVec, "get")

return cluster, err
}

func (r *DclusterReconciler) createCluster(instance *databricksv1alpha1.Dcluster) (cluster dbmodels.ClusterInfo, err error) {
timer := prometheus.NewTimer(dclusterCreateDuration)
defer timer.ObserveDuration()

cluster, err = r.APIClient.Clusters().Create(*instance.Spec)

trackSuccessFailure(err, dclusterCounterVec, "create")

return cluster, err
}
53 changes: 53 additions & 0 deletions controllers/dcluster_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
Copyright 2019 microsoft.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
dclusterCounterVec = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: metricPrefix + "dcluster_total",
Help: "Counter related to the dCluster CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint",
},
[]string{"status", "method"},
)

dclusterCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "dcluster_creation_request_duration_seconds",
Help: "Duration of DB api dcluster create calls.",
})

dclusterGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "dcluster_get_request_duration_seconds",
Help: "Duration of DB api dcluster get calls.",
})

dclusterDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "dcluster_delete_request_duration_seconds",
Help: "Duration of DB api dcluster delete calls.",
})
)

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(dclusterCounterVec,
dclusterCreateDuration, dclusterGetDuration, dclusterDeleteDuration)
}
46 changes: 39 additions & 7 deletions controllers/djob_controller_databricks.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@ package controllers
import (
"context"
"fmt"
"reflect"
"strings"

databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1"
"github.com/prometheus/client_golang/prometheus"
dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"reflect"
"sigs.k8s.io/controller-runtime/pkg/client"
"strings"
)

func (r *DjobReconciler) submit(instance *databricksv1alpha1.Djob) error {
Expand Down Expand Up @@ -68,11 +71,12 @@ func (r *DjobReconciler) submit(instance *databricksv1alpha1.Djob) error {
}
instance.ObjectMeta.SetOwnerReferences(references)
}
jobSettings := databricksv1alpha1.ToDatabricksJobSettings(instance.Spec)
job, err := r.APIClient.Jobs().Create(jobSettings)
job, err := r.createJob(instance)

if err != nil {
return err
}

instance.Spec.Name = instance.GetName()
instance.Status = &databricksv1alpha1.DjobStatus{
JobStatus: &job,
Expand All @@ -85,7 +89,8 @@ func (r *DjobReconciler) refresh(instance *databricksv1alpha1.Djob) error {

jobID := instance.Status.JobStatus.JobID

job, err := r.APIClient.Jobs().Get(jobID)
job, err := r.getJob(jobID)

if err != nil {
return err
}
Expand Down Expand Up @@ -126,12 +131,39 @@ func (r *DjobReconciler) delete(instance *databricksv1alpha1.Djob) error {
jobID := instance.Status.JobStatus.JobID

// Check if the job exists before trying to delete it
if _, err := r.APIClient.Jobs().Get(jobID); err != nil {
if _, err := r.getJob(jobID); err != nil {
if strings.Contains(err.Error(), "does not exist") {
return nil
}
return err
}

return r.APIClient.Jobs().Delete(jobID)
return trackExecutionTime(djobDeleteDuration, func() error {
err := r.APIClient.Jobs().Delete(jobID)
trackSuccessFailure(err, djobCounterVec, "delete")
return err
})
}

func (r *DjobReconciler) getJob(jobID int64) (job dbmodels.Job, err error) {
timer := prometheus.NewTimer(djobGetDuration)
defer timer.ObserveDuration()

job, err = r.APIClient.Jobs().Get(jobID)

trackSuccessFailure(err, djobCounterVec, "get")

return job, err
}

func (r *DjobReconciler) createJob(instance *databricksv1alpha1.Djob) (job dbmodels.Job, err error) {
timer := prometheus.NewTimer(djobCreateDuration)
defer timer.ObserveDuration()

jobSettings := databricksv1alpha1.ToDatabricksJobSettings(instance.Spec)
job, err = r.APIClient.Jobs().Create(jobSettings)

trackSuccessFailure(err, djobCounterVec, "create")

return job, err
}
53 changes: 53 additions & 0 deletions controllers/djob_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
Copyright 2019 microsoft.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
djobCounterVec = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: metricPrefix + "djob_total",
Help: "Counter related to the dJob CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint",
},
[]string{"status", "method"},
)

djobCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "djob_creation_request_duration_seconds",
Help: "Duration of DB api djob create calls.",
})

djobGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "djob_get_request_duration_seconds",
Help: "Duration of DB api djob get calls.",
})

djobDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "djob_delete_request_duration_seconds",
Help: "Duration of DB api djob delete calls.",
})
)

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(djobCounterVec,
djobCreateDuration, djobGetDuration, djobDeleteDuration)
}
41 changes: 41 additions & 0 deletions controllers/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
Copyright 2019 microsoft.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"github.com/prometheus/client_golang/prometheus"
)

const (
metricPrefix = "databricks_"
successMetric = "success"
failureMetric = "failure"
)

func trackExecutionTime(histogram prometheus.Histogram, f func() error) error {
timer := prometheus.NewTimer(histogram)
defer timer.ObserveDuration()
return f()
}

func trackSuccessFailure(err error, counterVec *prometheus.CounterVec, method string) {
if err == nil {
counterVec.With(prometheus.Labels{"status": successMetric, "method": method}).Inc()
} else {
counterVec.With(prometheus.Labels{"status": failureMetric, "method": method}).Inc()
}
}
Loading