diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 1f572e1..9bb915e 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -60,7 +60,8 @@ RUN apt-get update \ && curl -sSL -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl \ && chmod +x /usr/local/bin/kubectl \ # Install Helm - && curl -s https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash - + && curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash \ + && helm repo add stable https://kubernetes-charts.storage.googleapis.com/ # Enable bash completion RUN apt-get update && apt install -y bash-completion && echo "source /etc/bash_completion" >> "/root/.bashrc" diff --git a/Makefile b/Makefile index ef33723..ab85ea5 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,8 @@ IMG ?= controller:latest # Produce CRDs that work back to Kubernetes 1.11 (no version conversion) CRD_OPTIONS ?= "crd:trivialVersions=true" +# Prometheus helm installation name +PROMETHEUS_NAME ?= "prom-azure-databricks-operator" # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) ifeq (,$(shell go env GOBIN)) @@ -154,6 +156,8 @@ endif kubectl cluster-info + make install-prometheus + @echo "deploying controller to cluster" make deploy-controller @@ -191,6 +195,12 @@ else @echo "kustomize has been installed" endif +install-prometheus: + @echo "installing prometheus" + # install prometheus (and set to monitor all namespaces in our kind cluster) + helm install ${PROMETHEUS_NAME} stable/prometheus-operator --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false + @echo "prometheus has been installed" + install-test-dependency: go get -u github.com/jstemmer/go-junit-report \ && go get github.com/axw/gocov/gocov \ diff --git a/README.md b/README.md index b9d3cd7..8cddc11 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ Few topics are discussed in the [resources.md](https://github.com/microsoft/azur - Kubernetes on WSL - Build pipelines - Dev container +- Operator metrics ## Contributing diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml index e2d9b08..5c359b4 100644 --- a/config/prometheus/monitor.yaml +++ b/config/prometheus/monitor.yaml @@ -11,5 +11,9 @@ spec: endpoints: - path: /metrics port: https + scheme: https + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + insecureSkipVerify: true # Configure certs here if set up for auth_proxy (uses self-signed currently) selector: control-plane: controller-manager diff --git a/controllers/dcluster_controller_databricks.go b/controllers/dcluster_controller_databricks.go index d75a261..654fc72 100644 --- a/controllers/dcluster_controller_databricks.go +++ b/controllers/dcluster_controller_databricks.go @@ -22,6 +22,8 @@ import ( "reflect" databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1" + "github.com/prometheus/client_golang/prometheus" + dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models" ) func (r *DclusterReconciler) submit(instance *databricksv1alpha1.Dcluster) error { @@ -36,7 +38,7 @@ func (r *DclusterReconciler) submit(instance *databricksv1alpha1.Dcluster) error } } - clusterInfo, err := r.APIClient.Clusters().Create(*instance.Spec) + clusterInfo, err := r.createCluster(instance) if err != nil { return err } @@ -55,7 +57,7 @@ func (r *DclusterReconciler) refresh(instance *databricksv1alpha1.Dcluster) erro return nil } - clusterInfo, err := r.APIClient.Clusters().Get(instance.Status.ClusterInfo.ClusterID) + clusterInfo, err := r.getCluster(instance.Status.ClusterInfo.ClusterID) if err != nil { return err } @@ -78,5 +80,31 @@ func (r *DclusterReconciler) delete(instance *databricksv1alpha1.Dcluster) error return nil } - return r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID) + return trackExecutionTime(dclusterDeleteDuration, func() error { + err := r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID) + trackSuccessFailure(err, dclusterCounterVec, "delete") + return err + }) +} + +func (r *DclusterReconciler) getCluster(clusterID string) (cluster dbmodels.ClusterInfo, err error) { + timer := prometheus.NewTimer(dclusterGetDuration) + defer timer.ObserveDuration() + + cluster, err = r.APIClient.Clusters().Get(clusterID) + + trackSuccessFailure(err, dclusterCounterVec, "get") + + return cluster, err +} + +func (r *DclusterReconciler) createCluster(instance *databricksv1alpha1.Dcluster) (cluster dbmodels.ClusterInfo, err error) { + timer := prometheus.NewTimer(dclusterCreateDuration) + defer timer.ObserveDuration() + + cluster, err = r.APIClient.Clusters().Create(*instance.Spec) + + trackSuccessFailure(err, dclusterCounterVec, "create") + + return cluster, err } diff --git a/controllers/dcluster_metrics.go b/controllers/dcluster_metrics.go new file mode 100644 index 0000000..dc381f7 --- /dev/null +++ b/controllers/dcluster_metrics.go @@ -0,0 +1,53 @@ +/* +Copyright 2019 microsoft. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + dclusterCounterVec = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: metricPrefix + "dcluster_total", + Help: "Counter related to the dCluster CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint", + }, + []string{"status", "method"}, + ) + + dclusterCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "dcluster_creation_request_duration_seconds", + Help: "Duration of DB api dcluster create calls.", + }) + + dclusterGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "dcluster_get_request_duration_seconds", + Help: "Duration of DB api dcluster get calls.", + }) + + dclusterDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "dcluster_delete_request_duration_seconds", + Help: "Duration of DB api dcluster delete calls.", + }) +) + +func init() { + // Register custom metrics with the global prometheus registry + metrics.Registry.MustRegister(dclusterCounterVec, + dclusterCreateDuration, dclusterGetDuration, dclusterDeleteDuration) +} diff --git a/controllers/djob_controller_databricks.go b/controllers/djob_controller_databricks.go index 785586d..4288808 100644 --- a/controllers/djob_controller_databricks.go +++ b/controllers/djob_controller_databricks.go @@ -19,12 +19,15 @@ package controllers import ( "context" "fmt" + "reflect" + "strings" + databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1" + "github.com/prometheus/client_golang/prometheus" + dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "reflect" "sigs.k8s.io/controller-runtime/pkg/client" - "strings" ) func (r *DjobReconciler) submit(instance *databricksv1alpha1.Djob) error { @@ -68,11 +71,12 @@ func (r *DjobReconciler) submit(instance *databricksv1alpha1.Djob) error { } instance.ObjectMeta.SetOwnerReferences(references) } - jobSettings := databricksv1alpha1.ToDatabricksJobSettings(instance.Spec) - job, err := r.APIClient.Jobs().Create(jobSettings) + job, err := r.createJob(instance) + if err != nil { return err } + instance.Spec.Name = instance.GetName() instance.Status = &databricksv1alpha1.DjobStatus{ JobStatus: &job, @@ -85,7 +89,8 @@ func (r *DjobReconciler) refresh(instance *databricksv1alpha1.Djob) error { jobID := instance.Status.JobStatus.JobID - job, err := r.APIClient.Jobs().Get(jobID) + job, err := r.getJob(jobID) + if err != nil { return err } @@ -126,12 +131,39 @@ func (r *DjobReconciler) delete(instance *databricksv1alpha1.Djob) error { jobID := instance.Status.JobStatus.JobID // Check if the job exists before trying to delete it - if _, err := r.APIClient.Jobs().Get(jobID); err != nil { + if _, err := r.getJob(jobID); err != nil { if strings.Contains(err.Error(), "does not exist") { return nil } return err } - return r.APIClient.Jobs().Delete(jobID) + return trackExecutionTime(djobDeleteDuration, func() error { + err := r.APIClient.Jobs().Delete(jobID) + trackSuccessFailure(err, djobCounterVec, "delete") + return err + }) +} + +func (r *DjobReconciler) getJob(jobID int64) (job dbmodels.Job, err error) { + timer := prometheus.NewTimer(djobGetDuration) + defer timer.ObserveDuration() + + job, err = r.APIClient.Jobs().Get(jobID) + + trackSuccessFailure(err, djobCounterVec, "get") + + return job, err +} + +func (r *DjobReconciler) createJob(instance *databricksv1alpha1.Djob) (job dbmodels.Job, err error) { + timer := prometheus.NewTimer(djobCreateDuration) + defer timer.ObserveDuration() + + jobSettings := databricksv1alpha1.ToDatabricksJobSettings(instance.Spec) + job, err = r.APIClient.Jobs().Create(jobSettings) + + trackSuccessFailure(err, djobCounterVec, "create") + + return job, err } diff --git a/controllers/djob_metrics.go b/controllers/djob_metrics.go new file mode 100644 index 0000000..801dfd9 --- /dev/null +++ b/controllers/djob_metrics.go @@ -0,0 +1,53 @@ +/* +Copyright 2019 microsoft. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + djobCounterVec = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: metricPrefix + "djob_total", + Help: "Counter related to the dJob CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint", + }, + []string{"status", "method"}, + ) + + djobCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "djob_creation_request_duration_seconds", + Help: "Duration of DB api djob create calls.", + }) + + djobGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "djob_get_request_duration_seconds", + Help: "Duration of DB api djob get calls.", + }) + + djobDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "djob_delete_request_duration_seconds", + Help: "Duration of DB api djob delete calls.", + }) +) + +func init() { + // Register custom metrics with the global prometheus registry + metrics.Registry.MustRegister(djobCounterVec, + djobCreateDuration, djobGetDuration, djobDeleteDuration) +} diff --git a/controllers/metrics.go b/controllers/metrics.go new file mode 100644 index 0000000..8a2d2c2 --- /dev/null +++ b/controllers/metrics.go @@ -0,0 +1,41 @@ +/* +Copyright 2019 microsoft. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +const ( + metricPrefix = "databricks_" + successMetric = "success" + failureMetric = "failure" +) + +func trackExecutionTime(histogram prometheus.Histogram, f func() error) error { + timer := prometheus.NewTimer(histogram) + defer timer.ObserveDuration() + return f() +} + +func trackSuccessFailure(err error, counterVec *prometheus.CounterVec, method string) { + if err == nil { + counterVec.With(prometheus.Labels{"status": successMetric, "method": method}).Inc() + } else { + counterVec.With(prometheus.Labels{"status": failureMetric, "method": method}).Inc() + } +} diff --git a/controllers/run_controller_databricks.go b/controllers/run_controller_databricks.go index 3ca24b2..af39883 100644 --- a/controllers/run_controller_databricks.go +++ b/controllers/run_controller_databricks.go @@ -24,6 +24,8 @@ import ( "time" databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1" + "github.com/prometheus/client_golang/prometheus" + "github.com/xinsnake/databricks-sdk-golang/azure" dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -33,7 +35,7 @@ import ( func (r *RunReconciler) submit(instance *databricksv1alpha1.Run) error { r.Log.Info(fmt.Sprintf("Submitting run %s", instance.GetName())) - var run dbmodels.Run + var run *dbmodels.Run var err error instance.Spec.RunName = instance.GetName() @@ -42,50 +44,16 @@ func (r *RunReconciler) submit(instance *databricksv1alpha1.Run) error { // otherwise submit it as RunNow under the job, and make the // job the owner of the run if instance.Spec.JobName != "" { - runParameters := dbmodels.RunParameters{ - JarParams: instance.Spec.JarParams, - NotebookParams: instance.Spec.NotebookParams, - PythonParams: instance.Spec.PythonParams, - SparkSubmitParams: instance.Spec.SparkSubmitParams, - } - - // Here we set the owner attribute - k8sJobNamespacedName := types.NamespacedName{Namespace: instance.GetNamespace(), Name: instance.Spec.JobName} - var k8sJob databricksv1alpha1.Djob - if err = r.Client.Get(context.Background(), k8sJobNamespacedName, &k8sJob); err != nil { - return err - } - instance.ObjectMeta.SetOwnerReferences([]metav1.OwnerReference{ - { - APIVersion: "v1alpha1", // TODO should this be a referenced value? - Kind: "Djob", // TODO should this be a referenced value? - Name: k8sJob.GetName(), - UID: k8sJob.GetUID(), - }, - }) - - run, err = r.APIClient.Jobs().RunNow(k8sJob.Status.JobStatus.JobID, runParameters) + run, err = r.runUsingRunNow(instance) } else { - clusterSpec := dbmodels.ClusterSpec{ - NewCluster: instance.Spec.NewCluster, - ExistingClusterID: instance.Spec.ExistingClusterID, - Libraries: instance.Spec.Libraries, - } - jobTask := dbmodels.JobTask{ - NotebookTask: instance.Spec.NotebookTask, - SparkJarTask: instance.Spec.SparkJarTask, - SparkPythonTask: instance.Spec.SparkPythonTask, - SparkSubmitTask: instance.Spec.SparkSubmitTask, - } - run, err = r.APIClient.Jobs(). - RunsSubmit(instance.Spec.RunName, clusterSpec, jobTask, instance.Spec.TimeoutSeconds) + run, err = r.runUsingRunsSubmit(instance) } if err != nil { return err } - runOutput, err := r.APIClient.Jobs().RunsGetOutput(run.RunID) + runOutput, err := r.getRunOutput(run.RunID) if err != nil { return err } @@ -99,7 +67,7 @@ func (r *RunReconciler) refresh(instance *databricksv1alpha1.Run) error { runID := instance.Status.Metadata.RunID - runOutput, err := r.APIClient.Jobs().RunsGetOutput(runID) + runOutput, err := r.getRunOutput(runID) if err != nil { return err } @@ -130,7 +98,7 @@ func (r *RunReconciler) delete(instance *databricksv1alpha1.Run) error { runID := instance.Status.Metadata.RunID // Check if the run exists before trying to delete it - if _, err := r.APIClient.Jobs().RunsGet(runID); err != nil { + if _, err := r.getRun(runID); err != nil { if strings.Contains(err.Error(), "does not exist") { return nil } @@ -144,5 +112,84 @@ func (r *RunReconciler) delete(instance *databricksv1alpha1.Run) error { // It takes time for DataBricks to cancel a run time.Sleep(15 * time.Second) - return r.APIClient.Jobs().RunsDelete(runID) + return trackExecutionTime(runDeleteDuration, func() error { + err := r.APIClient.Jobs().RunsDelete(runID) + trackSuccessFailure(err, runCounterVec, "delete") + return err + }) +} + +func (r *RunReconciler) runUsingRunNow(instance *databricksv1alpha1.Run) (*dbmodels.Run, error) { + timer := prometheus.NewTimer(runNowDuration) + defer timer.ObserveDuration() + + runParameters := dbmodels.RunParameters{ + JarParams: instance.Spec.JarParams, + NotebookParams: instance.Spec.NotebookParams, + PythonParams: instance.Spec.PythonParams, + SparkSubmitParams: instance.Spec.SparkSubmitParams, + } + + // Here we set the owner attribute + k8sJobNamespacedName := types.NamespacedName{Namespace: instance.GetNamespace(), Name: instance.Spec.JobName} + var k8sJob databricksv1alpha1.Djob + if err := r.Client.Get(context.Background(), k8sJobNamespacedName, &k8sJob); err != nil { + return nil, err + } + + instance.ObjectMeta.SetOwnerReferences([]metav1.OwnerReference{ + { + APIVersion: "v1alpha1", // TODO should this be a referenced value? + Kind: "Djob", // TODO should this be a referenced value? + Name: k8sJob.GetName(), + UID: k8sJob.GetUID(), + }, + }) + + run, err := r.APIClient.Jobs().RunNow(k8sJob.Status.JobStatus.JobID, runParameters) + trackSuccessFailure(err, runCounterVec, "runsnow") + return &run, err +} + +func (r *RunReconciler) runUsingRunsSubmit(instance *databricksv1alpha1.Run) (*dbmodels.Run, error) { + timer := prometheus.NewTimer(runSubmitDuration) + defer timer.ObserveDuration() + + clusterSpec := dbmodels.ClusterSpec{ + NewCluster: instance.Spec.NewCluster, + ExistingClusterID: instance.Spec.ExistingClusterID, + Libraries: instance.Spec.Libraries, + } + jobTask := dbmodels.JobTask{ + NotebookTask: instance.Spec.NotebookTask, + SparkJarTask: instance.Spec.SparkJarTask, + SparkPythonTask: instance.Spec.SparkPythonTask, + SparkSubmitTask: instance.Spec.SparkSubmitTask, + } + + run, err := r.APIClient.Jobs().RunsSubmit(instance.Spec.RunName, clusterSpec, jobTask, instance.Spec.TimeoutSeconds) + trackSuccessFailure(err, runCounterVec, "runssubmit") + return &run, err +} + +func (r *RunReconciler) getRun(runID int64) (dbmodels.Run, error) { + timer := prometheus.NewTimer(runGetDuration) + defer timer.ObserveDuration() + + runOutput, err := r.APIClient.Jobs().RunsGet(runID) + + trackSuccessFailure(err, runCounterVec, "get") + + return runOutput, err +} + +func (r *RunReconciler) getRunOutput(runID int64) (azure.JobsRunsGetOutputResponse, error) { + timer := prometheus.NewTimer(runGetOutputDuration) + defer timer.ObserveDuration() + + runOutput, err := r.APIClient.Jobs().RunsGetOutput(runID) + + trackSuccessFailure(err, runCounterVec, "getoutput") + + return runOutput, err } diff --git a/controllers/run_metrics.go b/controllers/run_metrics.go new file mode 100644 index 0000000..903cd2a --- /dev/null +++ b/controllers/run_metrics.go @@ -0,0 +1,63 @@ +/* +Copyright 2019 microsoft. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + runCounterVec = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: metricPrefix + "run_total", + Help: "Counter related to the Run CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint", + }, + []string{"status", "method"}, + ) + + runSubmitDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "run_submit_request_duration_seconds", + Help: "Duration of DB api run submit calls.", + }) + + runNowDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "run_now_request_duration_seconds", + Help: "Duration of DB api run now calls.", + }) + + runGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "run_get_request_duration_seconds", + Help: "Duration of DB api run get calls.", + }) + + runGetOutputDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "run_get_output_request_duration_seconds", + Help: "Duration of DB api run get output calls.", + }) + + runDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: metricPrefix + "run_delete_request_duration_seconds", + Help: "Duration of DB api run delete calls.", + }) +) + +func init() { + // Register custom metrics with the global prometheus registry + metrics.Registry.MustRegister(runCounterVec, runSubmitDuration, + runNowDuration, runGetDuration, runGetOutputDuration, runDeleteDuration) +} diff --git a/docs/resources.md b/docs/resources.md index 6ae391d..60a7f9d 100644 --- a/docs/resources.md +++ b/docs/resources.md @@ -32,4 +32,41 @@ More info: ## Build pipelines - [Create a pipeline and add a status badge to Github](https://docs.microsoft.com/en-us/azure/devops/pipelines/create-first-pipeline?view=azure-devops&tabs=tfs-2018-2) -- [Customize status badge with shields.io](https://shields.io/) \ No newline at end of file +- [Customize status badge with shields.io](https://shields.io/) + +## Operator metrics + +- Operator telemetry metrics are exposed via standard [Prometheus](https://prometheus.io/) format endpoints. +- [Prometheus-Operator](https://github.com/coreos/prometheus-operator) is included as part of the operator deployment via Helm chart. + - Prometheus configuration is generated via the `config/default/kustomization.yaml` + - Installation of Prometheus-Operator can be manually triggered via command `make install-prometheus` + - If you don't want Prometheus-Operator configuration generated, it can be disabled by commenting out the line indicated in `config/default/kustomization.yaml` + - *NOTE:* If you don't have the Prometheus-Operator installed, the ServiceMonitor CRD will not be available to you +- Custom metrics exposed by the Operator can be found by searching for `databricks_` inside the Prometheus web ui +- Metrics follow the naming guidlines recommended by Prometheus + +### How to access the Prometheus instance +- Have the operator installed and running locally. See [deploy.md](https://github.com/microsoft/azure-databricks-operator/blob/master/docs/deploy.md) +- Determine the name of Prometheus service running in your cluster (by default this will be prom-azure-databricks-oper-prometheus) +- Port forward localhost:9090 to your service: `kubectl port-forward service/prom-azure-databricks-oper-prometheus 9090:9090` + - If using VSCode and Dev Container, you may need to expose the internal port out to your host machine (Command Pallete > Remote Containers Forward Port From Container) +- Using a browser navigate to `http://localhost:9090` to view the Prometheus dashboard +- For more information regarding the usage of Prometheus please view the [docs here](https://prometheus.io/) + +### How To scrape the metrics from a single intance of the Operator running on a Pod: +- Have the operator installed and running locally. See [deploy.md](https://github.com/microsoft/azure-databricks-operator/blob/master/docs/deploy.md) +- Determine the name of the pod running your operator: `kubectl get pods -n azure-databricks-operator-system` +- Port forward localhost:8080 to your pod: `kubectl port-forward -n azure-databricks-operator-system pod/azure-databricks-operator-controller-manager- 8080:8080` +- Open another terminal and curl request the metric endpoint: `curl localhost:8080/metrics` + +### Counter metrics +Counter metrics take the format `databricks_[x]_total` where: +- x: Object being maniputlated; example: `dcluster` + +Counter metrics have labels that show breakdown by: +- status (success | failure) +- method (the action being performed via REST call example: get, create, delete) + +Histogram metrics take the format `databricks_[x]_[action]_request_duration_seconds` where: +- x: Object being maniputlated; example: `dcluster` +- action: Action being performed; example: `create` diff --git a/go.mod b/go.mod index 38e16ab..6ddaaca 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/matm/gocov-html v0.0.0-20191111163307-9ee104d84c82 // indirect github.com/onsi/ginkgo v1.10.3 github.com/onsi/gomega v1.7.0 + github.com/prometheus/client_golang v0.9.2 github.com/spf13/pflag v1.0.5 // indirect github.com/xinsnake/databricks-sdk-golang v0.1.2 golang.org/x/crypto v0.0.0-20191112222119-e1110fd1c708 // indirect