From 424c7913f6641fc19e203341f23d6488245edb2b Mon Sep 17 00:00:00 2001 From: Benny Zlotnik Date: Thu, 23 Apr 2026 13:49:42 +0300 Subject: [PATCH 1/3] feat: add optional Prometheus monitoring via OperatorConfig Add MonitoringConfig to OperatorConfigSpec that deploys a ServiceMonitor, metrics token Secret, Role, and ClusterRoleBinding when enabled. Cleanup removes all resources when disabled. Uses unstructured types to avoid hard dependency on Prometheus Operator CRDs. Assisted-by: claude-sonnet-4-6 Signed-off-by: Benny Zlotnik --- api/v1alpha1/operatorconfig_types.go | 28 ++++ api/v1alpha1/zz_generated.deepcopy.go | 20 +++ ....sdv.cloud.redhat.com_operatorconfigs.yaml | 22 ++++ config/rbac/role.yaml | 12 ++ .../controller/operatorconfig/controller.go | 98 +++++++++++++- .../controller/operatorconfig/resources.go | 121 ++++++++++++++++++ .../operatorconfig/resources_test.go | 96 ++++++++++++++ 7 files changed, 396 insertions(+), 1 deletion(-) diff --git a/api/v1alpha1/operatorconfig_types.go b/api/v1alpha1/operatorconfig_types.go index d1ac7bcf..b8af2a9a 100644 --- a/api/v1alpha1/operatorconfig_types.go +++ b/api/v1alpha1/operatorconfig_types.go @@ -436,6 +436,27 @@ func (c *WorkspacesConfig) GetAutoPauseTimeoutMinutes() int32 { return DefaultAutoPauseTimeoutMinutes } +// MonitoringConfig defines configuration for Prometheus metrics collection +type MonitoringConfig struct { + // Enabled determines if a ServiceMonitor should be deployed for Prometheus scraping + // +kubebuilder:default=false + Enabled bool `json:"enabled"` + + // Interval defines the Prometheus scrape interval + // Default: "30s" + // +optional + // +kubebuilder:validation:Pattern=`^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$` + Interval string `json:"interval,omitempty"` +} + +// GetInterval returns the scrape interval, falling back to "30s" +func (c *MonitoringConfig) GetInterval() string { + if c != nil && c.Interval != "" { + return c.Interval + } + return "30s" +} + // OperatorConfigSpec defines the desired state of OperatorConfig type OperatorConfigSpec struct { // OSBuilds defines the configuration for OS build operations @@ -461,6 +482,10 @@ type OperatorConfigSpec struct { // Workspaces defines configuration for developer workspaces // +optional Workspaces *WorkspacesConfig `json:"workspaces,omitempty"` + + // Monitoring defines configuration for Prometheus metrics collection + // +optional + Monitoring *MonitoringConfig `json:"monitoring,omitempty"` } // OSBuildsConfig defines configuration for OS build operations @@ -642,6 +667,9 @@ type OperatorConfigStatus struct { // UserNamespacesSupported indicates if the cluster supports user namespaces in pods // (SCC userNamespaceLevel field). When false, workspace pods use privileged mode. UserNamespacesSupported bool `json:"userNamespacesSupported,omitempty"` + + // MonitoringEnabled indicates if the ServiceMonitor is currently deployed + MonitoringEnabled bool `json:"monitoringEnabled,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 7f2f4975..d666f053 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1039,6 +1039,21 @@ func (in *JumpstarterTargetMapping) DeepCopy() *JumpstarterTargetMapping { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MonitoringConfig) DeepCopyInto(out *MonitoringConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MonitoringConfig. +func (in *MonitoringConfig) DeepCopy() *MonitoringConfig { + if in == nil { + return nil + } + out := new(MonitoringConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *OSBuildsConfig) DeepCopyInto(out *OSBuildsConfig) { *out = *in @@ -1170,6 +1185,11 @@ func (in *OperatorConfigSpec) DeepCopyInto(out *OperatorConfigSpec) { *out = new(WorkspacesConfig) (*in).DeepCopyInto(*out) } + if in.Monitoring != nil { + in, out := &in.Monitoring, &out.Monitoring + *out = new(MonitoringConfig) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OperatorConfigSpec. diff --git a/config/crd/bases/automotive.sdv.cloud.redhat.com_operatorconfigs.yaml b/config/crd/bases/automotive.sdv.cloud.redhat.com_operatorconfigs.yaml index a7a884fd..16ce483e 100644 --- a/config/crd/bases/automotive.sdv.cloud.redhat.com_operatorconfigs.yaml +++ b/config/crd/bases/automotive.sdv.cloud.redhat.com_operatorconfigs.yaml @@ -684,6 +684,24 @@ spec: exporter configurations type: object type: object + monitoring: + description: Monitoring defines configuration for Prometheus metrics + collection + properties: + enabled: + default: false + description: Enabled determines if a ServiceMonitor should be + deployed for Prometheus scraping + type: boolean + interval: + description: |- + Interval defines the Prometheus scrape interval + Default: "30s" + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + required: + - enabled + type: object osBuilds: description: OSBuilds defines the configuration for OS build operations properties: @@ -1065,6 +1083,10 @@ spec: message: description: Message provides detail about the current phase type: string + monitoringEnabled: + description: MonitoringEnabled indicates if the ServiceMonitor is + currently deployed + type: boolean observedGeneration: description: ObservedGeneration is the most recent generation observed by the controller. diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index fecc483e..3db1cae9 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -135,6 +135,18 @@ rules: - imagestreamtags verbs: - delete +- apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - networking.k8s.io resources: diff --git a/internal/controller/operatorconfig/controller.go b/internal/controller/operatorconfig/controller.go index 41b02f1f..2e7de0d2 100644 --- a/internal/controller/operatorconfig/controller.go +++ b/internal/controller/operatorconfig/controller.go @@ -23,7 +23,9 @@ import ( "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -197,6 +199,7 @@ type OperatorConfigReconciler struct { // +kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=tekton.dev,resources=tasks;pipelines;pipelineruns,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=get;list;watch;create;update;patch;delete;use +// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=get;list;watch;create;update;patch;delete // Reconcile reconciles the OperatorConfig resource lifecycle. func (r *OperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -233,6 +236,10 @@ func (r *OperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reque log.Error(err, "Failed to cleanup OSBuilds") return ctrl.Result{}, err } + if err := r.cleanupServiceMonitor(ctx, config); err != nil { + log.Error(err, "Failed to cleanup ServiceMonitor") + return ctrl.Result{}, err + } log.Info("Removing finalizer") controllerutil.RemoveFinalizer(config, finalizerName) if err := r.Update(ctx, config); err != nil { @@ -293,11 +300,33 @@ func (r *OperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reque statusChanged = true } + // Reconcile monitoring (ServiceMonitor) + if config.Spec.Monitoring != nil && config.Spec.Monitoring.Enabled { + if err := r.deployServiceMonitor(ctx, config); err != nil { + log.Error(err, "Failed to deploy ServiceMonitor") + return ctrl.Result{}, fmt.Errorf("failed to deploy ServiceMonitor: %w", err) + } + if !config.Status.MonitoringEnabled { + config.Status.MonitoringEnabled = true + statusChanged = true + } + } else { + if err := r.cleanupServiceMonitor(ctx, config); err != nil { + log.Error(err, "Failed to cleanup ServiceMonitor") + return ctrl.Result{}, fmt.Errorf("failed to cleanup ServiceMonitor: %w", err) + } + if config.Status.MonitoringEnabled { + config.Status.MonitoringEnabled = false + statusChanged = true + } + } + if statusChanged { log.Info("Updating status", "phase", config.Status.Phase, "osBuildsDeployed", config.Status.OSBuildsDeployed, - "jumpstarterAvailable", config.Status.JumpstarterAvailable) + "jumpstarterAvailable", config.Status.JumpstarterAvailable, + "monitoringEnabled", config.Status.MonitoringEnabled) if err := r.Status().Update(ctx, config); err != nil { log.Error(err, "Failed to update status") return ctrl.Result{}, err @@ -1007,6 +1036,73 @@ func (r *OperatorConfigReconciler) cleanupWorkspaceInfra(ctx context.Context, co return nil } +func (r *OperatorConfigReconciler) deployServiceMonitor(ctx context.Context, config *automotivev1alpha1.OperatorConfig) error { + tokenSecret := r.buildMetricsTokenSecret(config.Namespace) + if err := r.createOrUpdate(ctx, tokenSecret, config); err != nil { + return fmt.Errorf("failed to create/update metrics token secret: %w", err) + } + + role := r.buildMetricsReaderRole(config.Namespace) + if err := r.createOrUpdate(ctx, role, config); err != nil { + return fmt.Errorf("failed to create/update metrics reader role: %w", err) + } + + binding := r.buildMetricsReaderRoleBinding(config.Namespace) + if err := r.createOrUpdate(ctx, binding, config); err != nil { + return fmt.Errorf("failed to create/update metrics reader role binding: %w", err) + } + + clusterBinding := r.buildMetricsReaderClusterRoleBinding(config.Namespace) + if err := r.createOrUpdate(ctx, clusterBinding, config); err != nil { + return fmt.Errorf("failed to create/update metrics reader cluster role binding: %w", err) + } + + sm := r.buildServiceMonitor(config.Namespace, config.Spec.Monitoring) + return r.createOrUpdate(ctx, sm, config) +} + +func (r *OperatorConfigReconciler) cleanupServiceMonitor(ctx context.Context, config *automotivev1alpha1.OperatorConfig) error { + sm := &unstructured.Unstructured{} + sm.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "monitoring.coreos.com", + Version: "v1", + Kind: "ServiceMonitor", + }) + sm.SetName(serviceMonitorName) + sm.SetNamespace(config.Namespace) + if err := r.Delete(ctx, sm); err != nil && !errors.IsNotFound(err) && !apimeta.IsNoMatchError(err) { + return fmt.Errorf("failed to delete ServiceMonitor: %w", err) + } + + secret := &corev1.Secret{} + secret.Name = serviceMonitorTokenSecret + secret.Namespace = config.Namespace + if err := r.Delete(ctx, secret); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete metrics token secret: %w", err) + } + + role := &rbacv1.Role{} + role.Name = metricsReaderRoleName + role.Namespace = config.Namespace + if err := r.Delete(ctx, role); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete metrics reader role: %w", err) + } + + binding := &rbacv1.RoleBinding{} + binding.Name = metricsReaderRoleName + binding.Namespace = config.Namespace + if err := r.Delete(ctx, binding); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete metrics reader role binding: %w", err) + } + + clusterBinding := &rbacv1.ClusterRoleBinding{} + clusterBinding.Name = metricsReaderBindingName + if err := r.Delete(ctx, clusterBinding); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete metrics reader cluster role binding: %w", err) + } + return nil +} + func (r *OperatorConfigReconciler) createOrUpdateTask(ctx context.Context, task *tektonv1.Task) error { return r.createOrUpdate(ctx, task, nil) } diff --git a/internal/controller/operatorconfig/resources.go b/internal/controller/operatorconfig/resources.go index 7ae4a671..abedbdc6 100644 --- a/internal/controller/operatorconfig/resources.go +++ b/internal/controller/operatorconfig/resources.go @@ -16,6 +16,8 @@ import ( networkingv1 "k8s.io/api/networking/v1" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" ) @@ -26,6 +28,10 @@ const ( sccPrivilegedRoleName = "ado-build-privileged" workspaceServiceAccountName = "ado-workspace" workspaceSCCName = "ado-workspace-scc" + serviceMonitorName = "ado-operator-metrics" + serviceMonitorTokenSecret = "ado-operator-metrics-token" + metricsReaderRoleName = "ado-metrics-reader" + metricsReaderBindingName = "ado-metrics-reader" ) // getOperatorImage returns the operator image from env var, then config, then default constant @@ -1062,3 +1068,118 @@ func (r *OperatorConfigReconciler) buildWorkspaceSCCPrivileged() *securityv1.Sec }, } } + +func (r *OperatorConfigReconciler) buildMetricsTokenSecret(namespace string) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceMonitorTokenSecret, + Namespace: namespace, + Annotations: map[string]string{ + "kubernetes.io/service-account.name": "ado-operator", + }, + }, + Type: corev1.SecretTypeServiceAccountToken, + } +} + +func (r *OperatorConfigReconciler) buildMetricsReaderRole(namespace string) *rbacv1.Role { + return &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: metricsReaderRoleName, + Namespace: namespace, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"secrets"}, + ResourceNames: []string{serviceMonitorTokenSecret}, + Verbs: []string{"get"}, + }, + }, + } +} + +func (r *OperatorConfigReconciler) buildMetricsReaderRoleBinding(namespace string) *rbacv1.RoleBinding { + return &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: metricsReaderRoleName, + Namespace: namespace, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.GroupName, + Kind: "Role", + Name: metricsReaderRoleName, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: "prometheus-user-workload", + Namespace: "openshift-user-workload-monitoring", + }, + }, + } +} + +func (r *OperatorConfigReconciler) buildMetricsReaderClusterRoleBinding(namespace string) *rbacv1.ClusterRoleBinding { + return &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: metricsReaderBindingName, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.GroupName, + Kind: "ClusterRole", + Name: "metrics-reader", + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: "ado-operator", + Namespace: namespace, + }, + }, + } +} + +func (r *OperatorConfigReconciler) buildServiceMonitor(namespace string, config *automotivev1alpha1.MonitoringConfig) *unstructured.Unstructured { + interval := config.GetInterval() + + sm := &unstructured.Unstructured{} + sm.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "monitoring.coreos.com", + Version: "v1", + Kind: "ServiceMonitor", + }) + sm.SetName(serviceMonitorName) + sm.SetNamespace(namespace) + sm.SetLabels(map[string]string{ + "control-plane": "operator", + "app.kubernetes.io/name": "automotive-dev-operator", + "app.kubernetes.io/managed-by": "operator", + "app.kubernetes.io/component": "monitoring", + }) + + sm.Object["spec"] = map[string]interface{}{ + "selector": map[string]interface{}{ + "matchLabels": map[string]interface{}{ + "control-plane": "operator", + }, + }, + "endpoints": []interface{}{ + map[string]interface{}{ + "path": "/metrics", + "port": "https", + "scheme": "https", + "interval": interval, + "bearerTokenSecret": map[string]interface{}{ + "name": serviceMonitorTokenSecret, + "key": "token", + }, + "tlsConfig": map[string]interface{}{ + "insecureSkipVerify": true, + }, + }, + }, + } + + return sm +} diff --git a/internal/controller/operatorconfig/resources_test.go b/internal/controller/operatorconfig/resources_test.go index fa5f75fc..bd36dfc5 100644 --- a/internal/controller/operatorconfig/resources_test.go +++ b/internal/controller/operatorconfig/resources_test.go @@ -24,6 +24,7 @@ import ( . "github.com/onsi/gomega" //nolint:revive "gopkg.in/yaml.v3" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" ) func TestResources(t *testing.T) { @@ -162,6 +163,101 @@ var _ = Describe("OperatorConfig Resources", func() { }) }) + Describe("buildServiceMonitor", func() { + It("should have correct GVK for ServiceMonitor", func() { + config := &automotivev1alpha1.MonitoringConfig{Enabled: true} + sm := r.buildServiceMonitor("test-ns", config) + Expect(sm.GetKind()).To(Equal("ServiceMonitor")) + Expect(sm.GroupVersionKind().Group).To(Equal("monitoring.coreos.com")) + Expect(sm.GroupVersionKind().Version).To(Equal("v1")) + }) + + It("should select services with control-plane=operator label", func() { + config := &automotivev1alpha1.MonitoringConfig{Enabled: true} + sm := r.buildServiceMonitor("test-ns", config) + selector := sm.Object["spec"].(map[string]any)["selector"].(map[string]any) + matchLabels := selector["matchLabels"].(map[string]any) + Expect(matchLabels["control-plane"]).To(Equal("operator")) + }) + + It("should use bearerTokenSecret referencing the token secret", func() { + config := &automotivev1alpha1.MonitoringConfig{Enabled: true} + sm := r.buildServiceMonitor("test-ns", config) + endpoints := sm.Object["spec"].(map[string]any)["endpoints"].([]any) + ep := endpoints[0].(map[string]any) + tokenRef := ep["bearerTokenSecret"].(map[string]any) + Expect(tokenRef["name"]).To(Equal(serviceMonitorTokenSecret)) + Expect(tokenRef["key"]).To(Equal("token")) + }) + + It("should use default interval when not specified", func() { + config := &automotivev1alpha1.MonitoringConfig{Enabled: true} + sm := r.buildServiceMonitor("test-ns", config) + endpoints := sm.Object["spec"].(map[string]any)["endpoints"].([]any) + ep := endpoints[0].(map[string]any) + Expect(ep["interval"]).To(Equal("30s")) + }) + + It("should use custom interval when specified", func() { + config := &automotivev1alpha1.MonitoringConfig{Enabled: true, Interval: "15s"} + sm := r.buildServiceMonitor("test-ns", config) + endpoints := sm.Object["spec"].(map[string]any)["endpoints"].([]any) + ep := endpoints[0].(map[string]any) + Expect(ep["interval"]).To(Equal("15s")) + }) + + It("should scrape /metrics on port https", func() { + config := &automotivev1alpha1.MonitoringConfig{Enabled: true} + sm := r.buildServiceMonitor("test-ns", config) + endpoints := sm.Object["spec"].(map[string]any)["endpoints"].([]any) + ep := endpoints[0].(map[string]any) + Expect(ep["path"]).To(Equal("/metrics")) + Expect(ep["port"]).To(Equal("https")) + Expect(ep["scheme"]).To(Equal("https")) + }) + }) + + Describe("buildMetricsTokenSecret", func() { + It("should be a ServiceAccountToken type referencing ado-operator", func() { + secret := r.buildMetricsTokenSecret("test-ns") + Expect(secret.Type).To(Equal(corev1.SecretTypeServiceAccountToken)) + Expect(secret.Annotations["kubernetes.io/service-account.name"]).To(Equal("ado-operator")) + }) + }) + + Describe("buildMetricsReaderRoleBinding", func() { + It("should grant prometheus-user-workload SA from the correct namespace", func() { + binding := r.buildMetricsReaderRoleBinding("test-ns") + Expect(binding.Subjects).To(HaveLen(1)) + Expect(binding.Subjects[0]).To(Equal(rbacv1.Subject{ + Kind: "ServiceAccount", + Name: "prometheus-user-workload", + Namespace: "openshift-user-workload-monitoring", + })) + }) + }) + + Describe("buildMetricsReaderClusterRoleBinding", func() { + It("should reference metrics-reader ClusterRole and ado-operator SA", func() { + binding := r.buildMetricsReaderClusterRoleBinding("test-ns") + Expect(binding.RoleRef.Name).To(Equal("metrics-reader")) + Expect(binding.RoleRef.Kind).To(Equal("ClusterRole")) + Expect(binding.Subjects).To(HaveLen(1)) + Expect(binding.Subjects[0].Name).To(Equal("ado-operator")) + Expect(binding.Subjects[0].Namespace).To(Equal("test-ns")) + }) + }) + + Describe("buildMetricsReaderRole", func() { + It("should scope secret access to only the metrics token secret", func() { + role := r.buildMetricsReaderRole("test-ns") + Expect(role.Rules).To(HaveLen(1)) + Expect(role.Rules[0].Resources).To(Equal([]string{"secrets"})) + Expect(role.Rules[0].ResourceNames).To(Equal([]string{serviceMonitorTokenSecret})) + Expect(role.Rules[0].Verbs).To(Equal([]string{"get"})) + }) + }) + Describe("buildBuildControllerDeployment", func() { It("should use ado-build-controller service account", func() { deployment := r.buildBuildControllerDeployment("test-namespace", defaultTestConfig()) From e3fa7ae8613cc8fe0b0d1a0961431b5a44b55f8b Mon Sep 17 00:00:00 2001 From: Benny Zlotnik Date: Thu, 23 Apr 2026 13:49:49 +0300 Subject: [PATCH 2/3] add active builds count Signed-off-by: Benny Zlotnik Assisted-by: claude-opus-4.6 --- .../samples/automotive_v1_operatorconfig.yaml | 7 ++++ internal/controller/imagebuild/controller.go | 29 +++++++++++++++++ internal/controller/imagebuild/metrics.go | 22 +++++++++++++ .../controller/imagebuild/metrics_test.go | 32 +++++++++++++++++++ 4 files changed, 90 insertions(+) diff --git a/config/samples/automotive_v1_operatorconfig.yaml b/config/samples/automotive_v1_operatorconfig.yaml index 6cb3aed5..8c11f25d 100644 --- a/config/samples/automotive_v1_operatorconfig.yaml +++ b/config/samples/automotive_v1_operatorconfig.yaml @@ -50,6 +50,13 @@ spec: # value: "automotive" # effect: "NoExecute" + # Monitoring configuration for Prometheus ServiceMonitor + # Requires user-workload-monitoring enabled on the cluster + # monitoring: + # enabled: true + # # Optional: scrape interval (default: 30s) + # # interval: "15s" + # BuildAPI configuration for the Build API server buildAPI: # Optional: Authentication configuration for OIDC/JWT providers diff --git a/internal/controller/imagebuild/controller.go b/internal/controller/imagebuild/controller.go index dd36c4bd..b5ef3c3e 100644 --- a/internal/controller/imagebuild/controller.go +++ b/internal/controller/imagebuild/controller.go @@ -35,6 +35,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/manager" ) const ( @@ -719,6 +720,7 @@ func (r *ImageBuildReconciler) checkBuildProgress( log.Error(err, "Failed to patch status to Completed") return ctrl.Result{}, err } + adjustActiveBuildsGauge(phaseBuilding, phaseCompleted) recordBuildMetrics(fresh, pipelineRun, buildStatusSuccess) if fresh.Spec.IsFlashEnabled() { r.recordPipelineFlashMetrics(ctx, fresh, pipelineRun, buildStatusSuccess) @@ -1992,6 +1994,10 @@ func (r *ImageBuildReconciler) deleteSecret( // SetupWithManager sets up the controller with the Manager. func (r *ImageBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { + if err := mgr.Add(r.seedActiveBuildsGauge(mgr)); err != nil { + return fmt.Errorf("failed to register ActiveBuilds seeder: %w", err) + } + builder := ctrl.NewControllerManagedBy(mgr). For(&automotivev1alpha1.ImageBuild{}). Owns(&tektonv1.PipelineRun{}). @@ -2005,6 +2011,28 @@ func (r *ImageBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { return builder.Complete(r) } +func (r *ImageBuildReconciler) seedActiveBuildsGauge(mgr ctrl.Manager) manager.RunnableFunc { + return func(ctx context.Context) error { + if !mgr.GetCache().WaitForCacheSync(ctx) { + return fmt.Errorf("cache sync failed") + } + var builds automotivev1alpha1.ImageBuildList + if err := mgr.GetClient().List(ctx, &builds); err != nil { + r.Log.Error(err, "Failed to seed ActiveBuilds gauge") + return nil + } + var active float64 + for i := range builds.Items { + if builds.Items[i].Status.Phase == phaseBuilding { + active++ + } + } + ActiveBuilds.Set(active) + r.Log.Info("Seeded ActiveBuilds gauge", "active", active) + return nil + } +} + func isTaskRunCompleted(taskRun *tektonv1.TaskRun) bool { return taskRun.Status.CompletionTime != nil } @@ -2431,6 +2459,7 @@ func (r *ImageBuildReconciler) updateStatus( if err := r.Status().Patch(ctx, fresh, patch); err != nil { return err } + adjustActiveBuildsGauge(oldPhase, phase) if oldPhase != phase || oldMessage != message { r.emitEventf( fresh, diff --git a/internal/controller/imagebuild/metrics.go b/internal/controller/imagebuild/metrics.go index 5bccca7e..b047d6dd 100644 --- a/internal/controller/imagebuild/metrics.go +++ b/internal/controller/imagebuild/metrics.go @@ -49,6 +49,16 @@ var ( []string{"mode", "distro", "target", "format", "arch", "status"}, ) + // ActiveBuilds tracks the number of currently in-progress builds. + ActiveBuilds = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "active", + Help: "Number of currently in-progress builds", + }, + ) + // FlashTotal counts pipeline-triggered flash operations by status. FlashTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -78,7 +88,19 @@ func init() { BuildDuration, BuildPhaseDuration, BuildTotal, + ActiveBuilds, FlashTotal, FlashDuration, ) } + +func adjustActiveBuildsGauge(oldPhase, newPhase string) { + if oldPhase == newPhase { + return + } + if newPhase == "Building" { + ActiveBuilds.Inc() + } else if oldPhase == "Building" { + ActiveBuilds.Dec() + } +} diff --git a/internal/controller/imagebuild/metrics_test.go b/internal/controller/imagebuild/metrics_test.go index 7a5af4a4..7b952f39 100644 --- a/internal/controller/imagebuild/metrics_test.go +++ b/internal/controller/imagebuild/metrics_test.go @@ -11,6 +11,38 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +func gaugeValue(g prometheus.Gauge) float64 { + m := &io_prometheus_client.Metric{} + if err := g.Write(m); err != nil { + return 0 + } + return m.GetGauge().GetValue() +} + +func TestAdjustActiveBuildsGauge(t *testing.T) { + ActiveBuilds.Set(0) + + adjustActiveBuildsGauge("", "Building") + if v := gaugeValue(ActiveBuilds); v != 1 { + t.Errorf("after entering Building: got %v, want 1", v) + } + + adjustActiveBuildsGauge("Building", "Building") + if v := gaugeValue(ActiveBuilds); v != 1 { + t.Errorf("same phase should not change gauge: got %v, want 1", v) + } + + adjustActiveBuildsGauge("Building", "Completed") + if v := gaugeValue(ActiveBuilds); v != 0 { + t.Errorf("after leaving Building: got %v, want 0", v) + } + + adjustActiveBuildsGauge("Completed", "Failed") + if v := gaugeValue(ActiveBuilds); v != 0 { + t.Errorf("non-Building transition should not change gauge: got %v, want 0", v) + } +} + // counterValue returns the current value of a counter with the given labels. func counterValue(cv *prometheus.CounterVec, labels ...string) float64 { m := &io_prometheus_client.Metric{} From 24ca7152f6927c3659b3a98676629d8ea2d14f47 Mon Sep 17 00:00:00 2001 From: Benny Zlotnik Date: Thu, 23 Apr 2026 13:49:55 +0300 Subject: [PATCH 3/3] add sample perses dashboard Signed-off-by: Benny Zlotnik Assisted-by: claude-opus-4.6 --- .../samples/monitoring/perses-dashboard.yaml | 474 ++++++++++++++++++ config/samples/monitoring/uiplugin.yaml | 20 + internal/controller/imagebuild/controller.go | 2 +- .../controller/operatorconfig/controller.go | 26 +- 4 files changed, 512 insertions(+), 10 deletions(-) create mode 100644 config/samples/monitoring/perses-dashboard.yaml create mode 100644 config/samples/monitoring/uiplugin.yaml diff --git a/config/samples/monitoring/perses-dashboard.yaml b/config/samples/monitoring/perses-dashboard.yaml new file mode 100644 index 00000000..25ffcb76 --- /dev/null +++ b/config/samples/monitoring/perses-dashboard.yaml @@ -0,0 +1,474 @@ +# Automotive Dev Operator - Perses Dashboard +# +# Prerequisites: +# 1. Cluster Observability Operator (COO) installed +# 2. UIPlugin with Perses enabled: +# oc apply -f uiplugin.yaml +# 3. Operator monitoring enabled in OperatorConfig: +# spec: +# monitoring: +# enabled: true +# +# Usage: +# oc apply -f perses-dashboard.yaml +# +# Note: Queries use namespace "automotive-dev-operator-system". +# Update if operator is deployed to a different namespace. +# +# View in OCP Console: Observe → Dashboards (Perses) +--- +apiVersion: perses.dev/v1alpha2 +kind: PersesDashboard +metadata: + name: automotive-dev-operator + namespace: openshift-cluster-observability-operator + labels: + app.kubernetes.io/part-of: automotive-dev-operator +spec: + config: + display: + name: "Automotive Dev Operator" + duration: "6h" + refreshInterval: "30s" + panels: + activeBuilds: + kind: Panel + spec: + display: + name: "Active Builds" + plugin: + kind: "StatChart" + spec: + calculation: "last-number" + format: + unit: "decimal" + decimalPlaces: 0 + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: 'sum(ado_build_active{namespace="automotive-dev-operator-system"}) or vector(0)' + successRate: + kind: Panel + spec: + display: + name: "Success Rate" + plugin: + kind: "StatChart" + spec: + calculation: "last-number" + format: + unit: "percent-decimal" + decimalPlaces: 1 + thresholds: + steps: + - value: 0 + color: "#e74c3c" + - value: 0.8 + color: "#f39c12" + - value: 0.95 + color: "#27ae60" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: '(sum(ado_build_total{namespace="automotive-dev-operator-system",status="success"}) or vector(0)) / clamp_min(sum(ado_build_total{namespace="automotive-dev-operator-system"}), 1)' + avgDuration: + kind: Panel + spec: + display: + name: "Avg Build Duration" + plugin: + kind: "StatChart" + spec: + calculation: "last-number" + format: + unit: "seconds" + decimalPlaces: 0 + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: '(sum(ado_build_duration_seconds_sum{namespace="automotive-dev-operator-system"}) or vector(0)) / clamp_min(sum(ado_build_duration_seconds_count{namespace="automotive-dev-operator-system"}), 1)' + totalBuilds: + kind: Panel + spec: + display: + name: "Total Builds" + plugin: + kind: "StatChart" + spec: + calculation: "last-number" + format: + unit: "decimal" + decimalPlaces: 0 + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: 'sum(ado_build_total{namespace="automotive-dev-operator-system"}) or vector(0)' + failedBuilds: + kind: Panel + spec: + display: + name: "Failed Builds" + plugin: + kind: "StatChart" + spec: + calculation: "last-number" + format: + unit: "decimal" + decimalPlaces: 0 + thresholds: + steps: + - value: 0 + color: "#27ae60" + - value: 1 + color: "#f39c12" + - value: 5 + color: "#e74c3c" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: 'sum(ado_build_total{namespace="automotive-dev-operator-system",status="failure"}) or vector(0)' + buildRate: + kind: Panel + spec: + display: + name: "Build Rate" + plugin: + kind: "TimeSeriesChart" + spec: + yAxis: + label: "builds/sec" + legend: + position: "bottom" + mode: "list" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: 'sum by (status) (rate(ado_build_total{namespace="automotive-dev-operator-system"}[5m]))' + seriesNameFormat: "{{status}}" + durationByTarget: + kind: Panel + spec: + display: + name: "Build Duration by Target" + plugin: + kind: "TimeSeriesChart" + spec: + yAxis: + label: "seconds" + legend: + position: "bottom" + mode: "list" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: '(sum by (target) (rate(ado_build_duration_seconds_sum{namespace="automotive-dev-operator-system"}[5m])) or vector(0)) / clamp_min(sum by (target) (rate(ado_build_duration_seconds_count{namespace="automotive-dev-operator-system"}[5m])), 1)' + seriesNameFormat: "{{target}}" + phaseDuration: + kind: Panel + spec: + display: + name: "Build Phase Duration" + plugin: + kind: "TimeSeriesChart" + spec: + yAxis: + label: "seconds" + legend: + position: "bottom" + mode: "list" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: '(sum by (phase) (rate(ado_build_phase_duration_seconds_sum{namespace="automotive-dev-operator-system"}[5m])) or vector(0)) / clamp_min(sum by (phase) (rate(ado_build_phase_duration_seconds_count{namespace="automotive-dev-operator-system"}[5m])), 1)' + seriesNameFormat: "{{phase}}" + buildsByTarget: + kind: Panel + spec: + display: + name: "Builds by Target" + plugin: + kind: "TimeSeriesChart" + spec: + yAxis: + label: "builds" + legend: + position: "bottom" + mode: "list" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: 'round(sum by (target) (ado_build_total{namespace="automotive-dev-operator-system"})) or vector(0)' + seriesNameFormat: "{{target}}" + buildsByMode: + kind: Panel + spec: + display: + name: "Builds by Mode" + plugin: + kind: "TimeSeriesChart" + spec: + yAxis: + label: "builds" + legend: + position: "bottom" + mode: "list" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: 'round(sum by (mode) (ado_build_total{namespace="automotive-dev-operator-system"})) or vector(0)' + seriesNameFormat: "{{mode}}" + buildsByDistro: + kind: Panel + spec: + display: + name: "Builds by Distro" + plugin: + kind: "TimeSeriesChart" + spec: + yAxis: + label: "builds" + legend: + position: "bottom" + mode: "list" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: 'round(sum by (distro) (ado_build_total{namespace="automotive-dev-operator-system"})) or vector(0)' + seriesNameFormat: "{{distro}}" + buildsByArch: + kind: Panel + spec: + display: + name: "Builds by Architecture" + plugin: + kind: "TimeSeriesChart" + spec: + yAxis: + label: "builds" + legend: + position: "bottom" + mode: "list" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: 'round(sum by (arch) (ado_build_total{namespace="automotive-dev-operator-system"})) or vector(0)' + seriesNameFormat: "{{arch}}" + flashTotal: + kind: Panel + spec: + display: + name: "Flash Operations" + plugin: + kind: "StatChart" + spec: + calculation: "last-number" + format: + unit: "decimal" + decimalPlaces: 0 + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: 'sum(ado_flash_total{namespace="automotive-dev-operator-system"}) or vector(0)' + flashSuccessRate: + kind: Panel + spec: + display: + name: "Flash Success Rate" + plugin: + kind: "StatChart" + spec: + calculation: "last-number" + format: + unit: "percent-decimal" + decimalPlaces: 1 + thresholds: + steps: + - value: 0 + color: "#e74c3c" + - value: 0.8 + color: "#f39c12" + - value: 0.95 + color: "#27ae60" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: '(sum(ado_flash_total{namespace="automotive-dev-operator-system",status="success"}) or vector(0)) / clamp_min(sum(ado_flash_total{namespace="automotive-dev-operator-system"}), 1)' + flashDuration: + kind: Panel + spec: + display: + name: "Flash Duration by Target" + plugin: + kind: "TimeSeriesChart" + spec: + yAxis: + label: "seconds" + legend: + position: "bottom" + mode: "list" + queries: + - kind: "TimeSeriesQuery" + spec: + plugin: + kind: "PrometheusTimeSeriesQuery" + spec: + query: '(sum by (target) (rate(ado_flash_duration_seconds_sum{namespace="automotive-dev-operator-system"}[5m])) or vector(0)) / clamp_min(sum by (target) (rate(ado_flash_duration_seconds_count{namespace="automotive-dev-operator-system"}[5m])), 1)' + seriesNameFormat: "{{target}}" + layouts: + - kind: "Grid" + spec: + display: + title: "Overview" + items: + - x: 0 + y: 0 + width: 8 + height: 4 + content: + $ref: "#/spec/panels/activeBuilds" + - x: 8 + y: 0 + width: 8 + height: 4 + content: + $ref: "#/spec/panels/successRate" + - x: 16 + y: 0 + width: 8 + height: 4 + content: + $ref: "#/spec/panels/avgDuration" + - kind: "Grid" + spec: + display: + title: "Build Details" + items: + - x: 0 + y: 0 + width: 12 + height: 4 + content: + $ref: "#/spec/panels/totalBuilds" + - x: 12 + y: 0 + width: 12 + height: 4 + content: + $ref: "#/spec/panels/failedBuilds" + - x: 0 + y: 4 + width: 24 + height: 8 + content: + $ref: "#/spec/panels/buildRate" + - kind: "Grid" + spec: + display: + title: "Build Duration" + items: + - x: 0 + y: 0 + width: 12 + height: 8 + content: + $ref: "#/spec/panels/durationByTarget" + - x: 12 + y: 0 + width: 12 + height: 8 + content: + $ref: "#/spec/panels/phaseDuration" + - kind: "Grid" + spec: + display: + title: "Build Breakdown" + items: + - x: 0 + y: 0 + width: 12 + height: 8 + content: + $ref: "#/spec/panels/buildsByTarget" + - x: 12 + y: 0 + width: 12 + height: 8 + content: + $ref: "#/spec/panels/buildsByMode" + - x: 0 + y: 8 + width: 12 + height: 8 + content: + $ref: "#/spec/panels/buildsByDistro" + - x: 12 + y: 8 + width: 12 + height: 8 + content: + $ref: "#/spec/panels/buildsByArch" + - kind: "Grid" + spec: + display: + title: "Flash Operations" + items: + - x: 0 + y: 0 + width: 8 + height: 4 + content: + $ref: "#/spec/panels/flashTotal" + - x: 8 + y: 0 + width: 8 + height: 4 + content: + $ref: "#/spec/panels/flashSuccessRate" + - x: 16 + y: 0 + width: 8 + height: 8 + content: + $ref: "#/spec/panels/flashDuration" diff --git a/config/samples/monitoring/uiplugin.yaml b/config/samples/monitoring/uiplugin.yaml new file mode 100644 index 00000000..b4d032f8 --- /dev/null +++ b/config/samples/monitoring/uiplugin.yaml @@ -0,0 +1,20 @@ +# Enable Perses dashboards in OpenShift Console +# +# Prerequisite: Cluster Observability Operator (COO) must be installed. +# +# This creates a UIPlugin that tells COO to: +# 1. Deploy a Perses instance +# 2. Register a console plugin adding "Dashboards (Perses)" under Observe +# +# Usage: +# oc apply -f uiplugin.yaml +--- +apiVersion: observability.openshift.io/v1alpha1 +kind: UIPlugin +metadata: + name: monitoring +spec: + type: Monitoring + monitoring: + perses: + enabled: true diff --git a/internal/controller/imagebuild/controller.go b/internal/controller/imagebuild/controller.go index b5ef3c3e..3f6b04ef 100644 --- a/internal/controller/imagebuild/controller.go +++ b/internal/controller/imagebuild/controller.go @@ -2019,7 +2019,7 @@ func (r *ImageBuildReconciler) seedActiveBuildsGauge(mgr ctrl.Manager) manager.R var builds automotivev1alpha1.ImageBuildList if err := mgr.GetClient().List(ctx, &builds); err != nil { r.Log.Error(err, "Failed to seed ActiveBuilds gauge") - return nil + return err } var active float64 for i := range builds.Items { diff --git a/internal/controller/operatorconfig/controller.go b/internal/controller/operatorconfig/controller.go index 2e7de0d2..ceff146f 100644 --- a/internal/controller/operatorconfig/controller.go +++ b/internal/controller/operatorconfig/controller.go @@ -302,12 +302,13 @@ func (r *OperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reque // Reconcile monitoring (ServiceMonitor) if config.Spec.Monitoring != nil && config.Spec.Monitoring.Enabled { - if err := r.deployServiceMonitor(ctx, config); err != nil { + deployed, err := r.deployServiceMonitor(ctx, config) + if err != nil { log.Error(err, "Failed to deploy ServiceMonitor") return ctrl.Result{}, fmt.Errorf("failed to deploy ServiceMonitor: %w", err) } - if !config.Status.MonitoringEnabled { - config.Status.MonitoringEnabled = true + if config.Status.MonitoringEnabled != deployed { + config.Status.MonitoringEnabled = deployed statusChanged = true } } else { @@ -1036,29 +1037,36 @@ func (r *OperatorConfigReconciler) cleanupWorkspaceInfra(ctx context.Context, co return nil } -func (r *OperatorConfigReconciler) deployServiceMonitor(ctx context.Context, config *automotivev1alpha1.OperatorConfig) error { +func (r *OperatorConfigReconciler) deployServiceMonitor(ctx context.Context, config *automotivev1alpha1.OperatorConfig) (bool, error) { tokenSecret := r.buildMetricsTokenSecret(config.Namespace) if err := r.createOrUpdate(ctx, tokenSecret, config); err != nil { - return fmt.Errorf("failed to create/update metrics token secret: %w", err) + return false, fmt.Errorf("failed to create/update metrics token secret: %w", err) } role := r.buildMetricsReaderRole(config.Namespace) if err := r.createOrUpdate(ctx, role, config); err != nil { - return fmt.Errorf("failed to create/update metrics reader role: %w", err) + return false, fmt.Errorf("failed to create/update metrics reader role: %w", err) } binding := r.buildMetricsReaderRoleBinding(config.Namespace) if err := r.createOrUpdate(ctx, binding, config); err != nil { - return fmt.Errorf("failed to create/update metrics reader role binding: %w", err) + return false, fmt.Errorf("failed to create/update metrics reader role binding: %w", err) } clusterBinding := r.buildMetricsReaderClusterRoleBinding(config.Namespace) if err := r.createOrUpdate(ctx, clusterBinding, config); err != nil { - return fmt.Errorf("failed to create/update metrics reader cluster role binding: %w", err) + return false, fmt.Errorf("failed to create/update metrics reader cluster role binding: %w", err) } sm := r.buildServiceMonitor(config.Namespace, config.Spec.Monitoring) - return r.createOrUpdate(ctx, sm, config) + if err := r.createOrUpdate(ctx, sm, config); err != nil { + if apimeta.IsNoMatchError(err) { + r.Log.Info("ServiceMonitor CRD not available, skipping (install Prometheus Operator to enable)") + return false, nil + } + return false, fmt.Errorf("failed to create/update ServiceMonitor: %w", err) + } + return true, nil } func (r *OperatorConfigReconciler) cleanupServiceMonitor(ctx context.Context, config *automotivev1alpha1.OperatorConfig) error {