From 29c1bdd5fa92e425df24c9b4e33cf2cb0f528ea3 Mon Sep 17 00:00:00 2001 From: Gerard Ryan Date: Mon, 27 Jul 2020 23:44:02 +0100 Subject: [PATCH 1/3] Don't use old hardcoded PD API key secret It should be specified in the PDI CR. If the API key can't be loaded from the specified API key, then trigger an alert. --- hack/olm-registry/olm-artifacts-template.yaml | 19 ++++++- .../clusterdeployment_created.go | 6 +-- .../clusterdeployment_deleted.go | 4 +- .../pagerdutyintegration_controller.go | 49 +++++++++++-------- .../pagerdutyintegration_controller_test.go | 10 ++-- pkg/localmetrics/localmetrics.go | 32 ++++++++++-- 6 files changed, 87 insertions(+), 33 deletions(-) diff --git a/hack/olm-registry/olm-artifacts-template.yaml b/hack/olm-registry/olm-artifacts-template.yaml index 4b6d53c3..b0ac2da9 100644 --- a/hack/olm-registry/olm-artifacts-template.yaml +++ b/hack/olm-registry/olm-artifacts-template.yaml @@ -28,7 +28,7 @@ objects: sourceType: grpc image: ${REGISTRY_IMG}:${CHANNEL}-${IMAGE_TAG} displayName: pagerduty-operator Registry - publisher: SRE + publisher: SRE - apiVersion: operators.coreos.com/v1alpha2 kind: OperatorGroup @@ -67,3 +67,20 @@ objects: targetSecretRef: name: pd-secret namespace: openshift-monitoring + +- apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + metadata: + name: pagerduty-integration-api-secret + namespace: pagerduty-operator + spec: + groups: + - name: pagerduty-integration-api-secret + rules: + - alert: PagerDutyIntegrationAPISecretError + annotations: + message: PagerDuty Operator is failing to load PAGERDUTY_API_KEY from Secret specified in PagerDutyIntegration {{ $labels.pagerdutyintegration_name }}. Either the Secret might be missing, or the key might be missing from within the Secret. + expr: pagerdutyintegration_secret_loaded < 1 + for: 15m + labels: + severity: warning diff --git a/pkg/controller/pagerdutyintegration/clusterdeployment_created.go b/pkg/controller/pagerdutyintegration/clusterdeployment_created.go index 82b8a099..e1760a5e 100644 --- a/pkg/controller/pagerdutyintegration/clusterdeployment_created.go +++ b/pkg/controller/pagerdutyintegration/clusterdeployment_created.go @@ -32,7 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) -func (r *ReconcilePagerDutyIntegration) handleCreate(pdi *pagerdutyv1alpha1.PagerDutyIntegration, cd *hivev1.ClusterDeployment) error { +func (r *ReconcilePagerDutyIntegration) handleCreate(pdclient pd.Client, pdi *pagerdutyv1alpha1.PagerDutyIntegration, cd *hivev1.ClusterDeployment) error { var ( // secretName is the name of the Secret deployed to the target // cluster, and also the name of the SyncSet that causes it to @@ -96,7 +96,7 @@ func (r *ReconcilePagerDutyIntegration) handleCreate(pdi *pagerdutyv1alpha1.Page err = pdData.ParseClusterConfig(r.client, cd.Namespace, configMapName) if err != nil { var createErr error - _, createErr = r.pdclient.CreateService(pdData) + _, createErr = pdclient.CreateService(pdData) if createErr != nil { localmetrics.UpdateMetricPagerDutyCreateFailure(1, ClusterID) return createErr @@ -104,7 +104,7 @@ func (r *ReconcilePagerDutyIntegration) handleCreate(pdi *pagerdutyv1alpha1.Page } localmetrics.UpdateMetricPagerDutyCreateFailure(0, ClusterID) - pdIntegrationKey, err = r.pdclient.GetIntegrationKey(pdData) + pdIntegrationKey, err = pdclient.GetIntegrationKey(pdData) if err != nil { return err } diff --git a/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go b/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go index f846748c..376da35b 100644 --- a/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go +++ b/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go @@ -28,7 +28,7 @@ import ( "k8s.io/apimachinery/pkg/types" ) -func (r *ReconcilePagerDutyIntegration) handleDelete(pdi *pagerdutyv1alpha1.PagerDutyIntegration, cd *hivev1.ClusterDeployment) error { +func (r *ReconcilePagerDutyIntegration) handleDelete(pdclient pd.Client, pdi *pagerdutyv1alpha1.PagerDutyIntegration, cd *hivev1.ClusterDeployment) error { var ( // secretName is the name of the Secret deployed to the target // cluster, and also the name of the SyncSet that causes it to @@ -119,7 +119,7 @@ func (r *ReconcilePagerDutyIntegration) handleDelete(pdi *pagerdutyv1alpha1.Page if deletePDService { // we have everything necessary to attempt deletion of the PD service - err = r.pdclient.DeleteService(pdData) + err = pdclient.DeleteService(pdData) if err != nil { r.reqLogger.Error(err, "Failed cleaning up pagerduty.") } else { diff --git a/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller.go b/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller.go index 642a1b53..f6dd548d 100644 --- a/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller.go +++ b/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller.go @@ -52,29 +52,16 @@ var log = logf.Log.WithName("controller_pagerdutyintegration") // Add creates a new PagerDutyIntegration Controller and adds it to the Manager. The Manager will set fields on the Controller // and Start it when the Manager is Started. func Add(mgr manager.Manager) error { - newRec, err := newReconciler(mgr) - if err != nil { - return err - } - - return add(mgr, newRec) + return add(mgr, newReconciler(mgr)) } // newReconciler returns a new reconcile.Reconciler -func newReconciler(mgr manager.Manager) (reconcile.Reconciler, error) { - tempClient, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) - if err != nil { - return nil, err - } - - // get PD API key from secret - pdAPIKey, err := utils.LoadSecretData(tempClient, config.PagerDutyAPISecretName, config.OperatorNamespace, config.PagerDutyAPISecretKey) - +func newReconciler(mgr manager.Manager) reconcile.Reconciler { return &ReconcilePagerDutyIntegration{ client: utils.NewClientWithMetricsOrDie(log, mgr, controllerName), scheme: mgr.GetScheme(), - pdclient: pd.NewClient(pdAPIKey), - }, nil + pdclient: pd.NewClient, + } } // add adds a new Controller to mgr with r as the reconcile.Reconciler @@ -159,7 +146,7 @@ type ReconcilePagerDutyIntegration struct { client client.Client scheme *runtime.Scheme reqLogger logr.Logger - pdclient pd.Client + pdclient func(APIKey string) pd.Client } // Reconcile reads that state of the cluster for a PagerDutyIntegration object and makes changes based on the state read @@ -198,15 +185,31 @@ func (r *ReconcilePagerDutyIntegration) Reconcile(request reconcile.Request) (re return r.requeueOnErr(err) } + pdApiKey, err := utils.LoadSecretData( + r.client, + pdi.Spec.PagerdutyApiKeySecretRef.Name, + pdi.Spec.PagerdutyApiKeySecretRef.Namespace, + config.PagerDutyAPISecretKey, + ) + if err != nil { + r.reqLogger.Error(err, "Failed to load PagerDuty API key from Secret listed in PagerDutyIntegration CR") + localmetrics.UpdateMetricPagerDutyIntegrationSecretLoaded(0, pdi.Name) + return r.requeueAfter(10 * time.Minute) + } + localmetrics.UpdateMetricPagerDutyIntegrationSecretLoaded(1, pdi.Name) + pdClient := r.pdclient(pdApiKey) + if pdi.DeletionTimestamp != nil { if utils.HasFinalizer(pdi, config.OperatorFinalizer) { for _, cd := range matchingClusterDeployments.Items { - err := r.handleDelete(pdi, &cd) + err := r.handleDelete(pdClient, pdi, &cd) if err != nil { return r.requeueOnErr(err) } } + localmetrics.DeleteMetricPagerDutyIntegrationSecretLoaded(pdi.Name) + utils.DeleteFinalizer(pdi, config.OperatorFinalizer) err = r.client.Update(context.TODO(), pdi) if err != nil { @@ -226,12 +229,12 @@ func (r *ReconcilePagerDutyIntegration) Reconcile(request reconcile.Request) (re for _, cd := range matchingClusterDeployments.Items { if cd.DeletionTimestamp != nil || cd.Labels[config.ClusterDeploymentNoalertsLabel] == "true" { - err := r.handleDelete(pdi, &cd) + err := r.handleDelete(pdClient, pdi, &cd) if err != nil { return r.requeueOnErr(err) } } else { - err := r.handleCreate(pdi, &cd) + err := r.handleCreate(pdClient, pdi, &cd) if err != nil { return r.requeueOnErr(err) } @@ -259,3 +262,7 @@ func (r *ReconcilePagerDutyIntegration) doNotRequeue() (reconcile.Result, error) func (r *ReconcilePagerDutyIntegration) requeueOnErr(err error) (reconcile.Result, error) { return reconcile.Result{}, err } + +func (r *ReconcilePagerDutyIntegration) requeueAfter(t time.Duration) (reconcile.Result, error) { + return reconcile.Result{RequeueAfter: t}, nil +} diff --git a/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller_test.go b/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller_test.go index ed5b7868..25619f3c 100644 --- a/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller_test.go +++ b/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller_test.go @@ -26,6 +26,7 @@ import ( pagerdutyapis "github.com/openshift/pagerduty-operator/pkg/apis" pagerdutyv1alpha1 "github.com/openshift/pagerduty-operator/pkg/apis/pagerduty/v1alpha1" "github.com/openshift/pagerduty-operator/pkg/kube" + pd "github.com/openshift/pagerduty-operator/pkg/pagerduty" mockpd "github.com/openshift/pagerduty-operator/pkg/pagerduty/mock" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" @@ -359,6 +360,7 @@ func TestReconcilePagerDutyIntegration(t *testing.T) { localObjects: []runtime.Object{ uninstalledClusterDeployment(), testPagerDutyIntegration(), + testPDConfigSecret(), }, expectedSyncSets: &SyncSetEntry{}, expectedSecrets: &SecretEntry{}, @@ -426,6 +428,7 @@ func TestReconcilePagerDutyIntegration(t *testing.T) { localObjects: []runtime.Object{ unlabelledClusterDeployment(), testPagerDutyIntegration(), + testPDConfigSecret(), }, expectedSyncSets: &SyncSetEntry{}, expectedSecrets: &SecretEntry{}, @@ -439,6 +442,7 @@ func TestReconcilePagerDutyIntegration(t *testing.T) { localObjects: []runtime.Object{ testNoalertsClusterDeployment(), testPagerDutyIntegration(), + testPDConfigSecret(), }, expectedSyncSets: &SyncSetEntry{}, expectedSecrets: &SecretEntry{}, @@ -502,7 +506,7 @@ func TestReconcilePagerDutyIntegration(t *testing.T) { rpdi := &ReconcilePagerDutyIntegration{ client: mocks.fakeKubeClient, scheme: scheme.Scheme, - pdclient: mocks.mockPDClient, + pdclient: func(s string) pd.Client { return mocks.mockPDClient }, } // Act [2x as first exits early after setting finalizer] @@ -556,7 +560,7 @@ func TestRemoveAlertsAfterCreate(t *testing.T) { rpdi := &ReconcilePagerDutyIntegration{ client: mocks.fakeKubeClient, scheme: scheme.Scheme, - pdclient: mocks.mockPDClient, + pdclient: func(s string) pd.Client { return mocks.mockPDClient }, } // Act (create) [2x as first exits early after setting finalizer] @@ -636,7 +640,7 @@ func TestDeleteSecret(t *testing.T) { rpdi := &ReconcilePagerDutyIntegration{ client: mocks.fakeKubeClient, scheme: scheme.Scheme, - pdclient: mocks.mockPDClient, + pdclient: func(s string) pd.Client { return mocks.mockPDClient }, } // Act (create) [2x as first exits early after setting finalizer] diff --git a/pkg/localmetrics/localmetrics.go b/pkg/localmetrics/localmetrics.go index 64113885..cc694f12 100644 --- a/pkg/localmetrics/localmetrics.go +++ b/pkg/localmetrics/localmetrics.go @@ -69,16 +69,23 @@ var ( Buckets: []float64{1}, }, []string{"controller", "method", "resource", "status"}) + MetricPagerDutyIntegrationSecretLoaded = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "pagerdutyintegration_secret_loaded", + Help: "Metric to track the ability to load the PagerDuty API key from the Secret specified in the PagerDutyIntegration", + ConstLabels: prometheus.Labels{"name": "pagerduty-operator"}, + }, []string{"pagerdutyintegration_name"}) + MetricsList = []prometheus.Collector{ MetricPagerDutyCreateFailure, MetricPagerDutyDeleteFailure, MetricPagerDutyHeartbeat, ApiCallDuration, ReconcileDuration, + MetricPagerDutyIntegrationSecretLoaded, } ) -// UpdateAPIMetrics updates all API endpoint metrics ever 5 minutes +// UpdateAPIMetrics updates all API endpoint metrics every 5 minutes func UpdateAPIMetrics(APIKey string, timer *prometheus.Timer) { d := time.Tick(5 * time.Minute) for range d { @@ -87,14 +94,33 @@ func UpdateAPIMetrics(APIKey string, timer *prometheus.Timer) { } -// UpdateMetricPagerDutyCreateFailure updates guage to 1 when creation fails +// UpdateMetricPagerDutyIntegrationSecretLoaded updates gauge to 1 +// when the PagerDuty API key can be loaded from the Secret specified +// in the PagerDutyIntegration, or to 0 if it fails +func UpdateMetricPagerDutyIntegrationSecretLoaded(x int, pdiName string) { + MetricPagerDutyIntegrationSecretLoaded.With( + prometheus.Labels{"pagerdutyintegration_name": pdiName}, + ).Set(float64(x)) +} + +// DeleteMetricPagerDutyIntegrationSecretLoaded deletes the metric for +// the PagerDutyIntegration name provided. This should be called when +// e.g. the PagerDutyIntegration is being deleted, so that there are +// no irrelevant metrics (which alerts could be firing on). +func DeleteMetricPagerDutyIntegrationSecretLoaded(pdiName string) bool { + return MetricPagerDutyIntegrationSecretLoaded.Delete( + prometheus.Labels{"pagerdutyintegration_name": pdiName}, + ) +} + +// UpdateMetricPagerDutyCreateFailure updates gauge to 1 when creation fails func UpdateMetricPagerDutyCreateFailure(x int, cd string) { MetricPagerDutyCreateFailure.With(prometheus.Labels{ "clusterdeployment_name": cd}).Set( float64(x)) } -// UpdateMetricPagerDutyDeleteFailure updates guage to 1 when deletion fails +// UpdateMetricPagerDutyDeleteFailure updates gauge to 1 when deletion fails func UpdateMetricPagerDutyDeleteFailure(x int, cd string) { MetricPagerDutyDeleteFailure.With(prometheus.Labels{ "clusterdeployment_name": cd}).Set( From 4710f366eee72ec7df66ab26acbdb5e33253c16d Mon Sep 17 00:00:00 2001 From: Gerard Ryan Date: Sat, 1 Aug 2020 21:38:36 +0100 Subject: [PATCH 2/3] Add pdi name label to fail metrics --- .../clusterdeployment_created.go | 4 ++-- .../clusterdeployment_deleted.go | 4 ++-- pkg/localmetrics/localmetrics.go | 18 ++++++++++-------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pkg/controller/pagerdutyintegration/clusterdeployment_created.go b/pkg/controller/pagerdutyintegration/clusterdeployment_created.go index e1760a5e..94021bb1 100644 --- a/pkg/controller/pagerdutyintegration/clusterdeployment_created.go +++ b/pkg/controller/pagerdutyintegration/clusterdeployment_created.go @@ -98,11 +98,11 @@ func (r *ReconcilePagerDutyIntegration) handleCreate(pdclient pd.Client, pdi *pa var createErr error _, createErr = pdclient.CreateService(pdData) if createErr != nil { - localmetrics.UpdateMetricPagerDutyCreateFailure(1, ClusterID) + localmetrics.UpdateMetricPagerDutyCreateFailure(1, ClusterID, pdi.Name) return createErr } } - localmetrics.UpdateMetricPagerDutyCreateFailure(0, ClusterID) + localmetrics.UpdateMetricPagerDutyCreateFailure(0, ClusterID, pdi.Name) pdIntegrationKey, err = pdclient.GetIntegrationKey(pdData) if err != nil { diff --git a/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go b/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go index 376da35b..298cb30f 100644 --- a/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go +++ b/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go @@ -155,11 +155,11 @@ func (r *ReconcilePagerDutyIntegration) handleDelete(pdclient pd.Client, pdi *pa utils.DeleteFinalizer(cd, finalizer) err = r.client.Update(context.TODO(), cd) if err != nil { - metrics.UpdateMetricPagerDutyDeleteFailure(1, ClusterID) + metrics.UpdateMetricPagerDutyDeleteFailure(1, ClusterID, pdi.Name) return err } } - metrics.UpdateMetricPagerDutyDeleteFailure(0, ClusterID) + metrics.UpdateMetricPagerDutyDeleteFailure(0, ClusterID, pdi.Name) return nil } diff --git a/pkg/localmetrics/localmetrics.go b/pkg/localmetrics/localmetrics.go index cc694f12..d76b6d4f 100644 --- a/pkg/localmetrics/localmetrics.go +++ b/pkg/localmetrics/localmetrics.go @@ -38,13 +38,13 @@ var ( Name: "pagerduty_create_failure", Help: "Metric for the failure of creating a cluster deployment.", ConstLabels: prometheus.Labels{"name": "pagerduty-operator"}, - }, []string{"clusterdeployment_name"}) + }, []string{"clusterdeployment_name", "pagerdutyintegration_name"}) MetricPagerDutyDeleteFailure = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "pagerduty_delete_failure", Help: "Metric for the failure of deleting a cluster deployment.", ConstLabels: prometheus.Labels{"name": "pagerduty-operator"}, - }, []string{"clusterdeployment_name"}) + }, []string{"clusterdeployment_name", "pagerdutyintegration_name"}) MetricPagerDutyHeartbeat = prometheus.NewSummary(prometheus.SummaryOpts{ Name: "pagerduty_heartbeat", @@ -114,17 +114,19 @@ func DeleteMetricPagerDutyIntegrationSecretLoaded(pdiName string) bool { } // UpdateMetricPagerDutyCreateFailure updates gauge to 1 when creation fails -func UpdateMetricPagerDutyCreateFailure(x int, cd string) { +func UpdateMetricPagerDutyCreateFailure(x int, cd string, pdiName string) { MetricPagerDutyCreateFailure.With(prometheus.Labels{ - "clusterdeployment_name": cd}).Set( - float64(x)) + "clusterdeployment_name": cd, + "pagerdutyintegration_name": pdiName, + }).Set(float64(x)) } // UpdateMetricPagerDutyDeleteFailure updates gauge to 1 when deletion fails -func UpdateMetricPagerDutyDeleteFailure(x int, cd string) { +func UpdateMetricPagerDutyDeleteFailure(x int, cd string, pdiName string) { MetricPagerDutyDeleteFailure.With(prometheus.Labels{ - "clusterdeployment_name": cd}).Set( - float64(x)) + "clusterdeployment_name": cd, + "pagerdutyintegration_name": pdiName, + }).Set(float64(x)) } // SetReconcileDuration Push the duration of the reconcile iteration From 94c0be8c70c91b3b2830dd15592beb30105487bf Mon Sep 17 00:00:00 2001 From: Gerard Ryan Date: Sat, 1 Aug 2020 22:16:46 +0100 Subject: [PATCH 3/3] Ensure deletion of old finalizer --- .../pagerdutyintegration/clusterdeployment_deleted.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go b/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go index 298cb30f..0db109ab 100644 --- a/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go +++ b/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go @@ -159,6 +159,17 @@ func (r *ReconcilePagerDutyIntegration) handleDelete(pdclient pd.Client, pdi *pa return err } } + + if utils.HasFinalizer(cd, config.OperatorFinalizer) { + r.reqLogger.Info("Deleting old PD finalizer from ClusterDeployment", "Namespace", cd.Namespace, "Name", cd.Name) + utils.DeleteFinalizer(cd, config.OperatorFinalizer) + err = r.client.Update(context.TODO(), cd) + if err != nil { + metrics.UpdateMetricPagerDutyDeleteFailure(1, ClusterID, pdi.Name) + return err + } + } + metrics.UpdateMetricPagerDutyDeleteFailure(0, ClusterID, pdi.Name) return nil