From a62bfd4aaad11b436b734a3c3412724e0f3a56cc Mon Sep 17 00:00:00 2001 From: Patryk Wasielewski Date: Tue, 22 Jul 2025 14:15:19 +0200 Subject: [PATCH 1/6] initial draft of shc upgrade process --- api/v4/searchheadcluster_types.go | 17 + ...erprise.splunk.com_searchheadclusters.yaml | 8 + .../controller/clustermanager_controller.go | 11 +- .../controller/clustermaster_controller.go | 3 +- .../controller/indexercluster_controller.go | 3 +- .../controller/licensemanager_controller.go | 3 +- .../controller/licensemaster_controller.go | 3 +- internal/controller/metric.go | 54 ---- .../monitoringconsole_controller.go | 3 +- .../searchheadcluster_controller.go | 3 +- internal/controller/standalone_controller.go | 3 +- pkg/splunk/client/enterprise.go | 22 ++ pkg/splunk/client/metrics/metrics.go | 117 +++++++ pkg/splunk/common/types.go | 2 + pkg/splunk/enterprise/indexercluster.go | 4 + pkg/splunk/enterprise/searchheadcluster.go | 227 +------------- .../enterprise/searchheadclusterpodmanager.go | 295 ++++++++++++++++++ pkg/splunk/splkcontroller/statefulset.go | 14 + pkg/splunk/splkcontroller/statefulset_test.go | 10 + 19 files changed, 513 insertions(+), 289 deletions(-) delete mode 100644 internal/controller/metric.go create mode 100644 pkg/splunk/client/metrics/metrics.go create mode 100644 pkg/splunk/enterprise/searchheadclusterpodmanager.go diff --git a/api/v4/searchheadcluster_types.go b/api/v4/searchheadcluster_types.go index e3a7585db..cabae7e17 100644 --- a/api/v4/searchheadcluster_types.go +++ b/api/v4/searchheadcluster_types.go @@ -50,6 +50,9 @@ type SearchHeadClusterSpec struct { // Splunk Deployer Node Affinity DeployerNodeAffinity *corev1.NodeAffinity `json:"deployerNodeAffinity,omitempty"` + + // upgrade strategy for the search head cluster + // UpgradeStrategy UpgradeStrategy `json:"upgradeStrategy,omitempty"` } // SearchHeadClusterMemberStatus is used to track the status of each search head cluster member @@ -128,8 +131,22 @@ type SearchHeadClusterStatus struct { // Auxillary message describing CR status Message string `json:"message"` + + UpgradePhase UpgradePhase `json:"upgradePhase"` + + UpgradeStartTimestamp int64 `json:"upgradeStartTimestamp"` + + UpgradeEndTimestamp int64 `json:"upgradeEndTimestamp"` + } +type UpgradePhase string + +const ( + UpgradePhaseUpgrading UpgradePhase = "Upgrading" + UpgradePhaseUpgraded UpgradePhase = "Upgraded" +) + // SearchHeadCluster is the Schema for a Splunk Enterprise search head cluster // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // +k8s:openapi-gen=true diff --git a/config/crd/bases/enterprise.splunk.com_searchheadclusters.yaml b/config/crd/bases/enterprise.splunk.com_searchheadclusters.yaml index 2e8e8695f..adfde431a 100644 --- a/config/crd/bases/enterprise.splunk.com_searchheadclusters.yaml +++ b/config/crd/bases/enterprise.splunk.com_searchheadclusters.yaml @@ -9470,6 +9470,14 @@ spec: telAppInstalled: description: Telemetry App installation flag type: boolean + upgradeEndTimestamp: + format: int64 + type: integer + upgradePhase: + type: string + upgradeStartTimestamp: + format: int64 + type: integer type: object type: object served: true diff --git a/internal/controller/clustermanager_controller.go b/internal/controller/clustermanager_controller.go index a75ea7a26..692c7366c 100644 --- a/internal/controller/clustermanager_controller.go +++ b/internal/controller/clustermanager_controller.go @@ -25,6 +25,7 @@ import ( "github.com/pkg/errors" enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -71,7 +72,7 @@ type ClusterManagerReconciler struct { // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.10.0/pkg/reconcile func (r *ClusterManagerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // your logic here - reconcileCounters.With(getPrometheusLabels(req, "ClusterManager")).Inc() + metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "ClusterManager")).Inc() defer recordInstrumentionData(time.Now(), req, "controller", "ClusterManager") reqLogger := log.FromContext(ctx) @@ -160,9 +161,9 @@ func (r *ClusterManagerReconciler) SetupWithManager(mgr ctrl.Manager) error { // recordInstrumentionData Record api profiling information to prometheus func recordInstrumentionData(start time.Time, req ctrl.Request, module string, name string) { - metricLabels := getPrometheusLabels(req, name) - metricLabels[labelModuleName] = module - metricLabels[labelMethodName] = name + metricLabels := metrics.GetPrometheusLabels(req, name) + metricLabels[metrics.LabelModuleName] = module + metricLabels[metrics.LabelMethodName] = name value := float64(time.Since(start) / time.Millisecond) - apiTotalTimeMetricEvents.With(metricLabels).Set(value) + metrics.ApiTotalTimeMetricEvents.With(metricLabels).Set(value) } diff --git a/internal/controller/clustermaster_controller.go b/internal/controller/clustermaster_controller.go index b967f5921..ef162cbc9 100644 --- a/internal/controller/clustermaster_controller.go +++ b/internal/controller/clustermaster_controller.go @@ -26,6 +26,7 @@ import ( "github.com/pkg/errors" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -72,7 +73,7 @@ type ClusterMasterReconciler struct { // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.10.0/pkg/reconcile func (r *ClusterMasterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // your logic here - reconcileCounters.With(getPrometheusLabels(req, "ClusterMaster")).Inc() + metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "ClusterMaster")).Inc() defer recordInstrumentionData(time.Now(), req, "controller", "ClusterMaster") reqLogger := log.FromContext(ctx) diff --git a/internal/controller/indexercluster_controller.go b/internal/controller/indexercluster_controller.go index b458a02e8..027e28fab 100644 --- a/internal/controller/indexercluster_controller.go +++ b/internal/controller/indexercluster_controller.go @@ -26,6 +26,7 @@ import ( "github.com/pkg/errors" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -71,7 +72,7 @@ type IndexerClusterReconciler struct { // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.10.0/pkg/reconcile func (r *IndexerClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - reconcileCounters.With(getPrometheusLabels(req, "IndexerCluster")).Inc() + metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "IndexerCluster")).Inc() defer recordInstrumentionData(time.Now(), req, "controller", "IndexerCluster") reqLogger := log.FromContext(ctx) diff --git a/internal/controller/licensemanager_controller.go b/internal/controller/licensemanager_controller.go index 8e7f72f90..aa162cf48 100644 --- a/internal/controller/licensemanager_controller.go +++ b/internal/controller/licensemanager_controller.go @@ -25,6 +25,7 @@ import ( "github.com/pkg/errors" enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -70,7 +71,7 @@ type LicenseManagerReconciler struct { // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.10.0/pkg/reconcile func (r *LicenseManagerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - reconcileCounters.With(getPrometheusLabels(req, "LicenseManager")).Inc() + metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "LicenseManager")).Inc() defer recordInstrumentionData(time.Now(), req, "controller", "LicenseManager") reqLogger := log.FromContext(ctx) diff --git a/internal/controller/licensemaster_controller.go b/internal/controller/licensemaster_controller.go index 3c21aaf70..f87c2dd4a 100644 --- a/internal/controller/licensemaster_controller.go +++ b/internal/controller/licensemaster_controller.go @@ -26,6 +26,7 @@ import ( enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" enterpriseApi "github.com/splunk/splunk-operator/api/v4" enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -71,7 +72,7 @@ type LicenseMasterReconciler struct { // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.10.0/pkg/reconcile func (r *LicenseMasterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - reconcileCounters.With(getPrometheusLabels(req, "LicenseMaster")).Inc() + metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "LicenseMaster")).Inc() defer recordInstrumentionData(time.Now(), req, "controller", "LicenseMaster") reqLogger := log.FromContext(ctx) diff --git a/internal/controller/metric.go b/internal/controller/metric.go deleted file mode 100644 index c5f5a25cb..000000000 --- a/internal/controller/metric.go +++ /dev/null @@ -1,54 +0,0 @@ -package controller - -import ( - "github.com/prometheus/client_golang/prometheus" - "sigs.k8s.io/controller-runtime/pkg/metrics" - "sigs.k8s.io/controller-runtime/pkg/reconcile" -) - -const ( - labelNamespace = "namespace" - labelName = "name" - labelKind = "kind" - labelErrorType = "error_type" - labelMethodName = "api" - labelModuleName = "module" - labelResourceVersion = "resource_version" -) - -var reconcileCounters = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "splunk_operator_reconcile_total", - Help: "The number of times reconciled by this controller", -}, []string{labelNamespace, labelName, labelKind}) - -var reconcileErrorCounter = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "splunk_operator_reconcile_error_total", - Help: "The number of times the operator has failed to reconcile", -}) - -var actionFailureCounters = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "splunk_operator_error_total", - Help: "The number of times operator has entered an error state", -}, []string{labelErrorType}) - -var apiTotalTimeMetricEvents = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "splunk_operator_module_duration_in_milliseconds", - Help: "The time it takes to complete each call in standalone (in milliseconds)", -}, []string{labelNamespace, labelName, labelKind, labelModuleName, labelMethodName}) - -func getPrometheusLabels(request reconcile.Request, kind string) prometheus.Labels { - return prometheus.Labels{ - labelNamespace: request.Namespace, - labelName: request.Name, - labelKind: kind, - } -} - -func init() { - metrics.Registry.MustRegister( - reconcileCounters, - reconcileErrorCounter, - actionFailureCounters, - apiTotalTimeMetricEvents, - ) -} diff --git a/internal/controller/monitoringconsole_controller.go b/internal/controller/monitoringconsole_controller.go index e86c2e8ff..2040d8d14 100644 --- a/internal/controller/monitoringconsole_controller.go +++ b/internal/controller/monitoringconsole_controller.go @@ -26,6 +26,7 @@ import ( "github.com/pkg/errors" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -71,7 +72,7 @@ type MonitoringConsoleReconciler struct { // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.10.0/pkg/reconcile func (r *MonitoringConsoleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - reconcileCounters.With(getPrometheusLabels(req, "MonitoringConsole")).Inc() + metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "MonitoringConsole")).Inc() defer recordInstrumentionData(time.Now(), req, "controller", "MonitoringConsole") reqLogger := log.FromContext(ctx) reqLogger = reqLogger.WithValues("monitoringconsole", req.NamespacedName) diff --git a/internal/controller/searchheadcluster_controller.go b/internal/controller/searchheadcluster_controller.go index a7c5283c9..509f8bf86 100644 --- a/internal/controller/searchheadcluster_controller.go +++ b/internal/controller/searchheadcluster_controller.go @@ -25,6 +25,7 @@ import ( "github.com/pkg/errors" enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -70,7 +71,7 @@ type SearchHeadClusterReconciler struct { // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.10.0/pkg/reconcile func (r *SearchHeadClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - reconcileCounters.With(getPrometheusLabels(req, "SearchHeadCluster")).Inc() + metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "SearchHeadCluster")).Inc() defer recordInstrumentionData(time.Now(), req, "controller", "SearchHeadCluster") reqLogger := log.FromContext(ctx) diff --git a/internal/controller/standalone_controller.go b/internal/controller/standalone_controller.go index e3ce23463..b26bc6e70 100644 --- a/internal/controller/standalone_controller.go +++ b/internal/controller/standalone_controller.go @@ -31,6 +31,7 @@ import ( "github.com/pkg/errors" enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" k8serrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" @@ -75,7 +76,7 @@ type StandaloneReconciler struct { // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.10.0/pkg/reconcile func (r *StandaloneReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - reconcileCounters.With(getPrometheusLabels(req, "Standalone")).Inc() + metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "Standalone")).Inc() defer recordInstrumentionData(time.Now(), req, "controller", "Standalone") reqLogger := log.FromContext(ctx) diff --git a/pkg/splunk/client/enterprise.go b/pkg/splunk/client/enterprise.go index f5382bf15..e1a8d5dc5 100644 --- a/pkg/splunk/client/enterprise.go +++ b/pkg/splunk/client/enterprise.go @@ -307,6 +307,28 @@ func (c *SplunkClient) SetSearchHeadDetention(detain bool) error { return c.Do(request, expectedStatus, nil) } +// InitiateUpgrade set upgrade banner for a search head cluster +func (c *SplunkClient) InitiateUpgrade() error { + endpoint := fmt.Sprintf("%s/services/shcluster/captain/control/control/upgrade-init", c.ManagementURI) + request, err := http.NewRequest("POST", endpoint, nil) + if err != nil { + return err + } + expectedStatus := []int{200} + return c.Do(request, expectedStatus, nil) +} + +// UpgradeFinalize unset upgrade banner for a search head cluster captain +func (c *SplunkClient) UpgradeFinalize(captainURI string) error { + endpoint := fmt.Sprintf("%s/services/shcluster/captain/control/control/upgrade-finalize", captainURI) + request, err := http.NewRequest("POST", endpoint, nil) + if err != nil { + return err + } + expectedStatus := []int{200} + return c.Do(request, expectedStatus, nil) +} + // RemoveSearchHeadClusterMember removes a search head cluster member. // You can use this on any member of a search head cluster. // See https://docs.splunk.com/Documentation/Splunk/latest/DistSearch/Removeaclustermember diff --git a/pkg/splunk/client/metrics/metrics.go b/pkg/splunk/client/metrics/metrics.go new file mode 100644 index 000000000..bb0bfd080 --- /dev/null +++ b/pkg/splunk/client/metrics/metrics.go @@ -0,0 +1,117 @@ +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +const ( + LabelNamespace = "namespace" + LabelName = "name" + LabelKind = "kind" + LabelErrorType = "error_type" + LabelMethodName = "api" + LabelModuleName = "module" + LabelResourceVersion = "resource_version" +) + +var ( + upgradeStartTimestamp int64 + upgradeEndTimestamp int64 +) + +var ReconcileCounters = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_reconcile_total", + Help: "The number of times reconciled by this controller", +}, []string{LabelNamespace, LabelName, LabelKind}) + +var ReconcileErrorCounter = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "splunk_operator_reconcile_error_total", + Help: "The number of times the operator has failed to reconcile", +}) + +var ActionFailureCounters = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "splunk_operator_error_total", + Help: "The number of times operator has entered an error state", +}, []string{LabelErrorType}) + +var ApiTotalTimeMetricEvents = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "splunk_operator_module_duration_in_milliseconds", + Help: "The time it takes to complete each call in standalone (in milliseconds)", +}, []string{LabelNamespace, LabelName, LabelKind, LabelModuleName, LabelMethodName}) + +var ( + UpgradeStartTime = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "splunk_upgrade_start_time", + Help: "Unix timestamp when the SHC upgrade started", + }) + UpgradeEndTime = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "splunk_upgrade_end_time", + Help: "Unix timestamp when the SHC upgrade ended", + }, + ) + ShortSearchSuccessCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "splunk_short_search_success_total", + Help: "Total number of successful short searches per search head", + }, + []string{"sh_name"}, + ) + ShortSearchFailureCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "splunk_short_search_failure_total", + Help: "Total number of failed short searches per search head", + }, + []string{"sh_name"}, + ) + TotalSearchSuccessCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "splunk_total_search_success_total", + Help: "Total number of successful total searches per search head", + }, + []string{"sh_name"}, + ) + TotalSearchFailureCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "splunk_total_search_failure_total", + Help: "Total number of failed total searches per search head", + }, + []string{"sh_name"}, + ) +) + +func GetPrometheusLabels(request reconcile.Request, kind string) prometheus.Labels { + return prometheus.Labels{ + LabelNamespace: request.Namespace, + LabelName: request.Name, + LabelKind: kind, + } +} + +func RecordUpgradeStartTime() { + upgradeStartTimestamp = time.Now().Unix() + UpgradeStartTime.Set(float64(upgradeStartTimestamp)) +} + +func RecordUpgradeEndTime() { + upgradeEndTimestamp = time.Now().Unix() + UpgradeEndTime.Set(float64(upgradeEndTimestamp)) +} + +func init() { + metrics.Registry.MustRegister( + ReconcileCounters, + ReconcileErrorCounter, + ActionFailureCounters, + ApiTotalTimeMetricEvents, + UpgradeStartTime, + UpgradeEndTime, + ShortSearchSuccessCounter, + ShortSearchFailureCounter, + TotalSearchSuccessCounter, + TotalSearchFailureCounter, + ) +} diff --git a/pkg/splunk/common/types.go b/pkg/splunk/common/types.go index f771623de..5de7aed85 100644 --- a/pkg/splunk/common/types.go +++ b/pkg/splunk/common/types.go @@ -53,4 +53,6 @@ type StatefulSetPodManager interface { // FinishRecycle completes recycle event for pod and returns true, or returns false if nothing to do FinishRecycle(context.Context, int32) (bool, error) + + FinishUpgrade(context.Context, int32) (error) } diff --git a/pkg/splunk/enterprise/indexercluster.go b/pkg/splunk/enterprise/indexercluster.go index d2d7962ce..24db53087 100644 --- a/pkg/splunk/enterprise/indexercluster.go +++ b/pkg/splunk/enterprise/indexercluster.go @@ -840,6 +840,10 @@ func (mgr *indexerClusterPodManager) PrepareRecycle(ctx context.Context, n int32 return mgr.decommission(ctx, n, false) } +func (mgr *indexerClusterPodManager) FinishUpgrade(ctx context.Context, n int32) (error) { + return nil +} + // FinishRecycle for indexerClusterPodManager completes recycle event for indexer pod; it returns true when complete func (mgr *indexerClusterPodManager) FinishRecycle(ctx context.Context, n int32) (bool, error) { if n >= int32(len(mgr.cr.Status.Peers)) { diff --git a/pkg/splunk/enterprise/searchheadcluster.go b/pkg/splunk/enterprise/searchheadcluster.go index 9e9227847..056ad8f96 100644 --- a/pkg/splunk/enterprise/searchheadcluster.go +++ b/pkg/splunk/enterprise/searchheadcluster.go @@ -25,8 +25,6 @@ import ( enterpriseApi "github.com/splunk/splunk-operator/api/v4" - "github.com/go-logr/logr" - splclient "github.com/splunk/splunk-operator/pkg/splunk/client" splcommon "github.com/splunk/splunk-operator/pkg/splunk/common" splctrl "github.com/splunk/splunk-operator/pkg/splunk/splkcontroller" @@ -198,7 +196,11 @@ func ApplySearchHeadCluster(ctx context.Context, client splcommon.ControllerClie } mgr := newSearchHeadClusterPodManager(client, scopedLog, cr, namespaceScopedSecret, splclient.NewSplunkClient) + + // handle SHC upgrade process phase, err = mgr.Update(ctx, client, statefulSet, cr.Spec.Replicas) + +// phase, err = mgr.ReconcileUpgrade(ctx, client, statefulSet, cr.Spec.Replicas) if err != nil { return result, err } @@ -255,26 +257,6 @@ func ApplySearchHeadCluster(ctx context.Context, client splcommon.ControllerClie return result, nil } -// searchHeadClusterPodManager is used to manage the pods within a search head cluster -type searchHeadClusterPodManager struct { - c splcommon.ControllerClient - log logr.Logger - cr *enterpriseApi.SearchHeadCluster - secrets *corev1.Secret - newSplunkClient func(managementURI, username, password string) *splclient.SplunkClient -} - -// newSerachHeadClusterPodManager function to create pod manager this is added to write unit test case -var newSearchHeadClusterPodManager = func(client splcommon.ControllerClient, log logr.Logger, cr *enterpriseApi.SearchHeadCluster, secret *corev1.Secret, newSplunkClient NewSplunkClientFunc) searchHeadClusterPodManager { - return searchHeadClusterPodManager{ - log: log, - cr: cr, - secrets: secret, - newSplunkClient: newSplunkClient, - c: client, - } -} - // ApplyShcSecret checks if any of the search heads have a different shc_secret from namespace scoped secret and changes it func ApplyShcSecret(ctx context.Context, mgr *searchHeadClusterPodManager, replicas int32, podExecClient splutil.PodExecClientImpl) error { // Get namespace scoped secret @@ -437,207 +419,6 @@ func ApplyShcSecret(ctx context.Context, mgr *searchHeadClusterPodManager, repli return nil } -// Update for searchHeadClusterPodManager handles all updates for a statefulset of search heads -func (mgr *searchHeadClusterPodManager) Update(ctx context.Context, c splcommon.ControllerClient, statefulSet *appsv1.StatefulSet, desiredReplicas int32) (enterpriseApi.Phase, error) { - // Assign client - if mgr.c == nil { - mgr.c = c - } - - // update statefulset, if necessary - _, err := splctrl.ApplyStatefulSet(ctx, mgr.c, statefulSet) - if err != nil { - return enterpriseApi.PhaseError, err - } - - // for now pass the targetPodName as empty since we are going to fill it in ApplyShcSecret - podExecClient := splutil.GetPodExecClient(mgr.c, mgr.cr, "") - - // Check if a recycle of shc pods is necessary(due to shc_secret mismatch with namespace scoped secret) - err = ApplyShcSecret(ctx, mgr, desiredReplicas, podExecClient) - if err != nil { - return enterpriseApi.PhaseError, err - } - - // update CR status with SHC information - err = mgr.updateStatus(ctx, statefulSet) - if err != nil || mgr.cr.Status.ReadyReplicas == 0 || !mgr.cr.Status.Initialized || !mgr.cr.Status.CaptainReady { - mgr.log.Info("Search head cluster is not ready", "reason ", err) - return enterpriseApi.PhasePending, nil - } - - // manage scaling and updates - return splctrl.UpdateStatefulSetPods(ctx, mgr.c, statefulSet, mgr, desiredReplicas) -} - -// PrepareScaleDown for searchHeadClusterPodManager prepares search head pod to be removed via scale down event; it returns true when ready -func (mgr *searchHeadClusterPodManager) PrepareScaleDown(ctx context.Context, n int32) (bool, error) { - // start by quarantining the pod - result, err := mgr.PrepareRecycle(ctx, n) - if err != nil || !result { - return result, err - } - - // pod is quarantined; decommission it - memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) - mgr.log.Info("Removing member from search head cluster", "memberName", memberName) - c := mgr.getClient(ctx, n) - err = c.RemoveSearchHeadClusterMember() - if err != nil { - return false, err - } - - // all done -> ok to scale down the statefulset - return true, nil -} - -// PrepareRecycle for searchHeadClusterPodManager prepares search head pod to be recycled for updates; it returns true when ready -func (mgr *searchHeadClusterPodManager) PrepareRecycle(ctx context.Context, n int32) (bool, error) { - memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) - - switch mgr.cr.Status.Members[n].Status { - case "Up": - // Detain search head - mgr.log.Info("Detaining search head cluster member", "memberName", memberName) - c := mgr.getClient(ctx, n) - podExecClient := splutil.GetPodExecClient(mgr.c, mgr.cr, getApplicablePodNameForK8Probes(mgr.cr, n)) - err := setProbeLevelOnSplunkPod(ctx, podExecClient, livenessProbeLevelOne) - if err != nil { - // During the Recycle, our reconcile loop is entered multiple times. If the Pod is already down, - // there is a chance of readiness probe failing, in which case, even the podExec will not be successful. - // So, just log the message, and ignore the error. - mgr.log.Info("Setting Probe level failed. Probably, the Pod is already down", "memberName", memberName) - } - - return false, c.SetSearchHeadDetention(true) - - case "ManualDetention": - // Wait until active searches have drained - searchesComplete := mgr.cr.Status.Members[n].ActiveHistoricalSearchCount+mgr.cr.Status.Members[n].ActiveRealtimeSearchCount == 0 - if searchesComplete { - mgr.log.Info("Detention complete", "memberName", memberName) - } else { - mgr.log.Info("Waiting for active searches to complete", "memberName", memberName) - } - return searchesComplete, nil - - case "": // this can happen after the member has already been recycled and we're just waiting for state to update - mgr.log.Info("Member has empty Status", "memberName", memberName) - return false, nil - } - - // unhandled status - return false, fmt.Errorf("Status=%s", mgr.cr.Status.Members[n].Status) -} - -// FinishRecycle for searchHeadClusterPodManager completes recycle event for search head pod; it returns true when complete -func (mgr *searchHeadClusterPodManager) FinishRecycle(ctx context.Context, n int32) (bool, error) { - memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) - - switch mgr.cr.Status.Members[n].Status { - case "Up": - // not in detention - return true, nil - - case "ManualDetention": - // release from detention - mgr.log.Info("Releasing search head cluster member from detention", "memberName", memberName) - c := mgr.getClient(ctx, n) - return false, c.SetSearchHeadDetention(false) - } - - // unhandled status - return false, fmt.Errorf("Status=%s", mgr.cr.Status.Members[n].Status) -} - -// getClient for searchHeadClusterPodManager returns a SplunkClient for the member n -func (mgr *searchHeadClusterPodManager) getClient(ctx context.Context, n int32) *splclient.SplunkClient { - reqLogger := log.FromContext(ctx) - scopedLog := reqLogger.WithName("searchHeadClusterPodManager.getClient").WithValues("name", mgr.cr.GetName(), "namespace", mgr.cr.GetNamespace()) - - // Get Pod Name - memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) - - // Get Fully Qualified Domain Name - fqdnName := splcommon.GetServiceFQDN(mgr.cr.GetNamespace(), - fmt.Sprintf("%s.%s", memberName, GetSplunkServiceName(SplunkSearchHead, mgr.cr.GetName(), true))) - - // Retrieve admin password from Pod - adminPwd, err := splutil.GetSpecificSecretTokenFromPod(ctx, mgr.c, memberName, mgr.cr.GetNamespace(), "password") - if err != nil { - scopedLog.Error(err, "Couldn't retrieve the admin password from Pod") - } - - return mgr.newSplunkClient(fmt.Sprintf("https://%s:8089", fqdnName), "admin", adminPwd) -} - -// GetSearchHeadClusterMemberInfo used in mocking this function -var GetSearchHeadClusterMemberInfo = func(ctx context.Context, mgr *searchHeadClusterPodManager, n int32) (*splclient.SearchHeadClusterMemberInfo, error) { - c := mgr.getClient(ctx, n) - return c.GetSearchHeadClusterMemberInfo() -} - -// GetSearchHeadCaptainInfo used in mocking this function -var GetSearchHeadCaptainInfo = func(ctx context.Context, mgr *searchHeadClusterPodManager, n int32) (*splclient.SearchHeadCaptainInfo, error) { - c := mgr.getClient(ctx, n) - return c.GetSearchHeadCaptainInfo() -} - -// updateStatus for searchHeadClusterPodManager uses the REST API to update the status for a SearcHead custom resource -func (mgr *searchHeadClusterPodManager) updateStatus(ctx context.Context, statefulSet *appsv1.StatefulSet) error { - // populate members status using REST API to get search head cluster member info - mgr.cr.Status.Captain = "" - mgr.cr.Status.CaptainReady = false - mgr.cr.Status.ReadyReplicas = statefulSet.Status.ReadyReplicas - if mgr.cr.Status.ReadyReplicas == 0 { - return nil - } - gotCaptainInfo := false - for n := int32(0); n < statefulSet.Status.Replicas; n++ { - memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) - memberStatus := enterpriseApi.SearchHeadClusterMemberStatus{Name: memberName} - memberInfo, err := GetSearchHeadClusterMemberInfo(ctx, mgr, n) - if err == nil { - memberStatus.Status = memberInfo.Status - memberStatus.Adhoc = memberInfo.Adhoc - memberStatus.Registered = memberInfo.Registered - memberStatus.ActiveHistoricalSearchCount = memberInfo.ActiveHistoricalSearchCount - memberStatus.ActiveRealtimeSearchCount = memberInfo.ActiveRealtimeSearchCount - } else { - mgr.log.Error(err, "Unable to retrieve search head cluster member info", "memberName", memberName) - } - - if err == nil && !gotCaptainInfo { - // try querying captain api; note that this should work on any node - captainInfo, err := GetSearchHeadCaptainInfo(ctx, mgr, n) - if err == nil { - mgr.cr.Status.Captain = captainInfo.Label - mgr.cr.Status.CaptainReady = captainInfo.ServiceReady - mgr.cr.Status.Initialized = captainInfo.Initialized - mgr.cr.Status.MinPeersJoined = captainInfo.MinPeersJoined - mgr.cr.Status.MaintenanceMode = captainInfo.MaintenanceMode - gotCaptainInfo = true - } else { - mgr.cr.Status.CaptainReady = false - mgr.log.Error(err, "Unable to retrieve captain info", "memberName", memberName) - } - } - - if n < int32(len(mgr.cr.Status.Members)) { - mgr.cr.Status.Members[n] = memberStatus - } else { - mgr.cr.Status.Members = append(mgr.cr.Status.Members, memberStatus) - } - } - - // truncate any extra members that we didn't check (leftover from scale down) - if statefulSet.Status.Replicas < int32(len(mgr.cr.Status.Members)) { - mgr.cr.Status.Members = mgr.cr.Status.Members[:statefulSet.Status.Replicas] - } - - return nil -} - // getSearchHeadStatefulSet returns a Kubernetes StatefulSet object for Splunk Enterprise search heads. func getSearchHeadStatefulSet(ctx context.Context, client splcommon.ControllerClient, cr *enterpriseApi.SearchHeadCluster) (*appsv1.StatefulSet, error) { diff --git a/pkg/splunk/enterprise/searchheadclusterpodmanager.go b/pkg/splunk/enterprise/searchheadclusterpodmanager.go new file mode 100644 index 000000000..6216964ed --- /dev/null +++ b/pkg/splunk/enterprise/searchheadclusterpodmanager.go @@ -0,0 +1,295 @@ +package enterprise + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + enterpriseApi "github.com/splunk/splunk-operator/api/v4" + splclient "github.com/splunk/splunk-operator/pkg/splunk/client" + metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" + splcommon "github.com/splunk/splunk-operator/pkg/splunk/common" + splctrl "github.com/splunk/splunk-operator/pkg/splunk/splkcontroller" + splutil "github.com/splunk/splunk-operator/pkg/splunk/util" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// searchHeadClusterPodManager is used to manage the pods within a search head cluster +type searchHeadClusterPodManager struct { + c splcommon.ControllerClient + log logr.Logger + cr *enterpriseApi.SearchHeadCluster + secrets *corev1.Secret + newSplunkClient func(managementURI, username, password string) *splclient.SplunkClient +} + +// newSerachHeadClusterPodManager function to create pod manager this is added to write unit test case +var newSearchHeadClusterPodManager = func(client splcommon.ControllerClient, log logr.Logger, cr *enterpriseApi.SearchHeadCluster, secret *corev1.Secret, newSplunkClient NewSplunkClientFunc) searchHeadClusterPodManager { + return searchHeadClusterPodManager{ + log: log, + cr: cr, + secrets: secret, + newSplunkClient: newSplunkClient, + c: client, + } +} + +// Update for searchHeadClusterPodManager handles all updates for a statefulset of search heads +func (mgr *searchHeadClusterPodManager) Update(ctx context.Context, c splcommon.ControllerClient, statefulSet *appsv1.StatefulSet, desiredReplicas int32) (enterpriseApi.Phase, error) { + // Assign client + if mgr.c == nil { + mgr.c = c + } + + // update statefulset, if necessary + _, err := splctrl.ApplyStatefulSet(ctx, mgr.c, statefulSet) + if err != nil { + return enterpriseApi.PhaseError, err + } + + // for now pass the targetPodName as empty since we are going to fill it in ApplyShcSecret + podExecClient := splutil.GetPodExecClient(mgr.c, mgr.cr, "") + + // Check if a recycle of shc pods is necessary(due to shc_secret mismatch with namespace scoped secret) + err = ApplyShcSecret(ctx, mgr, desiredReplicas, podExecClient) + if err != nil { + return enterpriseApi.PhaseError, err + } + + // update CR status with SHC information + err = mgr.updateStatus(ctx, statefulSet) + if err != nil || mgr.cr.Status.ReadyReplicas == 0 || !mgr.cr.Status.Initialized || !mgr.cr.Status.CaptainReady { + mgr.log.Info("Search head cluster is not ready", "reason ", err) + return enterpriseApi.PhasePending, nil + } + + // manage scaling and updates + return splctrl.UpdateStatefulSetPods(ctx, mgr.c, statefulSet, mgr, desiredReplicas) +} + +// PrepareScaleDown for searchHeadClusterPodManager prepares search head pod to be removed via scale down event; it returns true when ready +func (mgr *searchHeadClusterPodManager) PrepareScaleDown(ctx context.Context, n int32) (bool, error) { + // start by quarantining the pod + result, err := mgr.PrepareRecycle(ctx, n) + if err != nil || !result { + return result, err + } + + // pod is quarantined; decommission it + memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) + mgr.log.Info("Removing member from search head cluster", "memberName", memberName) + c := mgr.getClient(ctx, n) + err = c.RemoveSearchHeadClusterMember() + if err != nil { + return false, err + } + + // all done -> ok to scale down the statefulset + return true, nil +} + +// PrepareRecycle for searchHeadClusterPodManager prepares search head pod to be recycled for updates; it returns true when ready +func (mgr *searchHeadClusterPodManager) PrepareRecycle(ctx context.Context, n int32) (bool, error) { + memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) + + switch mgr.cr.Status.Members[n].Status { + case "Up": + // Detain search head + mgr.log.Info("Detaining search head cluster member", "memberName", memberName) + c := mgr.getClient(ctx, n) + + podExecClient := splutil.GetPodExecClient(mgr.c, mgr.cr, getApplicablePodNameForK8Probes(mgr.cr, n)) + + err := setProbeLevelOnSplunkPod(ctx, podExecClient, livenessProbeLevelOne) + + if err != nil { + // During the Recycle, our reconcile loop is entered multiple times. If the Pod is already down, + // there is a chance of readiness probe failing, in which case, even the podExec will not be successful. + // So, just log the message, and ignore the error. + mgr.log.Info("Setting Probe level failed. Probably, the Pod is already down", "memberName", memberName) + } + + mgr.log.Info("Setting Upgrade banner") + err = c.InitiateUpgrade() + + if err != nil { + mgr.log.Info("Setting upgrade banner failed.") + return false, err + } + + start := mgr.cr.Status.UpgradeStartTimestamp + end := mgr.cr.Status.UpgradeEndTimestamp + + // todo: add Upgraded state as a requirement? + if end >= start { + // todo: check if the timestamps are overwritten by updateStatus + currentTime := time.Now().Unix() + mgr.cr.Status.UpgradeStartTimestamp = currentTime + + metrics.UpgradeStartTime.Set(float64(currentTime)) + +// mgr.cr.Status.UpgradePhase = "Upgrading" + mgr.cr.Status.UpgradePhase = enterpriseApi.UpgradePhaseUpgrading + } + + return false, c.SetSearchHeadDetention(true) + + case "ManualDetention": + // Wait until active searches have drained + searchesComplete := mgr.cr.Status.Members[n].ActiveHistoricalSearchCount+mgr.cr.Status.Members[n].ActiveRealtimeSearchCount == 0 + if searchesComplete { + mgr.log.Info("Detention complete", "memberName", memberName) + } else { + mgr.log.Info("Waiting for active searches to complete", "memberName", memberName) + } + return searchesComplete, nil + + case "": // this can happen after the member has already been recycled and we're just waiting for state to update + mgr.log.Info("Member has empty Status", "memberName", memberName) + return false, nil + } + + // unhandled status + return false, fmt.Errorf("Status=%s", mgr.cr.Status.Members[n].Status) +} + +// FinishRecycle for searchHeadClusterPodManager completes recycle event for search head pod; it returns true when complete +func (mgr *searchHeadClusterPodManager) FinishRecycle(ctx context.Context, n int32) (bool, error) { + memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) + + switch mgr.cr.Status.Members[n].Status { + case "Up": + // not in detention + return true, nil + + case "ManualDetention": + // release from detention + mgr.log.Info("Releasing search head cluster member from detention", "memberName", memberName) + c := mgr.getClient(ctx, n) + return false, c.SetSearchHeadDetention(false) + } + + // unhandled status + return false, fmt.Errorf("Status=%s", mgr.cr.Status.Members[n].Status) +} + +func (mgr *searchHeadClusterPodManager) FinishUpgrade(ctx context.Context, n int32) error { + + if mgr.cr.Status.UpgradePhase == enterpriseApi.UpgradePhaseUpgrading { + c := mgr.getClient(ctx, n) + + // stop gathering metrics + currentTime := time.Now().Unix() + mgr.cr.Status.UpgradeEndTimestamp = currentTime + + metrics.UpgradeEndTime.Set(float64(currentTime)) + // metrics still + + i, err := c.GetSearchHeadCaptainInfo() + if err != nil { + mgr.log.Info("Getting Search Head Captain Info failed") + return err + } + + captainURI := i.PeerSchemeHostPort + + // revert upgrade state status + mgr.cr.Status.UpgradePhase = enterpriseApi.UpgradePhaseUpgraded + + mgr.log.Info("Finalize Upgrade - unset banner") + return c.UpgradeFinalize(captainURI) + } + + return nil +} + +// getClient for searchHeadClusterPodManager returns a SplunkClient for the member n +func (mgr *searchHeadClusterPodManager) getClient(ctx context.Context, n int32) *splclient.SplunkClient { + reqLogger := log.FromContext(ctx) + scopedLog := reqLogger.WithName("searchHeadClusterPodManager.getClient").WithValues("name", mgr.cr.GetName(), "namespace", mgr.cr.GetNamespace()) + + // Get Pod Name + memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) + + // Get Fully Qualified Domain Name + fqdnName := splcommon.GetServiceFQDN(mgr.cr.GetNamespace(), + fmt.Sprintf("%s.%s", memberName, GetSplunkServiceName(SplunkSearchHead, mgr.cr.GetName(), true))) + + // Retrieve admin password from Pod + adminPwd, err := splutil.GetSpecificSecretTokenFromPod(ctx, mgr.c, memberName, mgr.cr.GetNamespace(), "password") + if err != nil { + scopedLog.Error(err, "Couldn't retrieve the admin password from Pod") + } + + return mgr.newSplunkClient(fmt.Sprintf("https://%s:8089", fqdnName), "admin", adminPwd) +} + +// GetSearchHeadClusterMemberInfo used in mocking this function +var GetSearchHeadClusterMemberInfo = func(ctx context.Context, mgr *searchHeadClusterPodManager, n int32) (*splclient.SearchHeadClusterMemberInfo, error) { + c := mgr.getClient(ctx, n) + return c.GetSearchHeadClusterMemberInfo() +} + +// GetSearchHeadCaptainInfo used in mocking this function +var GetSearchHeadCaptainInfo = func(ctx context.Context, mgr *searchHeadClusterPodManager, n int32) (*splclient.SearchHeadCaptainInfo, error) { + c := mgr.getClient(ctx, n) + return c.GetSearchHeadCaptainInfo() +} + +// updateStatus for searchHeadClusterPodManager uses the REST API to update the status for a SearcHead custom resource +func (mgr *searchHeadClusterPodManager) updateStatus(ctx context.Context, statefulSet *appsv1.StatefulSet) error { + // populate members status using REST API to get search head cluster member info + mgr.cr.Status.Captain = "" + mgr.cr.Status.CaptainReady = false + mgr.cr.Status.ReadyReplicas = statefulSet.Status.ReadyReplicas + if mgr.cr.Status.ReadyReplicas == 0 { + return nil + } + gotCaptainInfo := false + for n := int32(0); n < statefulSet.Status.Replicas; n++ { + memberName := GetSplunkStatefulsetPodName(SplunkSearchHead, mgr.cr.GetName(), n) + memberStatus := enterpriseApi.SearchHeadClusterMemberStatus{Name: memberName} + memberInfo, err := GetSearchHeadClusterMemberInfo(ctx, mgr, n) + if err == nil { + memberStatus.Status = memberInfo.Status + memberStatus.Adhoc = memberInfo.Adhoc + memberStatus.Registered = memberInfo.Registered + memberStatus.ActiveHistoricalSearchCount = memberInfo.ActiveHistoricalSearchCount + memberStatus.ActiveRealtimeSearchCount = memberInfo.ActiveRealtimeSearchCount + } else { + mgr.log.Error(err, "Unable to retrieve search head cluster member info", "memberName", memberName) + } + + if err == nil && !gotCaptainInfo { + // try querying captain api; note that this should work on any node + captainInfo, err := GetSearchHeadCaptainInfo(ctx, mgr, n) + if err == nil { + mgr.cr.Status.Captain = captainInfo.Label + mgr.cr.Status.CaptainReady = captainInfo.ServiceReady + mgr.cr.Status.Initialized = captainInfo.Initialized + mgr.cr.Status.MinPeersJoined = captainInfo.MinPeersJoined + mgr.cr.Status.MaintenanceMode = captainInfo.MaintenanceMode + gotCaptainInfo = true + } else { + mgr.cr.Status.CaptainReady = false + mgr.log.Error(err, "Unable to retrieve captain info", "memberName", memberName) + } + } + + if n < int32(len(mgr.cr.Status.Members)) { + mgr.cr.Status.Members[n] = memberStatus + } else { + mgr.cr.Status.Members = append(mgr.cr.Status.Members, memberStatus) + } + } + + // truncate any extra members that we didn't check (leftover from scale down) + if statefulSet.Status.Replicas < int32(len(mgr.cr.Status.Members)) { + mgr.cr.Status.Members = mgr.cr.Status.Members[:statefulSet.Status.Replicas] + } + + return nil +} diff --git a/pkg/splunk/splkcontroller/statefulset.go b/pkg/splunk/splkcontroller/statefulset.go index aa1c74ed0..a9f7f6073 100644 --- a/pkg/splunk/splkcontroller/statefulset.go +++ b/pkg/splunk/splkcontroller/statefulset.go @@ -60,6 +60,10 @@ func (mgr *DefaultStatefulSetPodManager) FinishRecycle(ctx context.Context, n in return true, nil } +func (mgr *DefaultStatefulSetPodManager) FinishUpgrade(ctx context.Context, n int32) (error) { + return nil +} + // ApplyStatefulSet creates or updates a Kubernetes StatefulSet func ApplyStatefulSet(ctx context.Context, c splcommon.ControllerClient, revised *appsv1.StatefulSet) (enterpriseApi.Phase, error) { namespacedName := types.NamespacedName{Namespace: revised.GetNamespace(), Name: revised.GetName()} @@ -252,6 +256,16 @@ func UpdateStatefulSetPods(ctx context.Context, c splcommon.ControllerClient, st // all is good! scopedLog.Info("All pods are ready") + + // Unset upgrade banner from the captain + err = mgr.FinishUpgrade(ctx, 0) + if err != nil { + scopedLog.Error(err, "Unable to unset upgrade banner") + return enterpriseApi.PhaseError, err + } + + scopedLog.Info("Statefulset - Phase Ready") + return enterpriseApi.PhaseReady, nil } diff --git a/pkg/splunk/splkcontroller/statefulset_test.go b/pkg/splunk/splkcontroller/statefulset_test.go index f51dff717..aa2c926ed 100644 --- a/pkg/splunk/splkcontroller/statefulset_test.go +++ b/pkg/splunk/splkcontroller/statefulset_test.go @@ -75,6 +75,16 @@ func (mgr *errTestPodManager) FinishRecycle(ctx context.Context, n int32) (bool, return true, errors.New(splcommon.Rerr) } +func (mgr *errTestPodManager) FinishUpgrade(ctx context.Context, n int32) (error) { + // Induce not ready error + if ctx.Value("errKey") == "errVal" { + return nil + } + + return errors.New(splcommon.Rerr) +} + + func TestApplyStatefulSet(t *testing.T) { ctx := context.TODO() funcCalls := []spltest.MockFuncCall{{MetaName: "*v1.StatefulSet-test-splunk-stack1-indexer"}} From 072c87a88bce6b89724f3ca6c6923a7e78718037 Mon Sep 17 00:00:00 2001 From: Patryk Wasielewski Date: Thu, 24 Jul 2025 13:49:11 +0200 Subject: [PATCH 2/6] lint changes --- api/v4/searchheadcluster_types.go | 3 +-- internal/controller/clustermanager_controller.go | 2 +- internal/controller/clustermaster_controller.go | 2 +- internal/controller/common/predicate.go | 2 +- internal/controller/indexercluster_controller.go | 2 +- internal/controller/licensemanager_controller.go | 2 +- internal/controller/licensemaster_controller.go | 2 +- internal/controller/monitoringconsole_controller.go | 2 +- internal/controller/searchheadcluster_controller.go | 2 +- internal/controller/standalone_controller.go | 2 +- pkg/splunk/common/types.go | 2 +- pkg/splunk/enterprise/configuration_test.go | 12 ++++++------ pkg/splunk/enterprise/indexercluster.go | 2 +- pkg/splunk/enterprise/searchheadcluster.go | 2 +- pkg/splunk/enterprise/searchheadclusterpodmanager.go | 4 ++-- pkg/splunk/splkcontroller/statefulset.go | 2 +- pkg/splunk/splkcontroller/statefulset_test.go | 3 +-- 17 files changed, 23 insertions(+), 25 deletions(-) diff --git a/api/v4/searchheadcluster_types.go b/api/v4/searchheadcluster_types.go index cabae7e17..c866c6e09 100644 --- a/api/v4/searchheadcluster_types.go +++ b/api/v4/searchheadcluster_types.go @@ -137,14 +137,13 @@ type SearchHeadClusterStatus struct { UpgradeStartTimestamp int64 `json:"upgradeStartTimestamp"` UpgradeEndTimestamp int64 `json:"upgradeEndTimestamp"` - } type UpgradePhase string const ( UpgradePhaseUpgrading UpgradePhase = "Upgrading" - UpgradePhaseUpgraded UpgradePhase = "Upgraded" + UpgradePhaseUpgraded UpgradePhase = "Upgraded" ) // SearchHeadCluster is the Schema for a Splunk Enterprise search head cluster diff --git a/internal/controller/clustermanager_controller.go b/internal/controller/clustermanager_controller.go index 692c7366c..f149bf129 100644 --- a/internal/controller/clustermanager_controller.go +++ b/internal/controller/clustermanager_controller.go @@ -24,8 +24,8 @@ import ( "github.com/splunk/splunk-operator/internal/controller/common" "github.com/pkg/errors" - enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" + enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" diff --git a/internal/controller/clustermaster_controller.go b/internal/controller/clustermaster_controller.go index ef162cbc9..9f261f85b 100644 --- a/internal/controller/clustermaster_controller.go +++ b/internal/controller/clustermaster_controller.go @@ -25,8 +25,8 @@ import ( "github.com/pkg/errors" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" - enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" + enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" diff --git a/internal/controller/common/predicate.go b/internal/controller/common/predicate.go index 13ce095bb..9ef3c2df2 100644 --- a/internal/controller/common/predicate.go +++ b/internal/controller/common/predicate.go @@ -30,7 +30,7 @@ func LabelChangedPredicate() predicate.Predicate { func GenerationChangedPredicate() predicate.Predicate { return predicate.Funcs{ UpdateFunc: func(e event.UpdateEvent) bool { - return !reflect.DeepEqual(e.ObjectOld.GetGeneration(), e.ObjectNew.GetGeneration()) + return !reflect.DeepEqual(e.ObjectOld.GetGeneration(), e.ObjectNew.GetGeneration()) }, DeleteFunc: func(e event.DeleteEvent) bool { // Evaluates to false if the object has been confirmed deleted. diff --git a/internal/controller/indexercluster_controller.go b/internal/controller/indexercluster_controller.go index 027e28fab..bc9a6c9f5 100644 --- a/internal/controller/indexercluster_controller.go +++ b/internal/controller/indexercluster_controller.go @@ -25,8 +25,8 @@ import ( "github.com/pkg/errors" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" - enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" + enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" diff --git a/internal/controller/licensemanager_controller.go b/internal/controller/licensemanager_controller.go index aa162cf48..cb749a736 100644 --- a/internal/controller/licensemanager_controller.go +++ b/internal/controller/licensemanager_controller.go @@ -24,8 +24,8 @@ import ( "github.com/splunk/splunk-operator/internal/controller/common" "github.com/pkg/errors" - enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" + enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" diff --git a/internal/controller/licensemaster_controller.go b/internal/controller/licensemaster_controller.go index f87c2dd4a..717632e33 100644 --- a/internal/controller/licensemaster_controller.go +++ b/internal/controller/licensemaster_controller.go @@ -25,8 +25,8 @@ import ( "github.com/pkg/errors" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" enterpriseApi "github.com/splunk/splunk-operator/api/v4" - enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" + enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" diff --git a/internal/controller/monitoringconsole_controller.go b/internal/controller/monitoringconsole_controller.go index 2040d8d14..3767fcac2 100644 --- a/internal/controller/monitoringconsole_controller.go +++ b/internal/controller/monitoringconsole_controller.go @@ -25,8 +25,8 @@ import ( "github.com/pkg/errors" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" - enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" + enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" diff --git a/internal/controller/searchheadcluster_controller.go b/internal/controller/searchheadcluster_controller.go index 509f8bf86..53d50fab9 100644 --- a/internal/controller/searchheadcluster_controller.go +++ b/internal/controller/searchheadcluster_controller.go @@ -24,8 +24,8 @@ import ( "github.com/splunk/splunk-operator/internal/controller/common" "github.com/pkg/errors" - enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" + enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" diff --git a/internal/controller/standalone_controller.go b/internal/controller/standalone_controller.go index b26bc6e70..93e85b7f0 100644 --- a/internal/controller/standalone_controller.go +++ b/internal/controller/standalone_controller.go @@ -30,8 +30,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/pkg/errors" - enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" + enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" k8serrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" diff --git a/pkg/splunk/common/types.go b/pkg/splunk/common/types.go index 5de7aed85..22322d88a 100644 --- a/pkg/splunk/common/types.go +++ b/pkg/splunk/common/types.go @@ -54,5 +54,5 @@ type StatefulSetPodManager interface { // FinishRecycle completes recycle event for pod and returns true, or returns false if nothing to do FinishRecycle(context.Context, int32) (bool, error) - FinishUpgrade(context.Context, int32) (error) + FinishUpgrade(context.Context, int32) error } diff --git a/pkg/splunk/enterprise/configuration_test.go b/pkg/splunk/enterprise/configuration_test.go index 94736c758..9cf285cc3 100644 --- a/pkg/splunk/enterprise/configuration_test.go +++ b/pkg/splunk/enterprise/configuration_test.go @@ -1430,10 +1430,10 @@ func TestAddStorageVolumes(t *testing.T) { // test if adminManagedPV logic works labels = map[string]string{ - "app.kubernetes.io/component": "indexer", - "app.kubernetes.io/instance": "splunk-CM-cluster-manager", + "app.kubernetes.io/component": "indexer", + "app.kubernetes.io/instance": "splunk-CM-cluster-manager", "app.kubernetes.io/managed-by": "splunk-operator", - "app.kubernetes.io/name": "cluster-manager", + "app.kubernetes.io/name": "cluster-manager", } // adjust CR annotations @@ -1466,10 +1466,10 @@ func TestAddStorageVolumes(t *testing.T) { APIVersion: "apps/v1", }, ObjectMeta: metav1.ObjectMeta{ - Name: "test-statefulset", - Namespace: cr.GetNamespace(), + Name: "test-statefulset", + Namespace: cr.GetNamespace(), Annotations: cr.GetAnnotations(), - Labels: cr.GetLabels(), + Labels: cr.GetLabels(), }, Spec: appsv1.StatefulSetSpec{ Replicas: &replicas, diff --git a/pkg/splunk/enterprise/indexercluster.go b/pkg/splunk/enterprise/indexercluster.go index 24db53087..2d135d84f 100644 --- a/pkg/splunk/enterprise/indexercluster.go +++ b/pkg/splunk/enterprise/indexercluster.go @@ -840,7 +840,7 @@ func (mgr *indexerClusterPodManager) PrepareRecycle(ctx context.Context, n int32 return mgr.decommission(ctx, n, false) } -func (mgr *indexerClusterPodManager) FinishUpgrade(ctx context.Context, n int32) (error) { +func (mgr *indexerClusterPodManager) FinishUpgrade(ctx context.Context, n int32) error { return nil } diff --git a/pkg/splunk/enterprise/searchheadcluster.go b/pkg/splunk/enterprise/searchheadcluster.go index 056ad8f96..0c146576b 100644 --- a/pkg/splunk/enterprise/searchheadcluster.go +++ b/pkg/splunk/enterprise/searchheadcluster.go @@ -200,7 +200,7 @@ func ApplySearchHeadCluster(ctx context.Context, client splcommon.ControllerClie // handle SHC upgrade process phase, err = mgr.Update(ctx, client, statefulSet, cr.Spec.Replicas) -// phase, err = mgr.ReconcileUpgrade(ctx, client, statefulSet, cr.Spec.Replicas) + // phase, err = mgr.ReconcileUpgrade(ctx, client, statefulSet, cr.Spec.Replicas) if err != nil { return result, err } diff --git a/pkg/splunk/enterprise/searchheadclusterpodmanager.go b/pkg/splunk/enterprise/searchheadclusterpodmanager.go index 6216964ed..9d341e9dd 100644 --- a/pkg/splunk/enterprise/searchheadclusterpodmanager.go +++ b/pkg/splunk/enterprise/searchheadclusterpodmanager.go @@ -131,7 +131,7 @@ func (mgr *searchHeadClusterPodManager) PrepareRecycle(ctx context.Context, n in metrics.UpgradeStartTime.Set(float64(currentTime)) -// mgr.cr.Status.UpgradePhase = "Upgrading" + // mgr.cr.Status.UpgradePhase = "Upgrading" mgr.cr.Status.UpgradePhase = enterpriseApi.UpgradePhaseUpgrading } @@ -202,7 +202,7 @@ func (mgr *searchHeadClusterPodManager) FinishUpgrade(ctx context.Context, n int mgr.log.Info("Finalize Upgrade - unset banner") return c.UpgradeFinalize(captainURI) } - + return nil } diff --git a/pkg/splunk/splkcontroller/statefulset.go b/pkg/splunk/splkcontroller/statefulset.go index a9f7f6073..0ddcd7792 100644 --- a/pkg/splunk/splkcontroller/statefulset.go +++ b/pkg/splunk/splkcontroller/statefulset.go @@ -60,7 +60,7 @@ func (mgr *DefaultStatefulSetPodManager) FinishRecycle(ctx context.Context, n in return true, nil } -func (mgr *DefaultStatefulSetPodManager) FinishUpgrade(ctx context.Context, n int32) (error) { +func (mgr *DefaultStatefulSetPodManager) FinishUpgrade(ctx context.Context, n int32) error { return nil } diff --git a/pkg/splunk/splkcontroller/statefulset_test.go b/pkg/splunk/splkcontroller/statefulset_test.go index aa2c926ed..bd9b605a4 100644 --- a/pkg/splunk/splkcontroller/statefulset_test.go +++ b/pkg/splunk/splkcontroller/statefulset_test.go @@ -75,7 +75,7 @@ func (mgr *errTestPodManager) FinishRecycle(ctx context.Context, n int32) (bool, return true, errors.New(splcommon.Rerr) } -func (mgr *errTestPodManager) FinishUpgrade(ctx context.Context, n int32) (error) { +func (mgr *errTestPodManager) FinishUpgrade(ctx context.Context, n int32) error { // Induce not ready error if ctx.Value("errKey") == "errVal" { return nil @@ -84,7 +84,6 @@ func (mgr *errTestPodManager) FinishUpgrade(ctx context.Context, n int32) (error return errors.New(splcommon.Rerr) } - func TestApplyStatefulSet(t *testing.T) { ctx := context.TODO() funcCalls := []spltest.MockFuncCall{{MetaName: "*v1.StatefulSet-test-splunk-stack1-indexer"}} From 995c9ccd9bad16a1d95904c0def4a991bfb51349 Mon Sep 17 00:00:00 2001 From: Patryk Wasielewski Date: Tue, 29 Jul 2025 13:17:59 +0200 Subject: [PATCH 3/6] fix unit test --- pkg/splunk/enterprise/searchheadcluster_test.go | 12 ++++++++++-- pkg/splunk/enterprise/searchheadclusterpodmanager.go | 4 +++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pkg/splunk/enterprise/searchheadcluster_test.go b/pkg/splunk/enterprise/searchheadcluster_test.go index 8562d4fd3..17477376f 100644 --- a/pkg/splunk/enterprise/searchheadcluster_test.go +++ b/pkg/splunk/enterprise/searchheadcluster_test.go @@ -208,6 +208,7 @@ func searchHeadClusterPodManagerTester(t *testing.T, method string, mockHandlers } mockSplunkClient := &spltest.MockHTTPClient{} mockSplunkClient.AddHandlers(mockHandlers...) + mgr := &searchHeadClusterPodManager{ log: scopedLog, cr: &cr, @@ -318,11 +319,18 @@ func TestSearchHeadClusterPodManager(t *testing.T) { // test pod needs update => transition to detention mockHandlers = append(mockHandlers, spltest.MockHTTPHandler{ Method: "POST", - URL: "https://splunk-stack1-search-head-0.splunk-stack1-search-head-headless.test.svc.cluster.local:8089/services/shcluster/member/control/control/set_manual_detention?manual_detention=on", + URL: "https://splunk-stack1-search-head-0.splunk-stack1-search-head-headless.test.svc.cluster.local:8089/services/shcluster/captain/control/control/upgrade-init", Status: 200, Err: nil, Body: ``, - }) + }, spltest.MockHTTPHandler{ + Method: "POST", + URL: "https://splunk-stack1-search-head-0.splunk-stack1-search-head-headless.test.svc.cluster.local:8089/services/shcluster/member/control/control/set_manual_detention?manual_detention=on", + Status: 200, + Err: nil, + Body: ``, + }, + ) pod.ObjectMeta.Labels["controller-revision-hash"] = "v0" method = "searchHeadClusterPodManager.Update(Quarantine Pod)" wantCalls = map[string][]spltest.MockFuncCall{"Get": {funcCalls[0], funcCalls[1], funcCalls[1], funcCalls[2], funcCalls[2], funcCalls[5], funcCalls[2], funcCalls[2]}, "Create": {funcCalls[1]}} diff --git a/pkg/splunk/enterprise/searchheadclusterpodmanager.go b/pkg/splunk/enterprise/searchheadclusterpodmanager.go index 9d341e9dd..a814c3589 100644 --- a/pkg/splunk/enterprise/searchheadclusterpodmanager.go +++ b/pkg/splunk/enterprise/searchheadclusterpodmanager.go @@ -178,6 +178,8 @@ func (mgr *searchHeadClusterPodManager) FinishRecycle(ctx context.Context, n int func (mgr *searchHeadClusterPodManager) FinishUpgrade(ctx context.Context, n int32) error { + reqLogger := log.FromContext(ctx) + if mgr.cr.Status.UpgradePhase == enterpriseApi.UpgradePhaseUpgrading { c := mgr.getClient(ctx, n) @@ -199,7 +201,7 @@ func (mgr *searchHeadClusterPodManager) FinishUpgrade(ctx context.Context, n int // revert upgrade state status mgr.cr.Status.UpgradePhase = enterpriseApi.UpgradePhaseUpgraded - mgr.log.Info("Finalize Upgrade - unset banner") + reqLogger.Info("Finalize Upgrade - unset banner") return c.UpgradeFinalize(captainURI) } From 2d78b2c0345fd3f20766b6d32deca943a186891c Mon Sep 17 00:00:00 2001 From: Patryk Wasielewski Date: Wed, 30 Jul 2025 13:58:15 +0200 Subject: [PATCH 4/6] cleanup code --- api/v4/searchheadcluster_types.go | 3 - pkg/splunk/client/metrics/metrics.go | 66 +++++++------------ pkg/splunk/enterprise/searchheadcluster.go | 1 - .../enterprise/searchheadclusterpodmanager.go | 10 +++ 4 files changed, 33 insertions(+), 47 deletions(-) diff --git a/api/v4/searchheadcluster_types.go b/api/v4/searchheadcluster_types.go index c866c6e09..67bdd24ba 100644 --- a/api/v4/searchheadcluster_types.go +++ b/api/v4/searchheadcluster_types.go @@ -50,9 +50,6 @@ type SearchHeadClusterSpec struct { // Splunk Deployer Node Affinity DeployerNodeAffinity *corev1.NodeAffinity `json:"deployerNodeAffinity,omitempty"` - - // upgrade strategy for the search head cluster - // UpgradeStrategy UpgradeStrategy `json:"upgradeStrategy,omitempty"` } // SearchHeadClusterMemberStatus is used to track the status of each search head cluster member diff --git a/pkg/splunk/client/metrics/metrics.go b/pkg/splunk/client/metrics/metrics.go index bb0bfd080..4cf1f22db 100644 --- a/pkg/splunk/client/metrics/metrics.go +++ b/pkg/splunk/client/metrics/metrics.go @@ -43,45 +43,27 @@ var ApiTotalTimeMetricEvents = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Help: "The time it takes to complete each call in standalone (in milliseconds)", }, []string{LabelNamespace, LabelName, LabelKind, LabelModuleName, LabelMethodName}) -var ( - UpgradeStartTime = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "splunk_upgrade_start_time", - Help: "Unix timestamp when the SHC upgrade started", - }) - UpgradeEndTime = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "splunk_upgrade_end_time", - Help: "Unix timestamp when the SHC upgrade ended", - }, - ) - ShortSearchSuccessCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "splunk_short_search_success_total", - Help: "Total number of successful short searches per search head", - }, - []string{"sh_name"}, - ) - ShortSearchFailureCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "splunk_short_search_failure_total", - Help: "Total number of failed short searches per search head", - }, - []string{"sh_name"}, - ) - TotalSearchSuccessCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "splunk_total_search_success_total", - Help: "Total number of successful total searches per search head", - }, - []string{"sh_name"}, - ) - TotalSearchFailureCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "splunk_total_search_failure_total", - Help: "Total number of failed total searches per search head", - }, - []string{"sh_name"}, - ) -) +var UpgradeStartTime = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "splunk_upgrade_start_time", + Help: "Unix timestamp when the SHC upgrade started", +}) + +var UpgradeEndTime = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "splunk_upgrade_end_time", + Help: "Unix timestamp when the SHC upgrade ended", +}) + +var ActiveHistoricalSearchCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "splunk_active_historical_search_count", + Help: "Total number of active historical search count", + }, []string{"sh_name"}) + +var ActiveRealtimeSearchCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "splunk_active_realtime_search_count", + Help: "Total number of active realtime search count", + }, []string{"sh_name"}) func GetPrometheusLabels(request reconcile.Request, kind string) prometheus.Labels { return prometheus.Labels{ @@ -109,9 +91,7 @@ func init() { ApiTotalTimeMetricEvents, UpgradeStartTime, UpgradeEndTime, - ShortSearchSuccessCounter, - ShortSearchFailureCounter, - TotalSearchSuccessCounter, - TotalSearchFailureCounter, + ActiveHistoricalSearchCount, + ActiveRealtimeSearchCount, ) } diff --git a/pkg/splunk/enterprise/searchheadcluster.go b/pkg/splunk/enterprise/searchheadcluster.go index 0c146576b..ec6156a56 100644 --- a/pkg/splunk/enterprise/searchheadcluster.go +++ b/pkg/splunk/enterprise/searchheadcluster.go @@ -200,7 +200,6 @@ func ApplySearchHeadCluster(ctx context.Context, client splcommon.ControllerClie // handle SHC upgrade process phase, err = mgr.Update(ctx, client, statefulSet, cr.Spec.Replicas) - // phase, err = mgr.ReconcileUpgrade(ctx, client, statefulSet, cr.Spec.Replicas) if err != nil { return result, err } diff --git a/pkg/splunk/enterprise/searchheadclusterpodmanager.go b/pkg/splunk/enterprise/searchheadclusterpodmanager.go index a814c3589..60f757777 100644 --- a/pkg/splunk/enterprise/searchheadclusterpodmanager.go +++ b/pkg/splunk/enterprise/searchheadclusterpodmanager.go @@ -6,6 +6,7 @@ import ( "time" "github.com/go-logr/logr" + "github.com/prometheus/client_golang/prometheus" enterpriseApi "github.com/splunk/splunk-operator/api/v4" splclient "github.com/splunk/splunk-operator/pkg/splunk/client" metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" @@ -138,6 +139,15 @@ func (mgr *searchHeadClusterPodManager) PrepareRecycle(ctx context.Context, n in return false, c.SetSearchHeadDetention(true) case "ManualDetention": + + metrics.ActiveHistoricalSearchCount.With(prometheus.Labels{ + "sh_name": mgr.cr.Status.Members[n].Name, + }).Set(float64(mgr.cr.Status.Members[n].ActiveHistoricalSearchCount)) + + metrics.ActiveRealtimeSearchCount.With(prometheus.Labels{ + "sh_name": mgr.cr.Status.Members[n].Name, + }).Set(float64(mgr.cr.Status.Members[n].ActiveRealtimeSearchCount)) + // Wait until active searches have drained searchesComplete := mgr.cr.Status.Members[n].ActiveHistoricalSearchCount+mgr.cr.Status.Members[n].ActiveRealtimeSearchCount == 0 if searchesComplete { From b2833cf945d7706f74ec3775762ff716307bb4bb Mon Sep 17 00:00:00 2001 From: Patryk Wasielewski Date: Fri, 1 Aug 2025 15:43:26 +0200 Subject: [PATCH 5/6] fix endpoints --- pkg/splunk/client/enterprise.go | 10 ++++++---- pkg/splunk/common/types.go | 1 + .../enterprise/searchheadclusterpodmanager.go | 18 +++++------------- pkg/splunk/splkcontroller/statefulset.go | 5 +++-- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/pkg/splunk/client/enterprise.go b/pkg/splunk/client/enterprise.go index e1a8d5dc5..8bc36b08a 100644 --- a/pkg/splunk/client/enterprise.go +++ b/pkg/splunk/client/enterprise.go @@ -307,7 +307,8 @@ func (c *SplunkClient) SetSearchHeadDetention(detain bool) error { return c.Do(request, expectedStatus, nil) } -// InitiateUpgrade set upgrade banner for a search head cluster +// InitiateUpgrade initializes rolling upgrade process for a search head cluster +// This endpoint proxies request to the cluster captain func (c *SplunkClient) InitiateUpgrade() error { endpoint := fmt.Sprintf("%s/services/shcluster/captain/control/control/upgrade-init", c.ManagementURI) request, err := http.NewRequest("POST", endpoint, nil) @@ -318,9 +319,10 @@ func (c *SplunkClient) InitiateUpgrade() error { return c.Do(request, expectedStatus, nil) } -// UpgradeFinalize unset upgrade banner for a search head cluster captain -func (c *SplunkClient) UpgradeFinalize(captainURI string) error { - endpoint := fmt.Sprintf("%s/services/shcluster/captain/control/control/upgrade-finalize", captainURI) +// FinalizeUpgrade finalizes rolling upgrade process for a search head cluster +// This endpoint proxies request to the cluster captain +func (c *SplunkClient) FinalizeUpgrade() error { + endpoint := fmt.Sprintf("%s/services/shcluster/captain/control/control/upgrade-finalize", c.ManagementURI) request, err := http.NewRequest("POST", endpoint, nil) if err != nil { return err diff --git a/pkg/splunk/common/types.go b/pkg/splunk/common/types.go index 22322d88a..25b353276 100644 --- a/pkg/splunk/common/types.go +++ b/pkg/splunk/common/types.go @@ -54,5 +54,6 @@ type StatefulSetPodManager interface { // FinishRecycle completes recycle event for pod and returns true, or returns false if nothing to do FinishRecycle(context.Context, int32) (bool, error) + // FinishUpgrade finishes rolling upgrade process; it returns an error if upgrade process can't be finished FinishUpgrade(context.Context, int32) error } diff --git a/pkg/splunk/enterprise/searchheadclusterpodmanager.go b/pkg/splunk/enterprise/searchheadclusterpodmanager.go index 60f757777..1797925e0 100644 --- a/pkg/splunk/enterprise/searchheadclusterpodmanager.go +++ b/pkg/splunk/enterprise/searchheadclusterpodmanager.go @@ -113,11 +113,11 @@ func (mgr *searchHeadClusterPodManager) PrepareRecycle(ctx context.Context, n in mgr.log.Info("Setting Probe level failed. Probably, the Pod is already down", "memberName", memberName) } - mgr.log.Info("Setting Upgrade banner") + mgr.log.Info("Initializes rolling upgrade process") err = c.InitiateUpgrade() if err != nil { - mgr.log.Info("Setting upgrade banner failed.") + mgr.log.Info("Initialization of rolling upgrade failed.") return false, err } @@ -190,6 +190,7 @@ func (mgr *searchHeadClusterPodManager) FinishUpgrade(ctx context.Context, n int reqLogger := log.FromContext(ctx) + // check if shc is in an upgrade process if mgr.cr.Status.UpgradePhase == enterpriseApi.UpgradePhaseUpgrading { c := mgr.getClient(ctx, n) @@ -198,21 +199,12 @@ func (mgr *searchHeadClusterPodManager) FinishUpgrade(ctx context.Context, n int mgr.cr.Status.UpgradeEndTimestamp = currentTime metrics.UpgradeEndTime.Set(float64(currentTime)) - // metrics still - - i, err := c.GetSearchHeadCaptainInfo() - if err != nil { - mgr.log.Info("Getting Search Head Captain Info failed") - return err - } - - captainURI := i.PeerSchemeHostPort // revert upgrade state status mgr.cr.Status.UpgradePhase = enterpriseApi.UpgradePhaseUpgraded - reqLogger.Info("Finalize Upgrade - unset banner") - return c.UpgradeFinalize(captainURI) + reqLogger.Info("Finalize Upgrade") + return c.FinalizeUpgrade() } return nil diff --git a/pkg/splunk/splkcontroller/statefulset.go b/pkg/splunk/splkcontroller/statefulset.go index 0ddcd7792..2c8e2804a 100644 --- a/pkg/splunk/splkcontroller/statefulset.go +++ b/pkg/splunk/splkcontroller/statefulset.go @@ -257,10 +257,11 @@ func UpdateStatefulSetPods(ctx context.Context, c splcommon.ControllerClient, st // all is good! scopedLog.Info("All pods are ready") - // Unset upgrade banner from the captain + // Finalize rolling upgrade process + // It uses first pod to get a client err = mgr.FinishUpgrade(ctx, 0) if err != nil { - scopedLog.Error(err, "Unable to unset upgrade banner") + scopedLog.Error(err, "Unable to finalize rolling upgrade process") return enterpriseApi.PhaseError, err } From 62b71401618dcfdf4a73cc24ab6633d0d2192e1d Mon Sep 17 00:00:00 2001 From: Patryk Wasielewski Date: Mon, 4 Aug 2025 15:32:54 +0200 Subject: [PATCH 6/6] remove todos --- pkg/splunk/enterprise/searchheadclusterpodmanager.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/splunk/enterprise/searchheadclusterpodmanager.go b/pkg/splunk/enterprise/searchheadclusterpodmanager.go index 1797925e0..093ce9fe9 100644 --- a/pkg/splunk/enterprise/searchheadclusterpodmanager.go +++ b/pkg/splunk/enterprise/searchheadclusterpodmanager.go @@ -124,15 +124,12 @@ func (mgr *searchHeadClusterPodManager) PrepareRecycle(ctx context.Context, n in start := mgr.cr.Status.UpgradeStartTimestamp end := mgr.cr.Status.UpgradeEndTimestamp - // todo: add Upgraded state as a requirement? if end >= start { - // todo: check if the timestamps are overwritten by updateStatus currentTime := time.Now().Unix() mgr.cr.Status.UpgradeStartTimestamp = currentTime metrics.UpgradeStartTime.Set(float64(currentTime)) - // mgr.cr.Status.UpgradePhase = "Upgrading" mgr.cr.Status.UpgradePhase = enterpriseApi.UpgradePhaseUpgrading }