From e7d580e3209de5fc9528f5e07bc492955e376211 Mon Sep 17 00:00:00 2001 From: Benny Zlotnik Date: Wed, 25 Mar 2026 11:24:12 +0200 Subject: [PATCH] add auto-stop for idle workspaces Signed-off-by: Benny Zlotnik Assisted-by: claude-opus-4.6 --- api/v1alpha1/operatorconfig_types.go | 20 ++ api/v1alpha1/workspace_types.go | 13 ++ api/v1alpha1/zz_generated.deepcopy.go | 11 +- cmd/caib/workspace/workspace.go | 15 ++ cmd/main.go | 7 +- ....sdv.cloud.redhat.com_operatorconfigs.yaml | 9 + ...otive.sdv.cloud.redhat.com_workspaces.yaml | 15 ++ internal/buildapi/workspace.go | 110 ++++++++--- internal/controller/workspace/controller.go | 179 +++++++++++++++++- .../controller/workspace/controller_test.go | 15 +- 10 files changed, 349 insertions(+), 45 deletions(-) diff --git a/api/v1alpha1/operatorconfig_types.go b/api/v1alpha1/operatorconfig_types.go index f2fe202f..bef38fa9 100644 --- a/api/v1alpha1/operatorconfig_types.go +++ b/api/v1alpha1/operatorconfig_types.go @@ -56,6 +56,9 @@ const ( // DefaultWorkspacePVCSize is the default PVC size for workspace storage DefaultWorkspacePVCSize = "10Gi" + + // DefaultAutoPauseTimeoutMinutes is the default idle timeout in minutes before a workspace is auto-paused + DefaultAutoPauseTimeoutMinutes int32 = 30 ) // ImagesConfig defines container image references used by the operator @@ -351,6 +354,14 @@ type WorkspacesConfig struct { // BuildCacheSize is the size of the PVC created for build cache persistence (default: "20Gi") // +optional BuildCacheSize string `json:"buildCacheSize,omitempty"` + + // AutoPauseTimeoutMinutes is the cluster-wide default idle timeout in minutes before + // a workspace is automatically paused. Overridden per-workspace via spec.autoPauseTimeoutMinutes. + // Must be > 0. To disable auto-pause for a specific workspace, set its + // spec.autoPauseTimeoutMinutes to 0. + // Default: 30 + // +optional + AutoPauseTimeoutMinutes int32 `json:"autoPauseTimeoutMinutes,omitempty"` } // GetToolchainImage returns the toolchain image, falling back to the default @@ -406,6 +417,15 @@ func (c *WorkspacesConfig) GetTolerations() []corev1.Toleration { return nil } +// GetAutoPauseTimeoutMinutes returns the global auto-pause timeout in minutes. +// Returns DefaultAutoPauseTimeoutMinutes (30) when not configured. +func (c *WorkspacesConfig) GetAutoPauseTimeoutMinutes() int32 { + if c != nil && c.AutoPauseTimeoutMinutes > 0 { + return c.AutoPauseTimeoutMinutes + } + return DefaultAutoPauseTimeoutMinutes +} + // OperatorConfigSpec defines the desired state of OperatorConfig type OperatorConfigSpec struct { // OSBuilds defines the configuration for OS build operations diff --git a/api/v1alpha1/workspace_types.go b/api/v1alpha1/workspace_types.go index 0b634598..6dba7032 100644 --- a/api/v1alpha1/workspace_types.go +++ b/api/v1alpha1/workspace_types.go @@ -64,6 +64,14 @@ type WorkspaceSpec struct { // When true, the controller deletes the pod but preserves the PVC. // +optional Stopped bool `json:"stopped,omitempty"` + + // AutoPauseTimeoutMinutes overrides the global auto-pause timeout for this workspace. + // nil = use global default from OperatorConfig (default: 30 minutes) + // 0 = disable auto-pause for this workspace + // >0 = custom timeout in minutes + // +kubebuilder:validation:Minimum=0 + // +optional + AutoPauseTimeoutMinutes *int32 `json:"autoPauseTimeoutMinutes,omitempty"` } // WorkspaceStatus defines the observed state of a Workspace. @@ -85,6 +93,11 @@ type WorkspaceStatus struct { // Created lazily on first build referencing this workspace. // +optional BuildCachePVCName string `json:"buildCachePVCName,omitempty"` + + // LastActivityTime is the last time activity was detected in the workspace pod. + // Used by the auto-pause controller to determine idle duration. + // +optional + LastActivityTime *metav1.Time `json:"lastActivityTime,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 4a5108d7..4340f06d 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1268,7 +1268,7 @@ func (in *Workspace) DeepCopyInto(out *Workspace) { out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) in.Spec.DeepCopyInto(&out.Spec) - out.Status = in.Status + in.Status.DeepCopyInto(&out.Status) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Workspace. @@ -1336,6 +1336,11 @@ func (in *WorkspaceSpec) DeepCopyInto(out *WorkspaceSpec) { (*out)[key] = val } } + if in.AutoPauseTimeoutMinutes != nil { + in, out := &in.AutoPauseTimeoutMinutes, &out.AutoPauseTimeoutMinutes + *out = new(int32) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceSpec. @@ -1351,6 +1356,10 @@ func (in *WorkspaceSpec) DeepCopy() *WorkspaceSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkspaceStatus) DeepCopyInto(out *WorkspaceStatus) { *out = *in + if in.LastActivityTime != nil { + in, out := &in.LastActivityTime, &out.LastActivityTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceStatus. diff --git a/cmd/caib/workspace/workspace.go b/cmd/caib/workspace/workspace.go index abd8f312..b4138900 100644 --- a/cmd/caib/workspace/workspace.go +++ b/cmd/caib/workspace/workspace.go @@ -50,6 +50,9 @@ var ( // wait flag (shared by create and start) waitForRunningFlag bool + // auto-pause flag + autoPauseTimeout int + // deploy flags artifactMappings []string ) @@ -126,6 +129,7 @@ Examples: cmd.Flags().StringVar(&cpuRequest, "cpu", "", "CPU request/limit (e.g., \"1\", \"500m\")") cmd.Flags().StringVar(&memoryRequest, "memory", "", "memory request/limit (e.g., \"2Gi\", \"512Mi\")") cmd.Flags().BoolVar(&tmpfsBuildDir, "tmpfs", false, "mount a tmpfs volume at /tmp/build for faster compilation (uses RAM)") + cmd.Flags().IntVar(&autoPauseTimeout, "auto-pause-timeout", -1, "auto-pause timeout in minutes (0=disable, -1=use global default)") cmd.Flags().BoolVarP(&waitForRunningFlag, "wait", "w", true, "wait for workspace to be running") return cmd @@ -290,6 +294,13 @@ func runCreate(_ *cobra.Command, args []string) { Memory: memoryRequest, TmpfsBuildDir: tmpfsBuildDir, } + if autoPauseTimeout < -1 { + handleError(fmt.Errorf("--auto-pause-timeout must be >= -1")) + } + if autoPauseTimeout >= 0 { + v := int32(autoPauseTimeout) + req.AutoPauseTimeoutMinutes = &v + } var resp *buildapitypes.WorkspaceResponse err = caibcommon.ExecuteWithReauth(serverURL, &authToken, insecureSkipTLS, func(client *buildapiclient.Client) error { @@ -381,6 +392,10 @@ func runShow(_ *cobra.Command, args []string) { if ws.Age != "" { fmt.Printf("Age: %s\n", ws.Age) } + fmt.Printf("Auto-pause: %s\n", ws.AutoPauseTimeout) + if ws.LastActivity != "" { + fmt.Printf("Last active: %s\n", ws.LastActivity) + } } func runDelete(_ *cobra.Command, args []string) { diff --git a/cmd/main.go b/cmd/main.go index 4ae79192..3c85a2db 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -281,9 +281,10 @@ func main() { } workspaceReconciler := &workspace.Reconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Log: ctrl.Log.WithName("controllers").WithName("Workspace"), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: ctrl.Log.WithName("controllers").WithName("Workspace"), + RestConfig: mgr.GetConfig(), } if err = workspaceReconciler.SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Workspace") diff --git a/config/crd/bases/automotive.sdv.cloud.redhat.com_operatorconfigs.yaml b/config/crd/bases/automotive.sdv.cloud.redhat.com_operatorconfigs.yaml index a63b6884..99fdca5d 100644 --- a/config/crd/bases/automotive.sdv.cloud.redhat.com_operatorconfigs.yaml +++ b/config/crd/bases/automotive.sdv.cloud.redhat.com_operatorconfigs.yaml @@ -823,6 +823,15 @@ spec: workspaces: description: Workspaces defines configuration for developer workspaces properties: + autoPauseTimeoutMinutes: + description: |- + AutoPauseTimeoutMinutes is the cluster-wide default idle timeout in minutes before + a workspace is automatically paused. Overridden per-workspace via spec.autoPauseTimeoutMinutes. + Must be > 0. To disable auto-pause for a specific workspace, set its + spec.autoPauseTimeoutMinutes to 0. + Default: 30 + format: int32 + type: integer buildCacheSize: description: 'BuildCacheSize is the size of the PVC created for build cache persistence (default: "20Gi")' diff --git a/config/crd/bases/automotive.sdv.cloud.redhat.com_workspaces.yaml b/config/crd/bases/automotive.sdv.cloud.redhat.com_workspaces.yaml index 0f5f82ac..c5850255 100644 --- a/config/crd/bases/automotive.sdv.cloud.redhat.com_workspaces.yaml +++ b/config/crd/bases/automotive.sdv.cloud.redhat.com_workspaces.yaml @@ -59,6 +59,15 @@ spec: description: Architecture is the target architecture (e.g., "arm64", "amd64") type: string + autoPauseTimeoutMinutes: + description: |- + AutoPauseTimeoutMinutes overrides the global auto-pause timeout for this workspace. + nil = use global default from OperatorConfig (default: 30 minutes) + 0 = disable auto-pause for this workspace + >0 = custom timeout in minutes + format: int32 + minimum: 0 + type: integer clientConfigSecretRef: description: |- ClientConfigSecretRef is the name of the Secret containing the Jumpstarter client config @@ -169,6 +178,12 @@ spec: BuildCachePVCName is the name of the PVC used for build cache storage. Created lazily on first build referencing this workspace. type: string + lastActivityTime: + description: |- + LastActivityTime is the last time activity was detected in the workspace pod. + Used by the auto-pause controller to determine idle duration. + format: date-time + type: string message: description: Message provides additional detail about the current phase diff --git a/internal/buildapi/workspace.go b/internal/buildapi/workspace.go index a9e23dd4..75f9b596 100644 --- a/internal/buildapi/workspace.go +++ b/internal/buildapi/workspace.go @@ -37,25 +37,28 @@ func shellQuote(s string) string { // WorkspaceRequest is the payload to create a workspace. type WorkspaceRequest struct { - Name string `json:"name"` - FromBuild string `json:"fromBuild,omitempty"` // ImageBuild name to extract lease from - Lease string `json:"lease,omitempty"` // Direct lease ID - Arch string `json:"architecture,omitempty"` - Image string `json:"toolchainImage,omitempty"` - ClientConfig string `json:"clientConfig,omitempty"` // Base64-encoded Jumpstarter client config - CPU string `json:"cpu,omitempty"` // CPU request (e.g., "1", "500m") - Memory string `json:"memory,omitempty"` // Memory request (e.g., "2Gi", "512Mi") - TmpfsBuildDir bool `json:"tmpfsBuildDir,omitempty"` // Mount tmpfs at /tmp/build for fast compilation + Name string `json:"name"` + FromBuild string `json:"fromBuild,omitempty"` // ImageBuild name to extract lease from + Lease string `json:"lease,omitempty"` // Direct lease ID + Arch string `json:"architecture,omitempty"` + Image string `json:"toolchainImage,omitempty"` + ClientConfig string `json:"clientConfig,omitempty"` // Base64-encoded Jumpstarter client config + CPU string `json:"cpu,omitempty"` // CPU request (e.g., "1", "500m") + Memory string `json:"memory,omitempty"` // Memory request (e.g., "2Gi", "512Mi") + TmpfsBuildDir bool `json:"tmpfsBuildDir,omitempty"` // Mount tmpfs at /tmp/build for fast compilation + AutoPauseTimeoutMinutes *int32 `json:"autoPauseTimeoutMinutes,omitempty"` // nil = use global default, 0 = disable } // WorkspaceResponse is returned by workspace operations. type WorkspaceResponse struct { - Name string `json:"name"` - Phase string `json:"phase"` - Lease string `json:"lease,omitempty"` - Arch string `json:"architecture"` - PodName string `json:"podName,omitempty"` - Age string `json:"age,omitempty"` + Name string `json:"name"` + Phase string `json:"phase"` + Lease string `json:"lease,omitempty"` + Arch string `json:"architecture"` + PodName string `json:"podName,omitempty"` + Age string `json:"age,omitempty"` + AutoPauseTimeout string `json:"autoPauseTimeout,omitempty"` // e.g., "30m", "disabled" + LastActivity string `json:"lastActivity,omitempty"` // e.g., "2m ago", "just now" } // WorkspaceExecRequest is the payload to execute a command in a workspace. @@ -283,16 +286,17 @@ func (a *APIServer) createWorkspace(c *gin.Context) { Namespace: namespace, }, Spec: automotivev1alpha1.WorkspaceSpec{ - Architecture: arch, - Image: image, - LeaseID: leaseID, - Owner: requester, - ClientConfigSecretRef: jmpClientSecret, - PVCSize: pvcSize, - Resources: resources, - StorageClass: wsConfig.GetStorageClass(), - NodeSelector: wsConfig.GetNodeSelector(), - TmpfsBuildDir: req.TmpfsBuildDir, + Architecture: arch, + Image: image, + LeaseID: leaseID, + Owner: requester, + ClientConfigSecretRef: jmpClientSecret, + PVCSize: pvcSize, + Resources: resources, + StorageClass: wsConfig.GetStorageClass(), + NodeSelector: wsConfig.GetNodeSelector(), + TmpfsBuildDir: req.TmpfsBuildDir, + AutoPauseTimeoutMinutes: req.AutoPauseTimeoutMinutes, }, } if err := k8sClient.Create(c.Request.Context(), ws); err != nil { @@ -459,6 +463,24 @@ func (a *APIServer) getOwnedWorkspace(c *gin.Context, name string) (*automotivev return ws, nil } +// touchWorkspaceActivity updates LastActivityTime on the workspace status. +// Called from handlers that represent actual workspace usage (exec, shell, sync, deploy) +// so the auto-pause controller knows the workspace is in use. +func (a *APIServer) touchWorkspaceActivity(c *gin.Context, ws *automotivev1alpha1.Workspace) { + if ws.Spec.Stopped { + return + } + k8sClient, err := getClientFromRequest(c) + if err != nil { + return // best-effort, don't fail the operation + } + + now := metav1.Now() + patch := client.MergeFrom(ws.DeepCopy()) + ws.Status.LastActivityTime = &now + _ = k8sClient.Status().Patch(c.Request.Context(), ws, patch) +} + func (a *APIServer) syncWorkspace(c *gin.Context, name string) { ws, err := a.getOwnedWorkspace(c, name) if err != nil { @@ -468,6 +490,7 @@ func (a *APIServer) syncWorkspace(c *gin.Context, name string) { c.JSON(http.StatusConflict, gin.H{"error": fmt.Sprintf("workspace %q is not running (phase: %s)", name, ws.Status.Phase)}) return } + a.touchWorkspaceActivity(c, ws) namespace := ws.Namespace podName := ws.Status.PodName @@ -590,6 +613,7 @@ func (a *APIServer) execWorkspace(c *gin.Context, name string) { c.JSON(http.StatusConflict, gin.H{"error": fmt.Sprintf("workspace %q is not running (phase: %s)", name, ws.Status.Phase)}) return } + a.touchWorkspaceActivity(c, ws) restCfg, err := getRESTConfigFromRequest(c) if err != nil { @@ -618,6 +642,7 @@ func (a *APIServer) shellWorkspace(c *gin.Context, name string) { c.JSON(http.StatusConflict, gin.H{"error": fmt.Sprintf("workspace %q is not running (phase: %s)", name, ws.Status.Phase)}) return } + a.touchWorkspaceActivity(c, ws) namespace := ws.Namespace podName := ws.Status.PodName @@ -744,6 +769,7 @@ func (a *APIServer) deployWorkspace(c *gin.Context, name string) { c.JSON(http.StatusConflict, gin.H{"error": fmt.Sprintf("workspace %q is not running (phase: %s)", name, ws.Status.Phase)}) return } + a.touchWorkspaceActivity(c, ws) if ws.Spec.LeaseID == "" { c.JSON(http.StatusBadRequest, gin.H{"error": "no Jumpstarter lease associated with this workspace"}) @@ -914,13 +940,35 @@ func workspaceResponseFromCR(ws *automotivev1alpha1.Workspace) WorkspaceResponse if !ws.CreationTimestamp.IsZero() { age = time.Since(ws.CreationTimestamp.Time).Truncate(time.Second).String() } + var autoPauseTimeout string + switch { + case ws.Spec.AutoPauseTimeoutMinutes == nil: + autoPauseTimeout = "default" + case *ws.Spec.AutoPauseTimeoutMinutes == 0: + autoPauseTimeout = "disabled" + default: + autoPauseTimeout = fmt.Sprintf("%dm", *ws.Spec.AutoPauseTimeoutMinutes) + } + + lastActivity := "" + if ws.Status.LastActivityTime != nil { + elapsed := time.Since(ws.Status.LastActivityTime.Time) + if elapsed < time.Minute { + lastActivity = "just now" + } else { + lastActivity = elapsed.Truncate(time.Minute).String() + " ago" + } + } + return WorkspaceResponse{ - Name: ws.Name, - Phase: phase, - Lease: ws.Spec.LeaseID, - Arch: ws.Spec.Architecture, - PodName: ws.Status.PodName, - Age: age, + Name: ws.Name, + Phase: phase, + Lease: ws.Spec.LeaseID, + Arch: ws.Spec.Architecture, + PodName: ws.Status.PodName, + Age: age, + AutoPauseTimeout: autoPauseTimeout, + LastActivity: lastActivity, } } diff --git a/internal/controller/workspace/controller.go b/internal/controller/workspace/controller.go index d7fd4b56..d3bc5d0e 100644 --- a/internal/controller/workspace/controller.go +++ b/internal/controller/workspace/controller.go @@ -2,8 +2,13 @@ package workspace import ( + "bytes" "context" "fmt" + "net/http" + "strings" + "sync" + "time" automotivev1alpha1 "github.com/centos-automotive-suite/automotive-dev-operator/api/v1alpha1" "github.com/go-logr/logr" @@ -12,6 +17,10 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" + kscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/remotecommand" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -28,14 +37,20 @@ const ( // Reconciler reconciles a Workspace object. type Reconciler struct { client.Client - Scheme *runtime.Scheme - Log logr.Logger + Scheme *runtime.Scheme + Log logr.Logger + RestConfig *rest.Config + + clientset kubernetes.Interface + clientsetErr error + clientsetOnce sync.Once } // +kubebuilder:rbac:groups=automotive.sdv.cloud.redhat.com,resources=workspaces,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=automotive.sdv.cloud.redhat.com,resources=workspaces/status,verbs=get;update;patch // +kubebuilder:rbac:groups=automotive.sdv.cloud.redhat.com,resources=workspaces/finalizers,verbs=update // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="",resources=pods/exec,verbs=create // +kubebuilder:rbac:groups="",resources=persistentvolumeclaims,verbs=get;list;watch;create;delete // Reconcile handles Workspace CR changes. @@ -65,7 +80,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu if err := r.deleteWorkspacePod(ctx, ws, log); err != nil { return ctrl.Result{}, err } - return ctrl.Result{}, r.setStatus(ctx, ws, "Stopped", "") + // Preserve existing message (e.g., auto-pause reason) if already Stopped + msg := ws.Status.Message + return ctrl.Result{}, r.setStatus(ctx, ws, "Stopped", msg) } pod, err := r.ensurePod(ctx, ws, log) @@ -94,7 +111,16 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu } } - return ctrl.Result{}, r.setStatus(ctx, ws, phase, msg) + if err := r.setStatus(ctx, ws, phase, msg); err != nil { + return ctrl.Result{}, err + } + + // Auto-pause check: only for Running workspaces + if phase == "Running" { + return r.checkAutoPause(ctx, ws, log) + } + + return ctrl.Result{}, nil } func (r *Reconciler) ensurePVC(ctx context.Context, ws *automotivev1alpha1.Workspace) error { @@ -345,9 +371,154 @@ func (r *Reconciler) setStatus(ctx context.Context, ws *automotivev1alpha1.Works ws.Status.Phase = phase ws.Status.Message = message ws.Status.PodName = podName + if phase == "Stopped" || phase == "Pending" || phase == "Creating" { + ws.Status.LastActivityTime = nil + } return r.Status().Patch(ctx, ws, patch) } +// getAutoPauseTimeout returns the effective auto-pause timeout for a workspace. +// Returns 0 if auto-pause is disabled. +func (r *Reconciler) getAutoPauseTimeout(ctx context.Context, ws *automotivev1alpha1.Workspace) time.Duration { + if ws.Spec.AutoPauseTimeoutMinutes != nil { + mins := *ws.Spec.AutoPauseTimeoutMinutes + if mins <= 0 { + return 0 + } + return time.Duration(mins) * time.Minute + } + + oc := &automotivev1alpha1.OperatorConfig{} + if err := r.Get(ctx, client.ObjectKey{Name: "config", Namespace: ws.Namespace}, oc); err == nil { + if oc.Spec.Workspaces != nil { + return time.Duration(oc.Spec.Workspaces.GetAutoPauseTimeoutMinutes()) * time.Minute + } + } + + return time.Duration(automotivev1alpha1.DefaultAutoPauseTimeoutMinutes) * time.Minute +} + +// checkAutoPause checks if a Running workspace should be auto-paused due to inactivity. +func (r *Reconciler) checkAutoPause(ctx context.Context, ws *automotivev1alpha1.Workspace, log logr.Logger) (ctrl.Result, error) { + timeout := r.getAutoPauseTimeout(ctx, ws) + if timeout == 0 { + return ctrl.Result{}, nil + } + + checkInterval := timeout / 3 + if maxInterval := 5 * time.Minute; checkInterval > maxInterval { + checkInterval = maxInterval + } + + active, err := r.isWorkspaceActive(ctx, ws) + if err != nil { + log.V(1).Info("Failed to check workspace activity, will retry", "error", err) + return ctrl.Result{RequeueAfter: checkInterval}, nil + } + + // Active or first idle check: update the activity timestamp and requeue. + // Only patch when the timestamp is unset or stale (older than checkInterval) + // to avoid unnecessary API writes on every check. + if active || ws.Status.LastActivityTime == nil { + stale := ws.Status.LastActivityTime == nil || + (active && time.Since(ws.Status.LastActivityTime.Time) > checkInterval) + if stale { + now := metav1.Now() + patch := client.MergeFrom(ws.DeepCopy()) + ws.Status.LastActivityTime = &now + if err := r.Status().Patch(ctx, ws, patch); err != nil { + return ctrl.Result{}, err + } + } + return ctrl.Result{RequeueAfter: checkInterval}, nil + } + + // Workspace is idle — check if timeout has expired + idleDuration := time.Since(ws.Status.LastActivityTime.Time) + if idleDuration >= timeout { + log.Info("Auto-pausing idle workspace", + "workspace", ws.Name, + "idleDuration", idleDuration.Truncate(time.Second), + "timeout", timeout) + + specPatch := client.MergeFrom(ws.DeepCopy()) + ws.Spec.Stopped = true + if err := r.Patch(ctx, ws, specPatch); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to auto-pause workspace: %w", err) + } + + msg := fmt.Sprintf("Auto-paused after %s of inactivity", idleDuration.Truncate(time.Minute)) + return ctrl.Result{}, r.setStatus(ctx, ws, "Stopped", msg) + } + + remaining := timeout - idleDuration + return ctrl.Result{RequeueAfter: remaining}, nil +} + +// getClientset returns the cached Kubernetes clientset, creating it on first use. +func (r *Reconciler) getClientset() (kubernetes.Interface, error) { + r.clientsetOnce.Do(func() { + r.clientset, r.clientsetErr = kubernetes.NewForConfig(r.RestConfig) + }) + if r.clientsetErr != nil { + return nil, fmt.Errorf("creating clientset: %w", r.clientsetErr) + } + return r.clientset, nil +} + +// isWorkspaceActive execs into the workspace pod to check for user activity. +// Returns true if active sessions or build processes are detected. +func (r *Reconciler) isWorkspaceActive(ctx context.Context, ws *automotivev1alpha1.Workspace) (bool, error) { + podName := "workspace-" + ws.Name + + clientset, err := r.getClientset() + if err != nil { + return false, err + } + + // Detect user activity via two signals: + // 1. Active pts sessions (caib workspace shell connections) + // 2. Exec'd processes: in Kubernetes, exec'd processes have PPID=0 inside the + // container PID namespace (their real parent is outside). PID 1 is the + // entrypoint. Any other PPID=0 process (besides this check) is user activity. + cmd := []string{"/bin/sh", "-c", + `pts=$(ls /dev/pts/ 2>/dev/null | grep -cE '^[0-9]+$'); ` + + `if [ "$pts" -gt 0 ]; then echo active; exit 0; fi; ` + + `extra=$(ps -eo pid,ppid --no-headers | awk '$1 != 1 && $2 == 0 {c++} END {print c+0}'); ` + + `if [ "$extra" -gt 1 ]; then echo active; exit 0; fi; ` + + `echo idle`, + } + + execReq := clientset.CoreV1().RESTClient().Post(). + Resource("pods").Name(podName).Namespace(ws.Namespace).SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: containerName, + Command: cmd, + Stdin: false, + Stdout: true, + Stderr: true, + TTY: false, + }, kscheme.ParameterCodec) + + executor, err := remotecommand.NewSPDYExecutor(r.RestConfig, http.MethodPost, execReq.URL()) + if err != nil { + return false, fmt.Errorf("creating executor: %w", err) + } + + var stdout, stderr bytes.Buffer + execCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + if err := executor.StreamWithContext(execCtx, remotecommand.StreamOptions{ + Stdout: &stdout, + Stderr: &stderr, + }); err != nil { + return false, fmt.Errorf("exec failed: %w (stderr: %s)", err, stderr.String()) + } + + return strings.TrimSpace(stdout.String()) == "active", nil +} + // SetupWithManager sets up the controller with the Manager. func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). diff --git a/internal/controller/workspace/controller_test.go b/internal/controller/workspace/controller_test.go index d4809240..1f0bca26 100644 --- a/internal/controller/workspace/controller_test.go +++ b/internal/controller/workspace/controller_test.go @@ -18,7 +18,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" ) -const phaseStopped = "Stopped" +const ( + phaseRunning = "Running" + phaseStopped = "Stopped" +) func newTestScheme() *runtime.Scheme { scheme := runtime.NewScheme() @@ -55,7 +58,7 @@ func runningWorkspace(name, namespace string) (*automotivev1alpha1.Workspace, *c Architecture: "amd64", }, Status: automotivev1alpha1.WorkspaceStatus{ - Phase: "Running", + Phase: phaseRunning, PVCName: name + pvcSuffix, PodName: "workspace-" + name, }, @@ -276,7 +279,7 @@ func TestSetStatus_StoppedClearsPodName(t *testing.T) { Namespace: "default", }, Status: automotivev1alpha1.WorkspaceStatus{ - Phase: "Running", + Phase: phaseRunning, PodName: "workspace-test-ws", }, } @@ -316,7 +319,7 @@ func TestSetStatus_RunningSetsPodName(t *testing.T) { r, fc := newTestReconciler(ws) ctx := context.Background() - err := r.setStatus(ctx, ws, "Running", "") + err := r.setStatus(ctx, ws, phaseRunning, "") if err != nil { t.Fatalf("setStatus() error = %v", err) } @@ -328,8 +331,8 @@ func TestSetStatus_RunningSetsPodName(t *testing.T) { if updated.Status.PodName != "workspace-test-ws" { t.Errorf("expected PodName %q, got %q", "workspace-test-ws", updated.Status.PodName) } - if updated.Status.Phase != "Running" { - t.Errorf("expected phase %q, got %q", "Running", updated.Status.Phase) + if updated.Status.Phase != phaseRunning { + t.Errorf("expected phase %q, got %q", phaseRunning, updated.Status.Phase) } }