Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ spec:
- --mpi-run-ssh-secret-name={{ .Values.dynamo.mpiRun.secretName }}
- --mpi-run-ssh-secret-namespace={{ .Release.Namespace }}
{{- end }}
{{- if not .Values.namespaceRestriction.enabled }}
- --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner
{{- end }}
command:
- /manager
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

{{- if .Values.namespaceRestriction.enabled }}
# Namespace-restricted mode: Role + ServiceAccount + RoleBinding
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: planner-serviceaccount
namespace: {{ .Values.namespace }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
{{- if .Values.dynamo.dockerRegistry.useKubernetesSecret }}
imagePullSecrets:
- name: {{ include "dynamo-operator.componentsDockerRegistrySecretName" . }}
Expand All @@ -27,7 +32,9 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: planner-role
namespace: {{ .Values.namespace }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
rules:
- apiGroups: ["nvidia.com"]
resources: ["dynamocomponentdeployments", "dynamographdeployments"]
Expand All @@ -37,12 +44,28 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: planner-binding
namespace: {{ .Values.namespace }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
subjects:
- kind: ServiceAccount
name: planner-serviceaccount
namespace: {{ .Values.namespace }}
namespace: {{ .Release.Namespace }}
roleRef:
kind: Role
name: planner-role
apiGroup: rbac.authorization.k8s.io
apiGroup: rbac.authorization.k8s.io
{{- else }}
# Cluster-wide mode: ClusterRole for planner
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-planner
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
rules:
- apiGroups: ["nvidia.com"]
resources: ["dynamocomponentdeployments", "dynamographdeployments"]
verbs: ["get", "list", "create", "update", "patch"]
{{- end }}
11 changes: 11 additions & 0 deletions deploy/cloud/operator/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/rbac"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/secret"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/secrets"
istioclientsetscheme "istio.io/client-go/pkg/clientset/versioned/scheme"
Expand Down Expand Up @@ -137,6 +138,7 @@ func main() {
var prometheusEndpoint string
var mpiRunSecretName string
var mpiRunSecretNamespace string
var plannerClusterRoleName string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand Down Expand Up @@ -175,6 +177,8 @@ func main() {
"Name of the secret containing the SSH key for MPI Run (required)")
flag.StringVar(&mpiRunSecretNamespace, "mpi-run-ssh-secret-namespace", "",
"Namespace where the MPI SSH secret is located (required)")
flag.StringVar(&plannerClusterRoleName, "planner-cluster-role-name", "",
"Name of the ClusterRole for planner (cluster-wide mode only)")
opts := zap.Options{
Development: true,
}
Expand Down Expand Up @@ -225,6 +229,9 @@ func main() {
MpiRun: commonController.MpiRunConfig{
SecretName: mpiRunSecretName,
},
RBAC: commonController.RBACConfig{
PlannerClusterRoleName: plannerClusterRoleName,
},
}

mainCtx := ctrl.SetupSignalHandler()
Expand Down Expand Up @@ -421,13 +428,17 @@ func main() {
os.Exit(1)
}

// Initialize RBAC manager for cross-namespace resource management
rbacManager := rbac.NewManager(mgr.GetClient())

if err = (&controller.DynamoGraphDeploymentReconciler{
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("dynamographdeployment"),
Config: ctrlConfig,
DockerSecretRetriever: dockerSecretRetriever,
ScaleClient: scaleClient,
MPISecretReplicator: mpiSecretReplicator,
RBACMgr: rbacManager,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeployment")
os.Exit(1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ type etcdStorage interface {
DeleteKeys(ctx context.Context, prefix string) error
}

// rbacManager interface for managing RBAC resources
type rbacManager interface {
EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
}

// DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object
type DynamoGraphDeploymentReconciler struct {
client.Client
Expand All @@ -71,6 +76,7 @@ type DynamoGraphDeploymentReconciler struct {
DockerSecretRetriever dockerSecretRetriever
ScaleClient scale.ScalesGetter
MPISecretReplicator *secret.SecretReplicator
RBACMgr rbacManager
}

// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -158,6 +164,19 @@ type Resource interface {
func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger := log.FromContext(ctx)

// Ensure planner RBAC exists in cluster-wide mode
if r.Config.RestrictedNamespace == "" {
if err := r.RBACMgr.EnsureServiceAccountWithRBAC(
ctx,
dynamoDeployment.Namespace,
consts.PlannerServiceAccountName,
r.Config.RBAC.PlannerClusterRoleName,
); err != nil {
logger.Error(err, "Failed to ensure planner RBAC")
return "", "", "", fmt.Errorf("failed to ensure planner RBAC: %w", err)
}
}

// Reconcile top-level PVCs first
err := r.reconcilePVCs(ctx, dynamoDeployment)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ type Config struct {
// PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics
PrometheusEndpoint string
MpiRun MpiRunConfig
// RBAC configuration for cross-namespace resource management
RBAC RBACConfig
}

// RBACConfig holds configuration for RBAC management
type RBACConfig struct {
// PlannerClusterRoleName is the name of the ClusterRole for planner (cluster-wide mode only)
PlannerClusterRoleName string
}

type IngressConfig struct {
Expand Down
142 changes: 142 additions & 0 deletions deploy/cloud/operator/internal/rbac/manager.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

package rbac

import (
"context"
"fmt"

corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)

// Manager handles dynamic RBAC creation for cluster-wide operator installations.
type Manager struct {
client client.Client
}

// NewManager creates a new RBAC manager.
func NewManager(client client.Client) *Manager {
return &Manager{client: client}
}

// EnsureServiceAccountWithRBAC creates or updates a ServiceAccount and RoleBinding
// in the target namespace. This should ONLY be called in cluster-wide mode.
//
// In cluster-wide mode, the operator dynamically creates:
// - ServiceAccount in the target namespace
// - RoleBinding in the target namespace that binds the SA to a ClusterRole
//
// The ClusterRole must already exist (created by Helm).
//
// Parameters:
// - ctx: context
// - targetNamespace: namespace to create RBAC resources in
// - serviceAccountName: name of the ServiceAccount to create
// - clusterRoleName: name of the ClusterRole to bind to (must exist)
func (m *Manager) EnsureServiceAccountWithRBAC(
ctx context.Context,
targetNamespace string,
serviceAccountName string,
clusterRoleName string,
) error {
logger := log.FromContext(ctx)

// Create/update ServiceAccount
sa := &corev1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: serviceAccountName,
Namespace: targetNamespace,
Labels: map[string]string{
"app.kubernetes.io/managed-by": "dynamo-operator",
"app.kubernetes.io/component": "rbac",
"app.kubernetes.io/name": serviceAccountName,
},
},
}

if err := m.client.Get(ctx, client.ObjectKeyFromObject(sa), sa); err != nil {
if !apierrors.IsNotFound(err) {
return fmt.Errorf("failed to get service account: %w", err)
}
// ServiceAccount doesn't exist, create it
if err := m.client.Create(ctx, sa); err != nil {
return fmt.Errorf("failed to create service account: %w", err)
}
logger.V(1).Info("ServiceAccount created",
"serviceAccount", serviceAccountName,
"namespace", targetNamespace)
} else {
logger.V(1).Info("ServiceAccount already exists",
"serviceAccount", serviceAccountName,
"namespace", targetNamespace)
}

// Create/update RoleBinding
roleBindingName := fmt.Sprintf("%s-binding", serviceAccountName)
rb := &rbacv1.RoleBinding{
ObjectMeta: metav1.ObjectMeta{
Name: roleBindingName,
Namespace: targetNamespace,
Labels: map[string]string{
"app.kubernetes.io/managed-by": "dynamo-operator",
"app.kubernetes.io/component": "rbac",
"app.kubernetes.io/name": serviceAccountName,
},
},
Subjects: []rbacv1.Subject{{
Kind: "ServiceAccount",
Name: serviceAccountName,
Namespace: targetNamespace,
}},
RoleRef: rbacv1.RoleRef{
APIGroup: "rbac.authorization.k8s.io",
Kind: "ClusterRole",
Name: clusterRoleName,
},
}

existingRB := &rbacv1.RoleBinding{}
if err := m.client.Get(ctx, client.ObjectKeyFromObject(rb), existingRB); err != nil {
if !apierrors.IsNotFound(err) {
return fmt.Errorf("failed to get role binding: %w", err)
}
// RoleBinding doesn't exist, create it
if err := m.client.Create(ctx, rb); err != nil {
return fmt.Errorf("failed to create role binding: %w", err)
}
logger.V(1).Info("RoleBinding created",
"roleBinding", roleBindingName,
"clusterRole", clusterRoleName,
"namespace", targetNamespace)
} else {
// RoleBinding exists, update if needed
if existingRB.RoleRef.Name != clusterRoleName ||
len(existingRB.Subjects) != 1 ||
existingRB.Subjects[0].Name != serviceAccountName {
existingRB.Subjects = rb.Subjects
// Note: RoleRef is immutable, so if it changes, we'd need to delete and recreate
if err := m.client.Update(ctx, existingRB); err != nil {
return fmt.Errorf("failed to update role binding: %w", err)
}
logger.V(1).Info("RoleBinding updated",
"roleBinding", roleBindingName,
"clusterRole", clusterRoleName,
"namespace", targetNamespace)
} else {
logger.V(1).Info("RoleBinding already up-to-date",
"roleBinding", roleBindingName,
"clusterRole", clusterRoleName,
"namespace", targetNamespace)
}
}

return nil
}
Loading
Loading