Skip to content

Commit bfbcae7

Browse files
fix: setup planner serviceaccount in cluster-wide deployment (#3520)
Signed-off-by: Julien Mancuso <[email protected]>
1 parent 07a6474 commit bfbcae7

File tree

11 files changed

+1277
-12
lines changed

11 files changed

+1277
-12
lines changed

deploy/cloud/helm/platform/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ dependencies:
3535
repository: "https://charts.bitnami.com/bitnami"
3636
condition: etcd.enabled
3737
- name: kai-scheduler
38-
version: v0.9.2
38+
version: v0.9.4
3939
repository: oci://ghcr.io/nvidia/kai-scheduler
4040
condition: kai-scheduler.enabled
4141
- name: grove-charts
4242
alias: grove
43-
version: v0.1.0-alpha.1
43+
version: v0.1.0-alpha.2
4444
repository: oci://ghcr.io/nvidia/grove
4545
condition: grove.enabled

deploy/cloud/helm/platform/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ The chart includes built-in validation to prevent all operator conflicts:
8989
| file://components/operator | dynamo-operator | 0.5.0 |
9090
| https://charts.bitnami.com/bitnami | etcd | 12.0.18 |
9191
| https://nats-io.github.io/k8s/helm/charts/ | nats | 1.3.2 |
92-
| oci://ghcr.io/nvidia/grove | grove(grove-charts) | v0.1.0-alpha.1 |
93-
| oci://ghcr.io/nvidia/kai-scheduler | kai-scheduler | v0.9.2 |
92+
| oci://ghcr.io/nvidia/grove | grove(grove-charts) | v0.1.0-alpha.2 |
93+
| oci://ghcr.io/nvidia/kai-scheduler | kai-scheduler | v0.9.4 |
9494
9595
## Values
9696

deploy/cloud/helm/platform/components/operator/templates/deployment.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ spec:
124124
- --mpi-run-ssh-secret-name={{ .Values.dynamo.mpiRun.secretName }}
125125
- --mpi-run-ssh-secret-namespace={{ .Release.Namespace }}
126126
{{- end }}
127+
{{- if not .Values.namespaceRestriction.enabled }}
128+
- --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner
129+
{{- end }}
127130
command:
128131
- /manager
129132
env:

deploy/cloud/helm/platform/components/operator/templates/planner.yaml

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,17 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15+
16+
{{- if .Values.namespaceRestriction.enabled }}
17+
# Namespace-restricted mode: Role + ServiceAccount + RoleBinding
1518
---
1619
apiVersion: v1
1720
kind: ServiceAccount
1821
metadata:
1922
name: planner-serviceaccount
20-
namespace: {{ .Values.namespace }}
23+
namespace: {{ .Release.Namespace }}
24+
labels:
25+
{{- include "dynamo-operator.labels" . | nindent 4 }}
2126
{{- if .Values.dynamo.dockerRegistry.useKubernetesSecret }}
2227
imagePullSecrets:
2328
- name: {{ include "dynamo-operator.componentsDockerRegistrySecretName" . }}
@@ -27,7 +32,9 @@ apiVersion: rbac.authorization.k8s.io/v1
2732
kind: Role
2833
metadata:
2934
name: planner-role
30-
namespace: {{ .Values.namespace }}
35+
namespace: {{ .Release.Namespace }}
36+
labels:
37+
{{- include "dynamo-operator.labels" . | nindent 4 }}
3138
rules:
3239
- apiGroups: ["nvidia.com"]
3340
resources: ["dynamocomponentdeployments", "dynamographdeployments"]
@@ -37,12 +44,28 @@ apiVersion: rbac.authorization.k8s.io/v1
3744
kind: RoleBinding
3845
metadata:
3946
name: planner-binding
40-
namespace: {{ .Values.namespace }}
47+
namespace: {{ .Release.Namespace }}
48+
labels:
49+
{{- include "dynamo-operator.labels" . | nindent 4 }}
4150
subjects:
4251
- kind: ServiceAccount
4352
name: planner-serviceaccount
44-
namespace: {{ .Values.namespace }}
53+
namespace: {{ .Release.Namespace }}
4554
roleRef:
4655
kind: Role
4756
name: planner-role
48-
apiGroup: rbac.authorization.k8s.io
57+
apiGroup: rbac.authorization.k8s.io
58+
{{- else }}
59+
# Cluster-wide mode: ClusterRole for planner
60+
---
61+
apiVersion: rbac.authorization.k8s.io/v1
62+
kind: ClusterRole
63+
metadata:
64+
name: {{ include "dynamo-operator.fullname" . }}-planner
65+
labels:
66+
{{- include "dynamo-operator.labels" . | nindent 4 }}
67+
rules:
68+
- apiGroups: ["nvidia.com"]
69+
resources: ["dynamocomponentdeployments", "dynamographdeployments"]
70+
verbs: ["get", "list", "create", "update", "patch"]
71+
{{- end }}

deploy/cloud/operator/cmd/main.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ import (
6060
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
6161
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
6262
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd"
63+
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/rbac"
6364
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/secret"
6465
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/secrets"
6566
istioclientsetscheme "istio.io/client-go/pkg/clientset/versioned/scheme"
@@ -116,6 +117,7 @@ func init() {
116117
//+kubebuilder:scaffold:scheme
117118
}
118119

120+
//nolint:gocyclo
119121
func main() {
120122
var metricsAddr string
121123
var enableLeaderElection bool
@@ -137,6 +139,7 @@ func main() {
137139
var prometheusEndpoint string
138140
var mpiRunSecretName string
139141
var mpiRunSecretNamespace string
142+
var plannerClusterRoleName string
140143
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
141144
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
142145
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
@@ -175,12 +178,19 @@ func main() {
175178
"Name of the secret containing the SSH key for MPI Run (required)")
176179
flag.StringVar(&mpiRunSecretNamespace, "mpi-run-ssh-secret-namespace", "",
177180
"Namespace where the MPI SSH secret is located (required)")
181+
flag.StringVar(&plannerClusterRoleName, "planner-cluster-role-name", "",
182+
"Name of the ClusterRole for planner (cluster-wide mode only)")
178183
opts := zap.Options{
179184
Development: true,
180185
}
181186
opts.BindFlags(flag.CommandLine)
182187
flag.Parse()
183188

189+
if restrictedNamespace == "" && plannerClusterRoleName == "" {
190+
setupLog.Error(nil, "planner-cluster-role-name is required in cluster-wide mode")
191+
os.Exit(1)
192+
}
193+
184194
// Validate modelExpressURL if provided
185195
if modelExpressURL != "" {
186196
if _, err := url.Parse(modelExpressURL); err != nil {
@@ -225,6 +235,9 @@ func main() {
225235
MpiRun: commonController.MpiRunConfig{
226236
SecretName: mpiRunSecretName,
227237
},
238+
RBAC: commonController.RBACConfig{
239+
PlannerClusterRoleName: plannerClusterRoleName,
240+
},
228241
}
229242

230243
mainCtx := ctrl.SetupSignalHandler()
@@ -421,13 +434,17 @@ func main() {
421434
os.Exit(1)
422435
}
423436

437+
// Initialize RBAC manager for cross-namespace resource management
438+
rbacManager := rbac.NewManager(mgr.GetClient())
439+
424440
if err = (&controller.DynamoGraphDeploymentReconciler{
425441
Client: mgr.GetClient(),
426442
Recorder: mgr.GetEventRecorderFor("dynamographdeployment"),
427443
Config: ctrlConfig,
428444
DockerSecretRetriever: dockerSecretRetriever,
429445
ScaleClient: scaleClient,
430446
MPISecretReplicator: mpiSecretReplicator,
447+
RBACManager: rbacManager,
431448
}).SetupWithManager(mgr); err != nil {
432449
setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeployment")
433450
os.Exit(1)

deploy/cloud/operator/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ toolchain go1.24.3
66

77
require (
88
emperror.dev/errors v0.8.1
9-
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1
9+
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.2
1010
github.com/bsm/gomega v1.27.10
1111
github.com/go-logr/logr v1.4.2
1212
github.com/google/go-cmp v0.7.0

deploy/cloud/operator/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
22
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
3-
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1 h1:4DE6ZGa/3muBa5gk1GtJskMVss6GjeCPpn+xTnR1h9w=
4-
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4=
3+
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.2 h1:Xg2hrC5eKJ0jhStGFoGX0DnUdt/75K5fJvOxKkN54oU=
4+
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.2/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4=
55
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
66
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
77
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=

deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ type etcdStorage interface {
6363
DeleteKeys(ctx context.Context, prefix string) error
6464
}
6565

66+
// rbacManager interface for managing RBAC resources
67+
type rbacManager interface {
68+
EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
69+
}
70+
6671
// DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object
6772
type DynamoGraphDeploymentReconciler struct {
6873
client.Client
@@ -71,6 +76,7 @@ type DynamoGraphDeploymentReconciler struct {
7176
DockerSecretRetriever dockerSecretRetriever
7277
ScaleClient scale.ScalesGetter
7378
MPISecretReplicator *secret.SecretReplicator
79+
RBACManager rbacManager
7480
}
7581

7682
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
@@ -158,6 +164,25 @@ type Resource interface {
158164
func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
159165
logger := log.FromContext(ctx)
160166

167+
// Ensure planner RBAC exists in cluster-wide mode
168+
if r.Config.RestrictedNamespace == "" {
169+
if r.RBACManager == nil {
170+
return "", "", "", fmt.Errorf("RBAC manager not initialized in cluster-wide mode")
171+
}
172+
if r.Config.RBAC.PlannerClusterRoleName == "" {
173+
return "", "", "", fmt.Errorf("planner ClusterRole name is required in cluster-wide mode")
174+
}
175+
if err := r.RBACManager.EnsureServiceAccountWithRBAC(
176+
ctx,
177+
dynamoDeployment.Namespace,
178+
consts.PlannerServiceAccountName,
179+
r.Config.RBAC.PlannerClusterRoleName,
180+
); err != nil {
181+
logger.Error(err, "Failed to ensure planner RBAC")
182+
return "", "", "", fmt.Errorf("failed to ensure planner RBAC: %w", err)
183+
}
184+
}
185+
161186
// Reconcile top-level PVCs first
162187
err := r.reconcilePVCs(ctx, dynamoDeployment)
163188
if err != nil {

deploy/cloud/operator/internal/controller_common/predicate.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,14 @@ type Config struct {
6666
// PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics
6767
PrometheusEndpoint string
6868
MpiRun MpiRunConfig
69+
// RBAC configuration for cross-namespace resource management
70+
RBAC RBACConfig
71+
}
72+
73+
// RBACConfig holds configuration for RBAC management
74+
type RBACConfig struct {
75+
// PlannerClusterRoleName is the name of the ClusterRole for planner (cluster-wide mode only)
76+
PlannerClusterRoleName string
6977
}
7078

7179
type IngressConfig struct {

0 commit comments

Comments
 (0)