Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions deploy/cloud/helm/platform/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ dependencies:
repository: "https://charts.bitnami.com/bitnami"
condition: etcd.enabled
- name: kai-scheduler
version: v0.9.2
version: v0.9.4
repository: oci://ghcr.io/nvidia/kai-scheduler
condition: kai-scheduler.enabled
- name: grove-charts
alias: grove
version: v0.1.0-alpha.1
version: v0.1.0-alpha.2
repository: oci://ghcr.io/nvidia/grove
condition: grove.enabled
4 changes: 2 additions & 2 deletions deploy/cloud/helm/platform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ The chart includes built-in validation to prevent all operator conflicts:
| file://components/operator | dynamo-operator | 0.5.0 |
| https://charts.bitnami.com/bitnami | etcd | 12.0.18 |
| https://nats-io.github.io/k8s/helm/charts/ | nats | 1.3.2 |
| oci://ghcr.io/nvidia/grove | grove(grove-charts) | v0.1.0-alpha.1 |
| oci://ghcr.io/nvidia/kai-scheduler | kai-scheduler | v0.9.2 |
| oci://ghcr.io/nvidia/grove | grove(grove-charts) | v0.1.0-alpha.2 |
| oci://ghcr.io/nvidia/kai-scheduler | kai-scheduler | v0.9.4 |

## Values

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ spec:
- --mpi-run-ssh-secret-name={{ .Values.dynamo.mpiRun.secretName }}
- --mpi-run-ssh-secret-namespace={{ .Release.Namespace }}
{{- end }}
{{- if not .Values.namespaceRestriction.enabled }}
- --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner
{{- end }}
command:
- /manager
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

{{- if .Values.namespaceRestriction.enabled }}
# Namespace-restricted mode: Role + ServiceAccount + RoleBinding
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: planner-serviceaccount
namespace: {{ .Values.namespace }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
{{- if .Values.dynamo.dockerRegistry.useKubernetesSecret }}
imagePullSecrets:
- name: {{ include "dynamo-operator.componentsDockerRegistrySecretName" . }}
Expand All @@ -27,7 +32,9 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: planner-role
namespace: {{ .Values.namespace }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
rules:
- apiGroups: ["nvidia.com"]
resources: ["dynamocomponentdeployments", "dynamographdeployments"]
Expand All @@ -37,12 +44,28 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: planner-binding
namespace: {{ .Values.namespace }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
subjects:
- kind: ServiceAccount
name: planner-serviceaccount
namespace: {{ .Values.namespace }}
namespace: {{ .Release.Namespace }}
roleRef:
kind: Role
name: planner-role
apiGroup: rbac.authorization.k8s.io
apiGroup: rbac.authorization.k8s.io
{{- else }}
# Cluster-wide mode: ClusterRole for planner
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-planner
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
rules:
- apiGroups: ["nvidia.com"]
resources: ["dynamocomponentdeployments", "dynamographdeployments"]
verbs: ["get", "list", "create", "update", "patch"]
{{- end }}
17 changes: 17 additions & 0 deletions deploy/cloud/operator/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/rbac"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/secret"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/secrets"
istioclientsetscheme "istio.io/client-go/pkg/clientset/versioned/scheme"
Expand Down Expand Up @@ -116,6 +117,7 @@ func init() {
//+kubebuilder:scaffold:scheme
}

//nolint:gocyclo
func main() {
var metricsAddr string
var enableLeaderElection bool
Expand All @@ -137,6 +139,7 @@ func main() {
var prometheusEndpoint string
var mpiRunSecretName string
var mpiRunSecretNamespace string
var plannerClusterRoleName string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand Down Expand Up @@ -175,12 +178,19 @@ func main() {
"Name of the secret containing the SSH key for MPI Run (required)")
flag.StringVar(&mpiRunSecretNamespace, "mpi-run-ssh-secret-namespace", "",
"Namespace where the MPI SSH secret is located (required)")
flag.StringVar(&plannerClusterRoleName, "planner-cluster-role-name", "",
"Name of the ClusterRole for planner (cluster-wide mode only)")
opts := zap.Options{
Development: true,
}
opts.BindFlags(flag.CommandLine)
flag.Parse()

if restrictedNamespace == "" && plannerClusterRoleName == "" {
setupLog.Error(nil, "planner-cluster-role-name is required in cluster-wide mode")
os.Exit(1)
}

// Validate modelExpressURL if provided
if modelExpressURL != "" {
if _, err := url.Parse(modelExpressURL); err != nil {
Expand Down Expand Up @@ -225,6 +235,9 @@ func main() {
MpiRun: commonController.MpiRunConfig{
SecretName: mpiRunSecretName,
},
RBAC: commonController.RBACConfig{
PlannerClusterRoleName: plannerClusterRoleName,
},
}

mainCtx := ctrl.SetupSignalHandler()
Expand Down Expand Up @@ -421,13 +434,17 @@ func main() {
os.Exit(1)
}

// Initialize RBAC manager for cross-namespace resource management
rbacManager := rbac.NewManager(mgr.GetClient())

if err = (&controller.DynamoGraphDeploymentReconciler{
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("dynamographdeployment"),
Config: ctrlConfig,
DockerSecretRetriever: dockerSecretRetriever,
ScaleClient: scaleClient,
MPISecretReplicator: mpiSecretReplicator,
RBACManager: rbacManager,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeployment")
os.Exit(1)
Expand Down
2 changes: 1 addition & 1 deletion deploy/cloud/operator/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ toolchain go1.24.3

require (
emperror.dev/errors v0.8.1
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.2
github.com/bsm/gomega v1.27.10
github.com/go-logr/logr v1.4.2
github.com/google/go-cmp v0.7.0
Expand Down
4 changes: 2 additions & 2 deletions deploy/cloud/operator/go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1 h1:4DE6ZGa/3muBa5gk1GtJskMVss6GjeCPpn+xTnR1h9w=
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4=
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.2 h1:Xg2hrC5eKJ0jhStGFoGX0DnUdt/75K5fJvOxKkN54oU=
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.2/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ type etcdStorage interface {
DeleteKeys(ctx context.Context, prefix string) error
}

// rbacManager interface for managing RBAC resources
type rbacManager interface {
EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
}

// DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object
type DynamoGraphDeploymentReconciler struct {
client.Client
Expand All @@ -71,6 +76,7 @@ type DynamoGraphDeploymentReconciler struct {
DockerSecretRetriever dockerSecretRetriever
ScaleClient scale.ScalesGetter
MPISecretReplicator *secret.SecretReplicator
RBACManager rbacManager
}

// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -158,6 +164,25 @@ type Resource interface {
func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger := log.FromContext(ctx)

// Ensure planner RBAC exists in cluster-wide mode
if r.Config.RestrictedNamespace == "" {
if r.RBACManager == nil {
return "", "", "", fmt.Errorf("RBAC manager not initialized in cluster-wide mode")
}
if r.Config.RBAC.PlannerClusterRoleName == "" {
return "", "", "", fmt.Errorf("planner ClusterRole name is required in cluster-wide mode")
}
if err := r.RBACManager.EnsureServiceAccountWithRBAC(
ctx,
dynamoDeployment.Namespace,
consts.PlannerServiceAccountName,
r.Config.RBAC.PlannerClusterRoleName,
); err != nil {
logger.Error(err, "Failed to ensure planner RBAC")
return "", "", "", fmt.Errorf("failed to ensure planner RBAC: %w", err)
}
}

// Reconcile top-level PVCs first
err := r.reconcilePVCs(ctx, dynamoDeployment)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ type Config struct {
// PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics
PrometheusEndpoint string
MpiRun MpiRunConfig
// RBAC configuration for cross-namespace resource management
RBAC RBACConfig
}

// RBACConfig holds configuration for RBAC management
type RBACConfig struct {
// PlannerClusterRoleName is the name of the ClusterRole for planner (cluster-wide mode only)
PlannerClusterRoleName string
}

type IngressConfig struct {
Expand Down
Loading
Loading