From a400b94df4c7998d7c2b9474a4d58ba061e42392 Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Tue, 7 Oct 2025 16:49:03 -0700 Subject: [PATCH 1/3] feat: initial dynamomodel work Signed-off-by: Hannah Zhang --- .../templates/nvidia.com_dynamomodels.yaml | 218 +++++++++ deploy/cloud/operator/PROJECT | 8 + .../dynamocomponentdeployment_types.go | 5 + .../api/v1alpha1/dynamomodel_types.go | 188 ++++++++ deploy/cloud/operator/cmd/main.go | 7 + .../crd/bases/nvidia.com_dynamomodels.yaml | 219 +++++++++ .../dynamographdeployment_controller.go | 60 ++- .../controller/dynamomodel_controller.go | 453 ++++++++++++++++++ docs/kubernetes/README.md | 4 +- docs/kubernetes/dynamomodel.md | 273 +++++++++++ 10 files changed, 1433 insertions(+), 2 deletions(-) create mode 100644 deploy/cloud/helm/crds/templates/nvidia.com_dynamomodels.yaml create mode 100644 deploy/cloud/operator/api/v1alpha1/dynamomodel_types.go create mode 100644 deploy/cloud/operator/config/crd/bases/nvidia.com_dynamomodels.yaml create mode 100644 deploy/cloud/operator/internal/controller/dynamomodel_controller.go create mode 100644 docs/kubernetes/dynamomodel.md diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamomodels.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamomodels.yaml new file mode 100644 index 0000000000..69da21c121 --- /dev/null +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamomodels.yaml @@ -0,0 +1,218 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: dynamomodels.nvidia.com +spec: + group: nvidia.com + names: + kind: DynamoModel + listKind: DynamoModelList + plural: dynamomodels + shortNames: + - dm + singular: dynamomodel + scope: Namespaced + versions: + - name: v1alpha1 + additionalPrinterColumns: + - jsonPath: .status.state + name: State + type: string + - jsonPath: .spec.name + name: Model + type: string + - jsonPath: .spec.version + name: Version + type: string + - jsonPath: .status.pvcName + name: PVC + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + schema: + openAPIV3Schema: + description: |- + DynamoModel is the Schema for the dynamomodels API. + It provides a high-level abstraction for managing model artifacts cached in PVCs in the cluster. + All jobs referencing the same DynamoModel are guaranteed to use the same artifact, + preventing drift and simplifying maintenance. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the desired state for this model. + properties: + downloaderRef: + description: |- + DownloaderRef is an optional reference to a custom downloader or workflow + (e.g., MLFlow or internal tools). Provides extensibility for specialized workflows + (internal or third-party). + type: string + name: + description: |- + Name is the canonical model name (matches external model repo, e.g. HuggingFace, NGC). + Example: "meta-llama/Llama-3.3-70B-Instruct" + type: string + pvc: + description: PVC defines the persistent volume claim configuration for storing the model. + properties: + create: + default: true + description: Create indicates whether to create a new PVC or use an existing one. + type: boolean + name: + description: Name is the name of the PVC. If not specified, defaults to the DynamoModel name. + type: string + size: + anyOf: + - type: integer + - type: string + description: Size of the volume, used during PVC creation. Required when create is true. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + storageClass: + description: StorageClass to be used for PVC creation. Required when create is true. + type: string + volumeAccessMode: + default: ReadWriteMany + description: VolumeAccessMode is the volume access mode of the PVC. Defaults to ReadWriteMany. + type: string + required: + - create + type: object + secretRef: + description: |- + SecretRef is an optional reference to a secret needed for accessing the source URL + (private repo, S3 credentials, etc.) + type: string + sourceURL: + description: |- + SourceURL is the source location of model weights (can be HF, S3, NGC). + Ensures flexibility in downstream storage strategies; permits flexible source management and credential injection. + Examples: "hf://meta-llama/Llama-3.3-70B-Instruct", "s3://bucket/path/to/model", "ngc://nvidia/model" + type: string + version: + description: |- + Version is a version pin (e.g., SHA or tag from source repository). + This solves version drift by pinning deployments and benchmarking jobs to the same model artifact. + type: string + required: + - name + - pvc + - sourceURL + type: object + status: + description: Status reflects the current observed state of this model. + properties: + conditions: + description: Conditions contains the latest observed conditions of the model. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + downloadJobName: + description: DownloadJobName is the name of the Job created to download the model. + type: string + lastDownloadTime: + description: LastDownloadTime is the timestamp of the last successful download. + format: date-time + type: string + pvcName: + description: PVCName is the name of the PVC created or used for this model. + type: string + state: + description: |- + State is a high-level textual status of the model lifecycle. + Possible values: "Pending", "Downloading", "Ready", "Failed" + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/deploy/cloud/operator/PROJECT b/deploy/cloud/operator/PROJECT index a86430a2c2..38caf65e60 100644 --- a/deploy/cloud/operator/PROJECT +++ b/deploy/cloud/operator/PROJECT @@ -24,4 +24,12 @@ resources: kind: DynamoGraphDeployment path: github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1 version: v1alpha1 +- api: + crdVersion: v1 + namespaced: true + controller: true + domain: nvidia.com + kind: DynamoModel + path: github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1 + version: v1alpha1 version: "3" diff --git a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go index b475e90a59..9d69e11062 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go @@ -91,6 +91,11 @@ type DynamoComponentDeploymentSharedSpec struct { EnvFromSecret *string `json:"envFromSecret,omitempty"` // VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. VolumeMounts []VolumeMount `json:"volumeMounts,omitempty"` + // ModelRef references a DynamoModel resource that provides the model artifact for this component. + // When specified, the controller will wait for the model to be ready and automatically mount + // the model's PVC to the component. + // +kubebuilder:validation:Optional + ModelRef string `json:"modelRef,omitempty"` // Ingress config to expose the component outside the cluster (or through a service mesh). Ingress *IngressSpec `json:"ingress,omitempty"` diff --git a/deploy/cloud/operator/api/v1alpha1/dynamomodel_types.go b/deploy/cloud/operator/api/v1alpha1/dynamomodel_types.go new file mode 100644 index 0000000000..a927c04d23 --- /dev/null +++ b/deploy/cloud/operator/api/v1alpha1/dynamomodel_types.go @@ -0,0 +1,188 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// DynamoModelSpec defines the desired state of DynamoModel. +type DynamoModelSpec struct { + // Name is the canonical model name (matches external model repo, e.g. HuggingFace, NGC). + // Example: "meta-llama/Llama-3.3-70B-Instruct" + // +kubebuilder:validation:Required + Name string `json:"name"` + + // Version is a version pin (e.g., SHA or tag from source repository). + // This solves version drift by pinning deployments and benchmarking jobs to the same model artifact. + // +kubebuilder:validation:Optional + Version string `json:"version,omitempty"` + + // SourceURL is the source location of model weights (can be HF, S3, NGC). + // Ensures flexibility in downstream storage strategies; permits flexible source management and credential injection. + // Examples: "hf://meta-llama/Llama-3.3-70B-Instruct", "s3://bucket/path/to/model", "ngc://nvidia/model" + // +kubebuilder:validation:Required + SourceURL string `json:"sourceURL"` + + // SecretRef is an optional reference to a secret needed for accessing the source URL + // (private repo, S3 credentials, etc.) + // +kubebuilder:validation:Optional + SecretRef string `json:"secretRef,omitempty"` + + // DownloaderRef is an optional reference to a custom downloader or workflow + // (e.g., MLFlow or internal tools). Provides extensibility for specialized workflows + // (internal or third-party). + // +kubebuilder:validation:Optional + DownloaderRef string `json:"downloaderRef,omitempty"` + + // PVC defines the persistent volume claim configuration for storing the model. + // +kubebuilder:validation:Required + PVC PVCSpec `json:"pvc"` +} + +// PVCSpec defines the PVC configuration for model storage. +type PVCSpec struct { + // Create indicates whether to create a new PVC or use an existing one. + // +kubebuilder:default=true + Create *bool `json:"create,omitempty"` + + // Name is the name of the PVC. If not specified, defaults to the DynamoModel name. + // +kubebuilder:validation:Optional + Name string `json:"name,omitempty"` + + // StorageClass to be used for PVC creation. Required when create is true. + // +kubebuilder:validation:Optional + StorageClass string `json:"storageClass,omitempty"` + + // Size of the volume, used during PVC creation. Required when create is true. + // +kubebuilder:validation:Optional + Size resource.Quantity `json:"size,omitempty"` + + // VolumeAccessMode is the volume access mode of the PVC. Defaults to ReadWriteMany. + // +kubebuilder:default=ReadWriteMany + // +kubebuilder:validation:Optional + VolumeAccessMode corev1.PersistentVolumeAccessMode `json:"volumeAccessMode,omitempty"` +} + +// DynamoModelStatus defines the observed state of DynamoModel. +type DynamoModelStatus struct { + // State is a high-level textual status of the model lifecycle. + // Possible values: "Pending", "Downloading", "Ready", "Failed" + State string `json:"state,omitempty"` + + // Conditions contains the latest observed conditions of the model. + // +kubebuilder:validation:Optional + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` + + // PVCName is the name of the PVC created or used for this model. + // +kubebuilder:validation:Optional + PVCName string `json:"pvcName,omitempty"` + + // DownloadJobName is the name of the Job created to download the model. + // +kubebuilder:validation:Optional + DownloadJobName string `json:"downloadJobName,omitempty"` + + // LastDownloadTime is the timestamp of the last successful download. + // +kubebuilder:validation:Optional + LastDownloadTime *metav1.Time `json:"lastDownloadTime,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:shortName=dm +// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.state` +// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.name` +// +kubebuilder:printcolumn:name="Version",type=string,JSONPath=`.spec.version` +// +kubebuilder:printcolumn:name="PVC",type=string,JSONPath=`.status.pvcName` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` +// DynamoModel is the Schema for the dynamomodels API. +// It provides a high-level abstraction for managing model artifacts cached in PVCs in the cluster. +// All jobs referencing the same DynamoModel are guaranteed to use the same artifact, +// preventing drift and simplifying maintenance. +type DynamoModel struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Spec defines the desired state for this model. + Spec DynamoModelSpec `json:"spec,omitempty"` + // Status reflects the current observed state of this model. + Status DynamoModelStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// DynamoModelList contains a list of DynamoModel. +type DynamoModelList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []DynamoModel `json:"items"` +} + +func init() { + SchemeBuilder.Register(&DynamoModel{}, &DynamoModelList{}) +} + +// SetState sets the state of the DynamoModel. +func (m *DynamoModel) SetState(state string) { + m.Status.State = state +} + +// GetSpec returns the spec of the DynamoModel. +func (m *DynamoModel) GetSpec() any { + return m.Spec +} + +// SetSpec sets the spec of the DynamoModel. +func (m *DynamoModel) SetSpec(spec any) { + m.Spec = spec.(DynamoModelSpec) +} + +// AddStatusCondition adds or updates a status condition. +func (m *DynamoModel) AddStatusCondition(condition metav1.Condition) { + if m.Status.Conditions == nil { + m.Status.Conditions = []metav1.Condition{} + } + // Check if condition with same type already exists + for i, existingCondition := range m.Status.Conditions { + if existingCondition.Type == condition.Type { + // Replace the existing condition + m.Status.Conditions[i] = condition + return + } + } + // If no matching condition found, append the new one + m.Status.Conditions = append(m.Status.Conditions, condition) +} + +// IsReady returns true if the model is in Ready state. +func (m *DynamoModel) IsReady() bool { + return m.Status.State == "Ready" +} + +// GetPVCName returns the PVC name for this model. +func (m *DynamoModel) GetPVCName() string { + if m.Status.PVCName != "" { + return m.Status.PVCName + } + if m.Spec.PVC.Name != "" { + return m.Spec.PVC.Name + } + return m.Name +} diff --git a/deploy/cloud/operator/cmd/main.go b/deploy/cloud/operator/cmd/main.go index bc55f36eb2..9b6dfa24fa 100644 --- a/deploy/cloud/operator/cmd/main.go +++ b/deploy/cloud/operator/cmd/main.go @@ -432,6 +432,13 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeployment") os.Exit(1) } + if err = (&controller.DynamoModelReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "DynamoModel") + os.Exit(1) + } //+kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamomodels.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamomodels.yaml new file mode 100644 index 0000000000..965860d906 --- /dev/null +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamomodels.yaml @@ -0,0 +1,219 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: dynamomodels.nvidia.com +spec: + group: nvidia.com + names: + kind: DynamoModel + listKind: DynamoModelList + plural: dynamomodels + shortNames: + - dm + singular: dynamomodel + scope: Namespaced + versions: + - name: v1alpha1 + additionalPrinterColumns: + - jsonPath: .status.state + name: State + type: string + - jsonPath: .spec.name + name: Model + type: string + - jsonPath: .spec.version + name: Version + type: string + - jsonPath: .status.pvcName + name: PVC + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + schema: + openAPIV3Schema: + description: |- + DynamoModel is the Schema for the dynamomodels API. + It provides a high-level abstraction for managing model artifacts cached in PVCs in the cluster. + All jobs referencing the same DynamoModel are guaranteed to use the same artifact, + preventing drift and simplifying maintenance. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the desired state for this model. + properties: + downloaderRef: + description: |- + DownloaderRef is an optional reference to a custom downloader or workflow + (e.g., MLFlow or internal tools). Provides extensibility for specialized workflows + (internal or third-party). + type: string + name: + description: |- + Name is the canonical model name (matches external model repo, e.g. HuggingFace, NGC). + Example: "meta-llama/Llama-3.3-70B-Instruct" + type: string + pvc: + description: PVC defines the persistent volume claim configuration for storing the model. + properties: + create: + default: true + description: Create indicates whether to create a new PVC or use an existing one. + type: boolean + name: + description: Name is the name of the PVC. If not specified, defaults to the DynamoModel name. + type: string + size: + anyOf: + - type: integer + - type: string + description: Size of the volume, used during PVC creation. Required when create is true. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + storageClass: + description: StorageClass to be used for PVC creation. Required when create is true. + type: string + volumeAccessMode: + default: ReadWriteMany + description: VolumeAccessMode is the volume access mode of the PVC. Defaults to ReadWriteMany. + type: string + required: + - create + type: object + secretRef: + description: |- + SecretRef is an optional reference to a secret needed for accessing the source URL + (private repo, S3 credentials, etc.) + type: string + sourceURL: + description: |- + SourceURL is the source location of model weights (can be HF, S3, NGC). + Ensures flexibility in downstream storage strategies; permits flexible source management and credential injection. + Examples: "hf://meta-llama/Llama-3.3-70B-Instruct", "s3://bucket/path/to/model", "ngc://nvidia/model" + type: string + version: + description: |- + Version is a version pin (e.g., SHA or tag from source repository). + This solves version drift by pinning deployments and benchmarking jobs to the same model artifact. + type: string + required: + - name + - pvc + - sourceURL + type: object + status: + description: Status reflects the current observed state of this model. + properties: + conditions: + description: Conditions contains the latest observed conditions of the model. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + downloadJobName: + description: DownloadJobName is the name of the Job created to download the model. + type: string + lastDownloadTime: + description: LastDownloadTime is the timestamp of the last successful download. + format: date-time + type: string + pvcName: + description: PVCName is the name of the PVC created or used for this model. + type: string + state: + description: |- + State is a high-level textual status of the model lifecycle. + Possible values: "Pending", "Downloading", "Ready", "Failed" + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} + diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index 34a343fd17..8fc08061d8 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -76,6 +76,8 @@ type DynamoGraphDeploymentReconciler struct { // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamomodels,verbs=get;list;watch +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamomodels/status,verbs=get // +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch // +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch @@ -158,8 +160,19 @@ type Resource interface { func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) { logger := log.FromContext(ctx) + // Check if all referenced models are ready + modelsReady, notReadyModels, err := r.checkModelReferences(ctx, dynamoDeployment) + if err != nil { + logger.Error(err, "Failed to check model references") + return "", "", "", fmt.Errorf("failed to check model references: %w", err) + } + if !modelsReady { + logger.Info("Waiting for models to be ready", "notReadyModels", notReadyModels) + return PendingState, "WaitingForModels", Message(fmt.Sprintf("Waiting for models to be ready: %v", notReadyModels)), nil + } + // Reconcile top-level PVCs first - err := r.reconcilePVCs(ctx, dynamoDeployment) + err = r.reconcilePVCs(ctx, dynamoDeployment) if err != nil { logger.Error(err, "Failed to reconcile top-level PVCs") return "", "", "", fmt.Errorf("failed to reconcile top-level PVCs: %w", err) @@ -522,3 +535,48 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder { return r.Recorder } + +// checkModelReferences checks if all referenced DynamoModels are ready +func (r *DynamoGraphDeploymentReconciler) checkModelReferences(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (bool, []string, error) { + logger := log.FromContext(ctx) + notReadyModels := []string{} + + // Collect all model references from services + modelRefs := make(map[string]bool) + for serviceName, serviceSpec := range dynamoDeployment.Spec.Services { + if serviceSpec != nil && serviceSpec.ModelRef != "" { + modelRefs[serviceSpec.ModelRef] = true + logger.Info("Found model reference", "service", serviceName, "modelRef", serviceSpec.ModelRef) + } + } + + // If no model references, return true + if len(modelRefs) == 0 { + return true, notReadyModels, nil + } + + // Check each referenced model + for modelRef := range modelRefs { + model := &nvidiacomv1alpha1.DynamoModel{} + err := r.Get(ctx, types.NamespacedName{ + Name: modelRef, + Namespace: dynamoDeployment.Namespace, + }, model) + + if err != nil { + if errors.IsNotFound(err) { + logger.Error(err, "Referenced model not found", "modelRef", modelRef) + return false, append(notReadyModels, modelRef), fmt.Errorf("model %s not found", modelRef) + } + logger.Error(err, "Failed to get model", "modelRef", modelRef) + return false, notReadyModels, err + } + + if !model.IsReady() { + logger.Info("Model not ready", "modelRef", modelRef, "state", model.Status.State) + notReadyModels = append(notReadyModels, modelRef) + } + } + + return len(notReadyModels) == 0, notReadyModels, nil +} diff --git a/deploy/cloud/operator/internal/controller/dynamomodel_controller.go b/deploy/cloud/operator/internal/controller/dynamomodel_controller.go new file mode 100644 index 0000000000..4f3558f595 --- /dev/null +++ b/deploy/cloud/operator/internal/controller/dynamomodel_controller.go @@ -0,0 +1,453 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "fmt" + "strings" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" +) + +const ( + dynamoModelFinalizerName = "nvidia.com/dynamomodel-finalizer" +) + +// DynamoModelReconciler reconciles a DynamoModel object +type DynamoModelReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamomodels,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamomodels/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamomodels/finalizers,verbs=update +// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="",resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +func (r *DynamoModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + var err error + reason := Reason("undefined") + message := Message("") + state := PendingState + + // Retrieve the DynamoModel CRD + dynamoModel := &nvidiacomv1alpha1.DynamoModel{} + if err = r.Get(ctx, req.NamespacedName, dynamoModel); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + defer func() { + if err != nil { + state = FailedState + message = Message(err.Error()) + logger.Error(err, "Reconciliation failed") + } + dynamoModel.SetState(string(state)) + + readyStatus := metav1.ConditionFalse + if state == ReadyState { + readyStatus = metav1.ConditionTrue + } + + // Update Ready condition + dynamoModel.AddStatusCondition(metav1.Condition{ + Type: "Ready", + Status: readyStatus, + Reason: string(reason), + Message: string(message), + LastTransitionTime: metav1.Now(), + }) + + err = r.Status().Update(ctx, dynamoModel) + if err != nil { + logger.Error(err, "Unable to update the CRD status", "crd", req.NamespacedName, "state", state, "reason", reason, "message", message) + } + logger.Info("Reconciliation done") + }() + + // Handle finalizer + deleted, err := commonController.HandleFinalizer(ctx, dynamoModel, r.Client, r) + if err != nil { + logger.Error(err, "failed to handle the finalizer") + reason = "failed_to_handle_the_finalizer" + return ctrl.Result{}, err + } + if deleted { + return ctrl.Result{}, nil + } + + // Reconcile resources + state, reason, message, err = r.reconcileResources(ctx, dynamoModel) + if err != nil { + logger.Error(err, "failed to reconcile the resources") + reason = "failed_to_reconcile_the_resources" + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// reconcileResources handles the main reconciliation logic for DynamoModel +func (r *DynamoModelReconciler) reconcileResources(ctx context.Context, dynamoModel *nvidiacomv1alpha1.DynamoModel) (State, Reason, Message, error) { + logger := log.FromContext(ctx) + + // Step 1: Reconcile PVC + pvc, err := r.reconcilePVC(ctx, dynamoModel) + if err != nil { + return FailedState, "PVCReconciliationFailed", Message(err.Error()), err + } + + // Update status with PVC name + if dynamoModel.Status.PVCName != pvc.Name { + dynamoModel.Status.PVCName = pvc.Name + } + + // Step 2: Reconcile download Job + job, err := r.reconcileDownloadJob(ctx, dynamoModel, pvc) + if err != nil { + return FailedState, "JobReconciliationFailed", Message(err.Error()), err + } + + // Update status with Job name + if dynamoModel.Status.DownloadJobName != job.Name { + dynamoModel.Status.DownloadJobName = job.Name + } + + // Step 3: Check Job status + if job.Status.Succeeded > 0 { + // Job completed successfully + logger.Info("Model download completed successfully", "model", dynamoModel.Name) + if dynamoModel.Status.LastDownloadTime == nil { + now := metav1.Now() + dynamoModel.Status.LastDownloadTime = &now + } + return ReadyState, "ModelReady", "Model downloaded and ready", nil + } else if job.Status.Failed > 0 { + // Job failed + return FailedState, "DownloadFailed", "Model download job failed", fmt.Errorf("download job failed") + } else if job.Status.Active > 0 { + // Job is still running + return PendingState, "Downloading", "Model download in progress", nil + } + + // Job is pending + return PendingState, "JobPending", "Model download job is pending", nil +} + +// reconcilePVC creates or retrieves the PVC for the model +func (r *DynamoModelReconciler) reconcilePVC(ctx context.Context, dynamoModel *nvidiacomv1alpha1.DynamoModel) (*corev1.PersistentVolumeClaim, error) { + logger := log.FromContext(ctx) + + pvcName := dynamoModel.GetPVCName() + pvc := &corev1.PersistentVolumeClaim{} + pvcNamespacedName := types.NamespacedName{Name: pvcName, Namespace: dynamoModel.Namespace} + + err := r.Get(ctx, pvcNamespacedName, pvc) + if err != nil && !errors.IsNotFound(err) { + logger.Error(err, "Unable to retrieve PVC", "pvcName", pvcName) + return nil, err + } + + // If PVC does not exist, create it + if errors.IsNotFound(err) { + create := true + if dynamoModel.Spec.PVC.Create != nil { + create = *dynamoModel.Spec.PVC.Create + } + + if !create { + logger.Error(err, "PVC does not exist and create is not enabled", "pvcName", pvcName) + return nil, fmt.Errorf("PVC %s does not exist and create is disabled", pvcName) + } + + pvc = r.constructPVC(dynamoModel) + if err := controllerutil.SetControllerReference(dynamoModel, pvc, r.Scheme); err != nil { + logger.Error(err, "Failed to set controller reference for PVC", "pvcName", pvcName) + return nil, err + } + + err = r.Create(ctx, pvc) + if err != nil { + logger.Error(err, "Failed to create PVC", "pvcName", pvcName) + return nil, err + } + logger.Info("PVC created", "pvcName", pvcName, "namespace", dynamoModel.Namespace) + } + + return pvc, nil +} + +// constructPVC creates a PVC object from the DynamoModel spec +func (r *DynamoModelReconciler) constructPVC(dynamoModel *nvidiacomv1alpha1.DynamoModel) *corev1.PersistentVolumeClaim { + pvcName := dynamoModel.GetPVCName() + storageClassName := dynamoModel.Spec.PVC.StorageClass + accessMode := dynamoModel.Spec.PVC.VolumeAccessMode + if accessMode == "" { + accessMode = corev1.ReadWriteMany + } + + return &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: pvcName, + Namespace: dynamoModel.Namespace, + Labels: map[string]string{ + "app.kubernetes.io/managed-by": "dynamo-operator", + "app.kubernetes.io/component": "model-storage", + "dynamo.nvidia.com/model": dynamoModel.Name, + }, + }, + Spec: corev1.PersistentVolumeClaimSpec{ + AccessModes: []corev1.PersistentVolumeAccessMode{accessMode}, + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: dynamoModel.Spec.PVC.Size, + }, + }, + StorageClassName: &storageClassName, + }, + } +} + +// reconcileDownloadJob creates or retrieves the download Job for the model +func (r *DynamoModelReconciler) reconcileDownloadJob(ctx context.Context, dynamoModel *nvidiacomv1alpha1.DynamoModel, pvc *corev1.PersistentVolumeClaim) (*batchv1.Job, error) { + logger := log.FromContext(ctx) + + jobName := fmt.Sprintf("%s-download", dynamoModel.Name) + job := &batchv1.Job{} + jobNamespacedName := types.NamespacedName{Name: jobName, Namespace: dynamoModel.Namespace} + + err := r.Get(ctx, jobNamespacedName, job) + if err != nil && !errors.IsNotFound(err) { + logger.Error(err, "Unable to retrieve Job", "jobName", jobName) + return nil, err + } + + // If Job does not exist, create it + if errors.IsNotFound(err) { + job = r.constructDownloadJob(dynamoModel, pvc) + if err := controllerutil.SetControllerReference(dynamoModel, job, r.Scheme); err != nil { + logger.Error(err, "Failed to set controller reference for Job", "jobName", jobName) + return nil, err + } + + err = r.Create(ctx, job) + if err != nil { + logger.Error(err, "Failed to create Job", "jobName", jobName) + return nil, err + } + logger.Info("Download Job created", "jobName", jobName, "namespace", dynamoModel.Namespace) + } + + return job, nil +} + +// constructDownloadJob creates a Job object for downloading the model +func (r *DynamoModelReconciler) constructDownloadJob(dynamoModel *nvidiacomv1alpha1.DynamoModel, pvc *corev1.PersistentVolumeClaim) *batchv1.Job { + jobName := fmt.Sprintf("%s-download", dynamoModel.Name) + backoffLimit := int32(3) + completions := int32(1) + parallelism := int32(1) + + // Parse source URL to determine download strategy + downloadScript := r.generateDownloadScript(dynamoModel) + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: jobName, + Namespace: dynamoModel.Namespace, + Labels: map[string]string{ + "app.kubernetes.io/managed-by": "dynamo-operator", + "app.kubernetes.io/component": "model-downloader", + "dynamo.nvidia.com/model": dynamoModel.Name, + }, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: &backoffLimit, + Completions: &completions, + Parallelism: ¶llelism, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "app": "model-download", + "dynamo.nvidia.com/model": dynamoModel.Name, + }, + }, + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{ + { + Name: "model-download", + Image: "python:3.10-slim", + Command: []string{"sh", "-c"}, + Args: []string{downloadScript}, + Env: r.generateEnvVars(dynamoModel), + VolumeMounts: []corev1.VolumeMount{ + { + Name: "model-cache", + MountPath: "/model-cache", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "model-cache", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvc.Name, + }, + }, + }, + }, + }, + }, + }, + } + + // Add secret reference if specified + if dynamoModel.Spec.SecretRef != "" { + job.Spec.Template.Spec.Containers[0].EnvFrom = []corev1.EnvFromSource{ + { + SecretRef: &corev1.SecretEnvSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: dynamoModel.Spec.SecretRef, + }, + }, + }, + } + } + + return job +} + +// generateDownloadScript generates the download script based on the source URL +func (r *DynamoModelReconciler) generateDownloadScript(dynamoModel *nvidiacomv1alpha1.DynamoModel) string { + sourceURL := dynamoModel.Spec.SourceURL + + // Determine the download method based on the source URL prefix + if strings.HasPrefix(sourceURL, "hf://") { + // HuggingFace download + modelName := strings.TrimPrefix(sourceURL, "hf://") + script := ` +set -eux +pip install --no-cache-dir huggingface_hub hf_transfer +export HF_HUB_ENABLE_HF_TRANSFER=1 +huggingface-cli download $MODEL_NAME --cache-dir /model-cache +` + return script + } else if strings.HasPrefix(sourceURL, "s3://") { + // S3 download + script := ` +set -eux +pip install --no-cache-dir awscli +aws s3 sync $SOURCE_URL /model-cache --no-progress +` + return script + } else if strings.HasPrefix(sourceURL, "ngc://") { + // NGC download + script := ` +set -eux +pip install --no-cache-dir ngc-cli +ngc registry model download-version $MODEL_NAME --dest /model-cache +` + return script + } + + // Default: generic download (assumes HTTP/HTTPS URL) + script := ` +set -eux +pip install --no-cache-dir wget +wget -P /model-cache $SOURCE_URL +` + return script +} + +// generateEnvVars generates environment variables for the download job +func (r *DynamoModelReconciler) generateEnvVars(dynamoModel *nvidiacomv1alpha1.DynamoModel) []corev1.EnvVar { + envVars := []corev1.EnvVar{ + { + Name: "SOURCE_URL", + Value: dynamoModel.Spec.SourceURL, + }, + } + + // Add model name for HuggingFace downloads + if strings.HasPrefix(dynamoModel.Spec.SourceURL, "hf://") { + modelName := strings.TrimPrefix(dynamoModel.Spec.SourceURL, "hf://") + envVars = append(envVars, corev1.EnvVar{ + Name: "MODEL_NAME", + Value: modelName, + }) + } + + // Add version if specified + if dynamoModel.Spec.Version != "" { + envVars = append(envVars, corev1.EnvVar{ + Name: "MODEL_VERSION", + Value: dynamoModel.Spec.Version, + }) + } + + return envVars +} + +// SetupWithManager sets up the controller with the Manager. +func (r *DynamoModelReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&nvidiacomv1alpha1.DynamoModel{}). + Owns(&corev1.PersistentVolumeClaim{}). + Owns(&batchv1.Job{}). + Complete(r) +} + +// Cleanup implements the Cleanup interface for finalizer handling +func (r *DynamoModelReconciler) Cleanup(ctx context.Context, obj client.Object) error { + logger := log.FromContext(ctx) + dynamoModel := obj.(*nvidiacomv1alpha1.DynamoModel) + + logger.Info("Cleaning up DynamoModel resources", "name", dynamoModel.Name, "namespace", dynamoModel.Namespace) + + // The PVC and Job will be automatically deleted due to owner references + // Additional cleanup logic can be added here if needed + + return nil +} + +// GetFinalizerName returns the finalizer name for this controller +func (r *DynamoModelReconciler) GetFinalizerName() string { + return dynamoModelFinalizerName +} + diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index acc276fae0..fecc0d3cf5 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -84,7 +84,8 @@ Refer to the [API Reference and Documentation](/docs/kubernetes/api_reference.md For detailed technical specifications of Dynamo's Kubernetes resources: -- **[API Reference](/docs/kubernetes/api_reference.md)** - Complete CRD field specifications for `DynamoGraphDeployment` and `DynamoComponentDeployment` +- **[API Reference](/docs/kubernetes/api_reference.md)** - Complete CRD field specifications for `DynamoGraphDeployment`, `DynamoComponentDeployment`, and `DynamoModel` +- **[DynamoModel Guide](/docs/kubernetes/dynamomodel.md)** - Model artifact management with version pinning and automated downloads - **[Operator Guide](/docs/kubernetes/dynamo_operator.md)** - Dynamo operator configuration and management - **[Create Deployment](/docs/kubernetes/create_deployment.md)** - Step-by-step deployment creation examples @@ -170,6 +171,7 @@ Key customization points include: ## Additional Resources - **[Examples](/examples/README.md)** - Complete working examples +- **[DynamoModel Examples](/examples/deployments/README.md)** - Model artifact management examples - **[Create Custom Deployments](/docs/kubernetes/create_deployment.md)** - Build your own CRDs - **[Operator Documentation](/docs/kubernetes/dynamo_operator.md)** - How the platform works - **[Helm Charts](/deploy/helm/README.md)** - For advanced users diff --git a/docs/kubernetes/dynamomodel.md b/docs/kubernetes/dynamomodel.md new file mode 100644 index 0000000000..6479c799f4 --- /dev/null +++ b/docs/kubernetes/dynamomodel.md @@ -0,0 +1,273 @@ + + +# DynamoModel: Model Artifact Management + +## Overview + +`DynamoModel` is a Kubernetes Custom Resource Definition (CRD) that provides a high-level abstraction for managing model artifacts cached in PVCs within your cluster. It solves the critical problem of **model version drift** by ensuring all deployments and benchmarking jobs referencing the same `DynamoModel` use identical model artifacts. + +## Why DynamoModel? + +### Problem Statement + +Without `DynamoModel`, teams face several challenges: + +1. **Version Drift**: Different jobs might download different versions of a model, leading to inconsistent results +2. **Manual PVC Management**: Teams must manually create PVCs, download models, and track versions +3. **Duplicate Downloads**: Multiple jobs download the same model repeatedly, wasting time and bandwidth +4. **No Version Pinning**: Difficult to ensure deployments and benchmarks use the exact same model artifact + +### Solution + +`DynamoModel` provides: + +- **Version Pinning**: Pin deployments to specific model versions (SHA or tag) +- **Automated Downloads**: Automatically downloads and caches models in PVCs +- **Guaranteed Consistency**: All jobs referencing the same `DynamoModel` use identical artifacts +- **Flexible Sources**: Support for HuggingFace, S3, NGC, and custom sources +- **Simplified Management**: Declarative model management with Kubernetes-native tooling + +## Key Features + +### 1. Model Name and Version Pinning + +```yaml +spec: + name: meta-llama/Llama-3.3-70B-Instruct + version: abcd12345 # Source SHA, avoids drift +``` + +Enables version pinning, avoiding drift/inconsistency in deployments versus benchmarking. + +### 2. Flexible Source Management + +```yaml +spec: + sourceURL: hf://meta-llama/Llama-3.3-70B-Instruct + # Or: s3://bucket/path/to/model + # Or: ngc://nvidia/model +``` + +Supports multiple source types with automatic protocol detection. + +### 3. Credential Injection + +```yaml +spec: + secretRef: llama-hf-secret +``` + +Securely inject credentials for private repositories. + +### 4. Extensibility + +```yaml +spec: + downloaderRef: custom-downloader # Optional +``` + +Plug in custom downloaders or workflows (e.g., MLFlow or internal tools). + +## Quick Start + +### Step 1: Create a Model Secret (if needed) + +For private models, create a secret with your credentials: + +```bash +kubectl create secret generic llama-hf-secret \ + --from-literal=HF_TOKEN="your-huggingface-token" \ + -n your-namespace +``` + +### Step 2: Define a DynamoModel + +Create a `DynamoModel` resource: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: llama-3-70b-instruct-v1 + namespace: your-namespace +spec: + name: meta-llama/Llama-3.3-70B-Instruct + version: abcd12345 + sourceURL: hf://meta-llama/Llama-3.3-70B-Instruct + secretRef: llama-hf-secret + pvc: + create: true + storageClass: your-storage-class + size: 200Gi + volumeAccessMode: ReadWriteMany +``` + +Apply it: + +```bash +kubectl apply -f dynamomodel.yaml +``` + +### Step 3: Check Model Status + +```bash +# Check status +kubectl get dynamomodel llama-3-70b-instruct-v1 -n your-namespace + +# Watch download progress +kubectl get dynamomodel llama-3-70b-instruct-v1 -n your-namespace -w + +# View detailed status +kubectl describe dynamomodel llama-3-70b-instruct-v1 -n your-namespace + +# Check download job logs +kubectl logs job/llama-3-70b-instruct-v1-download -n your-namespace +``` + +### Step 4: Reference in DynamoGraphDeployment + +Once the model is ready, reference it in your deployment: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: vllm-disagg + namespace: your-namespace +spec: + services: + VllmDecodeWorker: + modelRef: llama-3-70b-instruct-v1 + replicas: 2 + # ... other configuration +``` + +The controller will automatically: +1. Wait for the model to be ready +2. Mount the model's PVC to the service +3. Ensure all replicas use the same model artifact + +## API Reference + +### DynamoModelSpec + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `name` | string | Yes | Canonical model name (e.g., "meta-llama/Llama-3.3-70B-Instruct") | +| `version` | string | No | Version pin (SHA or tag) to prevent drift | +| `sourceURL` | string | Yes | Source location (hf://, s3://, ngc://) | +| `secretRef` | string | No | Reference to secret for credentials | +| `downloaderRef` | string | No | Reference to custom downloader | +| `pvc` | PVCSpec | Yes | PVC configuration for model storage | + +### PVCSpec + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `create` | bool | No | true | Whether to create a new PVC | +| `name` | string | No | model name | Name of the PVC | +| `storageClass` | string | Yes* | - | Storage class for PVC creation | +| `size` | Quantity | Yes* | - | Size of the volume | +| `volumeAccessMode` | string | No | ReadWriteMany | Volume access mode | + +\* Required when `create` is true + +### DynamoModelStatus + +| Field | Type | Description | +|-------|------|-------------| +| `state` | string | Lifecycle state: "Pending", "Downloading", "Ready", "Failed" | +| `conditions` | []Condition | Detailed status conditions | +| `pvcName` | string | Name of the created/used PVC | +| `downloadJobName` | string | Name of the download Job | +| `lastDownloadTime` | Time | Timestamp of last successful download | + +## Supported Source Types + +### HuggingFace + +```yaml +sourceURL: hf://meta-llama/Llama-3.3-70B-Instruct +secretRef: hf-token-secret # Optional for public models +``` + +Downloads using `huggingface-cli` with HF Transfer enabled for faster downloads. + +### S3 + +```yaml +sourceURL: s3://my-bucket/models/llama-70b +secretRef: aws-credentials # AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY +``` + +Downloads using AWS CLI. + +### NGC (NVIDIA GPU Cloud) + +```yaml +sourceURL: ngc://nvidia/llama-70b +secretRef: ngc-api-key +``` + +Downloads using NGC CLI. + +### HTTP/HTTPS + +```yaml +sourceURL: https://example.com/models/model.tar.gz +``` + +Generic HTTP download using wget. + +## Advanced Usage + +### Using Existing PVC + +If you already have a PVC with a model: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: existing-model +spec: + name: my-org/my-model + sourceURL: hf://my-org/my-model + pvc: + create: false + name: existing-model-pvc +``` + +### Custom Downloader + +For specialized workflows: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: custom-model +spec: + name: my-org/custom-model + sourceURL: custom://my-internal-registry/model + downloaderRef: mlflow-downloader + pvc: + create: true + storageClass: fast-ssd + size: 500Gi +``` From 31dd961ed070e4209f2c654e9d27ba4dafdb5d55 Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Thu, 9 Oct 2025 10:20:19 -0700 Subject: [PATCH 2/3] feat: add model examples to components Signed-off-by: Hannah Zhang --- components/models/README.md | 102 +++++++++++++++++++++++++++++ components/models/llama-3-70b.yaml | 28 ++++++++ components/models/qwen3-0.6b.yaml | 28 ++++++++ docs/kubernetes/dynamomodel.md | 64 ++++++++++++++++-- 4 files changed, 216 insertions(+), 6 deletions(-) create mode 100644 components/models/README.md create mode 100644 components/models/llama-3-70b.yaml create mode 100644 components/models/qwen3-0.6b.yaml diff --git a/components/models/README.md b/components/models/README.md new file mode 100644 index 0000000000..73db05728a --- /dev/null +++ b/components/models/README.md @@ -0,0 +1,102 @@ +# DynamoModel Definitions + +This directory contains pre-configured `DynamoModel` resources for commonly used models. + +## Available Models + +### Qwen 3 0.6B +**File:** `qwen3-0.6b.yaml` +- **Size:** ~2GB +- **Use Case:** Testing, development, lightweight inference +- **Public:** Yes (no authentication required) + +```bash +kubectl apply -f qwen3-0.6b.yaml -n your-namespace +``` + +### Llama 3.3 70B Instruct +**File:** `llama-3-70b.yaml` +- **Size:** ~140GB +- **Use Case:** Production inference, high-quality responses +- **Public:** Gated (requires HuggingFace token) + +```bash +# Create secret first +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN="your-token" \ + -n your-namespace + +kubectl apply -f llama-3-70b.yaml -n your-namespace +``` + +## Usage + +### 1. Deploy Model + +```bash +kubectl apply -f .yaml -n your-namespace +``` + +### 2. Check Status + +```bash +# Watch model download progress +kubectl get dynamomodel -n your-namespace -w + +# Check detailed status +kubectl describe dynamomodel qwen3-0.6b -n your-namespace + +# View download logs +kubectl logs job/qwen3-0.6b-download -n your-namespace -f +``` + +### 3. Reference in Deployment + +Once the model state is "Ready", reference it in your `DynamoGraphDeployment`: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-deployment +spec: + modelRef: qwen3-0.6b # Reference the model by name + backendFramework: vllm + services: + VllmWorker: + replicas: 1 + resources: + limits: + nvidia.com/gpu: "1" +``` + +## Customization + +Update the following fields based on your cluster: + +- **`storageClass`**: Use your cluster's available storage class +- **`size`**: Adjust based on model requirements +- **`version`**: Pin to specific commit SHA for production +- **`secretRef`**: Add if model requires authentication + +## Adding New Models + +Create a new YAML file following this template: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: my-model +spec: + name: organization/model-name + version: commit-sha # Optional + sourceURL: hf://organization/model-name + secretRef: secret-name # Optional + pvc: + create: true + storageClass: your-storage-class + size: XXGi + volumeAccessMode: ReadWriteMany +``` + diff --git a/components/models/llama-3-70b.yaml b/components/models/llama-3-70b.yaml new file mode 100644 index 0000000000..a3f340d0d6 --- /dev/null +++ b/components/models/llama-3-70b.yaml @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# DynamoModel for Llama 3.3 70B Instruct - production-ready large model +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: llama-3-70b-instruct +spec: + # Canonical model name from HuggingFace + name: meta-llama/Llama-3.3-70B-Instruct + + # Version pin - use specific SHA for production + version: main + + # Source URL - HuggingFace Hub + sourceURL: hf://meta-llama/Llama-3.3-70B-Instruct + + # Secret reference for authentication (required for gated models) + secretRef: hf-token-secret + + # PVC configuration for model storage + pvc: + create: true + storageClass: standard # Update with your storage class + size: 200Gi # Large model requires significant storage + volumeAccessMode: ReadWriteMany # Required for multi-replica deployments + diff --git a/components/models/qwen3-0.6b.yaml b/components/models/qwen3-0.6b.yaml new file mode 100644 index 0000000000..47c73ce6d6 --- /dev/null +++ b/components/models/qwen3-0.6b.yaml @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# DynamoModel for Qwen 3 0.6B - lightweight model for testing and development +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: qwen3-0.6b +spec: + # Canonical model name from HuggingFace + name: Qwen/Qwen3-0.6B + + # Version pin (optional) - use a specific commit SHA for reproducibility + # version: main + + # Source URL - HuggingFace Hub + sourceURL: hf://Qwen/Qwen3-0.6B + + # Secret reference for authentication (optional for public models) + # secretRef: hf-token-secret + + # PVC configuration for model storage + pvc: + create: true + storageClass: standard # Update with your storage class + size: 10Gi # Small model, only needs ~2GB but allocate extra space + volumeAccessMode: ReadWriteMany # Required for multi-replica deployments + diff --git a/docs/kubernetes/dynamomodel.md b/docs/kubernetes/dynamomodel.md index 6479c799f4..6839f75d60 100644 --- a/docs/kubernetes/dynamomodel.md +++ b/docs/kubernetes/dynamomodel.md @@ -21,8 +21,6 @@ limitations under the License. `DynamoModel` is a Kubernetes Custom Resource Definition (CRD) that provides a high-level abstraction for managing model artifacts cached in PVCs within your cluster. It solves the critical problem of **model version drift** by ensuring all deployments and benchmarking jobs referencing the same `DynamoModel` use identical model artifacts. -## Why DynamoModel? - ### Problem Statement Without `DynamoModel`, teams face several challenges: @@ -83,6 +81,54 @@ spec: Plug in custom downloaders or workflows (e.g., MLFlow or internal tools). +## How It Works + +### User Flow + +1. **Create Model Definition:** + ```bash + kubectl apply -f components/models/qwen3-0.6b.yaml + ``` + +2. **Watch Model Download:** + ```bash + kubectl get dynamomodel qwen3-0.6b -w + ``` + +3. **Reference in Deployment:** + ```yaml + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeployment + spec: + modelRef: qwen3-0.6b + backendFramework: vllm + ``` + +### Controller Flow + +1. **DGD Controller** checks if `modelRef` is specified +2. Waits for `DynamoModel` to reach "Ready" state +3. Passes model name to Grove/Component pod generation +4. **Backend-specific logic** injects model arguments: + - vLLM: `--model Qwen/Qwen3-0.6B` + - SGLang/TRT-LLM: `--model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B` +5. Sets `HF_HOME=/model-cache` so backends can resolve models +6. Auto-mounts model PVC to `/model-cache` + +### Model Resolution + +When `HF_HOME=/model-cache` is set and backends use the canonical model name: +``` +/model-cache/ + models--Qwen--Qwen3-0.6B/ + snapshots/ + / + config.json + model-*.safetensors +``` + +Backends automatically resolve `Qwen/Qwen3-0.6B` → `/model-cache/models--Qwen--Qwen3-0.6B/snapshots/` + ## Quick Start ### Step 1: Create a Model Secret (if needed) @@ -150,17 +196,23 @@ metadata: name: vllm-disagg namespace: your-namespace spec: + modelRef: llama-3-70b-instruct-v1 # Reference at top-level + backendFramework: vllm services: VllmDecodeWorker: - modelRef: llama-3-70b-instruct-v1 replicas: 2 - # ... other configuration + resources: + limits: + nvidia.com/gpu: "2" + # Model arguments will be auto-injected by the controller ``` The controller will automatically: 1. Wait for the model to be ready -2. Mount the model's PVC to the service -3. Ensure all replicas use the same model artifact +2. Inject the appropriate model arguments for your backend +3. Mount the model's PVC to `/model-cache` +4. Set `HF_HOME=/model-cache` for model resolution +5. Ensure all replicas use the same model artifact ## API Reference From ba965f51ad1a8d74e7057c59e7ca0e5f4164cf0c Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Thu, 9 Oct 2025 10:24:06 -0700 Subject: [PATCH 3/3] feat: move modelRef to DGD top level Signed-off-by: Hannah Zhang --- .../dynamocomponentdeployment_types.go | 5 ---- .../v1alpha1/dynamographdeployment_types.go | 5 ++++ .../dynamographdeployment_controller.go | 21 ++++---------- .../controller/dynamomodel_controller.go | 29 +++++++++++++------ 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go index 9d69e11062..b475e90a59 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go @@ -91,11 +91,6 @@ type DynamoComponentDeploymentSharedSpec struct { EnvFromSecret *string `json:"envFromSecret,omitempty"` // VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. VolumeMounts []VolumeMount `json:"volumeMounts,omitempty"` - // ModelRef references a DynamoModel resource that provides the model artifact for this component. - // When specified, the controller will wait for the model to be ready and automatically mount - // the model's PVC to the component. - // +kubebuilder:validation:Optional - ModelRef string `json:"modelRef,omitempty"` // Ingress config to expose the component outside the cluster (or through a service mesh). Ingress *IngressSpec `json:"ingress,omitempty"` diff --git a/deploy/cloud/operator/api/v1alpha1/dynamographdeployment_types.go b/deploy/cloud/operator/api/v1alpha1/dynamographdeployment_types.go index 2d1a64a9fb..42f5d7c009 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamographdeployment_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamographdeployment_types.go @@ -49,6 +49,11 @@ type DynamoGraphDeploymentSpec struct { // BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). // +kubebuilder:validation:Enum=sglang;vllm;trtllm BackendFramework string `json:"backendFramework,omitempty"` + // ModelRef references a DynamoModel resource that provides the model artifact for this deployment. + // When specified, the controller will wait for the model to be ready and automatically + // configure all services with the appropriate model paths and mount the model's PVC. + // +kubebuilder:validation:Optional + ModelRef string `json:"modelRef,omitempty"` } // DynamoGraphDeploymentStatus defines the observed state of DynamoGraphDeployment. diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index 8fc08061d8..af75d25136 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -541,22 +541,11 @@ func (r *DynamoGraphDeploymentReconciler) checkModelReferences(ctx context.Conte logger := log.FromContext(ctx) notReadyModels := []string{} - // Collect all model references from services - modelRefs := make(map[string]bool) - for serviceName, serviceSpec := range dynamoDeployment.Spec.Services { - if serviceSpec != nil && serviceSpec.ModelRef != "" { - modelRefs[serviceSpec.ModelRef] = true - logger.Info("Found model reference", "service", serviceName, "modelRef", serviceSpec.ModelRef) - } - } - - // If no model references, return true - if len(modelRefs) == 0 { - return true, notReadyModels, nil - } + // Check top-level modelRef + if dynamoDeployment.Spec.ModelRef != "" { + modelRef := dynamoDeployment.Spec.ModelRef + logger.Info("Found top-level model reference", "modelRef", modelRef) - // Check each referenced model - for modelRef := range modelRefs { model := &nvidiacomv1alpha1.DynamoModel{} err := r.Get(ctx, types.NamespacedName{ Name: modelRef, @@ -566,7 +555,7 @@ func (r *DynamoGraphDeploymentReconciler) checkModelReferences(ctx context.Conte if err != nil { if errors.IsNotFound(err) { logger.Error(err, "Referenced model not found", "modelRef", modelRef) - return false, append(notReadyModels, modelRef), fmt.Errorf("model %s not found", modelRef) + return false, []string{modelRef}, fmt.Errorf("model %s not found", modelRef) } logger.Error(err, "Failed to get model", "modelRef", modelRef) return false, notReadyModels, err diff --git a/deploy/cloud/operator/internal/controller/dynamomodel_controller.go b/deploy/cloud/operator/internal/controller/dynamomodel_controller.go index 4f3558f595..c52726fb61 100644 --- a/deploy/cloud/operator/internal/controller/dynamomodel_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamomodel_controller.go @@ -359,14 +359,18 @@ func (r *DynamoModelReconciler) generateDownloadScript(dynamoModel *nvidiacomv1a // Determine the download method based on the source URL prefix if strings.HasPrefix(sourceURL, "hf://") { - // HuggingFace download - modelName := strings.TrimPrefix(sourceURL, "hf://") - script := ` + // HuggingFace download using modern 'hf download' command + // Build revision flag if version is specified + revisionFlag := "" + if dynamoModel.Spec.Version != "" { + revisionFlag = fmt.Sprintf("--revision $MODEL_REVISION") + } + + script := fmt.Sprintf(` set -eux pip install --no-cache-dir huggingface_hub hf_transfer -export HF_HUB_ENABLE_HF_TRANSFER=1 -huggingface-cli download $MODEL_NAME --cache-dir /model-cache -` +hf download $MODEL_NAME %s --exclude "original/*" --exclude "metal/*" +`, revisionFlag) return script } else if strings.HasPrefix(sourceURL, "s3://") { // S3 download @@ -402,6 +406,14 @@ func (r *DynamoModelReconciler) generateEnvVars(dynamoModel *nvidiacomv1alpha1.D Name: "SOURCE_URL", Value: dynamoModel.Spec.SourceURL, }, + { + Name: "HF_HOME", + Value: "/model-cache", + }, + { + Name: "HF_HUB_ENABLE_HF_TRANSFER", + Value: "1", + }, } // Add model name for HuggingFace downloads @@ -413,10 +425,10 @@ func (r *DynamoModelReconciler) generateEnvVars(dynamoModel *nvidiacomv1alpha1.D }) } - // Add version if specified + // Add version/revision if specified if dynamoModel.Spec.Version != "" { envVars = append(envVars, corev1.EnvVar{ - Name: "MODEL_VERSION", + Name: "MODEL_REVISION", Value: dynamoModel.Spec.Version, }) } @@ -450,4 +462,3 @@ func (r *DynamoModelReconciler) Cleanup(ctx context.Context, obj client.Object) func (r *DynamoModelReconciler) GetFinalizerName() string { return dynamoModelFinalizerName } -