Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions deploy/cloud/helm/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
export INGRESS_CLASS="${INGRESS_CLASS:=nginx}"
export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}"
export ENABLE_LWS="${ENABLE_LWS:=false}"
export ENABLE_GROVE="${ENABLE_GROVE:=false}"

# Add command line options
INTERACTIVE=false
Expand Down Expand Up @@ -165,7 +164,7 @@ echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX"
echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS"
echo "INSTALL_CRDS: $INSTALL_CRDS"

envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS} ${ENABLE_GROVE}' < dynamo-platform-values.yaml > generated-values.yaml
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}' < dynamo-platform-values.yaml > generated-values.yaml
echo "generated file contents:"
cat generated-values.yaml

Expand Down
1 change: 0 additions & 1 deletion deploy/cloud/helm/dynamo-platform-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ dynamo-operator:

dynamo:
enableLWS: ${ENABLE_LWS}
enableGrove: ${ENABLE_GROVE}
ingress:
enabled: ${INGRESS_ENABLED}
className: ${INGRESS_CLASS}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ spec:
{{- if .Values.dynamo.enableLWS }}
- --enable-lws
{{- end }}
{{- if .Values.dynamo.enableGrove }}
- --enable-grove
{{- if .Values.dynamo.groveTerminationDelay }}
- --grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }}
{{- end }}
command:
- /manager
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ rules:
- patch
- update
- watch
{{- if .Values.dynamo.enableGrove }}
- apiGroups:
- grove.io
resources:
Expand All @@ -129,7 +128,6 @@ rules:
- patch
- update
- watch
{{- end }}
- apiGroups:
- apps
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ dynamo:
annotations: {}

enableLWS: false
enableGrove: false
groveTerminationDelay: 15m

internalImages:
debugger: python:3.12-slim
Expand Down
2 changes: 1 addition & 1 deletion deploy/cloud/helm/platform/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dynamo-operator:
imagePullSecrets: []
dynamo:
enableLWS: false
enableGrove: false
groveTerminationDelay: 15m
internalImages:
debugger: python:3.12-slim
enableRestrictedSecurityContext: false
Expand Down
28 changes: 20 additions & 8 deletions deploy/cloud/operator/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
// to ensure that exec-entrypoint and run can make use of them.
clientv3 "go.etcd.io/etcd/client/v3"
corev1 "k8s.io/api/core/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth"
Expand All @@ -50,6 +51,7 @@ import (

grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd"
Expand All @@ -73,6 +75,10 @@ func init() {
utilruntime.Must(volcanoscheme.AddToScheme(scheme))

utilruntime.Must(grovev1alpha1.AddToScheme(scheme))

utilruntime.Must(apiextensionsv1.AddToScheme(scheme))

utilruntime.Must(istioclientsetscheme.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}

Expand All @@ -92,7 +98,7 @@ func main() {
var ingressControllerTLSSecretName string
var ingressHostSuffix string
var enableLWS bool
var enableGrove bool
var groveTerminationDelay time.Duration
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand Down Expand Up @@ -120,22 +126,23 @@ func main() {
"The suffix to use for the ingress host")
flag.BoolVar(&enableLWS, "enable-lws", false,
"If set, enable leader worker set")
flag.BoolVar(&enableGrove, "enable-grove", false,
"If set, enable grove")
flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay,
"The termination delay for Grove PodGangSets")
opts := zap.Options{
Development: true,
}
opts.BindFlags(flag.CommandLine)
flag.Parse()

utilruntime.Must(istioclientsetscheme.AddToScheme(scheme))

ctrlConfig := commonController.Config{
RestrictedNamespace: restrictedNamespace,
EnableLWS: enableLWS,
EnableGrove: enableGrove,
EtcdAddress: etcdAddr,
NatsAddress: natsAddr,
Grove: commonController.GroveConfig{
Enabled: false, // Will be set after Grove discovery
TerminationDelay: groveTerminationDelay,
},
EtcdAddress: etcdAddr,
NatsAddress: natsAddr,
IngressConfig: commonController.IngressConfig{
VirtualServiceGateway: istioVirtualServiceGateway,
IngressControllerClassName: ingressControllerClassName,
Expand Down Expand Up @@ -201,6 +208,11 @@ func main() {
os.Exit(1)
}

// Detect Grove availability using discovery client
setupLog.Info("Detecting Grove availability...")
groveEnabled := commonController.DetectGroveAvailability(mainCtx, mgr)
ctrlConfig.Grove.Enabled = groveEnabled

// Create etcd client
cli, err := clientv3.New(clientv3.Config{
Endpoints: []string{etcdAddr},
Expand Down
4 changes: 4 additions & 0 deletions deploy/cloud/operator/internal/consts/consts.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package consts

import "time"

const (
HPACPUDefaultAverageUtilization = 80

Expand Down Expand Up @@ -37,4 +39,6 @@ const (
PlannerServiceAccountName = "planner-serviceaccount"

DefaultIngressSuffix = "local"

DefaultGroveTerminationDelay = 15 * time.Minute
)
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ type Resource interface {

func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger := log.FromContext(ctx)
if r.Config.EnableGrove {
if r.Config.Grove.Enabled {
// check if explicit opt out of grove
if dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove] == consts.KubeLabelValueFalse {
logger.Info("Grove is explicitly disabled for this deployment, skipping grove resources reconciliation")
Expand Down Expand Up @@ -308,7 +308,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
GenericFunc: func(ge event.GenericEvent) bool { return true },
})).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config))
if r.Config.EnableGrove {
if r.Config.Grove.Enabled {
ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodGangSet{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the pod gang set
CreateFunc: func(ce event.CreateEvent) bool { return false },
Expand Down
49 changes: 48 additions & 1 deletion deploy/cloud/operator/internal/controller_common/predicate.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,28 @@ package controller_common
import (
"context"
"strings"
"time"

"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/client-go/discovery"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)

type GroveConfig struct {
// Enabled is automatically determined by checking if Grove CRDs are installed in the cluster
Enabled bool
// TerminationDelay configures the termination delay for Grove PodGangSets
TerminationDelay time.Duration
}

type Config struct {
// Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace string
EnableLWS bool
EnableGrove bool
Grove GroveConfig
EtcdAddress string
NatsAddress string
IngressConfig IngressConfig
Expand All @@ -48,6 +58,43 @@ func (i *IngressConfig) UseVirtualService() bool {
return i.VirtualServiceGateway != ""
}

// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool {
logger := log.FromContext(ctx)

// Use the discovery client to check if Grove API groups are available
cfg := mgr.GetConfig()
if cfg == nil {
logger.Info("Grove detection failed, no discovery client available")
return false
}

// Try to create a discovery client
discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg)
if err != nil {
logger.Error(err, "Grove detection failed, could not create discovery client")
return false
}

// Check if grove.io API group is available
apiGroups, err := discoveryClient.ServerGroups()
if err != nil {
logger.Error(err, "Grove detection failed, could not list server groups")
return false
}

for _, group := range apiGroups.Groups {
if group.Name == "grove.io" {
logger.Info("Grove is available, grove.io API group found")
return true
}
}

logger.Info("Grove not available, grove.io API group not found")
return false
}

func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
return predicate.NewPredicateFuncs(func(o client.Object) bool {
l := log.FromContext(context.Background())
Expand Down
3 changes: 3 additions & 0 deletions deploy/cloud/operator/internal/dynamo/graph.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,9 @@ func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.Dyn
gangSet.Name = dynamoDeployment.Name
gangSet.Namespace = dynamoDeployment.Namespace
gangSet.Spec.Replicas = 1
if controllerConfig.Grove.TerminationDelay > 0 {
gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay}
}
for componentName, component := range dynamoDeployment.Spec.Services {
container := corev1.Container{
Name: "main",
Expand Down
5 changes: 5 additions & 0 deletions deploy/cloud/operator/internal/dynamo/graph_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"reflect"
"sort"
"testing"
"time"

grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
Expand Down Expand Up @@ -1136,6 +1137,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
controllerConfig: controller_common.Config{
EtcdAddress: "etcd-address",
NatsAddress: "nats-address",
Grove: controller_common.GroveConfig{
TerminationDelay: 15 * time.Minute,
},
},
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -1272,6 +1276,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Spec: grovev1alpha1.PodGangSetSpec{
Replicas: 1,
Template: grovev1alpha1.PodGangSetTemplateSpec{
TerminationDelay: &metav1.Duration{Duration: 15 * time.Minute},
Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{
{
Name: "frontend",
Expand Down
Loading