From 6865663ba1e48a36b8f70ab5631438cda175019a Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Fri, 16 Aug 2024 00:16:14 +0100 Subject: [PATCH 01/12] KEP-2170: Add TrainJob and TrainingRuntime APIs Signed-off-by: Andrey Velichkevich --- go.mod | 17 +- go.sum | 35 +-- .../v2alpha1/trainingruntime_types.go | 170 +++++++++++++ .../kubeflow.org/v2alpha1/trainjob_types.go | 226 ++++++++++++++++++ 4 files changed, 423 insertions(+), 25 deletions(-) diff --git a/go.mod b/go.mod index 2790c7a228..eb2f0afcbc 100644 --- a/go.mod +++ b/go.mod @@ -5,8 +5,8 @@ go 1.22 require ( github.com/go-logr/logr v1.4.1 github.com/google/go-cmp v0.6.0 - github.com/onsi/ginkgo/v2 v2.14.0 - github.com/onsi/gomega v1.30.0 + github.com/onsi/ginkgo/v2 v2.17.1 + github.com/onsi/gomega v1.32.0 github.com/open-policy-agent/cert-controller v0.10.1 github.com/prometheus/client_golang v1.18.0 github.com/sirupsen/logrus v1.9.0 @@ -19,7 +19,8 @@ require ( k8s.io/klog/v2 v2.110.1 k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 k8s.io/utils v0.0.0-20230726121419-3b25d923346b - sigs.k8s.io/controller-runtime v0.17.2 + sigs.k8s.io/controller-runtime v0.17.3 + sigs.k8s.io/jobset v0.5.2 sigs.k8s.io/scheduler-plugins v0.28.9 sigs.k8s.io/yaml v1.4.0 volcano.sh/apis v1.9.0 @@ -44,8 +45,8 @@ require ( github.com/google/gnostic-models v0.6.8 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect - github.com/google/uuid v1.3.0 // indirect - github.com/imdario/mergo v0.3.13 // indirect + github.com/google/uuid v1.3.1 // indirect + github.com/imdario/mergo v0.3.16 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect @@ -61,7 +62,7 @@ require ( github.com/spf13/pflag v1.0.5 // indirect go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect - golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 // indirect + golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect golang.org/x/mod v0.16.0 // indirect golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.12.0 // indirect @@ -76,8 +77,8 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.29.0 // indirect - k8s.io/component-base v0.29.0 // indirect + k8s.io/apiextensions-apiserver v0.29.2 // indirect + k8s.io/component-base v0.29.2 // indirect k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect diff --git a/go.sum b/go.sum index 75b92b0eeb..da8a571436 100644 --- a/go.sum +++ b/go.sum @@ -50,11 +50,11 @@ github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= -github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= +github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= -github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg= +github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= +github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -80,10 +80,10 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY= -github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw= -github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= -github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= +github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8= +github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs= +github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk= +github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg= github.com/open-policy-agent/cert-controller v0.10.1 h1:RXSYoyn8FdCenWecRP//UV5nbVfmstNpj4kHQFkvPK4= github.com/open-policy-agent/cert-controller v0.10.1/go.mod h1:4uRbBLY5DsPOog+a9pqk3JLxuuhrWsbUedQW65HcLTI= github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4 h1:5dum5SLEz+95JDLkMls7Z7IDPjvSq3UhJSFe4f5einQ= @@ -130,8 +130,8 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 h1:tnebWN09GYg9OLPss1KXj8txwZc6X6uMr6VFdcGNbHw= -golang.org/x/exp v0.0.0-20220827204233-334a2380cb91/go.mod h1:cyybsKvd6eL0RnXn6p/Grxp8F5bW7iYuBgsNCOHpMYE= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.16.0 h1:QX4fJ0Rr5cPQCF7O9lh9Se4pmwfwskqZfq5moyldzic= @@ -191,21 +191,20 @@ gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/api v0.29.3 h1:2ORfZ7+bGC3YJqGpV0KSDDEVf8hdGQ6A03/50vj8pmw= k8s.io/api v0.29.3/go.mod h1:y2yg2NTyHUUkIoTC+phinTnEa3KFM6RZ3szxt014a80= -k8s.io/apiextensions-apiserver v0.29.0 h1:0VuspFG7Hj+SxyF/Z/2T0uFbI5gb5LRgEyUVE3Q4lV0= -k8s.io/apiextensions-apiserver v0.29.0/go.mod h1:TKmpy3bTS0mr9pylH0nOt/QzQRrW7/h7yLdRForMZwc= +k8s.io/apiextensions-apiserver v0.29.2 h1:UK3xB5lOWSnhaCk0RFZ0LUacPZz9RY4wi/yt2Iu+btg= +k8s.io/apiextensions-apiserver v0.29.2/go.mod h1:aLfYjpA5p3OwtqNXQFkhJ56TB+spV8Gc4wfMhUA3/b8= k8s.io/apimachinery v0.29.3 h1:2tbx+5L7RNvqJjn7RIuIKu9XTsIZ9Z5wX2G22XAa5EU= k8s.io/apimachinery v0.29.3/go.mod h1:hx/S4V2PNW4OMg3WizRrHutyB5la0iCUbZym+W0EQIU= k8s.io/client-go v0.29.3 h1:R/zaZbEAxqComZ9FHeQwOh3Y1ZUs7FaHKZdQtIc2WZg= k8s.io/client-go v0.29.3/go.mod h1:tkDisCvgPfiRpxGnOORfkljmS+UrW+WtXAy2fTvXJB0= k8s.io/code-generator v0.29.3 h1:m7E25/t9R9NvejspO2zBdyu+/Gl0Z5m7dCRc680KS14= k8s.io/code-generator v0.29.3/go.mod h1:x47ofBhN4gxYFcxeKA1PYXeaPreAGaDN85Y/lNUsPoM= -k8s.io/component-base v0.29.0 h1:T7rjd5wvLnPBV1vC4zWd/iWRbV8Mdxs+nGaoaFzGw3s= -k8s.io/component-base v0.29.0/go.mod h1:sADonFTQ9Zc9yFLghpDpmNXEdHyQmFIGbiuZbqAXQ1M= +k8s.io/component-base v0.29.2 h1:lpiLyuvPA9yV1aQwGLENYyK7n/8t6l3nn3zAtFTJYe8= +k8s.io/component-base v0.29.2/go.mod h1:BfB3SLrefbZXiBfbM+2H1dlat21Uewg/5qtKOl8degM= k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01 h1:pWEwq4Asjm4vjW7vcsmijwBhOr1/shsbSYiWXmNGlks= k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E= k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y= @@ -217,8 +216,10 @@ k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/A k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA= k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/controller-runtime v0.17.2 h1:FwHwD1CTUemg0pW2otk7/U5/i5m2ymzvOXdbeGOUvw0= -sigs.k8s.io/controller-runtime v0.17.2/go.mod h1:+MngTvIQQQhfXtwfdGw/UOQ/aIaqsYywfCINOtwMO/s= +sigs.k8s.io/controller-runtime v0.17.3 h1:65QmN7r3FWgTxDMz9fvGnO1kbf2nu+acg9p2R9oYYYk= +sigs.k8s.io/controller-runtime v0.17.3/go.mod h1:N0jpP5Lo7lMTF9aL56Z/B2oWBJjey6StQM0jRbKQXtY= +sigs.k8s.io/jobset v0.5.2 h1:276q5Pi/ErLYj+GQ0ydEXR6tx3LwBhEzHLQv+k8bYF4= +sigs.k8s.io/jobset v0.5.2/go.mod h1:Vg99rj/6OoGvy1uvywGEHOcVLCWWJYkJtisKqdWzcFw= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/scheduler-plugins v0.28.9 h1:1/bXRoXuSUFr1FLqxrzScdyZMl/G1psuDJcDKYxTo+Q= diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index ab0377d028..95f596373d 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -15,3 +15,173 @@ limitations under the License. */ package v2alpha1 + +import ( + autoscalingv2 "k8s.io/api/autoscaling/v2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" +) + +// ClusterTrainingRuntime represents a training runtime which can be referenced as part of +// `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced +// by TrainJob that created in *any* namespace. +type ClusterTrainingRuntime struct { + metav1.TypeMeta `json:",inline"` + + // Standard object's metadata. + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired ClusterTrainingRuntime. + Spec TrainingRuntimeSpec `json:"spec,omitempty"` +} + +// ClusterTrainingRuntimeList is a collection of cluster training runtimes. +type ClusterTrainingRuntimeList struct { + metav1.TypeMeta `json:",inline"` + + // Standard list metadata. + metav1.ListMeta `json:"metadata,omitempty"` + + // List of ClusterTrainingRuntimes. + Items []ClusterTrainingRuntime `json:"items"` +} + +// TrainingRuntime represents a training runtime which can be referenced as part of +// `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced +// by TrainJob that created in the *same* namespace as the TrainingRuntime. +type TrainingRuntime struct { + metav1.TypeMeta `json:",inline"` + + // Standard object's metadata. + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired TrainingRuntime. + Spec TrainingRuntimeSpec `json:"spec"` +} + +// TrainingRuntimeList is a collection of training runtimes. +type TrainingRuntimeList struct { + metav1.TypeMeta `json:",inline"` + + // Standard list metadata. + metav1.ListMeta `json:"metadata,omitempty"` + + // List of TrainingRuntimes. + Items []TrainingRuntime `json:"items"` +} + +// TrainingRuntimeSpec represents a specification of the desired training runtime. +type TrainingRuntimeSpec struct { + // Configuration for the runtime-specific parameters, such as Torch or MPI. + MLSpec *MLSpec `json:"mlSpec,omitempty"` + + // Number of training nodes. + // Defaults to 1. + NumNodes *int32 `json:"numNodes,omitempty"` + + // JobSet configuration which will be used by TrainJob. + JobSetSpec *jobsetv1alpha2.JobSetSpec `json:",inline"` + + // Configuration for the PodGroup to enable gang-scheduling via supported plugins. + PodGroupSpec *PodGroupSpec `json:"podGroupSpec,omitempty"` +} + +// PodGroupSpec represents a PodGroup configuration to enable gang-scheduling. +type PodGroupSpec struct { + // Plugin for the gang-scheduling. + Plugin GangSchedulerPlugin `json:"plugin"` + + // Time threshold to schedule PodGroup for gang-scheduling. + ScheduleTimeoutSeconds *string `json:"scheduleTimeoutSeconds,omitempty"` +} + +// GangSchedulerPlugin represents one of the supported gang-scheduling plugins. +type GangSchedulerPlugin string + +const ( + // Volcano plugin for gang-scheduling. + GangSchedulerPluginVolcano GangSchedulerPlugin = "volcano" + + // Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling. + GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling" +) + +// MLSpec represents the runtime-specific configuration for various technologies. +// One of the following specs can be set. +type MLSpec struct { + // Configuration for the PyTorch runtime. + TorchSpec *TorchSpec `json:"torchSpec,omitempty"` + + // Configuration for the MPI Runtime. + MPISpec *MPISpec `json:"mpiSpec,omitempty"` +} + +// TorchSpec represents a PyTorch runtime configuration. +type TorchSpec struct { + // Number of processes per node. + // This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. + // Supported values: `auto`, `cpu`, `gpu`, or int value. + // Defaults to `auto`. + NumProcPerNode *string `json:"numProcPerNode,omitempty"` + + // Whether to run single-node multi-worker training. + // This value is inserted into the `--standalone` argument of the `torchrun` CLI. + // Defaults to false. + Standalone *bool `json:"standalone,omitempty"` + + // Elastic policy for the PyTorch training. + ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"` +} + +// TorchElasticPolicy represents a configuration for the PyTorch elastic training. +// If this policy is set, the `.spec.numNodes` parameter must be omitted, since min and max node +// is used to configure the `torchrun` CLI argument: `--nnodes=minNodes:maxNodes`. +// Only `c10d` backend is supported for the Rendezvous communication. +type TorchElasticPolicy struct { + // How many times the training job can be restarted. + // This value is inserted into the `--max-restarts` argument of the `torchrun` CLI and + // the `.spec.failurePolicy.maxRestarts` parameter of the training Job. + MaxRestarts *int32 `json:"maxRestarts,omitempty"` + + // Lower limit for the number of nodes to which training job can scale down. + MinNodes *int32 `json:"minNodes,omitempty"` + + // Upper limit for the number of nodes to which training job can scale up. + MaxNodes *int32 `json:"maxNodes,omitempty"` + + // Specification which are used to calculate the desired number of nodes. See the individual + // metric source types for more information about how each type of metric must respond. + // The HPA will be created to perform auto-scaling. + Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` +} + +// MPISpec represents a MPI runtime configuration. +type MPISpec struct { + // Number of processes per node. + // This value is equal to the number of slots for each node in the hostfile. + NumProcPerNode *int32 `json:"numProcPerNode,omitempty"` + + // Implementation name for the MPI to create the appropriate hostfile. + MPIImplementation *MPIImplementation `json:"mpiImplementation"` + + // Directory where SSH keys are mounted. + SSHAuthMountPath *string `json:"SSHAuthMountPath,omitempty"` + + // Whether to run training process on the launcher Job. + // Defaults to false. + RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"` +} + +// MPIImplementation represents one of the supported MPI implementations. +type MPIImplementation string + +const ( + MPIImplementationOpenMPI MPIImplementation = "OpenMPI" + MPIImplementationIntel MPIImplementation = "Intel" + MPIImplementationMPICH MPIImplementation = "MPICH" +) + +// TODO: Enable this after controller implementation. +// func init() { +// SchemeBuilder.Register(&ClusterTrainingRuntime{}, &ClusterTrainingRuntimeList{}, &TrainingRuntime{}, &TrainingRuntimeList{}) +// } diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go index ab0377d028..700d398c5c 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go @@ -15,3 +15,229 @@ limitations under the License. */ package v2alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" +) + +// TrainJob represents configuration of a training job. +type TrainJob struct { + metav1.TypeMeta `json:",inline"` + + // Standard object's metadata. + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired TrainJob. + Spec TrainJobSpec `json:"spec,omitempty"` + + // Current status of TrainJob. + Status TrainJobStatus `json:"status,omitempty"` +} + +// TrainJobSpec represents specification of the desired TrainJob. +type TrainJobSpec struct { + // Reference to the training runtime. + TrainingRuntimeRef TrainingRuntimeRef `json:"trainingRuntimeRef"` + + // Configuration of the desired trainer. + Trainer *Trainer `json:"trainer,omitempty"` + + // Configuration of the training dataset. + DatasetConfig *DatasetConfig `json:"datasetConfig,omitempty"` + + // Configuration of the pre-trained and trained model. + ModelConfig *ModelConfig `json:"modelConfig,omitempty"` + + // Labels to apply for the derivative JobSet and Jobs. + // They will be merged with the TrainingRuntime values. + Labels map[string]string `json:"labels,omitempty"` + + // Annotations to apply for the derivative JobSet and Jobs. + // They will be merged with the TrainingRuntime values. + Annotations map[string]string `json:"annotations,omitempty"` + + // Custom overrides for the training runtime. + PodSpecOverrides []PodSpecOverrides `json:"podSpecOverrides,omitempty"` + + // Whether the controller should suspend the running TrainJob. + // Defaults to false. + Suspend *bool `json:"suspend,omitempty"` + + // ManagedBy field indicates the controller that manages a TrainJob. + ManagedBy *string `json:"managedBy,omitempty"` +} + +// TrainingRuntimeRef represents the reference to the existing training runtime. +type TrainingRuntimeRef struct { + // Name of the runtime being referenced. + // When namespaced-scoped TrainingRuntime is used, the TrainJob must have + // the same namespace as the deployed runtime. + Name string `json:"name"` + + // APIGroup of the runtime being referenced. + // Defaults to `kubeflow.org`. + APIGroup *string `json:"apiGroup,omitempty"` + + // Kind of the runtime being referenced. + // It must be one of TrainingRuntime or ClusterTrainingRuntime. + // Defaults to ClusterTrainingRuntime. + Kind *string `json:"kind,omitempty"` +} + +// Trainer represents the desired trainer configuration. +// Every training runtime contains `trainer` container which represents Trainer. +type Trainer struct { + // Docker image for the training container. + Image string `json:"image,omitempty"` + + // Entrypoint commands for the training container. + Command []string `json:"command,omitempty"` + + // Arguments to the entrypoint for the training container. + Args []string `json:"args,omitempty"` + + // List of environment variables to set in the training container. + // These values will be merged with the TrainingRuntime's trainer environments. + Env []corev1.EnvVar `json:"env,omitempty"` + + // Number of training nodes. + // TODO (andreyvelich): Do we want to support dynamic num of nodes in TrainJob for PyTorch elastic: `--nnodes=1:4` ? + NumNodes *int32 `json:"numNodes,omitempty"` + + // Compute resources for each training node. + ResourcesPerNode corev1.ResourceRequirements `json:"resourcesPerNode,omitempty"` + + // Number of processes/workers/slots on every training node. + // For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. + // For the MPI runtime only int value can be set. + NumProcPerNode *string `json:"numProcPerNode,omitempty"` +} + +// DatasetConfig represents the desired dataset configuration. +// When this API is used, the training runtime must have +// the `dataset-initializer` container in the `Initializer` Job. +type DatasetConfig struct { + // Storage uri for the dataset provider. + StorageUri string `json:"storageUri"` + + // List of environment variables to set in the dataset initializer container. + // These values will be merged with the TrainingRuntime's dataset initializer environments. + Env []corev1.EnvVar `json:"env,omitempty"` + + // Reference to the TrainJob's secrets to download dataset. + SecretRef corev1.SecretReference `json:"secretRef,omitempty"` +} + +// ModelConfig represents the desired model configuration. +type ModelConfig struct { + // Configuration of the pre-trained model. + // When this API is used, the training runtime must have + // the `model-initializer` container in the `Initializer` Job. + Input *InputModel `json:"input,omitempty"` + + // Configuration of the trained model. + // When this API is used, the training runtime must have + // the `model-exporter` container in the `Exporter` Job. + Output *OutputModel `json:"output,omitempty"` +} + +// InputModel represents the desired pre-trained model configuration. +type InputModel struct { + // Storage uri for the model provider. + StorageUri string `json:"storageUri"` + + // List of environment variables to set in the model initializer container. + // These values will be merged with the TrainingRuntime's model initializer environments. + Env []corev1.EnvVar `json:"env,omitempty"` + + // Reference to the TrainJob's secrets to download model. + SecretRef corev1.SecretReference `json:"secretRef,omitempty"` +} + +// OutputModel represents the desired trained model configuration. +type OutputModel struct { + // Storage uri for the model exporter. + StorageUri string `json:"storageUri"` + + // List of environment variables to set in the model exporter container. + // These values will be merged with the TrainingRuntime's model exporter environments. + Env []corev1.EnvVar `json:"env,omitempty"` + + // Reference to the TrainJob's secrets to export model. + SecretRef corev1.SecretReference `json:"secretRef,omitempty"` +} + +// PodSpecOverrides represents the custom overrides that will be applied for the TrainJob's resources. +type PodSpecOverrides struct { + // Names of the training job replicas in the training runtime template to apply the overrides. + TargetReplicatedJobs []string `json:"targetReplicatedJobs"` + + // Overrides for the containers in the desired job templates. + Containers []ContainerOverrides `json:"containers,omitempty"` + + // Overrides for the init container in the desired job templates. + InitContainers []ContainerOverrides `json:"initContainers,omitempty"` + + // Overrides for the Pod volume configuration. + Volumes []corev1.Volume `json:"volumes,omitempty"` + + // Override for the service account. + ServiceAccountName string `json:"serviceAccountName,omitempty"` + + // Override for the node selector to place Pod on the specific mode. + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + + // Override for the Pod's tolerations. + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` +} + +// ContainerOverrides represents parameters that can be overridden using PodSpecOverrides. +// Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence. +type ContainerOverrides struct { + // Name for the container. TrainingRuntime must have this container. + Name string `json:"name"` + + // Entrypoint commands for the training container. + Command []string `json:"command,omitempty"` + + // Arguments to the entrypoint for the training container. + Args []string `json:"args,omitempty"` + + // List of environment variables to set in the container. + // These values will be merged with the TrainingRuntime's environments. + Env []corev1.EnvVar `json:"env,omitempty"` + + // List of sources to populate environment variables in the container. + // These values will be merged with the TrainingRuntime's environments. + EnvFrom []corev1.EnvFromSource `json:"envFrom,omitempty"` + + // Pod volumes to mount into the container's filesystem. + VolumeMounts []corev1.VolumeMount `json:"volumeMounts,omitempty"` +} + +// TrainJobStatus represents the current status of TrainJob. +type TrainJobStatus struct { + // Conditions for the TrainJob. + Conditions []metav1.Condition `json:"conditions,omitempty"` + + // ReplicatedJobsStatus tracks the number of Jobs for each replicatedJob in TrainJob. + ReplicatedJobsStatus []jobsetv1alpha2.ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"` +} + +// TranJobList is a collection of training jobs. +type TranJobList struct { + metav1.TypeMeta `json:",inline"` + + // Standard list metadata. + metav1.ListMeta `json:"metadata,omitempty"` + + // List of TrainJobs. + Items []TrainJob `json:"items"` +} + +// TODO: Enable this after controller implementation. +// func init() { +// SchemeBuilder.Register(&TrainJob{}, &TranJobList{}) +// } From ed830c8885149f659bc5cd8eb4bd41a407c7ba19 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Fri, 16 Aug 2024 13:42:14 +0100 Subject: [PATCH 02/12] Fix TrainJobList Signed-off-by: Andrey Velichkevich --- pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go index 700d398c5c..df6348ff20 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go @@ -226,8 +226,8 @@ type TrainJobStatus struct { ReplicatedJobsStatus []jobsetv1alpha2.ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"` } -// TranJobList is a collection of training jobs. -type TranJobList struct { +// TrainJobList is a collection of training jobs. +type TrainJobList struct { metav1.TypeMeta `json:",inline"` // Standard list metadata. @@ -239,5 +239,5 @@ type TranJobList struct { // TODO: Enable this after controller implementation. // func init() { -// SchemeBuilder.Register(&TrainJob{}, &TranJobList{}) +// SchemeBuilder.Register(&TrainJob{}, &TrainJobList{}) // } From 66e7049e159c9755656cde335dee95213a5cb7e6 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Fri, 16 Aug 2024 14:38:32 +0100 Subject: [PATCH 03/12] Register APIs with scheme Signed-off-by: Andrey Velichkevich --- .../v2alpha1/groupversion_info.go | 36 + .../v2alpha1/trainingruntime_types.go | 16 +- .../kubeflow.org/v2alpha1/trainjob_types.go | 34 +- .../v2alpha1/zz_generated.deepcopy.go | 736 ++++++++++++++++++ 4 files changed, 804 insertions(+), 18 deletions(-) create mode 100644 pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go create mode 100644 pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go diff --git a/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go b/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go new file mode 100644 index 0000000000..450a22ad3d --- /dev/null +++ b/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go @@ -0,0 +1,36 @@ +/* +Copyright 2024 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v2alpha1 contains API Schema definitions for the kubeflow.org v2alpha1 API group +// +kubebuilder:object:generate=true +// +groupName=kubeflow.org +package v2alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects. + GroupVersion = schema.GroupVersion{Group: "kubeflow.org", Version: "v2alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme. + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index 95f596373d..c767b1a4bb 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -22,6 +22,8 @@ import ( jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" ) +// +kubebuilder:object:root=true + // ClusterTrainingRuntime represents a training runtime which can be referenced as part of // `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced // by TrainJob that created in *any* namespace. @@ -35,6 +37,8 @@ type ClusterTrainingRuntime struct { Spec TrainingRuntimeSpec `json:"spec,omitempty"` } +// +kubebuilder:object:root=true + // ClusterTrainingRuntimeList is a collection of cluster training runtimes. type ClusterTrainingRuntimeList struct { metav1.TypeMeta `json:",inline"` @@ -46,6 +50,8 @@ type ClusterTrainingRuntimeList struct { Items []ClusterTrainingRuntime `json:"items"` } +// +kubebuilder:object:root=true + // TrainingRuntime represents a training runtime which can be referenced as part of // `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced // by TrainJob that created in the *same* namespace as the TrainingRuntime. @@ -59,6 +65,8 @@ type TrainingRuntime struct { Spec TrainingRuntimeSpec `json:"spec"` } +// +kubebuilder:object:root=true + // TrainingRuntimeList is a collection of training runtimes. type TrainingRuntimeList struct { metav1.TypeMeta `json:",inline"` @@ -121,6 +129,7 @@ type TorchSpec struct { // Number of processes per node. // This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. // Supported values: `auto`, `cpu`, `gpu`, or int value. + // TODO (andreyvelich): Add kubebuilder validation. // Defaults to `auto`. NumProcPerNode *string `json:"numProcPerNode,omitempty"` @@ -181,7 +190,6 @@ const ( MPIImplementationMPICH MPIImplementation = "MPICH" ) -// TODO: Enable this after controller implementation. -// func init() { -// SchemeBuilder.Register(&ClusterTrainingRuntime{}, &ClusterTrainingRuntimeList{}, &TrainingRuntime{}, &TrainingRuntimeList{}) -// } +func init() { + SchemeBuilder.Register(&ClusterTrainingRuntime{}, &ClusterTrainingRuntimeList{}, &TrainingRuntime{}, &TrainingRuntimeList{}) +} diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go index df6348ff20..12e2da5a5f 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go @@ -22,6 +22,11 @@ import ( jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" ) +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.conditions[-1:].type` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + // TrainJob represents configuration of a training job. type TrainJob struct { metav1.TypeMeta `json:",inline"` @@ -36,6 +41,19 @@ type TrainJob struct { Status TrainJobStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// TrainJobList is a collection of training jobs. +type TrainJobList struct { + metav1.TypeMeta `json:",inline"` + + // Standard list metadata. + metav1.ListMeta `json:"metadata,omitempty"` + + // List of TrainJobs. + Items []TrainJob `json:"items"` +} + // TrainJobSpec represents specification of the desired TrainJob. type TrainJobSpec struct { // Reference to the training runtime. @@ -226,18 +244,6 @@ type TrainJobStatus struct { ReplicatedJobsStatus []jobsetv1alpha2.ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"` } -// TrainJobList is a collection of training jobs. -type TrainJobList struct { - metav1.TypeMeta `json:",inline"` - - // Standard list metadata. - metav1.ListMeta `json:"metadata,omitempty"` - - // List of TrainJobs. - Items []TrainJob `json:"items"` +func init() { + SchemeBuilder.Register(&TrainJob{}, &TrainJobList{}) } - -// TODO: Enable this after controller implementation. -// func init() { -// SchemeBuilder.Register(&TrainJob{}, &TrainJobList{}) -// } diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go new file mode 100644 index 0000000000..cea5c2bde4 --- /dev/null +++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go @@ -0,0 +1,736 @@ +//go:build !ignore_autogenerated + +// Copyright 2023 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by controller-gen. DO NOT EDIT. + +package v2alpha1 + +import ( + "k8s.io/api/autoscaling/v2" + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/jobset/api/jobset/v1alpha2" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterTrainingRuntime) DeepCopyInto(out *ClusterTrainingRuntime) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterTrainingRuntime. +func (in *ClusterTrainingRuntime) DeepCopy() *ClusterTrainingRuntime { + if in == nil { + return nil + } + out := new(ClusterTrainingRuntime) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterTrainingRuntime) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterTrainingRuntimeList) DeepCopyInto(out *ClusterTrainingRuntimeList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ClusterTrainingRuntime, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterTrainingRuntimeList. +func (in *ClusterTrainingRuntimeList) DeepCopy() *ClusterTrainingRuntimeList { + if in == nil { + return nil + } + out := new(ClusterTrainingRuntimeList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterTrainingRuntimeList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ContainerOverrides) DeepCopyInto(out *ContainerOverrides) { + *out = *in + if in.Command != nil { + in, out := &in.Command, &out.Command + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Args != nil { + in, out := &in.Args, &out.Args + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]v1.EnvVar, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.EnvFrom != nil { + in, out := &in.EnvFrom, &out.EnvFrom + *out = make([]v1.EnvFromSource, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.VolumeMounts != nil { + in, out := &in.VolumeMounts, &out.VolumeMounts + *out = make([]v1.VolumeMount, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ContainerOverrides. +func (in *ContainerOverrides) DeepCopy() *ContainerOverrides { + if in == nil { + return nil + } + out := new(ContainerOverrides) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DatasetConfig) DeepCopyInto(out *DatasetConfig) { + *out = *in + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]v1.EnvVar, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + out.SecretRef = in.SecretRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatasetConfig. +func (in *DatasetConfig) DeepCopy() *DatasetConfig { + if in == nil { + return nil + } + out := new(DatasetConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InputModel) DeepCopyInto(out *InputModel) { + *out = *in + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]v1.EnvVar, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + out.SecretRef = in.SecretRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InputModel. +func (in *InputModel) DeepCopy() *InputModel { + if in == nil { + return nil + } + out := new(InputModel) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MLSpec) DeepCopyInto(out *MLSpec) { + *out = *in + if in.TorchSpec != nil { + in, out := &in.TorchSpec, &out.TorchSpec + *out = new(TorchSpec) + (*in).DeepCopyInto(*out) + } + if in.MPISpec != nil { + in, out := &in.MPISpec, &out.MPISpec + *out = new(MPISpec) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpec. +func (in *MLSpec) DeepCopy() *MLSpec { + if in == nil { + return nil + } + out := new(MLSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MPISpec) DeepCopyInto(out *MPISpec) { + *out = *in + if in.NumProcPerNode != nil { + in, out := &in.NumProcPerNode, &out.NumProcPerNode + *out = new(int32) + **out = **in + } + if in.MPIImplementation != nil { + in, out := &in.MPIImplementation, &out.MPIImplementation + *out = new(MPIImplementation) + **out = **in + } + if in.SSHAuthMountPath != nil { + in, out := &in.SSHAuthMountPath, &out.SSHAuthMountPath + *out = new(string) + **out = **in + } + if in.RunLauncherAsNode != nil { + in, out := &in.RunLauncherAsNode, &out.RunLauncherAsNode + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPISpec. +func (in *MPISpec) DeepCopy() *MPISpec { + if in == nil { + return nil + } + out := new(MPISpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelConfig) DeepCopyInto(out *ModelConfig) { + *out = *in + if in.Input != nil { + in, out := &in.Input, &out.Input + *out = new(InputModel) + (*in).DeepCopyInto(*out) + } + if in.Output != nil { + in, out := &in.Output, &out.Output + *out = new(OutputModel) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelConfig. +func (in *ModelConfig) DeepCopy() *ModelConfig { + if in == nil { + return nil + } + out := new(ModelConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *OutputModel) DeepCopyInto(out *OutputModel) { + *out = *in + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]v1.EnvVar, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + out.SecretRef = in.SecretRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OutputModel. +func (in *OutputModel) DeepCopy() *OutputModel { + if in == nil { + return nil + } + out := new(OutputModel) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodGroupSpec) DeepCopyInto(out *PodGroupSpec) { + *out = *in + if in.ScheduleTimeoutSeconds != nil { + in, out := &in.ScheduleTimeoutSeconds, &out.ScheduleTimeoutSeconds + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupSpec. +func (in *PodGroupSpec) DeepCopy() *PodGroupSpec { + if in == nil { + return nil + } + out := new(PodGroupSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodSpecOverrides) DeepCopyInto(out *PodSpecOverrides) { + *out = *in + if in.TargetReplicatedJobs != nil { + in, out := &in.TargetReplicatedJobs, &out.TargetReplicatedJobs + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Containers != nil { + in, out := &in.Containers, &out.Containers + *out = make([]ContainerOverrides, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.InitContainers != nil { + in, out := &in.InitContainers, &out.InitContainers + *out = make([]ContainerOverrides, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Volumes != nil { + in, out := &in.Volumes, &out.Volumes + *out = make([]v1.Volume, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.NodeSelector != nil { + in, out := &in.NodeSelector, &out.NodeSelector + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Tolerations != nil { + in, out := &in.Tolerations, &out.Tolerations + *out = make([]v1.Toleration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodSpecOverrides. +func (in *PodSpecOverrides) DeepCopy() *PodSpecOverrides { + if in == nil { + return nil + } + out := new(PodSpecOverrides) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TorchElasticPolicy) DeepCopyInto(out *TorchElasticPolicy) { + *out = *in + if in.MaxRestarts != nil { + in, out := &in.MaxRestarts, &out.MaxRestarts + *out = new(int32) + **out = **in + } + if in.MinNodes != nil { + in, out := &in.MinNodes, &out.MinNodes + *out = new(int32) + **out = **in + } + if in.MaxNodes != nil { + in, out := &in.MaxNodes, &out.MaxNodes + *out = new(int32) + **out = **in + } + if in.Metrics != nil { + in, out := &in.Metrics, &out.Metrics + *out = make([]v2.MetricSpec, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchElasticPolicy. +func (in *TorchElasticPolicy) DeepCopy() *TorchElasticPolicy { + if in == nil { + return nil + } + out := new(TorchElasticPolicy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TorchSpec) DeepCopyInto(out *TorchSpec) { + *out = *in + if in.NumProcPerNode != nil { + in, out := &in.NumProcPerNode, &out.NumProcPerNode + *out = new(string) + **out = **in + } + if in.Standalone != nil { + in, out := &in.Standalone, &out.Standalone + *out = new(bool) + **out = **in + } + if in.ElasticPolicy != nil { + in, out := &in.ElasticPolicy, &out.ElasticPolicy + *out = new(TorchElasticPolicy) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchSpec. +func (in *TorchSpec) DeepCopy() *TorchSpec { + if in == nil { + return nil + } + out := new(TorchSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrainJob) DeepCopyInto(out *TrainJob) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJob. +func (in *TrainJob) DeepCopy() *TrainJob { + if in == nil { + return nil + } + out := new(TrainJob) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TrainJob) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrainJobList) DeepCopyInto(out *TrainJobList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]TrainJob, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJobList. +func (in *TrainJobList) DeepCopy() *TrainJobList { + if in == nil { + return nil + } + out := new(TrainJobList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TrainJobList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrainJobSpec) DeepCopyInto(out *TrainJobSpec) { + *out = *in + in.TrainingRuntimeRef.DeepCopyInto(&out.TrainingRuntimeRef) + if in.Trainer != nil { + in, out := &in.Trainer, &out.Trainer + *out = new(Trainer) + (*in).DeepCopyInto(*out) + } + if in.DatasetConfig != nil { + in, out := &in.DatasetConfig, &out.DatasetConfig + *out = new(DatasetConfig) + (*in).DeepCopyInto(*out) + } + if in.ModelConfig != nil { + in, out := &in.ModelConfig, &out.ModelConfig + *out = new(ModelConfig) + (*in).DeepCopyInto(*out) + } + if in.Labels != nil { + in, out := &in.Labels, &out.Labels + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Annotations != nil { + in, out := &in.Annotations, &out.Annotations + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.PodSpecOverrides != nil { + in, out := &in.PodSpecOverrides, &out.PodSpecOverrides + *out = make([]PodSpecOverrides, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Suspend != nil { + in, out := &in.Suspend, &out.Suspend + *out = new(bool) + **out = **in + } + if in.ManagedBy != nil { + in, out := &in.ManagedBy, &out.ManagedBy + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJobSpec. +func (in *TrainJobSpec) DeepCopy() *TrainJobSpec { + if in == nil { + return nil + } + out := new(TrainJobSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrainJobStatus) DeepCopyInto(out *TrainJobStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.ReplicatedJobsStatus != nil { + in, out := &in.ReplicatedJobsStatus, &out.ReplicatedJobsStatus + *out = make([]v1alpha2.ReplicatedJobStatus, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJobStatus. +func (in *TrainJobStatus) DeepCopy() *TrainJobStatus { + if in == nil { + return nil + } + out := new(TrainJobStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Trainer) DeepCopyInto(out *Trainer) { + *out = *in + if in.Command != nil { + in, out := &in.Command, &out.Command + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Args != nil { + in, out := &in.Args, &out.Args + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]v1.EnvVar, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.NumNodes != nil { + in, out := &in.NumNodes, &out.NumNodes + *out = new(int32) + **out = **in + } + in.ResourcesPerNode.DeepCopyInto(&out.ResourcesPerNode) + if in.NumProcPerNode != nil { + in, out := &in.NumProcPerNode, &out.NumProcPerNode + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Trainer. +func (in *Trainer) DeepCopy() *Trainer { + if in == nil { + return nil + } + out := new(Trainer) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrainingRuntime) DeepCopyInto(out *TrainingRuntime) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntime. +func (in *TrainingRuntime) DeepCopy() *TrainingRuntime { + if in == nil { + return nil + } + out := new(TrainingRuntime) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TrainingRuntime) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrainingRuntimeList) DeepCopyInto(out *TrainingRuntimeList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]TrainingRuntime, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeList. +func (in *TrainingRuntimeList) DeepCopy() *TrainingRuntimeList { + if in == nil { + return nil + } + out := new(TrainingRuntimeList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TrainingRuntimeList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrainingRuntimeRef) DeepCopyInto(out *TrainingRuntimeRef) { + *out = *in + if in.APIGroup != nil { + in, out := &in.APIGroup, &out.APIGroup + *out = new(string) + **out = **in + } + if in.Kind != nil { + in, out := &in.Kind, &out.Kind + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeRef. +func (in *TrainingRuntimeRef) DeepCopy() *TrainingRuntimeRef { + if in == nil { + return nil + } + out := new(TrainingRuntimeRef) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrainingRuntimeSpec) DeepCopyInto(out *TrainingRuntimeSpec) { + *out = *in + if in.MLSpec != nil { + in, out := &in.MLSpec, &out.MLSpec + *out = new(MLSpec) + (*in).DeepCopyInto(*out) + } + if in.NumNodes != nil { + in, out := &in.NumNodes, &out.NumNodes + *out = new(int32) + **out = **in + } + if in.JobSetSpec != nil { + in, out := &in.JobSetSpec, &out.JobSetSpec + *out = new(v1alpha2.JobSetSpec) + (*in).DeepCopyInto(*out) + } + if in.PodGroupSpec != nil { + in, out := &in.PodGroupSpec, &out.PodGroupSpec + *out = new(PodGroupSpec) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeSpec. +func (in *TrainingRuntimeSpec) DeepCopy() *TrainingRuntimeSpec { + if in == nil { + return nil + } + out := new(TrainingRuntimeSpec) + in.DeepCopyInto(out) + return out +} From bfa1f200911b126dc4cbc0c05e85464a4585ca2f Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Fri, 16 Aug 2024 14:44:57 +0100 Subject: [PATCH 04/12] Add SchemeGroupVersion Signed-off-by: Andrey Velichkevich --- pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go b/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go index 450a22ad3d..a4cc2fd7d5 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go +++ b/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go @@ -31,6 +31,9 @@ var ( // SchemeBuilder is used to add go types to the GroupVersionKind scheme. SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + // SchemeGroupVersion is alias to GroupVersion for client-go libraries. + SchemeGroupVersion = GroupVersion + // AddToScheme adds the types in this group-version to the given scheme. AddToScheme = SchemeBuilder.AddToScheme ) From 2bf13c9af60a028242d86afeb406b78f94b83751 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 19 Aug 2024 16:57:42 +0100 Subject: [PATCH 05/12] Fix TrainingRuntimeSpec omitempty Signed-off-by: Andrey Velichkevich --- pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index c767b1a4bb..dce5aeb421 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -62,7 +62,7 @@ type TrainingRuntime struct { metav1.ObjectMeta `json:"metadata,omitempty"` // Specification of the desired TrainingRuntime. - Spec TrainingRuntimeSpec `json:"spec"` + Spec TrainingRuntimeSpec `json:"spec,omitempty"` } // +kubebuilder:object:root=true From 72a933e500ed04f359d91d9b94c2f6a059bcbcf6 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 19 Aug 2024 17:52:37 +0100 Subject: [PATCH 06/12] Generate manifests only for v1 Signed-off-by: Andrey Velichkevich --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1727392003..4cb00d8578 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ help: ## Display this help. ##@ Development manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. - $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." \ + $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/apis/kubeflow.org/v1/..." \ output:crd:artifacts:config=manifests/base/crds \ output:rbac:artifacts:config=manifests/base/rbac \ output:webhook:artifacts:config=manifests/base/webhook From 9d0a6865de8b604ad3ed0db67e90817d5f4ed38d Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Tue, 20 Aug 2024 17:22:36 +0100 Subject: [PATCH 07/12] Fix pointers for APIs Signed-off-by: Andrey Velichkevich --- .../v2alpha1/trainingruntime_types.go | 3 ++- pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index dce5aeb421..0695b27567 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -171,7 +171,8 @@ type MPISpec struct { NumProcPerNode *int32 `json:"numProcPerNode,omitempty"` // Implementation name for the MPI to create the appropriate hostfile. - MPIImplementation *MPIImplementation `json:"mpiImplementation"` + // Defaults to OpenMPI. + MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"` // Directory where SSH keys are mounted. SSHAuthMountPath *string `json:"SSHAuthMountPath,omitempty"` diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go index 12e2da5a5f..99189df4f7 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go @@ -108,7 +108,7 @@ type TrainingRuntimeRef struct { // Every training runtime contains `trainer` container which represents Trainer. type Trainer struct { // Docker image for the training container. - Image string `json:"image,omitempty"` + Image *string `json:"image,omitempty"` // Entrypoint commands for the training container. Command []string `json:"command,omitempty"` @@ -125,7 +125,7 @@ type Trainer struct { NumNodes *int32 `json:"numNodes,omitempty"` // Compute resources for each training node. - ResourcesPerNode corev1.ResourceRequirements `json:"resourcesPerNode,omitempty"` + ResourcesPerNode *corev1.ResourceRequirements `json:"resourcesPerNode,omitempty"` // Number of processes/workers/slots on every training node. // For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. @@ -138,14 +138,14 @@ type Trainer struct { // the `dataset-initializer` container in the `Initializer` Job. type DatasetConfig struct { // Storage uri for the dataset provider. - StorageUri string `json:"storageUri"` + StorageUri *string `json:"storageUri,omitempty"` // List of environment variables to set in the dataset initializer container. // These values will be merged with the TrainingRuntime's dataset initializer environments. Env []corev1.EnvVar `json:"env,omitempty"` // Reference to the TrainJob's secrets to download dataset. - SecretRef corev1.SecretReference `json:"secretRef,omitempty"` + SecretRef *corev1.SecretReference `json:"secretRef,omitempty"` } // ModelConfig represents the desired model configuration. @@ -164,27 +164,27 @@ type ModelConfig struct { // InputModel represents the desired pre-trained model configuration. type InputModel struct { // Storage uri for the model provider. - StorageUri string `json:"storageUri"` + StorageUri *string `json:"storageUri,omitempty"` // List of environment variables to set in the model initializer container. // These values will be merged with the TrainingRuntime's model initializer environments. Env []corev1.EnvVar `json:"env,omitempty"` // Reference to the TrainJob's secrets to download model. - SecretRef corev1.SecretReference `json:"secretRef,omitempty"` + SecretRef *corev1.SecretReference `json:"secretRef,omitempty"` } // OutputModel represents the desired trained model configuration. type OutputModel struct { // Storage uri for the model exporter. - StorageUri string `json:"storageUri"` + StorageUri *string `json:"storageUri,omitempty"` // List of environment variables to set in the model exporter container. // These values will be merged with the TrainingRuntime's model exporter environments. Env []corev1.EnvVar `json:"env,omitempty"` // Reference to the TrainJob's secrets to export model. - SecretRef corev1.SecretReference `json:"secretRef,omitempty"` + SecretRef *corev1.SecretReference `json:"secretRef,omitempty"` } // PodSpecOverrides represents the custom overrides that will be applied for the TrainJob's resources. From 880560c6ebdcd535004c6e13d0e599945ebe0897 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Tue, 20 Aug 2024 18:55:23 +0100 Subject: [PATCH 08/12] Run code-gen Signed-off-by: Andrey Velichkevich --- .../v2alpha1/zz_generated.deepcopy.go | 49 +++++++++++++++---- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go index cea5c2bde4..06d6934377 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go @@ -133,6 +133,11 @@ func (in *ContainerOverrides) DeepCopy() *ContainerOverrides { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DatasetConfig) DeepCopyInto(out *DatasetConfig) { *out = *in + if in.StorageUri != nil { + in, out := &in.StorageUri, &out.StorageUri + *out = new(string) + **out = **in + } if in.Env != nil { in, out := &in.Env, &out.Env *out = make([]v1.EnvVar, len(*in)) @@ -140,7 +145,11 @@ func (in *DatasetConfig) DeepCopyInto(out *DatasetConfig) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - out.SecretRef = in.SecretRef + if in.SecretRef != nil { + in, out := &in.SecretRef, &out.SecretRef + *out = new(v1.SecretReference) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatasetConfig. @@ -156,6 +165,11 @@ func (in *DatasetConfig) DeepCopy() *DatasetConfig { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *InputModel) DeepCopyInto(out *InputModel) { *out = *in + if in.StorageUri != nil { + in, out := &in.StorageUri, &out.StorageUri + *out = new(string) + **out = **in + } if in.Env != nil { in, out := &in.Env, &out.Env *out = make([]v1.EnvVar, len(*in)) @@ -163,7 +177,11 @@ func (in *InputModel) DeepCopyInto(out *InputModel) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - out.SecretRef = in.SecretRef + if in.SecretRef != nil { + in, out := &in.SecretRef, &out.SecretRef + *out = new(v1.SecretReference) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InputModel. @@ -209,11 +227,6 @@ func (in *MPISpec) DeepCopyInto(out *MPISpec) { *out = new(int32) **out = **in } - if in.MPIImplementation != nil { - in, out := &in.MPIImplementation, &out.MPIImplementation - *out = new(MPIImplementation) - **out = **in - } if in.SSHAuthMountPath != nil { in, out := &in.SSHAuthMountPath, &out.SSHAuthMountPath *out = new(string) @@ -264,6 +277,11 @@ func (in *ModelConfig) DeepCopy() *ModelConfig { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *OutputModel) DeepCopyInto(out *OutputModel) { *out = *in + if in.StorageUri != nil { + in, out := &in.StorageUri, &out.StorageUri + *out = new(string) + **out = **in + } if in.Env != nil { in, out := &in.Env, &out.Env *out = make([]v1.EnvVar, len(*in)) @@ -271,7 +289,11 @@ func (in *OutputModel) DeepCopyInto(out *OutputModel) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - out.SecretRef = in.SecretRef + if in.SecretRef != nil { + in, out := &in.SecretRef, &out.SecretRef + *out = new(v1.SecretReference) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OutputModel. @@ -577,6 +599,11 @@ func (in *TrainJobStatus) DeepCopy() *TrainJobStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Trainer) DeepCopyInto(out *Trainer) { *out = *in + if in.Image != nil { + in, out := &in.Image, &out.Image + *out = new(string) + **out = **in + } if in.Command != nil { in, out := &in.Command, &out.Command *out = make([]string, len(*in)) @@ -599,7 +626,11 @@ func (in *Trainer) DeepCopyInto(out *Trainer) { *out = new(int32) **out = **in } - in.ResourcesPerNode.DeepCopyInto(&out.ResourcesPerNode) + if in.ResourcesPerNode != nil { + in, out := &in.ResourcesPerNode, &out.ResourcesPerNode + *out = new(v1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } if in.NumProcPerNode != nil { in, out := &in.NumProcPerNode, &out.NumProcPerNode *out = new(string) From 49a004c0c5ccfad9dc48d7fbafa6d35028644cbe Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Wed, 21 Aug 2024 17:39:37 +0100 Subject: [PATCH 09/12] Use pointer for MPIImplementation Signed-off-by: Andrey Velichkevich --- pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go | 2 +- pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index 0695b27567..7a740d11b4 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -172,7 +172,7 @@ type MPISpec struct { // Implementation name for the MPI to create the appropriate hostfile. // Defaults to OpenMPI. - MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"` + MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"` // Directory where SSH keys are mounted. SSHAuthMountPath *string `json:"SSHAuthMountPath,omitempty"` diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go index 06d6934377..5b8870acb6 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go @@ -227,6 +227,11 @@ func (in *MPISpec) DeepCopyInto(out *MPISpec) { *out = new(int32) **out = **in } + if in.MPIImplementation != nil { + in, out := &in.MPIImplementation, &out.MPIImplementation + *out = new(MPIImplementation) + **out = **in + } if in.SSHAuthMountPath != nil { in, out := &in.SSHAuthMountPath, &out.SSHAuthMountPath *out = new(string) From c28a1669e7e28647c8f0c796368297447c5e3a56 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Thu, 22 Aug 2024 17:45:13 +0100 Subject: [PATCH 10/12] Update the JobSetTemplate API Signed-off-by: Andrey Velichkevich --- .../v2alpha1/trainingruntime_types.go | 57 +++++++----- .../v2alpha1/zz_generated.deepcopy.go | 88 ++++++++++++------- 2 files changed, 92 insertions(+), 53 deletions(-) diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index 7a740d11b4..ede9920b41 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -80,18 +80,25 @@ type TrainingRuntimeList struct { // TrainingRuntimeSpec represents a specification of the desired training runtime. type TrainingRuntimeSpec struct { - // Configuration for the runtime-specific parameters, such as Torch or MPI. - MLSpec *MLSpec `json:"mlSpec,omitempty"` - // Number of training nodes. - // Defaults to 1. - NumNodes *int32 `json:"numNodes,omitempty"` - - // JobSet configuration which will be used by TrainJob. - JobSetSpec *jobsetv1alpha2.JobSetSpec `json:",inline"` + // Configuration for the model training with ML-specific parameters. + MLSpec *MLSpec `json:"mlSpec,omitempty"` // Configuration for the PodGroup to enable gang-scheduling via supported plugins. PodGroupSpec *PodGroupSpec `json:"podGroupSpec,omitempty"` + + // JobSet template which will be used by TrainJob. + Template JobSetTemplateSpec `json:"template"` +} + +// JobSetTemplateSpec represents a template of the desired JobSet. +type JobSetTemplateSpec struct { + // Metadata for custom JobSet's labels and annotations. + // JobSet name and namespace is equal to the TrainJob's name and namespace. + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired JobSet which will be created from TrainJob. + Spec jobsetv1alpha2.JobSetSpec `json:"spec,omitempty"` } // PodGroupSpec represents a PodGroup configuration to enable gang-scheduling. @@ -114,18 +121,31 @@ const ( GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling" ) -// MLSpec represents the runtime-specific configuration for various technologies. -// One of the following specs can be set. +// MLSpec represents configuration for the model trining with ML-specific parameters. type MLSpec struct { + + // Number of training nodes. + // Defaults to 1. + NumNodes *int32 `json:"numNodes,omitempty"` + + // Configuration for the runtime-specific parameters, such as Torch or MPI. + // One of the following spec sources can be set. + MLSpecSource `json:",inline"` +} + +// MLPolicySource represents the runtime-specific configuration for various technologies. +// One of the following specs can be set. +type MLSpecSource struct { + // Configuration for the PyTorch runtime. - TorchSpec *TorchSpec `json:"torchSpec,omitempty"` + Torch *TorchMLSpecSource `json:"torch,omitempty"` // Configuration for the MPI Runtime. - MPISpec *MPISpec `json:"mpiSpec,omitempty"` + MPI *MPIMLSpecSource `json:"mpi,omitempty"` } -// TorchSpec represents a PyTorch runtime configuration. -type TorchSpec struct { +// TorchMLSpecSource represents a PyTorch runtime configuration. +type TorchMLSpecSource struct { // Number of processes per node. // This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. // Supported values: `auto`, `cpu`, `gpu`, or int value. @@ -133,11 +153,6 @@ type TorchSpec struct { // Defaults to `auto`. NumProcPerNode *string `json:"numProcPerNode,omitempty"` - // Whether to run single-node multi-worker training. - // This value is inserted into the `--standalone` argument of the `torchrun` CLI. - // Defaults to false. - Standalone *bool `json:"standalone,omitempty"` - // Elastic policy for the PyTorch training. ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"` } @@ -164,8 +179,8 @@ type TorchElasticPolicy struct { Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` } -// MPISpec represents a MPI runtime configuration. -type MPISpec struct { +// MPIMLSpecSource represents a MPI runtime configuration. +type MPIMLSpecSource struct { // Number of processes per node. // This value is equal to the number of slots for each node in the hostfile. NumProcPerNode *int32 `json:"numProcPerNode,omitempty"` diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go index 5b8870acb6..20d4ba0ef0 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go @@ -195,18 +195,31 @@ func (in *InputModel) DeepCopy() *InputModel { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MLSpec) DeepCopyInto(out *MLSpec) { +func (in *JobSetTemplateSpec) DeepCopyInto(out *JobSetTemplateSpec) { *out = *in - if in.TorchSpec != nil { - in, out := &in.TorchSpec, &out.TorchSpec - *out = new(TorchSpec) - (*in).DeepCopyInto(*out) + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new JobSetTemplateSpec. +func (in *JobSetTemplateSpec) DeepCopy() *JobSetTemplateSpec { + if in == nil { + return nil } - if in.MPISpec != nil { - in, out := &in.MPISpec, &out.MPISpec - *out = new(MPISpec) - (*in).DeepCopyInto(*out) + out := new(JobSetTemplateSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MLSpec) DeepCopyInto(out *MLSpec) { + *out = *in + if in.NumNodes != nil { + in, out := &in.NumNodes, &out.NumNodes + *out = new(int32) + **out = **in } + in.MLSpecSource.DeepCopyInto(&out.MLSpecSource) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpec. @@ -220,7 +233,32 @@ func (in *MLSpec) DeepCopy() *MLSpec { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MPISpec) DeepCopyInto(out *MPISpec) { +func (in *MLSpecSource) DeepCopyInto(out *MLSpecSource) { + *out = *in + if in.Torch != nil { + in, out := &in.Torch, &out.Torch + *out = new(TorchMLSpecSource) + (*in).DeepCopyInto(*out) + } + if in.MPI != nil { + in, out := &in.MPI, &out.MPI + *out = new(MPIMLSpecSource) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpecSource. +func (in *MLSpecSource) DeepCopy() *MLSpecSource { + if in == nil { + return nil + } + out := new(MLSpecSource) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MPIMLSpecSource) DeepCopyInto(out *MPIMLSpecSource) { *out = *in if in.NumProcPerNode != nil { in, out := &in.NumProcPerNode, &out.NumProcPerNode @@ -244,12 +282,12 @@ func (in *MPISpec) DeepCopyInto(out *MPISpec) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPISpec. -func (in *MPISpec) DeepCopy() *MPISpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIMLSpecSource. +func (in *MPIMLSpecSource) DeepCopy() *MPIMLSpecSource { if in == nil { return nil } - out := new(MPISpec) + out := new(MPIMLSpecSource) in.DeepCopyInto(out) return out } @@ -424,18 +462,13 @@ func (in *TorchElasticPolicy) DeepCopy() *TorchElasticPolicy { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TorchSpec) DeepCopyInto(out *TorchSpec) { +func (in *TorchMLSpecSource) DeepCopyInto(out *TorchMLSpecSource) { *out = *in if in.NumProcPerNode != nil { in, out := &in.NumProcPerNode, &out.NumProcPerNode *out = new(string) **out = **in } - if in.Standalone != nil { - in, out := &in.Standalone, &out.Standalone - *out = new(bool) - **out = **in - } if in.ElasticPolicy != nil { in, out := &in.ElasticPolicy, &out.ElasticPolicy *out = new(TorchElasticPolicy) @@ -443,12 +476,12 @@ func (in *TorchSpec) DeepCopyInto(out *TorchSpec) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchSpec. -func (in *TorchSpec) DeepCopy() *TorchSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchMLSpecSource. +func (in *TorchMLSpecSource) DeepCopy() *TorchMLSpecSource { if in == nil { return nil } - out := new(TorchSpec) + out := new(TorchMLSpecSource) in.DeepCopyInto(out) return out } @@ -744,21 +777,12 @@ func (in *TrainingRuntimeSpec) DeepCopyInto(out *TrainingRuntimeSpec) { *out = new(MLSpec) (*in).DeepCopyInto(*out) } - if in.NumNodes != nil { - in, out := &in.NumNodes, &out.NumNodes - *out = new(int32) - **out = **in - } - if in.JobSetSpec != nil { - in, out := &in.JobSetSpec, &out.JobSetSpec - *out = new(v1alpha2.JobSetSpec) - (*in).DeepCopyInto(*out) - } if in.PodGroupSpec != nil { in, out := &in.PodGroupSpec, &out.PodGroupSpec *out = new(PodGroupSpec) (*in).DeepCopyInto(*out) } + in.Template.DeepCopyInto(&out.Template) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeSpec. From 06e7653474bcfa269a7e60d820154d8decf850c8 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 26 Aug 2024 20:29:39 +0100 Subject: [PATCH 11/12] Rename PodGroupPolicy and MLPolicy APIs Signed-off-by: Andrey Velichkevich --- .../v2alpha1/trainingruntime_types.go | 60 ++++++----- .../v2alpha1/zz_generated.deepcopy.go | 102 ++++++++++++------ 2 files changed, 102 insertions(+), 60 deletions(-) diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index ede9920b41..4acb3d2d62 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -82,10 +82,10 @@ type TrainingRuntimeList struct { type TrainingRuntimeSpec struct { // Configuration for the model training with ML-specific parameters. - MLSpec *MLSpec `json:"mlSpec,omitempty"` + MLPolicy *MLPolicy `json:"mlPolicy,omitempty"` // Configuration for the PodGroup to enable gang-scheduling via supported plugins. - PodGroupSpec *PodGroupSpec `json:"podGroupSpec,omitempty"` + PodGroupPolicy *PodGroupPolicy `json:"podGroupPolicy,omitempty"` // JobSet template which will be used by TrainJob. Template JobSetTemplateSpec `json:"template"` @@ -101,51 +101,57 @@ type JobSetTemplateSpec struct { Spec jobsetv1alpha2.JobSetSpec `json:"spec,omitempty"` } -// PodGroupSpec represents a PodGroup configuration to enable gang-scheduling. -type PodGroupSpec struct { - // Plugin for the gang-scheduling. - Plugin GangSchedulerPlugin `json:"plugin"` +// PodGroupPolicy represents a PodGroup configuration for gang-scheduling. +type PodGroupPolicy struct { - // Time threshold to schedule PodGroup for gang-scheduling. - ScheduleTimeoutSeconds *string `json:"scheduleTimeoutSeconds,omitempty"` + // Configuration for gang-scheduling using various plugins. + PodGroupPolicySource `json:",inline"` } -// GangSchedulerPlugin represents one of the supported gang-scheduling plugins. -type GangSchedulerPlugin string - -const ( - // Volcano plugin for gang-scheduling. - GangSchedulerPluginVolcano GangSchedulerPlugin = "volcano" +// PodGroupPolicySource represents supported plugins for gang-scheduling. +// Only one of its members may be specified. +type PodGroupPolicySource struct { // Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling. - GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling" -) + Coscheduling *CoschedulingPodGroupPolicySource `json:"coscheduling,omitempty"` + + // TODO (andreyvelich): Add support for Volcano gang-scheduler. +} + +// CoschedulingPodGroupPolicySource represents configuration for coscheduling plugin. +type CoschedulingPodGroupPolicySource struct { + + // Time threshold to schedule PodGroup for gang-scheduling. + // If the scheduling timeout is equal to 0, the default value is used. + // Defaults to 60 seconds. + ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` +} -// MLSpec represents configuration for the model trining with ML-specific parameters. -type MLSpec struct { +// MLPolicy represents configuration for the model trining with ML-specific parameters. +type MLPolicy struct { // Number of training nodes. // Defaults to 1. NumNodes *int32 `json:"numNodes,omitempty"` // Configuration for the runtime-specific parameters, such as Torch or MPI. - // One of the following spec sources can be set. - MLSpecSource `json:",inline"` + // Only one of its members may be specified. + MLPolicySource `json:",inline"` } // MLPolicySource represents the runtime-specific configuration for various technologies. // One of the following specs can be set. -type MLSpecSource struct { +type MLPolicySource struct { // Configuration for the PyTorch runtime. - Torch *TorchMLSpecSource `json:"torch,omitempty"` + Torch *TorchMLPolicySource `json:"torch,omitempty"` // Configuration for the MPI Runtime. - MPI *MPIMLSpecSource `json:"mpi,omitempty"` + MPI *MPIMLPolicySource `json:"mpi,omitempty"` } -// TorchMLSpecSource represents a PyTorch runtime configuration. -type TorchMLSpecSource struct { +// TorchMLPolicySource represents a PyTorch runtime configuration. +type TorchMLPolicySource struct { // Number of processes per node. // This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. // Supported values: `auto`, `cpu`, `gpu`, or int value. @@ -179,8 +185,8 @@ type TorchElasticPolicy struct { Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` } -// MPIMLSpecSource represents a MPI runtime configuration. -type MPIMLSpecSource struct { +// MPIMLPolicySource represents a MPI runtime configuration. +type MPIMLPolicySource struct { // Number of processes per node. // This value is equal to the number of slots for each node in the hostfile. NumProcPerNode *int32 `json:"numProcPerNode,omitempty"` diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go index 20d4ba0ef0..db62e89800 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go @@ -130,6 +130,26 @@ func (in *ContainerOverrides) DeepCopy() *ContainerOverrides { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CoschedulingPodGroupPolicySource) DeepCopyInto(out *CoschedulingPodGroupPolicySource) { + *out = *in + if in.ScheduleTimeoutSeconds != nil { + in, out := &in.ScheduleTimeoutSeconds, &out.ScheduleTimeoutSeconds + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CoschedulingPodGroupPolicySource. +func (in *CoschedulingPodGroupPolicySource) DeepCopy() *CoschedulingPodGroupPolicySource { + if in == nil { + return nil + } + out := new(CoschedulingPodGroupPolicySource) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DatasetConfig) DeepCopyInto(out *DatasetConfig) { *out = *in @@ -212,53 +232,53 @@ func (in *JobSetTemplateSpec) DeepCopy() *JobSetTemplateSpec { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MLSpec) DeepCopyInto(out *MLSpec) { +func (in *MLPolicy) DeepCopyInto(out *MLPolicy) { *out = *in if in.NumNodes != nil { in, out := &in.NumNodes, &out.NumNodes *out = new(int32) **out = **in } - in.MLSpecSource.DeepCopyInto(&out.MLSpecSource) + in.MLPolicySource.DeepCopyInto(&out.MLPolicySource) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpec. -func (in *MLSpec) DeepCopy() *MLSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLPolicy. +func (in *MLPolicy) DeepCopy() *MLPolicy { if in == nil { return nil } - out := new(MLSpec) + out := new(MLPolicy) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MLSpecSource) DeepCopyInto(out *MLSpecSource) { +func (in *MLPolicySource) DeepCopyInto(out *MLPolicySource) { *out = *in if in.Torch != nil { in, out := &in.Torch, &out.Torch - *out = new(TorchMLSpecSource) + *out = new(TorchMLPolicySource) (*in).DeepCopyInto(*out) } if in.MPI != nil { in, out := &in.MPI, &out.MPI - *out = new(MPIMLSpecSource) + *out = new(MPIMLPolicySource) (*in).DeepCopyInto(*out) } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpecSource. -func (in *MLSpecSource) DeepCopy() *MLSpecSource { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLPolicySource. +func (in *MLPolicySource) DeepCopy() *MLPolicySource { if in == nil { return nil } - out := new(MLSpecSource) + out := new(MLPolicySource) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MPIMLSpecSource) DeepCopyInto(out *MPIMLSpecSource) { +func (in *MPIMLPolicySource) DeepCopyInto(out *MPIMLPolicySource) { *out = *in if in.NumProcPerNode != nil { in, out := &in.NumProcPerNode, &out.NumProcPerNode @@ -282,12 +302,12 @@ func (in *MPIMLSpecSource) DeepCopyInto(out *MPIMLSpecSource) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIMLSpecSource. -func (in *MPIMLSpecSource) DeepCopy() *MPIMLSpecSource { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIMLPolicySource. +func (in *MPIMLPolicySource) DeepCopy() *MPIMLPolicySource { if in == nil { return nil } - out := new(MPIMLSpecSource) + out := new(MPIMLPolicySource) in.DeepCopyInto(out) return out } @@ -350,21 +370,37 @@ func (in *OutputModel) DeepCopy() *OutputModel { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PodGroupSpec) DeepCopyInto(out *PodGroupSpec) { +func (in *PodGroupPolicy) DeepCopyInto(out *PodGroupPolicy) { *out = *in - if in.ScheduleTimeoutSeconds != nil { - in, out := &in.ScheduleTimeoutSeconds, &out.ScheduleTimeoutSeconds - *out = new(string) - **out = **in + in.PodGroupPolicySource.DeepCopyInto(&out.PodGroupPolicySource) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupPolicy. +func (in *PodGroupPolicy) DeepCopy() *PodGroupPolicy { + if in == nil { + return nil + } + out := new(PodGroupPolicy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodGroupPolicySource) DeepCopyInto(out *PodGroupPolicySource) { + *out = *in + if in.Coscheduling != nil { + in, out := &in.Coscheduling, &out.Coscheduling + *out = new(CoschedulingPodGroupPolicySource) + (*in).DeepCopyInto(*out) } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupSpec. -func (in *PodGroupSpec) DeepCopy() *PodGroupSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupPolicySource. +func (in *PodGroupPolicySource) DeepCopy() *PodGroupPolicySource { if in == nil { return nil } - out := new(PodGroupSpec) + out := new(PodGroupPolicySource) in.DeepCopyInto(out) return out } @@ -462,7 +498,7 @@ func (in *TorchElasticPolicy) DeepCopy() *TorchElasticPolicy { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TorchMLSpecSource) DeepCopyInto(out *TorchMLSpecSource) { +func (in *TorchMLPolicySource) DeepCopyInto(out *TorchMLPolicySource) { *out = *in if in.NumProcPerNode != nil { in, out := &in.NumProcPerNode, &out.NumProcPerNode @@ -476,12 +512,12 @@ func (in *TorchMLSpecSource) DeepCopyInto(out *TorchMLSpecSource) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchMLSpecSource. -func (in *TorchMLSpecSource) DeepCopy() *TorchMLSpecSource { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchMLPolicySource. +func (in *TorchMLPolicySource) DeepCopy() *TorchMLPolicySource { if in == nil { return nil } - out := new(TorchMLSpecSource) + out := new(TorchMLPolicySource) in.DeepCopyInto(out) return out } @@ -772,14 +808,14 @@ func (in *TrainingRuntimeRef) DeepCopy() *TrainingRuntimeRef { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TrainingRuntimeSpec) DeepCopyInto(out *TrainingRuntimeSpec) { *out = *in - if in.MLSpec != nil { - in, out := &in.MLSpec, &out.MLSpec - *out = new(MLSpec) + if in.MLPolicy != nil { + in, out := &in.MLPolicy, &out.MLPolicy + *out = new(MLPolicy) (*in).DeepCopyInto(*out) } - if in.PodGroupSpec != nil { - in, out := &in.PodGroupSpec, &out.PodGroupSpec - *out = new(PodGroupSpec) + if in.PodGroupPolicy != nil { + in, out := &in.PodGroupPolicy, &out.PodGroupPolicy + *out = new(PodGroupPolicy) (*in).DeepCopyInto(*out) } in.Template.DeepCopyInto(&out.Template) From 7aa4094c8df8e7ca0a6157967b06607e92db25cf Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 26 Aug 2024 21:26:41 +0100 Subject: [PATCH 12/12] Update comments Signed-off-by: Andrey Velichkevich --- pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index 4acb3d2d62..65346d0f6b 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -80,7 +80,6 @@ type TrainingRuntimeList struct { // TrainingRuntimeSpec represents a specification of the desired training runtime. type TrainingRuntimeSpec struct { - // Configuration for the model training with ML-specific parameters. MLPolicy *MLPolicy `json:"mlPolicy,omitempty"` @@ -103,7 +102,6 @@ type JobSetTemplateSpec struct { // PodGroupPolicy represents a PodGroup configuration for gang-scheduling. type PodGroupPolicy struct { - // Configuration for gang-scheduling using various plugins. PodGroupPolicySource `json:",inline"` } @@ -111,7 +109,6 @@ type PodGroupPolicy struct { // PodGroupPolicySource represents supported plugins for gang-scheduling. // Only one of its members may be specified. type PodGroupPolicySource struct { - // Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling. Coscheduling *CoschedulingPodGroupPolicySource `json:"coscheduling,omitempty"` @@ -119,8 +116,8 @@ type PodGroupPolicySource struct { } // CoschedulingPodGroupPolicySource represents configuration for coscheduling plugin. +// The number of min members in the PodGroupSpec is always equal to the number of nodes. type CoschedulingPodGroupPolicySource struct { - // Time threshold to schedule PodGroup for gang-scheduling. // If the scheduling timeout is equal to 0, the default value is used. // Defaults to 60 seconds. @@ -129,7 +126,6 @@ type CoschedulingPodGroupPolicySource struct { // MLPolicy represents configuration for the model trining with ML-specific parameters. type MLPolicy struct { - // Number of training nodes. // Defaults to 1. NumNodes *int32 `json:"numNodes,omitempty"` @@ -142,7 +138,6 @@ type MLPolicy struct { // MLPolicySource represents the runtime-specific configuration for various technologies. // One of the following specs can be set. type MLPolicySource struct { - // Configuration for the PyTorch runtime. Torch *TorchMLPolicySource `json:"torch,omitempty"`