From 6865663ba1e48a36b8f70ab5631438cda175019a Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Fri, 16 Aug 2024 00:16:14 +0100
Subject: [PATCH 01/12] KEP-2170: Add TrainJob and TrainingRuntime APIs

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 go.mod                                        |  17 +-
 go.sum                                        |  35 +--
 .../v2alpha1/trainingruntime_types.go         | 170 +++++++++++++
 .../kubeflow.org/v2alpha1/trainjob_types.go   | 226 ++++++++++++++++++
 4 files changed, 423 insertions(+), 25 deletions(-)

diff --git a/go.mod b/go.mod
index 2790c7a228..eb2f0afcbc 100644
--- a/go.mod
+++ b/go.mod
@@ -5,8 +5,8 @@ go 1.22
 require (
 	github.com/go-logr/logr v1.4.1
 	github.com/google/go-cmp v0.6.0
-	github.com/onsi/ginkgo/v2 v2.14.0
-	github.com/onsi/gomega v1.30.0
+	github.com/onsi/ginkgo/v2 v2.17.1
+	github.com/onsi/gomega v1.32.0
 	github.com/open-policy-agent/cert-controller v0.10.1
 	github.com/prometheus/client_golang v1.18.0
 	github.com/sirupsen/logrus v1.9.0
@@ -19,7 +19,8 @@ require (
 	k8s.io/klog/v2 v2.110.1
 	k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00
 	k8s.io/utils v0.0.0-20230726121419-3b25d923346b
-	sigs.k8s.io/controller-runtime v0.17.2
+	sigs.k8s.io/controller-runtime v0.17.3
+	sigs.k8s.io/jobset v0.5.2
 	sigs.k8s.io/scheduler-plugins v0.28.9
 	sigs.k8s.io/yaml v1.4.0
 	volcano.sh/apis v1.9.0
@@ -44,8 +45,8 @@ require (
 	github.com/google/gnostic-models v0.6.8 // indirect
 	github.com/google/gofuzz v1.2.0 // indirect
 	github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect
-	github.com/google/uuid v1.3.0 // indirect
-	github.com/imdario/mergo v0.3.13 // indirect
+	github.com/google/uuid v1.3.1 // indirect
+	github.com/imdario/mergo v0.3.16 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
@@ -61,7 +62,7 @@ require (
 	github.com/spf13/pflag v1.0.5 // indirect
 	go.uber.org/atomic v1.11.0 // indirect
 	go.uber.org/multierr v1.11.0 // indirect
-	golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 // indirect
+	golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
 	golang.org/x/mod v0.16.0 // indirect
 	golang.org/x/net v0.23.0 // indirect
 	golang.org/x/oauth2 v0.12.0 // indirect
@@ -76,8 +77,8 @@ require (
 	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
-	k8s.io/apiextensions-apiserver v0.29.0 // indirect
-	k8s.io/component-base v0.29.0 // indirect
+	k8s.io/apiextensions-apiserver v0.29.2 // indirect
+	k8s.io/component-base v0.29.2 // indirect
 	k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01 // indirect
 	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
 	sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
diff --git a/go.sum b/go.sum
index 75b92b0eeb..da8a571436 100644
--- a/go.sum
+++ b/go.sum
@@ -50,11 +50,11 @@ github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
 github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec=
 github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
-github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
-github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4=
+github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
-github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk=
-github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg=
+github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
+github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY=
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
@@ -80,10 +80,10 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
-github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY=
-github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw=
-github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8=
-github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ=
+github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8=
+github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs=
+github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk=
+github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg=
 github.com/open-policy-agent/cert-controller v0.10.1 h1:RXSYoyn8FdCenWecRP//UV5nbVfmstNpj4kHQFkvPK4=
 github.com/open-policy-agent/cert-controller v0.10.1/go.mod h1:4uRbBLY5DsPOog+a9pqk3JLxuuhrWsbUedQW65HcLTI=
 github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4 h1:5dum5SLEz+95JDLkMls7Z7IDPjvSq3UhJSFe4f5einQ=
@@ -130,8 +130,8 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 h1:tnebWN09GYg9OLPss1KXj8txwZc6X6uMr6VFdcGNbHw=
-golang.org/x/exp v0.0.0-20220827204233-334a2380cb91/go.mod h1:cyybsKvd6eL0RnXn6p/Grxp8F5bW7iYuBgsNCOHpMYE=
+golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g=
+golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k=
 golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.16.0 h1:QX4fJ0Rr5cPQCF7O9lh9Se4pmwfwskqZfq5moyldzic=
@@ -191,21 +191,20 @@ gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
 gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 k8s.io/api v0.29.3 h1:2ORfZ7+bGC3YJqGpV0KSDDEVf8hdGQ6A03/50vj8pmw=
 k8s.io/api v0.29.3/go.mod h1:y2yg2NTyHUUkIoTC+phinTnEa3KFM6RZ3szxt014a80=
-k8s.io/apiextensions-apiserver v0.29.0 h1:0VuspFG7Hj+SxyF/Z/2T0uFbI5gb5LRgEyUVE3Q4lV0=
-k8s.io/apiextensions-apiserver v0.29.0/go.mod h1:TKmpy3bTS0mr9pylH0nOt/QzQRrW7/h7yLdRForMZwc=
+k8s.io/apiextensions-apiserver v0.29.2 h1:UK3xB5lOWSnhaCk0RFZ0LUacPZz9RY4wi/yt2Iu+btg=
+k8s.io/apiextensions-apiserver v0.29.2/go.mod h1:aLfYjpA5p3OwtqNXQFkhJ56TB+spV8Gc4wfMhUA3/b8=
 k8s.io/apimachinery v0.29.3 h1:2tbx+5L7RNvqJjn7RIuIKu9XTsIZ9Z5wX2G22XAa5EU=
 k8s.io/apimachinery v0.29.3/go.mod h1:hx/S4V2PNW4OMg3WizRrHutyB5la0iCUbZym+W0EQIU=
 k8s.io/client-go v0.29.3 h1:R/zaZbEAxqComZ9FHeQwOh3Y1ZUs7FaHKZdQtIc2WZg=
 k8s.io/client-go v0.29.3/go.mod h1:tkDisCvgPfiRpxGnOORfkljmS+UrW+WtXAy2fTvXJB0=
 k8s.io/code-generator v0.29.3 h1:m7E25/t9R9NvejspO2zBdyu+/Gl0Z5m7dCRc680KS14=
 k8s.io/code-generator v0.29.3/go.mod h1:x47ofBhN4gxYFcxeKA1PYXeaPreAGaDN85Y/lNUsPoM=
-k8s.io/component-base v0.29.0 h1:T7rjd5wvLnPBV1vC4zWd/iWRbV8Mdxs+nGaoaFzGw3s=
-k8s.io/component-base v0.29.0/go.mod h1:sADonFTQ9Zc9yFLghpDpmNXEdHyQmFIGbiuZbqAXQ1M=
+k8s.io/component-base v0.29.2 h1:lpiLyuvPA9yV1aQwGLENYyK7n/8t6l3nn3zAtFTJYe8=
+k8s.io/component-base v0.29.2/go.mod h1:BfB3SLrefbZXiBfbM+2H1dlat21Uewg/5qtKOl8degM=
 k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01 h1:pWEwq4Asjm4vjW7vcsmijwBhOr1/shsbSYiWXmNGlks=
 k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E=
 k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
@@ -217,8 +216,10 @@ k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/A
 k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA=
 k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI=
 k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
-sigs.k8s.io/controller-runtime v0.17.2 h1:FwHwD1CTUemg0pW2otk7/U5/i5m2ymzvOXdbeGOUvw0=
-sigs.k8s.io/controller-runtime v0.17.2/go.mod h1:+MngTvIQQQhfXtwfdGw/UOQ/aIaqsYywfCINOtwMO/s=
+sigs.k8s.io/controller-runtime v0.17.3 h1:65QmN7r3FWgTxDMz9fvGnO1kbf2nu+acg9p2R9oYYYk=
+sigs.k8s.io/controller-runtime v0.17.3/go.mod h1:N0jpP5Lo7lMTF9aL56Z/B2oWBJjey6StQM0jRbKQXtY=
+sigs.k8s.io/jobset v0.5.2 h1:276q5Pi/ErLYj+GQ0ydEXR6tx3LwBhEzHLQv+k8bYF4=
+sigs.k8s.io/jobset v0.5.2/go.mod h1:Vg99rj/6OoGvy1uvywGEHOcVLCWWJYkJtisKqdWzcFw=
 sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
 sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
 sigs.k8s.io/scheduler-plugins v0.28.9 h1:1/bXRoXuSUFr1FLqxrzScdyZMl/G1psuDJcDKYxTo+Q=
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index ab0377d028..95f596373d 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -15,3 +15,173 @@ limitations under the License.
 */
 
 package v2alpha1
+
+import (
+	autoscalingv2 "k8s.io/api/autoscaling/v2"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
+)
+
+// ClusterTrainingRuntime represents a training runtime which can be referenced as part of
+// `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced
+// by TrainJob that created in *any* namespace.
+type ClusterTrainingRuntime struct {
+	metav1.TypeMeta `json:",inline"`
+
+	// Standard object's metadata.
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	// Specification of the desired ClusterTrainingRuntime.
+	Spec TrainingRuntimeSpec `json:"spec,omitempty"`
+}
+
+// ClusterTrainingRuntimeList is a collection of cluster training runtimes.
+type ClusterTrainingRuntimeList struct {
+	metav1.TypeMeta `json:",inline"`
+
+	// Standard list metadata.
+	metav1.ListMeta `json:"metadata,omitempty"`
+
+	// List of ClusterTrainingRuntimes.
+	Items []ClusterTrainingRuntime `json:"items"`
+}
+
+// TrainingRuntime represents a training runtime which can be referenced as part of
+// `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced
+// by TrainJob that created in the *same* namespace as the TrainingRuntime.
+type TrainingRuntime struct {
+	metav1.TypeMeta `json:",inline"`
+
+	// Standard object's metadata.
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	// Specification of the desired TrainingRuntime.
+	Spec TrainingRuntimeSpec `json:"spec"`
+}
+
+// TrainingRuntimeList is a collection of training runtimes.
+type TrainingRuntimeList struct {
+	metav1.TypeMeta `json:",inline"`
+
+	// Standard list metadata.
+	metav1.ListMeta `json:"metadata,omitempty"`
+
+	// List of TrainingRuntimes.
+	Items []TrainingRuntime `json:"items"`
+}
+
+// TrainingRuntimeSpec represents a specification of the desired training runtime.
+type TrainingRuntimeSpec struct {
+	// Configuration for the runtime-specific parameters, such as Torch or MPI.
+	MLSpec *MLSpec `json:"mlSpec,omitempty"`
+
+	// Number of training nodes.
+	// Defaults to 1.
+	NumNodes *int32 `json:"numNodes,omitempty"`
+
+	// JobSet configuration which will be used by TrainJob.
+	JobSetSpec *jobsetv1alpha2.JobSetSpec `json:",inline"`
+
+	// Configuration for the PodGroup to enable gang-scheduling via supported plugins.
+	PodGroupSpec *PodGroupSpec `json:"podGroupSpec,omitempty"`
+}
+
+// PodGroupSpec represents a PodGroup configuration to enable gang-scheduling.
+type PodGroupSpec struct {
+	// Plugin for the gang-scheduling.
+	Plugin GangSchedulerPlugin `json:"plugin"`
+
+	// Time threshold to schedule PodGroup for gang-scheduling.
+	ScheduleTimeoutSeconds *string `json:"scheduleTimeoutSeconds,omitempty"`
+}
+
+// GangSchedulerPlugin represents one of the supported gang-scheduling plugins.
+type GangSchedulerPlugin string
+
+const (
+	// Volcano plugin for gang-scheduling.
+	GangSchedulerPluginVolcano GangSchedulerPlugin = "volcano"
+
+	// Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling.
+	GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling"
+)
+
+// MLSpec represents the runtime-specific configuration for various technologies.
+// One of the following specs can be set.
+type MLSpec struct {
+	// Configuration for the PyTorch runtime.
+	TorchSpec *TorchSpec `json:"torchSpec,omitempty"`
+
+	// Configuration for the MPI Runtime.
+	MPISpec *MPISpec `json:"mpiSpec,omitempty"`
+}
+
+// TorchSpec represents a PyTorch runtime configuration.
+type TorchSpec struct {
+	// Number of processes per node.
+	// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
+	// Supported values: `auto`, `cpu`, `gpu`, or int value.
+	// Defaults to `auto`.
+	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
+
+	// Whether to run single-node multi-worker training.
+	// This value is inserted into the `--standalone` argument of the `torchrun` CLI.
+	// Defaults to false.
+	Standalone *bool `json:"standalone,omitempty"`
+
+	// Elastic policy for the PyTorch training.
+	ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`
+}
+
+// TorchElasticPolicy represents a configuration for the PyTorch elastic training.
+// If this policy is set, the `.spec.numNodes` parameter must be omitted, since min and max node
+// is used to configure the `torchrun` CLI argument: `--nnodes=minNodes:maxNodes`.
+// Only `c10d` backend is supported for the Rendezvous communication.
+type TorchElasticPolicy struct {
+	// How many times the training job can be restarted.
+	// This value is inserted into the `--max-restarts` argument of the `torchrun` CLI and
+	// the `.spec.failurePolicy.maxRestarts` parameter of the training Job.
+	MaxRestarts *int32 `json:"maxRestarts,omitempty"`
+
+	// Lower limit for the number of nodes to which training job can scale down.
+	MinNodes *int32 `json:"minNodes,omitempty"`
+
+	// Upper limit for the number of nodes to which training job can scale up.
+	MaxNodes *int32 `json:"maxNodes,omitempty"`
+
+	// Specification which are used to calculate the desired number of nodes. See the individual
+	// metric source types for more information about how each type of metric must respond.
+	// The HPA will be created to perform auto-scaling.
+	Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
+}
+
+// MPISpec represents a MPI runtime configuration.
+type MPISpec struct {
+	// Number of processes per node.
+	// This value is equal to the number of slots for each node in the hostfile.
+	NumProcPerNode *int32 `json:"numProcPerNode,omitempty"`
+
+	// Implementation name for the MPI to create the appropriate hostfile.
+	MPIImplementation *MPIImplementation `json:"mpiImplementation"`
+
+	// Directory where SSH keys are mounted.
+	SSHAuthMountPath *string `json:"SSHAuthMountPath,omitempty"`
+
+	// Whether to run training process on the launcher Job.
+	// Defaults to false.
+	RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"`
+}
+
+// MPIImplementation represents one of the supported MPI implementations.
+type MPIImplementation string
+
+const (
+	MPIImplementationOpenMPI MPIImplementation = "OpenMPI"
+	MPIImplementationIntel   MPIImplementation = "Intel"
+	MPIImplementationMPICH   MPIImplementation = "MPICH"
+)
+
+// TODO: Enable this after controller implementation.
+// func init() {
+// 	SchemeBuilder.Register(&ClusterTrainingRuntime{}, &ClusterTrainingRuntimeList{}, &TrainingRuntime{}, &TrainingRuntimeList{})
+// }
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
index ab0377d028..700d398c5c 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
@@ -15,3 +15,229 @@ limitations under the License.
 */
 
 package v2alpha1
+
+import (
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
+)
+
+// TrainJob represents configuration of a training job.
+type TrainJob struct {
+	metav1.TypeMeta `json:",inline"`
+
+	// Standard object's metadata.
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	// Specification of the desired TrainJob.
+	Spec TrainJobSpec `json:"spec,omitempty"`
+
+	// Current status of TrainJob.
+	Status TrainJobStatus `json:"status,omitempty"`
+}
+
+// TrainJobSpec represents specification of the desired TrainJob.
+type TrainJobSpec struct {
+	// Reference to the training runtime.
+	TrainingRuntimeRef TrainingRuntimeRef `json:"trainingRuntimeRef"`
+
+	// Configuration of the desired trainer.
+	Trainer *Trainer `json:"trainer,omitempty"`
+
+	// Configuration of the training dataset.
+	DatasetConfig *DatasetConfig `json:"datasetConfig,omitempty"`
+
+	// Configuration of the pre-trained and trained model.
+	ModelConfig *ModelConfig `json:"modelConfig,omitempty"`
+
+	// Labels to apply for the derivative JobSet and Jobs.
+	// They will be merged with the TrainingRuntime values.
+	Labels map[string]string `json:"labels,omitempty"`
+
+	// Annotations to apply for the derivative JobSet and Jobs.
+	// They will be merged with the TrainingRuntime values.
+	Annotations map[string]string `json:"annotations,omitempty"`
+
+	// Custom overrides for the training runtime.
+	PodSpecOverrides []PodSpecOverrides `json:"podSpecOverrides,omitempty"`
+
+	// Whether the controller should suspend the running TrainJob.
+	// Defaults to false.
+	Suspend *bool `json:"suspend,omitempty"`
+
+	// ManagedBy field indicates the controller that manages a TrainJob.
+	ManagedBy *string `json:"managedBy,omitempty"`
+}
+
+// TrainingRuntimeRef represents the reference to the existing training runtime.
+type TrainingRuntimeRef struct {
+	// Name of the runtime being referenced.
+	// When namespaced-scoped TrainingRuntime is used, the TrainJob must have
+	// the same namespace as the deployed runtime.
+	Name string `json:"name"`
+
+	// APIGroup of the runtime being referenced.
+	// Defaults to `kubeflow.org`.
+	APIGroup *string `json:"apiGroup,omitempty"`
+
+	// Kind of the runtime being referenced.
+	// It must be one of TrainingRuntime or ClusterTrainingRuntime.
+	// Defaults to ClusterTrainingRuntime.
+	Kind *string `json:"kind,omitempty"`
+}
+
+// Trainer represents the desired trainer configuration.
+// Every training runtime contains `trainer` container which represents Trainer.
+type Trainer struct {
+	// Docker image for the training container.
+	Image string `json:"image,omitempty"`
+
+	// Entrypoint commands for the training container.
+	Command []string `json:"command,omitempty"`
+
+	// Arguments to the entrypoint for the training container.
+	Args []string `json:"args,omitempty"`
+
+	// List of environment variables to set in the training container.
+	// These values will be merged with the TrainingRuntime's trainer environments.
+	Env []corev1.EnvVar `json:"env,omitempty"`
+
+	// Number of training nodes.
+	// TODO (andreyvelich): Do we want to support dynamic num of nodes in TrainJob for PyTorch elastic: `--nnodes=1:4` ?
+	NumNodes *int32 `json:"numNodes,omitempty"`
+
+	// Compute resources for each training node.
+	ResourcesPerNode corev1.ResourceRequirements `json:"resourcesPerNode,omitempty"`
+
+	// Number of processes/workers/slots on every training node.
+	// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
+	// For the MPI runtime only int value can be set.
+	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
+}
+
+// DatasetConfig represents the desired dataset configuration.
+// When this API is used, the training runtime must have
+// the `dataset-initializer` container in the `Initializer` Job.
+type DatasetConfig struct {
+	// Storage uri for the dataset provider.
+	StorageUri string `json:"storageUri"`
+
+	// List of environment variables to set in the dataset initializer container.
+	// These values will be merged with the TrainingRuntime's dataset initializer environments.
+	Env []corev1.EnvVar `json:"env,omitempty"`
+
+	// Reference to the TrainJob's secrets to download dataset.
+	SecretRef corev1.SecretReference `json:"secretRef,omitempty"`
+}
+
+// ModelConfig represents the desired model configuration.
+type ModelConfig struct {
+	// Configuration of the pre-trained model.
+	// When this API is used, the training runtime must have
+	// the `model-initializer` container in the `Initializer` Job.
+	Input *InputModel `json:"input,omitempty"`
+
+	// Configuration of the trained model.
+	// When this API is used, the training runtime must have
+	// the `model-exporter` container in the `Exporter` Job.
+	Output *OutputModel `json:"output,omitempty"`
+}
+
+// InputModel represents the desired pre-trained model configuration.
+type InputModel struct {
+	// Storage uri for the model provider.
+	StorageUri string `json:"storageUri"`
+
+	// List of environment variables to set in the model initializer container.
+	// These values will be merged with the TrainingRuntime's model initializer environments.
+	Env []corev1.EnvVar `json:"env,omitempty"`
+
+	// Reference to the TrainJob's secrets to download model.
+	SecretRef corev1.SecretReference `json:"secretRef,omitempty"`
+}
+
+// OutputModel represents the desired trained model configuration.
+type OutputModel struct {
+	// Storage uri for the model exporter.
+	StorageUri string `json:"storageUri"`
+
+	// List of environment variables to set in the model exporter container.
+	// These values will be merged with the TrainingRuntime's model exporter environments.
+	Env []corev1.EnvVar `json:"env,omitempty"`
+
+	// Reference to the TrainJob's secrets to export model.
+	SecretRef corev1.SecretReference `json:"secretRef,omitempty"`
+}
+
+// PodSpecOverrides represents the custom overrides that will be applied for the TrainJob's resources.
+type PodSpecOverrides struct {
+	// Names of the training job replicas in the training runtime template to apply the overrides.
+	TargetReplicatedJobs []string `json:"targetReplicatedJobs"`
+
+	// Overrides for the containers in the desired job templates.
+	Containers []ContainerOverrides `json:"containers,omitempty"`
+
+	// Overrides for the init container in the desired job templates.
+	InitContainers []ContainerOverrides `json:"initContainers,omitempty"`
+
+	// Overrides for the Pod volume configuration.
+	Volumes []corev1.Volume `json:"volumes,omitempty"`
+
+	// Override for the service account.
+	ServiceAccountName string `json:"serviceAccountName,omitempty"`
+
+	// Override for the node selector to place Pod on the specific mode.
+	NodeSelector map[string]string `json:"nodeSelector,omitempty"`
+
+	// Override for the Pod's tolerations.
+	Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
+}
+
+// ContainerOverrides represents parameters that can be overridden using PodSpecOverrides.
+// Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence.
+type ContainerOverrides struct {
+	// Name for the container. TrainingRuntime must have this container.
+	Name string `json:"name"`
+
+	// Entrypoint commands for the training container.
+	Command []string `json:"command,omitempty"`
+
+	// Arguments to the entrypoint for the training container.
+	Args []string `json:"args,omitempty"`
+
+	// List of environment variables to set in the container.
+	// These values will be merged with the TrainingRuntime's environments.
+	Env []corev1.EnvVar `json:"env,omitempty"`
+
+	// List of sources to populate environment variables in the container.
+	// These   values will be merged with the TrainingRuntime's environments.
+	EnvFrom []corev1.EnvFromSource `json:"envFrom,omitempty"`
+
+	// Pod volumes to mount into the container's filesystem.
+	VolumeMounts []corev1.VolumeMount `json:"volumeMounts,omitempty"`
+}
+
+// TrainJobStatus represents the current status of TrainJob.
+type TrainJobStatus struct {
+	// Conditions for the TrainJob.
+	Conditions []metav1.Condition `json:"conditions,omitempty"`
+
+	// ReplicatedJobsStatus tracks the number of Jobs for each replicatedJob in TrainJob.
+	ReplicatedJobsStatus []jobsetv1alpha2.ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"`
+}
+
+// TranJobList is a collection of training jobs.
+type TranJobList struct {
+	metav1.TypeMeta `json:",inline"`
+
+	// Standard list metadata.
+	metav1.ListMeta `json:"metadata,omitempty"`
+
+	// List of TrainJobs.
+	Items []TrainJob `json:"items"`
+}
+
+// TODO: Enable this after controller implementation.
+// func init() {
+// 	SchemeBuilder.Register(&TrainJob{}, &TranJobList{})
+// }

From ed830c8885149f659bc5cd8eb4bd41a407c7ba19 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Fri, 16 Aug 2024 13:42:14 +0100
Subject: [PATCH 02/12] Fix TrainJobList

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
index 700d398c5c..df6348ff20 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
@@ -226,8 +226,8 @@ type TrainJobStatus struct {
 	ReplicatedJobsStatus []jobsetv1alpha2.ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"`
 }
 
-// TranJobList is a collection of training jobs.
-type TranJobList struct {
+// TrainJobList is a collection of training jobs.
+type TrainJobList struct {
 	metav1.TypeMeta `json:",inline"`
 
 	// Standard list metadata.
@@ -239,5 +239,5 @@ type TranJobList struct {
 
 // TODO: Enable this after controller implementation.
 // func init() {
-// 	SchemeBuilder.Register(&TrainJob{}, &TranJobList{})
+// 	SchemeBuilder.Register(&TrainJob{}, &TrainJobList{})
 // }

From 66e7049e159c9755656cde335dee95213a5cb7e6 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Fri, 16 Aug 2024 14:38:32 +0100
Subject: [PATCH 03/12] Register APIs with scheme

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 .../v2alpha1/groupversion_info.go             |  36 +
 .../v2alpha1/trainingruntime_types.go         |  16 +-
 .../kubeflow.org/v2alpha1/trainjob_types.go   |  34 +-
 .../v2alpha1/zz_generated.deepcopy.go         | 736 ++++++++++++++++++
 4 files changed, 804 insertions(+), 18 deletions(-)
 create mode 100644 pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go
 create mode 100644 pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go

diff --git a/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go b/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go
new file mode 100644
index 0000000000..450a22ad3d
--- /dev/null
+++ b/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go
@@ -0,0 +1,36 @@
+/*
+Copyright 2024 The Kubeflow Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package v2alpha1 contains API Schema definitions for the kubeflow.org v2alpha1 API group
+// +kubebuilder:object:generate=true
+// +groupName=kubeflow.org
+package v2alpha1
+
+import (
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"sigs.k8s.io/controller-runtime/pkg/scheme"
+)
+
+var (
+	// GroupVersion is group version used to register these objects.
+	GroupVersion = schema.GroupVersion{Group: "kubeflow.org", Version: "v2alpha1"}
+
+	// SchemeBuilder is used to add go types to the GroupVersionKind scheme.
+	SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
+
+	// AddToScheme adds the types in this group-version to the given scheme.
+	AddToScheme = SchemeBuilder.AddToScheme
+)
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index 95f596373d..c767b1a4bb 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -22,6 +22,8 @@ import (
 	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
 )
 
+// +kubebuilder:object:root=true
+
 // ClusterTrainingRuntime represents a training runtime which can be referenced as part of
 // `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced
 // by TrainJob that created in *any* namespace.
@@ -35,6 +37,8 @@ type ClusterTrainingRuntime struct {
 	Spec TrainingRuntimeSpec `json:"spec,omitempty"`
 }
 
+// +kubebuilder:object:root=true
+
 // ClusterTrainingRuntimeList is a collection of cluster training runtimes.
 type ClusterTrainingRuntimeList struct {
 	metav1.TypeMeta `json:",inline"`
@@ -46,6 +50,8 @@ type ClusterTrainingRuntimeList struct {
 	Items []ClusterTrainingRuntime `json:"items"`
 }
 
+// +kubebuilder:object:root=true
+
 // TrainingRuntime represents a training runtime which can be referenced as part of
 // `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced
 // by TrainJob that created in the *same* namespace as the TrainingRuntime.
@@ -59,6 +65,8 @@ type TrainingRuntime struct {
 	Spec TrainingRuntimeSpec `json:"spec"`
 }
 
+// +kubebuilder:object:root=true
+
 // TrainingRuntimeList is a collection of training runtimes.
 type TrainingRuntimeList struct {
 	metav1.TypeMeta `json:",inline"`
@@ -121,6 +129,7 @@ type TorchSpec struct {
 	// Number of processes per node.
 	// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
 	// Supported values: `auto`, `cpu`, `gpu`, or int value.
+	// TODO (andreyvelich): Add kubebuilder validation.
 	// Defaults to `auto`.
 	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
 
@@ -181,7 +190,6 @@ const (
 	MPIImplementationMPICH   MPIImplementation = "MPICH"
 )
 
-// TODO: Enable this after controller implementation.
-// func init() {
-// 	SchemeBuilder.Register(&ClusterTrainingRuntime{}, &ClusterTrainingRuntimeList{}, &TrainingRuntime{}, &TrainingRuntimeList{})
-// }
+func init() {
+	SchemeBuilder.Register(&ClusterTrainingRuntime{}, &ClusterTrainingRuntimeList{}, &TrainingRuntime{}, &TrainingRuntimeList{})
+}
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
index df6348ff20..12e2da5a5f 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
@@ -22,6 +22,11 @@ import (
 	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
 )
 
+// +kubebuilder:object:root=true
+// +kubebuilder:subresource:status
+// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.conditions[-1:].type`
+// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`
+
 // TrainJob represents configuration of a training job.
 type TrainJob struct {
 	metav1.TypeMeta `json:",inline"`
@@ -36,6 +41,19 @@ type TrainJob struct {
 	Status TrainJobStatus `json:"status,omitempty"`
 }
 
+// +kubebuilder:object:root=true
+
+// TrainJobList is a collection of training jobs.
+type TrainJobList struct {
+	metav1.TypeMeta `json:",inline"`
+
+	// Standard list metadata.
+	metav1.ListMeta `json:"metadata,omitempty"`
+
+	// List of TrainJobs.
+	Items []TrainJob `json:"items"`
+}
+
 // TrainJobSpec represents specification of the desired TrainJob.
 type TrainJobSpec struct {
 	// Reference to the training runtime.
@@ -226,18 +244,6 @@ type TrainJobStatus struct {
 	ReplicatedJobsStatus []jobsetv1alpha2.ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"`
 }
 
-// TrainJobList is a collection of training jobs.
-type TrainJobList struct {
-	metav1.TypeMeta `json:",inline"`
-
-	// Standard list metadata.
-	metav1.ListMeta `json:"metadata,omitempty"`
-
-	// List of TrainJobs.
-	Items []TrainJob `json:"items"`
+func init() {
+	SchemeBuilder.Register(&TrainJob{}, &TrainJobList{})
 }
-
-// TODO: Enable this after controller implementation.
-// func init() {
-// 	SchemeBuilder.Register(&TrainJob{}, &TrainJobList{})
-// }
diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
new file mode 100644
index 0000000000..cea5c2bde4
--- /dev/null
+++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
@@ -0,0 +1,736 @@
+//go:build !ignore_autogenerated
+
+// Copyright 2023 The Kubeflow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by controller-gen. DO NOT EDIT.
+
+package v2alpha1
+
+import (
+	"k8s.io/api/autoscaling/v2"
+	"k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	runtime "k8s.io/apimachinery/pkg/runtime"
+	"sigs.k8s.io/jobset/api/jobset/v1alpha2"
+)
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ClusterTrainingRuntime) DeepCopyInto(out *ClusterTrainingRuntime) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
+	in.Spec.DeepCopyInto(&out.Spec)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterTrainingRuntime.
+func (in *ClusterTrainingRuntime) DeepCopy() *ClusterTrainingRuntime {
+	if in == nil {
+		return nil
+	}
+	out := new(ClusterTrainingRuntime)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *ClusterTrainingRuntime) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ClusterTrainingRuntimeList) DeepCopyInto(out *ClusterTrainingRuntimeList) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ListMeta.DeepCopyInto(&out.ListMeta)
+	if in.Items != nil {
+		in, out := &in.Items, &out.Items
+		*out = make([]ClusterTrainingRuntime, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterTrainingRuntimeList.
+func (in *ClusterTrainingRuntimeList) DeepCopy() *ClusterTrainingRuntimeList {
+	if in == nil {
+		return nil
+	}
+	out := new(ClusterTrainingRuntimeList)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *ClusterTrainingRuntimeList) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ContainerOverrides) DeepCopyInto(out *ContainerOverrides) {
+	*out = *in
+	if in.Command != nil {
+		in, out := &in.Command, &out.Command
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
+	if in.Args != nil {
+		in, out := &in.Args, &out.Args
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
+	if in.Env != nil {
+		in, out := &in.Env, &out.Env
+		*out = make([]v1.EnvVar, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.EnvFrom != nil {
+		in, out := &in.EnvFrom, &out.EnvFrom
+		*out = make([]v1.EnvFromSource, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.VolumeMounts != nil {
+		in, out := &in.VolumeMounts, &out.VolumeMounts
+		*out = make([]v1.VolumeMount, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ContainerOverrides.
+func (in *ContainerOverrides) DeepCopy() *ContainerOverrides {
+	if in == nil {
+		return nil
+	}
+	out := new(ContainerOverrides)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DatasetConfig) DeepCopyInto(out *DatasetConfig) {
+	*out = *in
+	if in.Env != nil {
+		in, out := &in.Env, &out.Env
+		*out = make([]v1.EnvVar, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	out.SecretRef = in.SecretRef
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatasetConfig.
+func (in *DatasetConfig) DeepCopy() *DatasetConfig {
+	if in == nil {
+		return nil
+	}
+	out := new(DatasetConfig)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *InputModel) DeepCopyInto(out *InputModel) {
+	*out = *in
+	if in.Env != nil {
+		in, out := &in.Env, &out.Env
+		*out = make([]v1.EnvVar, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	out.SecretRef = in.SecretRef
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InputModel.
+func (in *InputModel) DeepCopy() *InputModel {
+	if in == nil {
+		return nil
+	}
+	out := new(InputModel)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *MLSpec) DeepCopyInto(out *MLSpec) {
+	*out = *in
+	if in.TorchSpec != nil {
+		in, out := &in.TorchSpec, &out.TorchSpec
+		*out = new(TorchSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.MPISpec != nil {
+		in, out := &in.MPISpec, &out.MPISpec
+		*out = new(MPISpec)
+		(*in).DeepCopyInto(*out)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpec.
+func (in *MLSpec) DeepCopy() *MLSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(MLSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *MPISpec) DeepCopyInto(out *MPISpec) {
+	*out = *in
+	if in.NumProcPerNode != nil {
+		in, out := &in.NumProcPerNode, &out.NumProcPerNode
+		*out = new(int32)
+		**out = **in
+	}
+	if in.MPIImplementation != nil {
+		in, out := &in.MPIImplementation, &out.MPIImplementation
+		*out = new(MPIImplementation)
+		**out = **in
+	}
+	if in.SSHAuthMountPath != nil {
+		in, out := &in.SSHAuthMountPath, &out.SSHAuthMountPath
+		*out = new(string)
+		**out = **in
+	}
+	if in.RunLauncherAsNode != nil {
+		in, out := &in.RunLauncherAsNode, &out.RunLauncherAsNode
+		*out = new(bool)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPISpec.
+func (in *MPISpec) DeepCopy() *MPISpec {
+	if in == nil {
+		return nil
+	}
+	out := new(MPISpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ModelConfig) DeepCopyInto(out *ModelConfig) {
+	*out = *in
+	if in.Input != nil {
+		in, out := &in.Input, &out.Input
+		*out = new(InputModel)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.Output != nil {
+		in, out := &in.Output, &out.Output
+		*out = new(OutputModel)
+		(*in).DeepCopyInto(*out)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelConfig.
+func (in *ModelConfig) DeepCopy() *ModelConfig {
+	if in == nil {
+		return nil
+	}
+	out := new(ModelConfig)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *OutputModel) DeepCopyInto(out *OutputModel) {
+	*out = *in
+	if in.Env != nil {
+		in, out := &in.Env, &out.Env
+		*out = make([]v1.EnvVar, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	out.SecretRef = in.SecretRef
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OutputModel.
+func (in *OutputModel) DeepCopy() *OutputModel {
+	if in == nil {
+		return nil
+	}
+	out := new(OutputModel)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodGroupSpec) DeepCopyInto(out *PodGroupSpec) {
+	*out = *in
+	if in.ScheduleTimeoutSeconds != nil {
+		in, out := &in.ScheduleTimeoutSeconds, &out.ScheduleTimeoutSeconds
+		*out = new(string)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupSpec.
+func (in *PodGroupSpec) DeepCopy() *PodGroupSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(PodGroupSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodSpecOverrides) DeepCopyInto(out *PodSpecOverrides) {
+	*out = *in
+	if in.TargetReplicatedJobs != nil {
+		in, out := &in.TargetReplicatedJobs, &out.TargetReplicatedJobs
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
+	if in.Containers != nil {
+		in, out := &in.Containers, &out.Containers
+		*out = make([]ContainerOverrides, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.InitContainers != nil {
+		in, out := &in.InitContainers, &out.InitContainers
+		*out = make([]ContainerOverrides, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.Volumes != nil {
+		in, out := &in.Volumes, &out.Volumes
+		*out = make([]v1.Volume, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.NodeSelector != nil {
+		in, out := &in.NodeSelector, &out.NodeSelector
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
+	if in.Tolerations != nil {
+		in, out := &in.Tolerations, &out.Tolerations
+		*out = make([]v1.Toleration, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodSpecOverrides.
+func (in *PodSpecOverrides) DeepCopy() *PodSpecOverrides {
+	if in == nil {
+		return nil
+	}
+	out := new(PodSpecOverrides)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TorchElasticPolicy) DeepCopyInto(out *TorchElasticPolicy) {
+	*out = *in
+	if in.MaxRestarts != nil {
+		in, out := &in.MaxRestarts, &out.MaxRestarts
+		*out = new(int32)
+		**out = **in
+	}
+	if in.MinNodes != nil {
+		in, out := &in.MinNodes, &out.MinNodes
+		*out = new(int32)
+		**out = **in
+	}
+	if in.MaxNodes != nil {
+		in, out := &in.MaxNodes, &out.MaxNodes
+		*out = new(int32)
+		**out = **in
+	}
+	if in.Metrics != nil {
+		in, out := &in.Metrics, &out.Metrics
+		*out = make([]v2.MetricSpec, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchElasticPolicy.
+func (in *TorchElasticPolicy) DeepCopy() *TorchElasticPolicy {
+	if in == nil {
+		return nil
+	}
+	out := new(TorchElasticPolicy)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TorchSpec) DeepCopyInto(out *TorchSpec) {
+	*out = *in
+	if in.NumProcPerNode != nil {
+		in, out := &in.NumProcPerNode, &out.NumProcPerNode
+		*out = new(string)
+		**out = **in
+	}
+	if in.Standalone != nil {
+		in, out := &in.Standalone, &out.Standalone
+		*out = new(bool)
+		**out = **in
+	}
+	if in.ElasticPolicy != nil {
+		in, out := &in.ElasticPolicy, &out.ElasticPolicy
+		*out = new(TorchElasticPolicy)
+		(*in).DeepCopyInto(*out)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchSpec.
+func (in *TorchSpec) DeepCopy() *TorchSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(TorchSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TrainJob) DeepCopyInto(out *TrainJob) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
+	in.Spec.DeepCopyInto(&out.Spec)
+	in.Status.DeepCopyInto(&out.Status)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJob.
+func (in *TrainJob) DeepCopy() *TrainJob {
+	if in == nil {
+		return nil
+	}
+	out := new(TrainJob)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *TrainJob) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TrainJobList) DeepCopyInto(out *TrainJobList) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ListMeta.DeepCopyInto(&out.ListMeta)
+	if in.Items != nil {
+		in, out := &in.Items, &out.Items
+		*out = make([]TrainJob, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJobList.
+func (in *TrainJobList) DeepCopy() *TrainJobList {
+	if in == nil {
+		return nil
+	}
+	out := new(TrainJobList)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *TrainJobList) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TrainJobSpec) DeepCopyInto(out *TrainJobSpec) {
+	*out = *in
+	in.TrainingRuntimeRef.DeepCopyInto(&out.TrainingRuntimeRef)
+	if in.Trainer != nil {
+		in, out := &in.Trainer, &out.Trainer
+		*out = new(Trainer)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.DatasetConfig != nil {
+		in, out := &in.DatasetConfig, &out.DatasetConfig
+		*out = new(DatasetConfig)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.ModelConfig != nil {
+		in, out := &in.ModelConfig, &out.ModelConfig
+		*out = new(ModelConfig)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.Labels != nil {
+		in, out := &in.Labels, &out.Labels
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
+	if in.Annotations != nil {
+		in, out := &in.Annotations, &out.Annotations
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
+	if in.PodSpecOverrides != nil {
+		in, out := &in.PodSpecOverrides, &out.PodSpecOverrides
+		*out = make([]PodSpecOverrides, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.Suspend != nil {
+		in, out := &in.Suspend, &out.Suspend
+		*out = new(bool)
+		**out = **in
+	}
+	if in.ManagedBy != nil {
+		in, out := &in.ManagedBy, &out.ManagedBy
+		*out = new(string)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJobSpec.
+func (in *TrainJobSpec) DeepCopy() *TrainJobSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(TrainJobSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TrainJobStatus) DeepCopyInto(out *TrainJobStatus) {
+	*out = *in
+	if in.Conditions != nil {
+		in, out := &in.Conditions, &out.Conditions
+		*out = make([]metav1.Condition, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.ReplicatedJobsStatus != nil {
+		in, out := &in.ReplicatedJobsStatus, &out.ReplicatedJobsStatus
+		*out = make([]v1alpha2.ReplicatedJobStatus, len(*in))
+		copy(*out, *in)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJobStatus.
+func (in *TrainJobStatus) DeepCopy() *TrainJobStatus {
+	if in == nil {
+		return nil
+	}
+	out := new(TrainJobStatus)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *Trainer) DeepCopyInto(out *Trainer) {
+	*out = *in
+	if in.Command != nil {
+		in, out := &in.Command, &out.Command
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
+	if in.Args != nil {
+		in, out := &in.Args, &out.Args
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
+	if in.Env != nil {
+		in, out := &in.Env, &out.Env
+		*out = make([]v1.EnvVar, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.NumNodes != nil {
+		in, out := &in.NumNodes, &out.NumNodes
+		*out = new(int32)
+		**out = **in
+	}
+	in.ResourcesPerNode.DeepCopyInto(&out.ResourcesPerNode)
+	if in.NumProcPerNode != nil {
+		in, out := &in.NumProcPerNode, &out.NumProcPerNode
+		*out = new(string)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Trainer.
+func (in *Trainer) DeepCopy() *Trainer {
+	if in == nil {
+		return nil
+	}
+	out := new(Trainer)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TrainingRuntime) DeepCopyInto(out *TrainingRuntime) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
+	in.Spec.DeepCopyInto(&out.Spec)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntime.
+func (in *TrainingRuntime) DeepCopy() *TrainingRuntime {
+	if in == nil {
+		return nil
+	}
+	out := new(TrainingRuntime)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *TrainingRuntime) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TrainingRuntimeList) DeepCopyInto(out *TrainingRuntimeList) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ListMeta.DeepCopyInto(&out.ListMeta)
+	if in.Items != nil {
+		in, out := &in.Items, &out.Items
+		*out = make([]TrainingRuntime, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeList.
+func (in *TrainingRuntimeList) DeepCopy() *TrainingRuntimeList {
+	if in == nil {
+		return nil
+	}
+	out := new(TrainingRuntimeList)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *TrainingRuntimeList) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TrainingRuntimeRef) DeepCopyInto(out *TrainingRuntimeRef) {
+	*out = *in
+	if in.APIGroup != nil {
+		in, out := &in.APIGroup, &out.APIGroup
+		*out = new(string)
+		**out = **in
+	}
+	if in.Kind != nil {
+		in, out := &in.Kind, &out.Kind
+		*out = new(string)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeRef.
+func (in *TrainingRuntimeRef) DeepCopy() *TrainingRuntimeRef {
+	if in == nil {
+		return nil
+	}
+	out := new(TrainingRuntimeRef)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *TrainingRuntimeSpec) DeepCopyInto(out *TrainingRuntimeSpec) {
+	*out = *in
+	if in.MLSpec != nil {
+		in, out := &in.MLSpec, &out.MLSpec
+		*out = new(MLSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.NumNodes != nil {
+		in, out := &in.NumNodes, &out.NumNodes
+		*out = new(int32)
+		**out = **in
+	}
+	if in.JobSetSpec != nil {
+		in, out := &in.JobSetSpec, &out.JobSetSpec
+		*out = new(v1alpha2.JobSetSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.PodGroupSpec != nil {
+		in, out := &in.PodGroupSpec, &out.PodGroupSpec
+		*out = new(PodGroupSpec)
+		(*in).DeepCopyInto(*out)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeSpec.
+func (in *TrainingRuntimeSpec) DeepCopy() *TrainingRuntimeSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(TrainingRuntimeSpec)
+	in.DeepCopyInto(out)
+	return out
+}

From bfa1f200911b126dc4cbc0c05e85464a4585ca2f Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Fri, 16 Aug 2024 14:44:57 +0100
Subject: [PATCH 04/12] Add SchemeGroupVersion

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go b/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go
index 450a22ad3d..a4cc2fd7d5 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/groupversion_info.go
@@ -31,6 +31,9 @@ var (
 	// SchemeBuilder is used to add go types to the GroupVersionKind scheme.
 	SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
 
+	// SchemeGroupVersion is alias to GroupVersion for client-go libraries.
+	SchemeGroupVersion = GroupVersion
+
 	// AddToScheme adds the types in this group-version to the given scheme.
 	AddToScheme = SchemeBuilder.AddToScheme
 )

From 2bf13c9af60a028242d86afeb406b78f94b83751 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Mon, 19 Aug 2024 16:57:42 +0100
Subject: [PATCH 05/12] Fix TrainingRuntimeSpec omitempty

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index c767b1a4bb..dce5aeb421 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -62,7 +62,7 @@ type TrainingRuntime struct {
 	metav1.ObjectMeta `json:"metadata,omitempty"`
 
 	// Specification of the desired TrainingRuntime.
-	Spec TrainingRuntimeSpec `json:"spec"`
+	Spec TrainingRuntimeSpec `json:"spec,omitempty"`
 }
 
 // +kubebuilder:object:root=true

From 72a933e500ed04f359d91d9b94c2f6a059bcbcf6 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Mon, 19 Aug 2024 17:52:37 +0100
Subject: [PATCH 06/12] Generate manifests only for v1

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1727392003..4cb00d8578 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,7 @@ help: ## Display this help.
 ##@ Development
 
 manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
-	$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." \
+	$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/apis/kubeflow.org/v1/..." \
 		output:crd:artifacts:config=manifests/base/crds \
 		output:rbac:artifacts:config=manifests/base/rbac \
 		output:webhook:artifacts:config=manifests/base/webhook

From 9d0a6865de8b604ad3ed0db67e90817d5f4ed38d Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Tue, 20 Aug 2024 17:22:36 +0100
Subject: [PATCH 07/12] Fix pointers for APIs

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 .../v2alpha1/trainingruntime_types.go            |  3 ++-
 pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go | 16 ++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index dce5aeb421..0695b27567 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -171,7 +171,8 @@ type MPISpec struct {
 	NumProcPerNode *int32 `json:"numProcPerNode,omitempty"`
 
 	// Implementation name for the MPI to create the appropriate hostfile.
-	MPIImplementation *MPIImplementation `json:"mpiImplementation"`
+	// Defaults to OpenMPI.
+	MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
 
 	// Directory where SSH keys are mounted.
 	SSHAuthMountPath *string `json:"SSHAuthMountPath,omitempty"`
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
index 12e2da5a5f..99189df4f7 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
@@ -108,7 +108,7 @@ type TrainingRuntimeRef struct {
 // Every training runtime contains `trainer` container which represents Trainer.
 type Trainer struct {
 	// Docker image for the training container.
-	Image string `json:"image,omitempty"`
+	Image *string `json:"image,omitempty"`
 
 	// Entrypoint commands for the training container.
 	Command []string `json:"command,omitempty"`
@@ -125,7 +125,7 @@ type Trainer struct {
 	NumNodes *int32 `json:"numNodes,omitempty"`
 
 	// Compute resources for each training node.
-	ResourcesPerNode corev1.ResourceRequirements `json:"resourcesPerNode,omitempty"`
+	ResourcesPerNode *corev1.ResourceRequirements `json:"resourcesPerNode,omitempty"`
 
 	// Number of processes/workers/slots on every training node.
 	// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
@@ -138,14 +138,14 @@ type Trainer struct {
 // the `dataset-initializer` container in the `Initializer` Job.
 type DatasetConfig struct {
 	// Storage uri for the dataset provider.
-	StorageUri string `json:"storageUri"`
+	StorageUri *string `json:"storageUri,omitempty"`
 
 	// List of environment variables to set in the dataset initializer container.
 	// These values will be merged with the TrainingRuntime's dataset initializer environments.
 	Env []corev1.EnvVar `json:"env,omitempty"`
 
 	// Reference to the TrainJob's secrets to download dataset.
-	SecretRef corev1.SecretReference `json:"secretRef,omitempty"`
+	SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
 }
 
 // ModelConfig represents the desired model configuration.
@@ -164,27 +164,27 @@ type ModelConfig struct {
 // InputModel represents the desired pre-trained model configuration.
 type InputModel struct {
 	// Storage uri for the model provider.
-	StorageUri string `json:"storageUri"`
+	StorageUri *string `json:"storageUri,omitempty"`
 
 	// List of environment variables to set in the model initializer container.
 	// These values will be merged with the TrainingRuntime's model initializer environments.
 	Env []corev1.EnvVar `json:"env,omitempty"`
 
 	// Reference to the TrainJob's secrets to download model.
-	SecretRef corev1.SecretReference `json:"secretRef,omitempty"`
+	SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
 }
 
 // OutputModel represents the desired trained model configuration.
 type OutputModel struct {
 	// Storage uri for the model exporter.
-	StorageUri string `json:"storageUri"`
+	StorageUri *string `json:"storageUri,omitempty"`
 
 	// List of environment variables to set in the model exporter container.
 	// These values will be merged with the TrainingRuntime's model exporter environments.
 	Env []corev1.EnvVar `json:"env,omitempty"`
 
 	// Reference to the TrainJob's secrets to export model.
-	SecretRef corev1.SecretReference `json:"secretRef,omitempty"`
+	SecretRef *corev1.SecretReference `json:"secretRef,omitempty"`
 }
 
 // PodSpecOverrides represents the custom overrides that will be applied for the TrainJob's resources.

From 880560c6ebdcd535004c6e13d0e599945ebe0897 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Tue, 20 Aug 2024 18:55:23 +0100
Subject: [PATCH 08/12] Run code-gen

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 .../v2alpha1/zz_generated.deepcopy.go         | 49 +++++++++++++++----
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
index cea5c2bde4..06d6934377 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
@@ -133,6 +133,11 @@ func (in *ContainerOverrides) DeepCopy() *ContainerOverrides {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DatasetConfig) DeepCopyInto(out *DatasetConfig) {
 	*out = *in
+	if in.StorageUri != nil {
+		in, out := &in.StorageUri, &out.StorageUri
+		*out = new(string)
+		**out = **in
+	}
 	if in.Env != nil {
 		in, out := &in.Env, &out.Env
 		*out = make([]v1.EnvVar, len(*in))
@@ -140,7 +145,11 @@ func (in *DatasetConfig) DeepCopyInto(out *DatasetConfig) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
-	out.SecretRef = in.SecretRef
+	if in.SecretRef != nil {
+		in, out := &in.SecretRef, &out.SecretRef
+		*out = new(v1.SecretReference)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatasetConfig.
@@ -156,6 +165,11 @@ func (in *DatasetConfig) DeepCopy() *DatasetConfig {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *InputModel) DeepCopyInto(out *InputModel) {
 	*out = *in
+	if in.StorageUri != nil {
+		in, out := &in.StorageUri, &out.StorageUri
+		*out = new(string)
+		**out = **in
+	}
 	if in.Env != nil {
 		in, out := &in.Env, &out.Env
 		*out = make([]v1.EnvVar, len(*in))
@@ -163,7 +177,11 @@ func (in *InputModel) DeepCopyInto(out *InputModel) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
-	out.SecretRef = in.SecretRef
+	if in.SecretRef != nil {
+		in, out := &in.SecretRef, &out.SecretRef
+		*out = new(v1.SecretReference)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InputModel.
@@ -209,11 +227,6 @@ func (in *MPISpec) DeepCopyInto(out *MPISpec) {
 		*out = new(int32)
 		**out = **in
 	}
-	if in.MPIImplementation != nil {
-		in, out := &in.MPIImplementation, &out.MPIImplementation
-		*out = new(MPIImplementation)
-		**out = **in
-	}
 	if in.SSHAuthMountPath != nil {
 		in, out := &in.SSHAuthMountPath, &out.SSHAuthMountPath
 		*out = new(string)
@@ -264,6 +277,11 @@ func (in *ModelConfig) DeepCopy() *ModelConfig {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *OutputModel) DeepCopyInto(out *OutputModel) {
 	*out = *in
+	if in.StorageUri != nil {
+		in, out := &in.StorageUri, &out.StorageUri
+		*out = new(string)
+		**out = **in
+	}
 	if in.Env != nil {
 		in, out := &in.Env, &out.Env
 		*out = make([]v1.EnvVar, len(*in))
@@ -271,7 +289,11 @@ func (in *OutputModel) DeepCopyInto(out *OutputModel) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
-	out.SecretRef = in.SecretRef
+	if in.SecretRef != nil {
+		in, out := &in.SecretRef, &out.SecretRef
+		*out = new(v1.SecretReference)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OutputModel.
@@ -577,6 +599,11 @@ func (in *TrainJobStatus) DeepCopy() *TrainJobStatus {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *Trainer) DeepCopyInto(out *Trainer) {
 	*out = *in
+	if in.Image != nil {
+		in, out := &in.Image, &out.Image
+		*out = new(string)
+		**out = **in
+	}
 	if in.Command != nil {
 		in, out := &in.Command, &out.Command
 		*out = make([]string, len(*in))
@@ -599,7 +626,11 @@ func (in *Trainer) DeepCopyInto(out *Trainer) {
 		*out = new(int32)
 		**out = **in
 	}
-	in.ResourcesPerNode.DeepCopyInto(&out.ResourcesPerNode)
+	if in.ResourcesPerNode != nil {
+		in, out := &in.ResourcesPerNode, &out.ResourcesPerNode
+		*out = new(v1.ResourceRequirements)
+		(*in).DeepCopyInto(*out)
+	}
 	if in.NumProcPerNode != nil {
 		in, out := &in.NumProcPerNode, &out.NumProcPerNode
 		*out = new(string)

From 49a004c0c5ccfad9dc48d7fbafa6d35028644cbe Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Wed, 21 Aug 2024 17:39:37 +0100
Subject: [PATCH 09/12] Use pointer for MPIImplementation

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go | 2 +-
 pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index 0695b27567..7a740d11b4 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -172,7 +172,7 @@ type MPISpec struct {
 
 	// Implementation name for the MPI to create the appropriate hostfile.
 	// Defaults to OpenMPI.
-	MPIImplementation MPIImplementation `json:"mpiImplementation,omitempty"`
+	MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"`
 
 	// Directory where SSH keys are mounted.
 	SSHAuthMountPath *string `json:"SSHAuthMountPath,omitempty"`
diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
index 06d6934377..5b8870acb6 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
@@ -227,6 +227,11 @@ func (in *MPISpec) DeepCopyInto(out *MPISpec) {
 		*out = new(int32)
 		**out = **in
 	}
+	if in.MPIImplementation != nil {
+		in, out := &in.MPIImplementation, &out.MPIImplementation
+		*out = new(MPIImplementation)
+		**out = **in
+	}
 	if in.SSHAuthMountPath != nil {
 		in, out := &in.SSHAuthMountPath, &out.SSHAuthMountPath
 		*out = new(string)

From c28a1669e7e28647c8f0c796368297447c5e3a56 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Thu, 22 Aug 2024 17:45:13 +0100
Subject: [PATCH 10/12] Update the JobSetTemplate API

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 .../v2alpha1/trainingruntime_types.go         | 57 +++++++-----
 .../v2alpha1/zz_generated.deepcopy.go         | 88 ++++++++++++-------
 2 files changed, 92 insertions(+), 53 deletions(-)

diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index 7a740d11b4..ede9920b41 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -80,18 +80,25 @@ type TrainingRuntimeList struct {
 
 // TrainingRuntimeSpec represents a specification of the desired training runtime.
 type TrainingRuntimeSpec struct {
-	// Configuration for the runtime-specific parameters, such as Torch or MPI.
-	MLSpec *MLSpec `json:"mlSpec,omitempty"`
 
-	// Number of training nodes.
-	// Defaults to 1.
-	NumNodes *int32 `json:"numNodes,omitempty"`
-
-	// JobSet configuration which will be used by TrainJob.
-	JobSetSpec *jobsetv1alpha2.JobSetSpec `json:",inline"`
+	// Configuration for the model training with ML-specific parameters.
+	MLSpec *MLSpec `json:"mlSpec,omitempty"`
 
 	// Configuration for the PodGroup to enable gang-scheduling via supported plugins.
 	PodGroupSpec *PodGroupSpec `json:"podGroupSpec,omitempty"`
+
+	// JobSet template which will be used by TrainJob.
+	Template JobSetTemplateSpec `json:"template"`
+}
+
+// JobSetTemplateSpec represents a template of the desired JobSet.
+type JobSetTemplateSpec struct {
+	// Metadata for custom JobSet's labels and annotations.
+	// JobSet name and namespace is equal to the TrainJob's name and namespace.
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	// Specification of the desired JobSet which will be created from TrainJob.
+	Spec jobsetv1alpha2.JobSetSpec `json:"spec,omitempty"`
 }
 
 // PodGroupSpec represents a PodGroup configuration to enable gang-scheduling.
@@ -114,18 +121,31 @@ const (
 	GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling"
 )
 
-// MLSpec represents the runtime-specific configuration for various technologies.
-// One of the following specs can be set.
+// MLSpec represents configuration for the model trining with ML-specific parameters.
 type MLSpec struct {
+
+	// Number of training nodes.
+	// Defaults to 1.
+	NumNodes *int32 `json:"numNodes,omitempty"`
+
+	// Configuration for the runtime-specific parameters, such as Torch or MPI.
+	// One of the following spec sources can be set.
+	MLSpecSource `json:",inline"`
+}
+
+// MLPolicySource represents the runtime-specific configuration for various technologies.
+// One of the following specs can be set.
+type MLSpecSource struct {
+
 	// Configuration for the PyTorch runtime.
-	TorchSpec *TorchSpec `json:"torchSpec,omitempty"`
+	Torch *TorchMLSpecSource `json:"torch,omitempty"`
 
 	// Configuration for the MPI Runtime.
-	MPISpec *MPISpec `json:"mpiSpec,omitempty"`
+	MPI *MPIMLSpecSource `json:"mpi,omitempty"`
 }
 
-// TorchSpec represents a PyTorch runtime configuration.
-type TorchSpec struct {
+// TorchMLSpecSource represents a PyTorch runtime configuration.
+type TorchMLSpecSource struct {
 	// Number of processes per node.
 	// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
 	// Supported values: `auto`, `cpu`, `gpu`, or int value.
@@ -133,11 +153,6 @@ type TorchSpec struct {
 	// Defaults to `auto`.
 	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
 
-	// Whether to run single-node multi-worker training.
-	// This value is inserted into the `--standalone` argument of the `torchrun` CLI.
-	// Defaults to false.
-	Standalone *bool `json:"standalone,omitempty"`
-
 	// Elastic policy for the PyTorch training.
 	ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`
 }
@@ -164,8 +179,8 @@ type TorchElasticPolicy struct {
 	Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
 }
 
-// MPISpec represents a MPI runtime configuration.
-type MPISpec struct {
+// MPIMLSpecSource represents a MPI runtime configuration.
+type MPIMLSpecSource struct {
 	// Number of processes per node.
 	// This value is equal to the number of slots for each node in the hostfile.
 	NumProcPerNode *int32 `json:"numProcPerNode,omitempty"`
diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
index 5b8870acb6..20d4ba0ef0 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
@@ -195,18 +195,31 @@ func (in *InputModel) DeepCopy() *InputModel {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *MLSpec) DeepCopyInto(out *MLSpec) {
+func (in *JobSetTemplateSpec) DeepCopyInto(out *JobSetTemplateSpec) {
 	*out = *in
-	if in.TorchSpec != nil {
-		in, out := &in.TorchSpec, &out.TorchSpec
-		*out = new(TorchSpec)
-		(*in).DeepCopyInto(*out)
+	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
+	in.Spec.DeepCopyInto(&out.Spec)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new JobSetTemplateSpec.
+func (in *JobSetTemplateSpec) DeepCopy() *JobSetTemplateSpec {
+	if in == nil {
+		return nil
 	}
-	if in.MPISpec != nil {
-		in, out := &in.MPISpec, &out.MPISpec
-		*out = new(MPISpec)
-		(*in).DeepCopyInto(*out)
+	out := new(JobSetTemplateSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *MLSpec) DeepCopyInto(out *MLSpec) {
+	*out = *in
+	if in.NumNodes != nil {
+		in, out := &in.NumNodes, &out.NumNodes
+		*out = new(int32)
+		**out = **in
 	}
+	in.MLSpecSource.DeepCopyInto(&out.MLSpecSource)
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpec.
@@ -220,7 +233,32 @@ func (in *MLSpec) DeepCopy() *MLSpec {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *MPISpec) DeepCopyInto(out *MPISpec) {
+func (in *MLSpecSource) DeepCopyInto(out *MLSpecSource) {
+	*out = *in
+	if in.Torch != nil {
+		in, out := &in.Torch, &out.Torch
+		*out = new(TorchMLSpecSource)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.MPI != nil {
+		in, out := &in.MPI, &out.MPI
+		*out = new(MPIMLSpecSource)
+		(*in).DeepCopyInto(*out)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpecSource.
+func (in *MLSpecSource) DeepCopy() *MLSpecSource {
+	if in == nil {
+		return nil
+	}
+	out := new(MLSpecSource)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *MPIMLSpecSource) DeepCopyInto(out *MPIMLSpecSource) {
 	*out = *in
 	if in.NumProcPerNode != nil {
 		in, out := &in.NumProcPerNode, &out.NumProcPerNode
@@ -244,12 +282,12 @@ func (in *MPISpec) DeepCopyInto(out *MPISpec) {
 	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPISpec.
-func (in *MPISpec) DeepCopy() *MPISpec {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIMLSpecSource.
+func (in *MPIMLSpecSource) DeepCopy() *MPIMLSpecSource {
 	if in == nil {
 		return nil
 	}
-	out := new(MPISpec)
+	out := new(MPIMLSpecSource)
 	in.DeepCopyInto(out)
 	return out
 }
@@ -424,18 +462,13 @@ func (in *TorchElasticPolicy) DeepCopy() *TorchElasticPolicy {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *TorchSpec) DeepCopyInto(out *TorchSpec) {
+func (in *TorchMLSpecSource) DeepCopyInto(out *TorchMLSpecSource) {
 	*out = *in
 	if in.NumProcPerNode != nil {
 		in, out := &in.NumProcPerNode, &out.NumProcPerNode
 		*out = new(string)
 		**out = **in
 	}
-	if in.Standalone != nil {
-		in, out := &in.Standalone, &out.Standalone
-		*out = new(bool)
-		**out = **in
-	}
 	if in.ElasticPolicy != nil {
 		in, out := &in.ElasticPolicy, &out.ElasticPolicy
 		*out = new(TorchElasticPolicy)
@@ -443,12 +476,12 @@ func (in *TorchSpec) DeepCopyInto(out *TorchSpec) {
 	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchSpec.
-func (in *TorchSpec) DeepCopy() *TorchSpec {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchMLSpecSource.
+func (in *TorchMLSpecSource) DeepCopy() *TorchMLSpecSource {
 	if in == nil {
 		return nil
 	}
-	out := new(TorchSpec)
+	out := new(TorchMLSpecSource)
 	in.DeepCopyInto(out)
 	return out
 }
@@ -744,21 +777,12 @@ func (in *TrainingRuntimeSpec) DeepCopyInto(out *TrainingRuntimeSpec) {
 		*out = new(MLSpec)
 		(*in).DeepCopyInto(*out)
 	}
-	if in.NumNodes != nil {
-		in, out := &in.NumNodes, &out.NumNodes
-		*out = new(int32)
-		**out = **in
-	}
-	if in.JobSetSpec != nil {
-		in, out := &in.JobSetSpec, &out.JobSetSpec
-		*out = new(v1alpha2.JobSetSpec)
-		(*in).DeepCopyInto(*out)
-	}
 	if in.PodGroupSpec != nil {
 		in, out := &in.PodGroupSpec, &out.PodGroupSpec
 		*out = new(PodGroupSpec)
 		(*in).DeepCopyInto(*out)
 	}
+	in.Template.DeepCopyInto(&out.Template)
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeSpec.

From 06e7653474bcfa269a7e60d820154d8decf850c8 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Mon, 26 Aug 2024 20:29:39 +0100
Subject: [PATCH 11/12] Rename PodGroupPolicy and MLPolicy APIs

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 .../v2alpha1/trainingruntime_types.go         |  60 ++++++-----
 .../v2alpha1/zz_generated.deepcopy.go         | 102 ++++++++++++------
 2 files changed, 102 insertions(+), 60 deletions(-)

diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index ede9920b41..4acb3d2d62 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -82,10 +82,10 @@ type TrainingRuntimeList struct {
 type TrainingRuntimeSpec struct {
 
 	// Configuration for the model training with ML-specific parameters.
-	MLSpec *MLSpec `json:"mlSpec,omitempty"`
+	MLPolicy *MLPolicy `json:"mlPolicy,omitempty"`
 
 	// Configuration for the PodGroup to enable gang-scheduling via supported plugins.
-	PodGroupSpec *PodGroupSpec `json:"podGroupSpec,omitempty"`
+	PodGroupPolicy *PodGroupPolicy `json:"podGroupPolicy,omitempty"`
 
 	// JobSet template which will be used by TrainJob.
 	Template JobSetTemplateSpec `json:"template"`
@@ -101,51 +101,57 @@ type JobSetTemplateSpec struct {
 	Spec jobsetv1alpha2.JobSetSpec `json:"spec,omitempty"`
 }
 
-// PodGroupSpec represents a PodGroup configuration to enable gang-scheduling.
-type PodGroupSpec struct {
-	// Plugin for the gang-scheduling.
-	Plugin GangSchedulerPlugin `json:"plugin"`
+// PodGroupPolicy represents a PodGroup configuration for gang-scheduling.
+type PodGroupPolicy struct {
 
-	// Time threshold to schedule PodGroup for gang-scheduling.
-	ScheduleTimeoutSeconds *string `json:"scheduleTimeoutSeconds,omitempty"`
+	// Configuration for gang-scheduling using various plugins.
+	PodGroupPolicySource `json:",inline"`
 }
 
-// GangSchedulerPlugin represents one of the supported gang-scheduling plugins.
-type GangSchedulerPlugin string
-
-const (
-	// Volcano plugin for gang-scheduling.
-	GangSchedulerPluginVolcano GangSchedulerPlugin = "volcano"
+// PodGroupPolicySource represents supported plugins for gang-scheduling.
+// Only one of its members may be specified.
+type PodGroupPolicySource struct {
 
 	// Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling.
-	GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling"
-)
+	Coscheduling *CoschedulingPodGroupPolicySource `json:"coscheduling,omitempty"`
+
+	// TODO (andreyvelich): Add support for Volcano gang-scheduler.
+}
+
+// CoschedulingPodGroupPolicySource represents configuration for coscheduling plugin.
+type CoschedulingPodGroupPolicySource struct {
+
+	// Time threshold to schedule PodGroup for gang-scheduling.
+	// If the scheduling timeout is equal to 0, the default value is used.
+	// Defaults to 60 seconds.
+	ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
+}
 
-// MLSpec represents configuration for the model trining with ML-specific parameters.
-type MLSpec struct {
+// MLPolicy represents configuration for the model trining with ML-specific parameters.
+type MLPolicy struct {
 
 	// Number of training nodes.
 	// Defaults to 1.
 	NumNodes *int32 `json:"numNodes,omitempty"`
 
 	// Configuration for the runtime-specific parameters, such as Torch or MPI.
-	// One of the following spec sources can be set.
-	MLSpecSource `json:",inline"`
+	// Only one of its members may be specified.
+	MLPolicySource `json:",inline"`
 }
 
 // MLPolicySource represents the runtime-specific configuration for various technologies.
 // One of the following specs can be set.
-type MLSpecSource struct {
+type MLPolicySource struct {
 
 	// Configuration for the PyTorch runtime.
-	Torch *TorchMLSpecSource `json:"torch,omitempty"`
+	Torch *TorchMLPolicySource `json:"torch,omitempty"`
 
 	// Configuration for the MPI Runtime.
-	MPI *MPIMLSpecSource `json:"mpi,omitempty"`
+	MPI *MPIMLPolicySource `json:"mpi,omitempty"`
 }
 
-// TorchMLSpecSource represents a PyTorch runtime configuration.
-type TorchMLSpecSource struct {
+// TorchMLPolicySource represents a PyTorch runtime configuration.
+type TorchMLPolicySource struct {
 	// Number of processes per node.
 	// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
 	// Supported values: `auto`, `cpu`, `gpu`, or int value.
@@ -179,8 +185,8 @@ type TorchElasticPolicy struct {
 	Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
 }
 
-// MPIMLSpecSource represents a MPI runtime configuration.
-type MPIMLSpecSource struct {
+// MPIMLPolicySource represents a MPI runtime configuration.
+type MPIMLPolicySource struct {
 	// Number of processes per node.
 	// This value is equal to the number of slots for each node in the hostfile.
 	NumProcPerNode *int32 `json:"numProcPerNode,omitempty"`
diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
index 20d4ba0ef0..db62e89800 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
@@ -130,6 +130,26 @@ func (in *ContainerOverrides) DeepCopy() *ContainerOverrides {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *CoschedulingPodGroupPolicySource) DeepCopyInto(out *CoschedulingPodGroupPolicySource) {
+	*out = *in
+	if in.ScheduleTimeoutSeconds != nil {
+		in, out := &in.ScheduleTimeoutSeconds, &out.ScheduleTimeoutSeconds
+		*out = new(int32)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CoschedulingPodGroupPolicySource.
+func (in *CoschedulingPodGroupPolicySource) DeepCopy() *CoschedulingPodGroupPolicySource {
+	if in == nil {
+		return nil
+	}
+	out := new(CoschedulingPodGroupPolicySource)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DatasetConfig) DeepCopyInto(out *DatasetConfig) {
 	*out = *in
@@ -212,53 +232,53 @@ func (in *JobSetTemplateSpec) DeepCopy() *JobSetTemplateSpec {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *MLSpec) DeepCopyInto(out *MLSpec) {
+func (in *MLPolicy) DeepCopyInto(out *MLPolicy) {
 	*out = *in
 	if in.NumNodes != nil {
 		in, out := &in.NumNodes, &out.NumNodes
 		*out = new(int32)
 		**out = **in
 	}
-	in.MLSpecSource.DeepCopyInto(&out.MLSpecSource)
+	in.MLPolicySource.DeepCopyInto(&out.MLPolicySource)
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpec.
-func (in *MLSpec) DeepCopy() *MLSpec {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLPolicy.
+func (in *MLPolicy) DeepCopy() *MLPolicy {
 	if in == nil {
 		return nil
 	}
-	out := new(MLSpec)
+	out := new(MLPolicy)
 	in.DeepCopyInto(out)
 	return out
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *MLSpecSource) DeepCopyInto(out *MLSpecSource) {
+func (in *MLPolicySource) DeepCopyInto(out *MLPolicySource) {
 	*out = *in
 	if in.Torch != nil {
 		in, out := &in.Torch, &out.Torch
-		*out = new(TorchMLSpecSource)
+		*out = new(TorchMLPolicySource)
 		(*in).DeepCopyInto(*out)
 	}
 	if in.MPI != nil {
 		in, out := &in.MPI, &out.MPI
-		*out = new(MPIMLSpecSource)
+		*out = new(MPIMLPolicySource)
 		(*in).DeepCopyInto(*out)
 	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLSpecSource.
-func (in *MLSpecSource) DeepCopy() *MLSpecSource {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLPolicySource.
+func (in *MLPolicySource) DeepCopy() *MLPolicySource {
 	if in == nil {
 		return nil
 	}
-	out := new(MLSpecSource)
+	out := new(MLPolicySource)
 	in.DeepCopyInto(out)
 	return out
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *MPIMLSpecSource) DeepCopyInto(out *MPIMLSpecSource) {
+func (in *MPIMLPolicySource) DeepCopyInto(out *MPIMLPolicySource) {
 	*out = *in
 	if in.NumProcPerNode != nil {
 		in, out := &in.NumProcPerNode, &out.NumProcPerNode
@@ -282,12 +302,12 @@ func (in *MPIMLSpecSource) DeepCopyInto(out *MPIMLSpecSource) {
 	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIMLSpecSource.
-func (in *MPIMLSpecSource) DeepCopy() *MPIMLSpecSource {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIMLPolicySource.
+func (in *MPIMLPolicySource) DeepCopy() *MPIMLPolicySource {
 	if in == nil {
 		return nil
 	}
-	out := new(MPIMLSpecSource)
+	out := new(MPIMLPolicySource)
 	in.DeepCopyInto(out)
 	return out
 }
@@ -350,21 +370,37 @@ func (in *OutputModel) DeepCopy() *OutputModel {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *PodGroupSpec) DeepCopyInto(out *PodGroupSpec) {
+func (in *PodGroupPolicy) DeepCopyInto(out *PodGroupPolicy) {
 	*out = *in
-	if in.ScheduleTimeoutSeconds != nil {
-		in, out := &in.ScheduleTimeoutSeconds, &out.ScheduleTimeoutSeconds
-		*out = new(string)
-		**out = **in
+	in.PodGroupPolicySource.DeepCopyInto(&out.PodGroupPolicySource)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupPolicy.
+func (in *PodGroupPolicy) DeepCopy() *PodGroupPolicy {
+	if in == nil {
+		return nil
+	}
+	out := new(PodGroupPolicy)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *PodGroupPolicySource) DeepCopyInto(out *PodGroupPolicySource) {
+	*out = *in
+	if in.Coscheduling != nil {
+		in, out := &in.Coscheduling, &out.Coscheduling
+		*out = new(CoschedulingPodGroupPolicySource)
+		(*in).DeepCopyInto(*out)
 	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupSpec.
-func (in *PodGroupSpec) DeepCopy() *PodGroupSpec {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupPolicySource.
+func (in *PodGroupPolicySource) DeepCopy() *PodGroupPolicySource {
 	if in == nil {
 		return nil
 	}
-	out := new(PodGroupSpec)
+	out := new(PodGroupPolicySource)
 	in.DeepCopyInto(out)
 	return out
 }
@@ -462,7 +498,7 @@ func (in *TorchElasticPolicy) DeepCopy() *TorchElasticPolicy {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *TorchMLSpecSource) DeepCopyInto(out *TorchMLSpecSource) {
+func (in *TorchMLPolicySource) DeepCopyInto(out *TorchMLPolicySource) {
 	*out = *in
 	if in.NumProcPerNode != nil {
 		in, out := &in.NumProcPerNode, &out.NumProcPerNode
@@ -476,12 +512,12 @@ func (in *TorchMLSpecSource) DeepCopyInto(out *TorchMLSpecSource) {
 	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchMLSpecSource.
-func (in *TorchMLSpecSource) DeepCopy() *TorchMLSpecSource {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchMLPolicySource.
+func (in *TorchMLPolicySource) DeepCopy() *TorchMLPolicySource {
 	if in == nil {
 		return nil
 	}
-	out := new(TorchMLSpecSource)
+	out := new(TorchMLPolicySource)
 	in.DeepCopyInto(out)
 	return out
 }
@@ -772,14 +808,14 @@ func (in *TrainingRuntimeRef) DeepCopy() *TrainingRuntimeRef {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *TrainingRuntimeSpec) DeepCopyInto(out *TrainingRuntimeSpec) {
 	*out = *in
-	if in.MLSpec != nil {
-		in, out := &in.MLSpec, &out.MLSpec
-		*out = new(MLSpec)
+	if in.MLPolicy != nil {
+		in, out := &in.MLPolicy, &out.MLPolicy
+		*out = new(MLPolicy)
 		(*in).DeepCopyInto(*out)
 	}
-	if in.PodGroupSpec != nil {
-		in, out := &in.PodGroupSpec, &out.PodGroupSpec
-		*out = new(PodGroupSpec)
+	if in.PodGroupPolicy != nil {
+		in, out := &in.PodGroupPolicy, &out.PodGroupPolicy
+		*out = new(PodGroupPolicy)
 		(*in).DeepCopyInto(*out)
 	}
 	in.Template.DeepCopyInto(&out.Template)

From 7aa4094c8df8e7ca0a6157967b06607e92db25cf Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Mon, 26 Aug 2024 21:26:41 +0100
Subject: [PATCH 12/12] Update comments

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index 4acb3d2d62..65346d0f6b 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -80,7 +80,6 @@ type TrainingRuntimeList struct {
 
 // TrainingRuntimeSpec represents a specification of the desired training runtime.
 type TrainingRuntimeSpec struct {
-
 	// Configuration for the model training with ML-specific parameters.
 	MLPolicy *MLPolicy `json:"mlPolicy,omitempty"`
 
@@ -103,7 +102,6 @@ type JobSetTemplateSpec struct {
 
 // PodGroupPolicy represents a PodGroup configuration for gang-scheduling.
 type PodGroupPolicy struct {
-
 	// Configuration for gang-scheduling using various plugins.
 	PodGroupPolicySource `json:",inline"`
 }
@@ -111,7 +109,6 @@ type PodGroupPolicy struct {
 // PodGroupPolicySource represents supported plugins for gang-scheduling.
 // Only one of its members may be specified.
 type PodGroupPolicySource struct {
-
 	// Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling.
 	Coscheduling *CoschedulingPodGroupPolicySource `json:"coscheduling,omitempty"`
 
@@ -119,8 +116,8 @@ type PodGroupPolicySource struct {
 }
 
 // CoschedulingPodGroupPolicySource represents configuration for coscheduling plugin.
+// The number of min members in the PodGroupSpec is always equal to the number of nodes.
 type CoschedulingPodGroupPolicySource struct {
-
 	// Time threshold to schedule PodGroup for gang-scheduling.
 	// If the scheduling timeout is equal to 0, the default value is used.
 	// Defaults to 60 seconds.
@@ -129,7 +126,6 @@ type CoschedulingPodGroupPolicySource struct {
 
 // MLPolicy represents configuration for the model trining with ML-specific parameters.
 type MLPolicy struct {
-
 	// Number of training nodes.
 	// Defaults to 1.
 	NumNodes *int32 `json:"numNodes,omitempty"`
@@ -142,7 +138,6 @@ type MLPolicy struct {
 // MLPolicySource represents the runtime-specific configuration for various technologies.
 // One of the following specs can be set.
 type MLPolicySource struct {
-
 	// Configuration for the PyTorch runtime.
 	Torch *TorchMLPolicySource `json:"torch,omitempty"`