Skip to content

Commit

Permalink
add dependabot config script (kubeflow#403)
Browse files Browse the repository at this point in the history
* add dependabot config script

* replace with new python script

* add main function
  • Loading branch information
davidspek authored and xiaozhouX committed Feb 5, 2021
1 parent b605824 commit e678e84
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 17 deletions.
13 changes: 9 additions & 4 deletions pkg/operators/et-operator/api/v1alpha1/trainingjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ type TrainingJobSpec struct {
}

type ETReplicaSpecs struct {
Launcher *common.ReplicaSpec `json:"Launcher"`
Worker *ETReplicaSpec `json:"Worker"`
Launcher *common.ReplicaSpec `json:"launcher"`
Worker *ETReplicaSpec `json:"worker"`
}

type ETReplicaSpec struct {
// Replicas is the desired number of replicas of the given template.
// If unspecified, defaults to 1.
Replicas *int32 `json:"Replicas,omitempty"`
Replicas *int32 `json:"replicas,omitempty"`

// MaxReplicas is the desired max number of replicas of the given template.
// If unspecified, MaxReplicas defaults to infinite.
Expand Down Expand Up @@ -83,10 +83,15 @@ const (
ETReplicaTypeWorker ETReplicaType = "Worker"
)


// TrainingJobStatus defines the observed state of TrainingJob
type TrainingJobStatus struct {
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// Important: Run "make" to regenerate code after modifying this file
common.JobStatus `json:",inline"`

TargetWorkers []string `json:"targetWorkers,omitempty"`
CurrentWorkers []string `json:"currentWorkers,omitempty"`
}

// +genclient
Expand All @@ -108,7 +113,7 @@ type TrainingJob struct {

// Most recently observed status of the PyTorchJob.
// Read-only (modified by the system).
Status common.JobStatus `json:"status,omitempty"`
Status TrainingJobStatus `json:"status,omitempty"`
}

// +kubebuilder:object:root=true
Expand Down
2 changes: 2 additions & 0 deletions pkg/training/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ const (
NVIDIAGPUResourceName = "nvidia.com/gpu"
// GPUShareResourceName is the gpushare resource name
GPUShareResourceName = "aliyun.com/gpu-mem"
// GPUShareResourceName is the gpushare resource name
AliyunGPUResourceName = "aliyun.com/gpu"

DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu"

Expand Down
18 changes: 14 additions & 4 deletions pkg/training/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,11 @@ func gpuInNodeDeprecated(node v1.Node) int64 {
}

func gpuInPod(pod v1.Pod) (gpuCount int64) {
containers := pod.Spec.Containers
return gpuInPodSpec(pod.Spec)
}

func gpuInPodSpec(spec v1.PodSpec) (gpuCount int64) {
containers := spec.Containers
for _, container := range containers {
gpuCount += gpuInContainer(container)
}
Expand Down Expand Up @@ -113,11 +117,17 @@ func gpuInActivePod(pod v1.Pod) (gpuCount int64) {
func gpuInContainer(container v1.Container) int64 {
val, ok := container.Resources.Limits[NVIDIAGPUResourceName]

if !ok {
return gpuInContainerDeprecated(container)
if ok {
return val.Value()
}

return val.Value()
val, ok = container.Resources.Limits[AliyunGPUResourceName]

if ok {
return val.Value()
}

return gpuInContainerDeprecated(container)
}

func gpuInContainerDeprecated(container v1.Container) int64 {
Expand Down
25 changes: 16 additions & 9 deletions pkg/training/trainer_et.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"context"
"encoding/json"
"fmt"
"github.com/kubeflow/arena/pkg/operators/et-operator/api/common"
"strings"
"time"

Expand Down Expand Up @@ -143,17 +144,23 @@ func (ej *ETJob) Duration() time.Duration {

// Requested GPU count of the Job
func (ej *ETJob) RequestedGPU() int64 {
if ej.requestedGPU > 0 {
return ej.requestedGPU
}
requestGPUs := getRequestGPUsOfJobFromPodAnnotation(ej.pods)
if requestGPUs > 0 {
return requestGPUs
var requestedGPU int64 = 0
job := ej.trainingjob
if status, ok := job.Status.ReplicaStatuses[common.ReplicaType(v1alpha1.ETReplicaTypeWorker)]; ok {
if job.Spec.ETReplicaSpecs.Worker != nil {
total := status.Succeeded + status.Failed + status.Active
gpuCountPerWorker := gpuInPodSpec(job.Spec.ETReplicaSpecs.Worker.Template.Spec)
requestedGPU += gpuCountPerWorker * int64(total)
}
}
for _, pod := range ej.pods {
ej.requestedGPU += gpuInPod(*pod)
if status, ok := job.Status.ReplicaStatuses[common.ReplicaType(v1alpha1.ETReplicaTypeLauncher)]; ok {
if job.Spec.ETReplicaSpecs.Launcher != nil {
total := status.Succeeded + status.Failed + status.Active
gpuCountPerWorker := gpuInPodSpec(job.Spec.ETReplicaSpecs.Launcher.Template.Spec)
requestedGPU += gpuCountPerWorker * int64(total)
}
}
return ej.requestedGPU
return requestedGPU
}

// Requested GPU count of the Job
Expand Down

0 comments on commit e678e84

Please sign in to comment.