add dependabot config script (kubeflow#403)

* add dependabot config script * replace with new python script * add main function
xiaozhouX · Feb 5, 2021 · e678e84 · e678e84
1 parent b605824
commit e678e84
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 17 deletions.
diff --git a/pkg/operators/et-operator/api/v1alpha1/trainingjob_types.go b/pkg/operators/et-operator/api/v1alpha1/trainingjob_types.go
@@ -44,14 +44,14 @@ type TrainingJobSpec struct {
 }
 
 type ETReplicaSpecs struct {
-	Launcher *common.ReplicaSpec `json:"Launcher"`
-	Worker   *ETReplicaSpec      `json:"Worker"`
+	Launcher *common.ReplicaSpec `json:"launcher"`
+	Worker   *ETReplicaSpec      `json:"worker"`
 }
 
 type ETReplicaSpec struct {
 	// Replicas is the desired number of replicas of the given template.
 	// If unspecified, defaults to 1.
-	Replicas *int32 `json:"Replicas,omitempty"`
+	Replicas *int32 `json:"replicas,omitempty"`
 
 	// MaxReplicas is the desired max number of replicas of the given template.
 	// If unspecified, MaxReplicas defaults to infinite.
@@ -83,10 +83,15 @@ const (
 	ETReplicaTypeWorker ETReplicaType = "Worker"
 )
 
+
 // TrainingJobStatus defines the observed state of TrainingJob
 type TrainingJobStatus struct {
 	// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
 	// Important: Run "make" to regenerate code after modifying this file
+	common.JobStatus `json:",inline"`
+
+	TargetWorkers  []string `json:"targetWorkers,omitempty"`
+	CurrentWorkers []string `json:"currentWorkers,omitempty"`
 }
 
 // +genclient
@@ -108,7 +113,7 @@ type TrainingJob struct {
 
 	// Most recently observed status of the PyTorchJob.
 	// Read-only (modified by the system).
-	Status common.JobStatus `json:"status,omitempty"`
+	Status TrainingJobStatus `json:"status,omitempty"`
 }
 
 // +kubebuilder:object:root=true

diff --git a/pkg/training/const.go b/pkg/training/const.go
@@ -23,6 +23,8 @@ const (
 	NVIDIAGPUResourceName = "nvidia.com/gpu"
 	// GPUShareResourceName is the gpushare resource name
 	GPUShareResourceName = "aliyun.com/gpu-mem"
+	// GPUShareResourceName is the gpushare resource name
+	AliyunGPUResourceName = "aliyun.com/gpu"
 
 	DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu"
 

diff --git a/pkg/training/gpu.go b/pkg/training/gpu.go
@@ -67,7 +67,11 @@ func gpuInNodeDeprecated(node v1.Node) int64 {
 }
 
 func gpuInPod(pod v1.Pod) (gpuCount int64) {
-	containers := pod.Spec.Containers
+	return gpuInPodSpec(pod.Spec)
+}
+
+func gpuInPodSpec(spec v1.PodSpec) (gpuCount int64) {
+	containers := spec.Containers
 	for _, container := range containers {
 		gpuCount += gpuInContainer(container)
 	}
@@ -113,11 +117,17 @@ func gpuInActivePod(pod v1.Pod) (gpuCount int64) {
 func gpuInContainer(container v1.Container) int64 {
 	val, ok := container.Resources.Limits[NVIDIAGPUResourceName]
 
-	if !ok {
-		return gpuInContainerDeprecated(container)
+	if ok {
+		return val.Value()
 	}
 
-	return val.Value()
+	val, ok = container.Resources.Limits[AliyunGPUResourceName]
+
+	if ok {
+		return val.Value()
+	}
+
+	return gpuInContainerDeprecated(container)
 }
 
 func gpuInContainerDeprecated(container v1.Container) int64 {

diff --git a/pkg/training/trainer_et.go b/pkg/training/trainer_et.go
@@ -18,6 +18,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"github.com/kubeflow/arena/pkg/operators/et-operator/api/common"
 	"strings"
 	"time"
 
@@ -143,17 +144,23 @@ func (ej *ETJob) Duration() time.Duration {
 
 // Requested GPU count of the Job
 func (ej *ETJob) RequestedGPU() int64 {
-	if ej.requestedGPU > 0 {
-		return ej.requestedGPU
-	}
-	requestGPUs := getRequestGPUsOfJobFromPodAnnotation(ej.pods)
-	if requestGPUs > 0 {
-		return requestGPUs
+	var requestedGPU int64 = 0
+	job := ej.trainingjob
+	if status, ok := job.Status.ReplicaStatuses[common.ReplicaType(v1alpha1.ETReplicaTypeWorker)]; ok {
+		if job.Spec.ETReplicaSpecs.Worker != nil {
+			total := status.Succeeded + status.Failed + status.Active
+			gpuCountPerWorker := gpuInPodSpec(job.Spec.ETReplicaSpecs.Worker.Template.Spec)
+			requestedGPU += gpuCountPerWorker * int64(total)
+		}
 	}
-	for _, pod := range ej.pods {
-		ej.requestedGPU += gpuInPod(*pod)
+	if status, ok := job.Status.ReplicaStatuses[common.ReplicaType(v1alpha1.ETReplicaTypeLauncher)]; ok {
+		if job.Spec.ETReplicaSpecs.Launcher != nil {
+			total := status.Succeeded + status.Failed + status.Active
+			gpuCountPerWorker := gpuInPodSpec(job.Spec.ETReplicaSpecs.Launcher.Template.Spec)
+			requestedGPU += gpuCountPerWorker * int64(total)
+		}
 	}
-	return ej.requestedGPU
+	return requestedGPU
 }
 
 // Requested GPU count of the Job