diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..10e7a6d1c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,337 @@ +updates: +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: . + open-pull-requests-limit: 10 + package-ecosystem: docker + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: samples/docker/serve-custom-sample + open-pull-requests-limit: 10 + package-ecosystem: docker + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/golang.org/x/net/http2 + open-pull-requests-limit: 10 + package-ecosystem: docker + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: kubernetes-artifacts/tf-operator + open-pull-requests-limit: 10 + package-ecosystem: docker + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: kubernetes-artifacts/jobmon + open-pull-requests-limit: 10 + package-ecosystem: docker + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: . + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - dims + - thockin + - justinsb + - tallclair + - piosz + - brancz + - DirectXMan12 + - lavalamp + directory: vendor/k8s.io/klog + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - jayunit100 + - hoegaarden + - andyxning + - neolit123 + - pohly + - yagonobre + - vincepri + - detiber + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/google.golang.org/appengine + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/json-iterator/go + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/hashicorp/golang-lru + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/hashicorp/hcl + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/mitchellh/go-homedir + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/mitchellh/mapstructure + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/fsnotify/fsnotify + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/spf13/pflag + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/spf13/viper + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/spf13/afero + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/spf13/cast + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/spf13/jwalterweatherman + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/magiconair/properties + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/google/gofuzz + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/konsorten/go-windows-terminal-sequences + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/github.com/sirupsen/logrus + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/golang.org/x/oauth2 + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +- assignees: + - cheyang + - wsxiaozhang + - denverdino + directory: vendor/gopkg.in/yaml.v2 + open-pull-requests-limit: 10 + package-ecosystem: gomod + reviewers: + - GarnettWang + - xiaozhouX + - osswangxining + schedule: + interval: daily +version: 2 diff --git a/Makefile b/Makefile index 0e6c6a04a..fdac052cc 100644 --- a/Makefile +++ b/Makefile @@ -119,3 +119,7 @@ build-pkg: docker run -itd --name=arena-pkg arena-build:${VERSION}-${GIT_SHORT_COMMIT}-${OS_ARCH} /bin/bash docker cp arena-pkg:/arena-installer-${VERSION}-${GIT_SHORT_COMMIT}-${OS_ARCH}.tar.gz . docker rm -f arena-pkg + + +build-dependabot: + python3 hack/create_dependabot.py diff --git a/hack/create_dependabot.py b/hack/create_dependabot.py new file mode 100644 index 000000000..a83f04de5 --- /dev/null +++ b/hack/create_dependabot.py @@ -0,0 +1,100 @@ +import yaml +import collections +from pathlib import Path + +dependabot = {} +dependabot['version'] = 2 +dependabot['updates'] = [] +ignored_folders = ['node_modules', 'dist', '.git', 'deprecated'] + +def get_owners(path): + while not Path(path/'OWNERS').is_file(): + path = path.parent.absolute() + with open(path/'OWNERS') as owner_file: + owners = yaml.load(owner_file) + return owners + +def get_docker_paths(): + dockerfile_list = list(repo_path.glob('**/*ockerfile*')) + docker_clean_list = [] + for dockerfile in dockerfile_list: + if all(x not in str(dockerfile) for x in ignored_folders): + if dockerfile.parents[0] not in docker_clean_list: + docker_clean_list.append(dockerfile.parents[0]) + return docker_clean_list + +def get_npm_paths(): + npm_list = list(repo_path.glob('**/package*.json')) + npm_clean_list = [] + for npm_file in npm_list: + if all(x not in str(npm_file) for x in ignored_folders): + if npm_file.parents[0] not in npm_clean_list: + npm_clean_list.append(npm_file.parents[0]) + return npm_clean_list + +def get_pip_paths(): + pip_list = list(repo_path.glob('**/*requirements.txt')) + pip_clean_list = [] + for pip_file in pip_list: + if all(x not in str(pip_file) for x in ignored_folders): + if pip_file.parents[0] not in pip_clean_list: + pip_clean_list.append(pip_file.parents[0]) + return pip_clean_list + +def get_go_paths(): + go_list = list(repo_path.glob('**/go.*')) + go_clean_list = [] + for go_file in go_list: + if all(x not in str(go_file) for x in ignored_folders): + if go_file.parents[0] not in go_clean_list: + go_clean_list.append(go_file.parents[0]) + return go_clean_list + +def append_updates(ecosystem, directory, assignees, reviewers=None): + config = {} + config['package-ecosystem'] = ecosystem + config['directory'] = directory + config['schedule']= {} + config['schedule']['interval'] = 'daily' + config['open-pull-requests-limit'] = 10 + config['assignees'] = assignees + if reviewers: + config['reviewers'] = reviewers + dependabot['updates'].append(config) + +def main(): + for docker_path in get_docker_paths(): + string_path = str(docker_path) + assignees = get_owners(docker_path).get('approvers') + reviewers = get_owners(docker_path).get('reviewers') + append_updates('docker', string_path, assignees, reviewers) + + for npm_path in get_npm_paths(): + string_path = str(npm_path) + assignees = get_owners(npm_path).get('approvers') + reviewers = get_owners(npm_path).get('reviewers') + append_updates('npm', string_path, assignees, reviewers) + + for pip_path in get_pip_paths(): + string_path = str(pip_path) + assignees = get_owners(pip_path).get('approvers') + reviewers = get_owners(pip_path).get('reviewers') + append_updates('pip', string_path, assignees, reviewers) + + for go_path in get_go_paths(): + string_path = str(go_path) + assignees = get_owners(go_path).get('approvers') + reviewers = get_owners(go_path).get('reviewers') + append_updates('gomod', string_path, assignees, reviewers) + + with open('.github/dependabot.yml', 'w') as outfile: + yaml.dump(dependabot, outfile, default_flow_style=False) + + print(get_docker_paths()) + print(get_npm_paths()) + print(get_pip_paths()) + print(get_go_paths()) + +if __name__ == "__main__": + repo_path = Path(__file__).parents[1] + main() \ No newline at end of file diff --git a/pkg/operators/et-operator/api/v1alpha1/trainingjob_types.go b/pkg/operators/et-operator/api/v1alpha1/trainingjob_types.go index 6c6c1f2d3..17f679ea1 100644 --- a/pkg/operators/et-operator/api/v1alpha1/trainingjob_types.go +++ b/pkg/operators/et-operator/api/v1alpha1/trainingjob_types.go @@ -44,14 +44,14 @@ type TrainingJobSpec struct { } type ETReplicaSpecs struct { - Launcher *common.ReplicaSpec `json:"Launcher"` - Worker *ETReplicaSpec `json:"Worker"` + Launcher *common.ReplicaSpec `json:"launcher"` + Worker *ETReplicaSpec `json:"worker"` } type ETReplicaSpec struct { // Replicas is the desired number of replicas of the given template. // If unspecified, defaults to 1. - Replicas *int32 `json:"Replicas,omitempty"` + Replicas *int32 `json:"replicas,omitempty"` // MaxReplicas is the desired max number of replicas of the given template. // If unspecified, MaxReplicas defaults to infinite. @@ -83,10 +83,15 @@ const ( ETReplicaTypeWorker ETReplicaType = "Worker" ) + // TrainingJobStatus defines the observed state of TrainingJob type TrainingJobStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster // Important: Run "make" to regenerate code after modifying this file + common.JobStatus `json:",inline"` + + TargetWorkers []string `json:"targetWorkers,omitempty"` + CurrentWorkers []string `json:"currentWorkers,omitempty"` } // +genclient @@ -108,7 +113,7 @@ type TrainingJob struct { // Most recently observed status of the PyTorchJob. // Read-only (modified by the system). - Status common.JobStatus `json:"status,omitempty"` + Status TrainingJobStatus `json:"status,omitempty"` } // +kubebuilder:object:root=true diff --git a/pkg/training/const.go b/pkg/training/const.go index 899a6bd68..734ff3d59 100644 --- a/pkg/training/const.go +++ b/pkg/training/const.go @@ -23,6 +23,8 @@ const ( NVIDIAGPUResourceName = "nvidia.com/gpu" // GPUShareResourceName is the gpushare resource name GPUShareResourceName = "aliyun.com/gpu-mem" + // GPUShareResourceName is the gpushare resource name + AliyunGPUResourceName = "aliyun.com/gpu" DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu" diff --git a/pkg/training/gpu.go b/pkg/training/gpu.go index ed23b7b58..1a2b065ba 100644 --- a/pkg/training/gpu.go +++ b/pkg/training/gpu.go @@ -67,7 +67,11 @@ func gpuInNodeDeprecated(node v1.Node) int64 { } func gpuInPod(pod v1.Pod) (gpuCount int64) { - containers := pod.Spec.Containers + return gpuInPodSpec(pod.Spec) +} + +func gpuInPodSpec(spec v1.PodSpec) (gpuCount int64) { + containers := spec.Containers for _, container := range containers { gpuCount += gpuInContainer(container) } @@ -113,11 +117,17 @@ func gpuInActivePod(pod v1.Pod) (gpuCount int64) { func gpuInContainer(container v1.Container) int64 { val, ok := container.Resources.Limits[NVIDIAGPUResourceName] - if !ok { - return gpuInContainerDeprecated(container) + if ok { + return val.Value() } - return val.Value() + val, ok = container.Resources.Limits[AliyunGPUResourceName] + + if ok { + return val.Value() + } + + return gpuInContainerDeprecated(container) } func gpuInContainerDeprecated(container v1.Container) int64 { diff --git a/pkg/training/trainer_et.go b/pkg/training/trainer_et.go index bb582ab0e..f74651000 100644 --- a/pkg/training/trainer_et.go +++ b/pkg/training/trainer_et.go @@ -18,6 +18,7 @@ import ( "context" "encoding/json" "fmt" + "github.com/kubeflow/arena/pkg/operators/et-operator/api/common" "strings" "time" @@ -143,17 +144,23 @@ func (ej *ETJob) Duration() time.Duration { // Requested GPU count of the Job func (ej *ETJob) RequestedGPU() int64 { - if ej.requestedGPU > 0 { - return ej.requestedGPU - } - requestGPUs := getRequestGPUsOfJobFromPodAnnotation(ej.pods) - if requestGPUs > 0 { - return requestGPUs + var requestedGPU int64 = 0 + job := ej.trainingjob + if status, ok := job.Status.ReplicaStatuses[common.ReplicaType(v1alpha1.ETReplicaTypeWorker)]; ok { + if job.Spec.ETReplicaSpecs.Worker != nil { + total := status.Succeeded + status.Failed + status.Active + gpuCountPerWorker := gpuInPodSpec(job.Spec.ETReplicaSpecs.Worker.Template.Spec) + requestedGPU += gpuCountPerWorker * int64(total) + } } - for _, pod := range ej.pods { - ej.requestedGPU += gpuInPod(*pod) + if status, ok := job.Status.ReplicaStatuses[common.ReplicaType(v1alpha1.ETReplicaTypeLauncher)]; ok { + if job.Spec.ETReplicaSpecs.Launcher != nil { + total := status.Succeeded + status.Failed + status.Active + gpuCountPerWorker := gpuInPodSpec(job.Spec.ETReplicaSpecs.Launcher.Template.Spec) + requestedGPU += gpuCountPerWorker * int64(total) + } } - return ej.requestedGPU + return requestedGPU } // Requested GPU count of the Job