-
Notifications
You must be signed in to change notification settings - Fork 663
[RayJob] Yunikorn Integration #3948
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 59 commits
6c0d860
ebf0cb3
7d7823b
9e12f53
f5b7df1
7d8c4e8
a9b6b95
0e9cbd8
4eb38e5
27ccfa2
7bf11d1
1abda9c
da6efc2
51a2d1e
0def3ae
6c77276
ce99f4e
4a89804
469e4a8
07e1f0d
0bea9a1
e7e7f20
d44b8be
3393d9f
e28caa5
22f2679
dcc6f1c
668d6a2
f1e0c6b
7a2d474
c2f92a1
ce9a513
0551ac1
8c6d088
effd7bb
12a1b0c
c7e88b4
ee9a61b
2b39ede
1a414f0
25eebf2
110ba38
83954d9
adf84e6
afa537f
68e6cbe
8f9aef5
74df79d
0d03a21
2fe091e
cd09b3e
2c6e649
2646d20
ff38824
69a0531
9a6c707
7cc7c2d
0646149
3066c41
906ab83
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| apiVersion: ray.io/v1 | ||
| kind: RayJob | ||
| metadata: | ||
| name: rayjob-yunikorn-0 | ||
| labels: | ||
|
Comment on lines
+1
to
+5
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you write some comments specifically for yunikorn-scheduler?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought those will be written in docs since RayCluster + batchScheduler did not come with those comments. If you think this is needed, I can open a new PR after this one.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated: I will add comments to this yaml and |
||
| ray.io/gang-scheduling-enabled: "true" | ||
| yunikorn.apache.org/app-id: rayjob-yunikorn-0 | ||
| yunikorn.apache.org/queue: root.test | ||
| spec: | ||
| entrypoint: python /home/ray/samples/sample_code.py | ||
| runtimeEnvYAML: | | ||
| pip: | ||
| - requests==2.26.0 | ||
| - pendulum==2.1.2 | ||
| env_vars: | ||
| counter_name: "test_counter" | ||
| rayClusterSpec: | ||
| rayVersion: '2.46.0' | ||
| headGroupSpec: | ||
| rayStartParams: {} | ||
| template: | ||
| spec: | ||
| containers: | ||
| - name: ray-head | ||
| image: rayproject/ray:2.46.0 | ||
| ports: | ||
| - containerPort: 6379 | ||
| name: gcs-server | ||
| - containerPort: 8265 | ||
| name: dashboard | ||
| - containerPort: 10001 | ||
| name: client | ||
| resources: | ||
| limits: | ||
| cpu: "1" | ||
| memory: "2Gi" | ||
| requests: | ||
| cpu: "1" | ||
| memory: "2Gi" | ||
| volumeMounts: | ||
| - mountPath: /home/ray/samples | ||
| name: code-sample | ||
| volumes: | ||
| - name: code-sample | ||
| configMap: | ||
| name: ray-job-code-sample | ||
| items: | ||
| - key: sample_code.py | ||
| path: sample_code.py | ||
| workerGroupSpecs: | ||
| - replicas: 1 | ||
| minReplicas: 1 | ||
| maxReplicas: 5 | ||
| groupName: small-group | ||
| rayStartParams: {} | ||
| template: | ||
| spec: | ||
| containers: | ||
| - name: ray-worker | ||
| image: rayproject/ray:2.46.0 | ||
| resources: | ||
| limits: | ||
| cpu: "1" | ||
| memory: "2Gi" | ||
| requests: | ||
| cpu: "1" | ||
| memory: "2Gi" | ||
|
|
||
| --- | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: ray-job-code-sample | ||
| data: | ||
| sample_code.py: | | ||
| import ray | ||
| import os | ||
| import requests | ||
|
|
||
| ray.init() | ||
|
|
||
| @ray.remote | ||
| class Counter: | ||
| def __init__(self): | ||
| # Used to verify runtimeEnv | ||
| self.name = os.getenv("counter_name") | ||
| assert self.name == "test_counter" | ||
| self.counter = 0 | ||
|
|
||
| def inc(self): | ||
| self.counter += 1 | ||
|
|
||
| def get_counter(self): | ||
| return "{} got {}".format(self.name, self.counter) | ||
|
|
||
| counter = Counter.remote() | ||
|
|
||
| for _ in range(5): | ||
| ray.get(counter.inc.remote()) | ||
| print(ray.get(counter.get_counter.remote())) | ||
|
|
||
| # Verify that the correct runtime env was used for the job. | ||
| assert requests.__version__ == "2.26.0" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,7 @@ import ( | |
| "context" | ||
|
|
||
| corev1 "k8s.io/api/core/v1" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/runtime" | ||
| "k8s.io/client-go/rest" | ||
| "sigs.k8s.io/controller-runtime/pkg/builder" | ||
|
|
@@ -18,13 +19,18 @@ type BatchScheduler interface { | |
| // https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/ | ||
| Name() string | ||
|
|
||
| // DoBatchSchedulingOnSubmission handles submitting the RayCluster to the batch scheduler on creation / update | ||
| // DoBatchSchedulingOnSubmission handles submitting the RayCluster/RayJob to the batch scheduler on creation / update | ||
| // For most batch schedulers, this results in the creation of a PodGroup. | ||
| DoBatchSchedulingOnSubmission(ctx context.Context, app *rayv1.RayCluster) error | ||
| DoBatchSchedulingOnSubmission(ctx context.Context, object metav1.Object) error | ||
|
|
||
| // AddMetadataToPod enriches Pod specs with metadata necessary to tie them to the scheduler. | ||
| // AddMetadataToPod enriches the pod with metadata necessary to tie it to the scheduler. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add comment saying that we are removing this method?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, added! |
||
| // For example, setting labels for queues / priority, and setting schedulerName. | ||
| AddMetadataToPod(ctx context.Context, app *rayv1.RayCluster, groupName string, pod *corev1.Pod) | ||
| // This function will be removed once Rayjob Volcano scheduler integration is completed. | ||
| AddMetadataToPod(ctx context.Context, rayCluster *rayv1.RayCluster, groupName string, pod *corev1.Pod) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will remove |
||
|
|
||
| // AddMetadataToChildResource enriches the child resource (batchv1.Job, rayv1.RayCluster) with metadata necessary to tie it to the scheduler. | ||
| // For example, setting labels for queues / priority, and setting schedulerName. | ||
| AddMetadataToChildResource(ctx context.Context, parent metav1.Object, child metav1.Object, groupName string) | ||
| } | ||
|
|
||
| // BatchSchedulerFactory handles initial setup of the scheduler plugin by registering the | ||
|
|
@@ -53,13 +59,16 @@ func (d *DefaultBatchScheduler) Name() string { | |
| return GetDefaultPluginName() | ||
| } | ||
|
|
||
| func (d *DefaultBatchScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ *rayv1.RayCluster) error { | ||
| func (d *DefaultBatchScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ metav1.Object) error { | ||
| return nil | ||
| } | ||
|
|
||
| func (d *DefaultBatchScheduler) AddMetadataToPod(_ context.Context, _ *rayv1.RayCluster, _ string, _ *corev1.Pod) { | ||
| } | ||
|
|
||
| func (d *DefaultBatchScheduler) AddMetadataToChildResource(_ context.Context, _ metav1.Object, _ metav1.Object, _ string) { | ||
| } | ||
|
|
||
| func (df *DefaultBatchSchedulerFactory) New(_ context.Context, _ *rest.Config, _ client.Client) (BatchScheduler, error) { | ||
| return &DefaultBatchScheduler{}, nil | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.