-
Notifications
You must be signed in to change notification settings - Fork 7.1k
Updating zero capacity resource semantics #4555
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
1052cce
3f2e2ac
cc292ee
f5c66da
41d0165
153058d
68b91d3
10a452f
35539a2
74f647a
2c30bc9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,5 @@ | ||
| ray { | ||
| run-mode = SINGLE_PROCESS | ||
| resources = "CPU:4,GPU:0" | ||
| resources = "CPU:4" | ||
| redis.address = "" | ||
| } |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -34,30 +34,37 @@ public Integer echo(Integer number) { | |||||
| @Test | ||||||
| public void testMethods() { | ||||||
| TestUtils.skipTestUnderSingleProcess(); | ||||||
| CallOptions callOptions1 = new CallOptions(ImmutableMap.of("CPU", 4.0, "GPU", 0.0)); | ||||||
| CallOptions callOptions1 = new CallOptions(ImmutableMap.of("CPU", 4.0)); | ||||||
|
|
||||||
| // This is a case that can satisfy required resources. | ||||||
| // The static resources for test are "CPU:4,RES-A:4". | ||||||
| RayObject<Integer> result1 = Ray.call(ResourcesManagementTest::echo, 100, callOptions1); | ||||||
| Assert.assertEquals(100, (int) result1.get()); | ||||||
|
|
||||||
| CallOptions callOptions2 = new CallOptions(ImmutableMap.of("CPU", 4.0, "GPU", 2.0)); | ||||||
| CallOptions callOptions2 = new CallOptions(ImmutableMap.of("CPU", 4.0)); | ||||||
|
|
||||||
| // This is a case that can't satisfy required resources. | ||||||
| // The static resources for test are "CPU:4,RES-A:4". | ||||||
| final RayObject<Integer> result2 = Ray.call(ResourcesManagementTest::echo, 200, callOptions2); | ||||||
| WaitResult<Integer> waitResult = Ray.wait(ImmutableList.of(result2), 1, 1000); | ||||||
|
|
||||||
| Assert.assertEquals(0, waitResult.getReady().size()); | ||||||
| Assert.assertEquals(1, waitResult.getUnready().size()); | ||||||
| Assert.assertEquals(1, waitResult.getReady().size()); | ||||||
| Assert.assertEquals(0, waitResult.getUnready().size()); | ||||||
|
|
||||||
| try { | ||||||
| CallOptions callOptions3 = new CallOptions(ImmutableMap.of("CPU", 0.0)); | ||||||
| Assert.fail(); | ||||||
| } catch (RuntimeException e) { | ||||||
| // We should receive a RuntimeException indicate that we should pass a zero capacity resource. | ||||||
|
||||||
| // We should receive a RuntimeException indicate that we should pass a zero capacity resource. | |
| // We should receive a RuntimeException indicates that we should not pass a zero capacity resource. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1934,15 +1934,15 @@ def run_lots_of_tasks(): | |
| store_names = [] | ||
| store_names += [ | ||
| client["ObjectStoreSocketName"] for client in client_table | ||
| if client["Resources"]["GPU"] == 0 | ||
| if client["Resources"].get("GPU", 0) == 0 | ||
| ] | ||
| store_names += [ | ||
| client["ObjectStoreSocketName"] for client in client_table | ||
| if client["Resources"]["GPU"] == 5 | ||
| if client["Resources"].get("GPU", 0) == 5 | ||
| ] | ||
| store_names += [ | ||
| client["ObjectStoreSocketName"] for client in client_table | ||
| if client["Resources"]["GPU"] == 1 | ||
| if client["Resources"].get("GPU", 0) == 1 | ||
| ] | ||
| assert len(store_names) == 3 | ||
|
|
||
|
|
@@ -2112,6 +2112,32 @@ def f(): | |
| ray.get(results) | ||
|
|
||
|
|
||
| def test_zero_capacity_deletion_semantics(shutdown_only): | ||
| ray.init(num_cpus=2, num_gpus=1, resources={"test_resource": 1}) | ||
|
|
||
| def test(): | ||
| resources = ray.global_state.available_resources() | ||
| retry_count = 0 | ||
|
|
||
| while resources and retry_count < 5: | ||
| time.sleep(0.1) | ||
|
||
| resources = ray.global_state.available_resources() | ||
| retry_count += 1 | ||
|
|
||
| if retry_count >= 5: | ||
| raise RuntimeError("Resources were available even after retries.") | ||
|
|
||
| return resources | ||
|
|
||
| function = ray.remote( | ||
| num_cpus=2, num_gpus=1, resources={"test_resource": 1})(test) | ||
| cluster_resources = ray.get(function.remote()) | ||
romilbhardwaj marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # All cluster resources should be utilized and | ||
| # cluster_resources must be empty | ||
| assert cluster_resources == {} | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def save_gpu_ids_shutdown_only(): | ||
| # Record the curent value of this environment variable so that we can | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,7 +11,7 @@ | |
| import traceback | ||
|
|
||
| import ray | ||
| from ray.tune.error import TuneError, AbortTrialExecution | ||
| from ray.tune.error import AbortTrialExecution | ||
| from ray.tune.logger import NoopLogger | ||
| from ray.tune.trial import Trial, Resources, Checkpoint | ||
| from ray.tune.trial_executor import TrialExecutor | ||
|
|
@@ -363,17 +363,22 @@ def _update_avail_resources(self, num_retries=5): | |
| resources = ray.services.check_and_update_resources( | ||
| None, None, None) | ||
| if not resources: | ||
|
||
| logger.warning("Cluster resources not detected. Retrying...") | ||
| logger.warning( | ||
| "Cluster resources not detected or are 0. Retrying...") | ||
| time.sleep(0.5) | ||
|
|
||
| if not resources or "CPU" not in resources: | ||
| raise TuneError("Cluster resources cannot be detected. " | ||
| "You can resume this experiment by passing in " | ||
| "`resume=True` to `run`.") | ||
| if not resources: | ||
| # NOTE: This hides the possibility that Ray may be waiting for | ||
| # clients to connect. | ||
| resources.setdefault("CPU", 0) | ||
| resources.setdefault("GPU", 0) | ||
| logger.warning("Cluster resources cannot be detected or are 0. " | ||
| "You can resume this experiment by passing in " | ||
| "`resume=True` to `run`.") | ||
|
|
||
| resources = resources.copy() | ||
| num_cpus = resources.pop("CPU") | ||
| num_gpus = resources.pop("GPU") | ||
| num_cpus = resources.pop("CPU", 0) | ||
| num_gpus = resources.pop("GPU", 0) | ||
| custom_resources = resources | ||
|
|
||
| self._avail_resources = Resources( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's use
IllegalArgumentExceptionhere