-
Notifications
You must be signed in to change notification settings - Fork 7k
[core] (cgroups 21/n) Do not move dashboard modules into the workers cgroup even if they are drivers #57955
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[core] (cgroups 21/n) Do not move dashboard modules into the workers cgroup even if they are drivers #57955
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,8 @@ | ||
| import os | ||
| import platform | ||
| import subprocess | ||
| import sys | ||
| import textwrap | ||
| from pathlib import Path | ||
| from typing import Set | ||
|
|
||
|
|
@@ -11,6 +13,7 @@ | |
| import ray._common.utils as utils | ||
| import ray._private.ray_constants as ray_constants | ||
| import ray.scripts.scripts as scripts | ||
| from ray._common.test_utils import wait_for_condition | ||
| from ray._private.resource_isolation_config import ResourceIsolationConfig | ||
|
|
||
| # These tests are intended to run in CI inside a container. | ||
|
|
@@ -21,7 +24,6 @@ | |
| # Run these commands locally before running the test suite: | ||
| # | ||
| # sudo mkdir -p /sys/fs/cgroup/resource_isolation_test | ||
| # echo "+cpu +memory" | sudo tee -a /sys/fs/cgroup/resource_isolation_test/cgroup.subtree_control | ||
| # sudo chown -R $(whoami):$(whoami) /sys/fs/cgroup/resource_isolation_test/ | ||
| # sudo chmod -R u+rwx /sys/fs/cgroup/resource_isolation_test/ | ||
| # echo $$ | sudo tee /sys/fs/cgroup/resource_isolation_test/cgroup.procs | ||
|
|
@@ -337,6 +339,35 @@ def assert_cgroup_hierarchy_exists_for_node( | |
| ) | ||
|
|
||
|
|
||
| def assert_process_in_not_moved_into_ray_cgroups( | ||
| node_id: str, | ||
| resource_isolation_config: ResourceIsolationConfig, | ||
| pid: str, | ||
| ): | ||
| """Asserts that the system processes were created in the correct cgroup. | ||
|
|
||
| Args: | ||
| node_id: used to construct the path of the cgroup subtree | ||
| resource_isolation_config: used to construct the path of the cgroup | ||
| subtree | ||
| pid: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing |
||
| """ | ||
| base_cgroup_for_node = resource_isolation_config.cgroup_path | ||
| node_cgroup = Path(base_cgroup_for_node) / f"ray-node_{node_id}" | ||
| cgroup_procs_file_paths = [ | ||
| node_cgroup / "system" / "leaf" / "cgroup.procs", | ||
| node_cgroup / "user" / "non-ray" / "cgroup.procs", | ||
| node_cgroup / "user" / "workers" / "cgroup.procs", | ||
| ] | ||
| found_pid = False | ||
| for file_path in cgroup_procs_file_paths: | ||
| with open(file_path, "r") as cgroup_procs_file: | ||
| lines = cgroup_procs_file.readlines() | ||
| for line in lines: | ||
| found_pid = found_pid or (line.strip() == pid) | ||
| assert not found_pid | ||
|
|
||
|
|
||
| def assert_system_processes_are_in_system_cgroup( | ||
| node_id: str, | ||
| resource_isolation_config: ResourceIsolationConfig, | ||
|
|
@@ -407,6 +438,30 @@ def assert_cgroup_hierarchy_cleaned_up_for_node( | |
| ), f"Root cgroup node at {node_cgroup} was not deleted. Cgroup cleanup failed. You may have to manually delete the cgroup subtree." | ||
|
|
||
|
|
||
| def create_driver_in_internal_namespace(): | ||
| """ | ||
| Returns a driver process that is a part of the '_ray_internal_' namespace. | ||
| If the driver is part of the '_ray_internal_' namespace, it will NOT | ||
| be moved into the workers cgroup by the raylet when it registers. | ||
| The Dashboard ServeHead and JobHead modules are drivers that are | ||
| technically system processes and use the '_ray_internal_' namespace and therefore | ||
| must not be moved into the workers cgroup on registration. | ||
| """ | ||
|
|
||
| driver_code = textwrap.dedent( | ||
| """ | ||
| import ray | ||
| import time | ||
| ray.init(namespace='_ray_internal_') | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there's a constant for it somewhere you could use, but hard coding is also fine (it shouldn't change) |
||
| time.sleep(3600) | ||
| """ | ||
| ).strip() | ||
|
|
||
| second_driver_proc = subprocess.Popen(["python", "-c", driver_code]) | ||
|
|
||
| return second_driver_proc | ||
|
|
||
|
|
||
| # The following tests check for cgroup setup and cleanup with the | ||
| # ray cli. | ||
| def test_ray_cli_start_invalid_resource_isolation_config(cleanup_ray): | ||
|
|
@@ -465,13 +520,16 @@ def __init__(self): | |
| def get_pid(self): | ||
| return os.getpid() | ||
|
|
||
| second_driver_proc = create_driver_in_internal_namespace() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I only added the test to |
||
|
|
||
| actor_refs = [] | ||
| for _ in range(num_cpus): | ||
| actor_refs.append(Actor.remote()) | ||
| worker_pids = set() | ||
| worker_pids.add(str(os.getpid())) | ||
| for actor in actor_refs: | ||
| worker_pids.add(str(ray.get(actor.get_pid.remote()))) | ||
|
|
||
| assert_system_processes_are_in_system_cgroup( | ||
| node_id, | ||
| resource_isolation_config, | ||
|
|
@@ -480,8 +538,13 @@ def get_pid(self): | |
| assert_worker_processes_are_in_workers_cgroup( | ||
| node_id, resource_isolation_config, worker_pids | ||
| ) | ||
| runner.invoke(scripts.stop) | ||
| assert_process_in_not_moved_into_ray_cgroups( | ||
| node_id, resource_isolation_config, second_driver_proc.pid | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Test Assertion Fails Due to PID Type MismatchThe |
||
|
|
||
| second_driver_proc.kill() | ||
| wait_for_condition(lambda: second_driver_proc.wait(), timeout=5) | ||
| runner.invoke(scripts.stop) | ||
| assert_cgroup_hierarchy_cleaned_up_for_node(node_id, resource_isolation_config) | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
did you forget a word in the name?