1+ import os
12import sys
23from pathlib import Path
34
45import pytest
56from click .testing import CliRunner
67
78import ray
8- import ray ._private .ray_constants as ray_constants
9- import ray ._private .utils as utils
109import ray .scripts .scripts as scripts
1110from ray ._private .resource_isolation_config import ResourceIsolationConfig
1211
2120#
2221# Run these commands locally before running the test suite:
2322# sudo mkdir -p /sys/fs/cgroup/resource_isolation_test
23+ # echo "+cpu +memory" | sudo tee -a /sys/fs/cgroup/resource_isolation_test/cgroup.subtree_control
2424# sudo chown -R $(whoami):$(whoami) /sys/fs/cgroup/resource_isolation_test/
2525# sudo chmod -R u+rwx /sys/fs/cgroup/resource_isolation_test/
2626# echo $$ | sudo tee /sys/fs/cgroup/resource_isolation_test/cgroup.procs
3232# _BASE_CGROUP_PATH = "/sys/fs/cgroup/resource_isolation_test"
3333
3434
35- def test_resource_isolation_enabled_creates_cgroup_hierarchy (ray_start_cluster ):
36- cluster = ray_start_cluster
37- base_cgroup = _BASE_CGROUP_PATH
38- resource_isolation_config = ResourceIsolationConfig (
39- enable_resource_isolation = True ,
40- cgroup_path = base_cgroup ,
41- system_reserved_memory = 1024 ** 3 ,
42- system_reserved_cpu = 1 ,
43- )
44- # Need to use a worker node because the driver cannot delete the head node.
45- cluster .add_node (num_cpus = 0 )
46- ray .init (address = cluster .address )
47-
48- worker_node = cluster .add_node (
49- num_cpus = 1 , resource_isolation_config = resource_isolation_config
50- )
51- worker_node_id = worker_node .node_id
52- cluster .wait_for_nodes ()
53-
54- # Make sure the worker node is up and running.
55- @ray .remote
56- def task ():
57- return "hellodarknessmyoldfriend"
58-
59- ray .get (task .remote (), timeout = 5 )
60-
61- # TODO(#54703): This test is deliberately overspecified right now. The test shouldn't
62- # care about the cgroup hierarchy. It should just verify that application and system processes
63- # are started in a cgroup with the correct constraints. This will be updated once cgroup
64- # process management is completed.
65- node_cgroup = Path (base_cgroup ) / f"ray_node_{ worker_node_id } "
35+ # TODO(#54703): This test is deliberately overspecified right now. The test shouldn't
36+ # care about the cgroup hierarchy. It should just verify that application and system processes
37+ # are started in a cgroup with the correct constraints. This will be updated once cgroup
38+ # process management is completed.
39+ def assert_cgroup_hierarchy_exists_for_node (
40+ node_id : str , resource_isolation_config : ResourceIsolationConfig
41+ ):
42+ base_cgroup_for_node = resource_isolation_config .cgroup_path
43+ node_cgroup = Path (base_cgroup_for_node ) / f"ray_node_{ node_id } "
6644 system_cgroup = node_cgroup / "system"
45+ system_leaf_cgroup = system_cgroup / "leaf"
6746 application_cgroup = node_cgroup / "application"
47+ application_leaf_cgroup = application_cgroup / "leaf"
6848
6949 # 1) Check that the cgroup hierarchy is created correctly for the node.
7050 assert node_cgroup .is_dir ()
7151 assert system_cgroup .is_dir ()
52+ assert system_leaf_cgroup .is_dir ()
7253 assert application_cgroup .is_dir ()
54+ assert application_leaf_cgroup .is_dir ()
7355
7456 # 2) Verify the constraints are applied correctly.
7557 system_cgroup_memory_min = system_cgroup / "memory.min"
@@ -87,14 +69,24 @@ def task():
8769 10000 - resource_isolation_config .system_reserved_cpu_weight
8870 )
8971
90- # 3) Gracefully shutting down the node cleans up everything. Don't need to check
91- # everything. If the base_cgroup is deleted, then all clean up succeeded.
92- cluster .remove_node (worker_node )
72+ # 3) Check to see that all system pids are inside the system cgroup
73+ system_leaf_cgroup_procs = system_leaf_cgroup / "cgroup.procs"
74+ # At least the raylet process is always moved.
75+ with open (system_leaf_cgroup_procs , "r" ) as cgroup_procs_file :
76+ lines = cgroup_procs_file .readlines ()
77+ assert (
78+ len (lines ) > 0
79+ ), f"Expected only system process passed into the raylet. Found { lines } "
80+
81+
82+ def assert_cgroup_hierarchy_cleaned_up_for_node (
83+ node_id : str , resource_isolation_config : ResourceIsolationConfig
84+ ):
85+ base_cgroup_for_node = resource_isolation_config .cgroup_path
86+ node_cgroup = Path (base_cgroup_for_node ) / f"ray_node_{ node_id } "
9387 assert not node_cgroup .is_dir ()
9488
9589
96- # The following tests will test integration of resource isolation
97- # with the 'ray start' command.
9890@pytest .fixture
9991def cleanup_ray ():
10092 """Shutdown all ray instances"""
@@ -114,19 +106,41 @@ def test_ray_start_invalid_resource_isolation_config(cleanup_ray):
114106 assert isinstance (result .exception , ValueError )
115107
116108
117- def test_ray_start_resource_isolation_config_default_values (monkeypatch , cleanup_ray ):
118- monkeypatch .setattr (utils , "get_num_cpus" , lambda * args , ** kwargs : 16 )
119- # The DEFAULT_CGROUP_PATH override is only relevant when running locally.
120- monkeypatch .setattr (ray_constants , "DEFAULT_CGROUP_PATH" , _BASE_CGROUP_PATH )
121-
109+ def test_ray_start_resource_isolation_creates_cgroup_hierarchy_and_cleans_up (
110+ monkeypatch , cleanup_ray
111+ ):
112+ object_store_memory = 1024 ** 3
113+ system_reserved_memory = 1024 ** 3
114+ system_reserved_cpu = 1
115+ resource_isolation_config = ResourceIsolationConfig (
116+ cgroup_path = _BASE_CGROUP_PATH ,
117+ enable_resource_isolation = True ,
118+ system_reserved_cpu = system_reserved_cpu ,
119+ system_reserved_memory = system_reserved_memory ,
120+ )
121+ node_id = ray .NodeID .from_random ().hex ()
122+ os .environ ["RAY_OVERRIDE_NODE_ID_FOR_TESTING" ] = node_id
122123 runner = CliRunner ()
123124 result = runner .invoke (
124125 scripts .start ,
125- ["--head" , "--enable-resource-isolation" ],
126+ [
127+ "--head" ,
128+ "--enable-resource-isolation" ,
129+ "--cgroup-path" ,
130+ _BASE_CGROUP_PATH ,
131+ "--system-reserved-cpu" ,
132+ system_reserved_cpu ,
133+ "--system-reserved-memory" ,
134+ system_reserved_memory ,
135+ "--object-store-memory" ,
136+ object_store_memory ,
137+ ],
126138 )
127- # TODO(#54703): Need to rewrite this test to check for side-effects on the cgroup
128- # hierarchy once the rest of the implemetation is complete.
129139 assert result .exit_code == 0
140+ resource_isolation_config .add_object_store_memory (object_store_memory )
141+ assert_cgroup_hierarchy_exists_for_node (node_id , resource_isolation_config )
142+ runner .invoke (scripts .stop )
143+ assert_cgroup_hierarchy_cleaned_up_for_node (node_id , resource_isolation_config )
130144
131145
132146# The following tests will test integration of resource isolation
@@ -144,50 +158,31 @@ def test_ray_init_resource_isolation_disabled_by_default(ray_shutdown):
144158 assert not node .resource_isolation_config .is_enabled ()
145159
146160
147- def test_ray_init_with_resource_isolation_default_values (monkeypatch , ray_shutdown ):
148- total_system_cpu = 10
149- monkeypatch .setattr (utils , "get_num_cpus" , lambda * args , ** kwargs : total_system_cpu )
150- # The DEFAULT_CGROUP_PATH override is only relevant when running locally.
151- monkeypatch .setattr (ray_constants , "DEFAULT_CGROUP_PATH" , _BASE_CGROUP_PATH )
152- ray .init (address = "local" , enable_resource_isolation = True )
153- node = ray ._private .worker ._global_node
154- assert node is not None
155- assert node .resource_isolation_config .is_enabled ()
156-
157-
158161def test_ray_init_with_resource_isolation_override_defaults (ray_shutdown ):
159- cgroup_path = _BASE_CGROUP_PATH
160162 system_reserved_cpu = 1
161- system_reserved_memory = 1 * 10 ** 9
162- object_store_memory = 1 * 10 ** 9
163+ system_reserved_memory = 1024 ** 3
164+ object_store_memory = 1024 ** 3
163165 resource_isolation_config = ResourceIsolationConfig (
164166 enable_resource_isolation = True ,
165- cgroup_path = cgroup_path ,
167+ cgroup_path = _BASE_CGROUP_PATH ,
166168 system_reserved_cpu = system_reserved_cpu ,
167169 system_reserved_memory = system_reserved_memory ,
168170 )
169171 resource_isolation_config .add_object_store_memory (object_store_memory )
170172 ray .init (
171173 address = "local" ,
172174 enable_resource_isolation = True ,
173- _cgroup_path = cgroup_path ,
175+ _cgroup_path = _BASE_CGROUP_PATH ,
174176 system_reserved_cpu = system_reserved_cpu ,
175177 system_reserved_memory = system_reserved_memory ,
176178 object_store_memory = object_store_memory ,
177179 )
178180 node = ray ._private .worker ._global_node
179- # TODO(#54703): Need to rewrite this test to check for side-effects on the cgroup
180- # hierarchy once the rest of the implemetation is complete.
181181 assert node is not None
182- assert node .resource_isolation_config .is_enabled ()
183- assert (
184- node .resource_isolation_config .system_reserved_cpu_weight
185- == resource_isolation_config .system_reserved_cpu_weight
186- )
187- assert (
188- node .resource_isolation_config .system_reserved_memory
189- == resource_isolation_config .system_reserved_memory
190- )
182+ node_id = node .node_id
183+ assert_cgroup_hierarchy_exists_for_node (node_id , resource_isolation_config )
184+ ray .shutdown ()
185+ assert_cgroup_hierarchy_cleaned_up_for_node (node_id , resource_isolation_config )
191186
192187
193188if __name__ == "__main__" :
0 commit comments