11import os
22import platform
3+ import subprocess
34import sys
5+ import textwrap
46from pathlib import Path
57from typing import Set
68
1113import ray ._common .utils as utils
1214import ray ._private .ray_constants as ray_constants
1315import ray .scripts .scripts as scripts
16+ from ray ._common .test_utils import wait_for_condition
1417from ray ._private .resource_isolation_config import ResourceIsolationConfig
1518
1619# These tests are intended to run in CI inside a container.
2124# Run these commands locally before running the test suite:
2225#
2326# sudo mkdir -p /sys/fs/cgroup/resource_isolation_test
24- # echo "+cpu +memory" | sudo tee -a /sys/fs/cgroup/resource_isolation_test/cgroup.subtree_control
2527# sudo chown -R $(whoami):$(whoami) /sys/fs/cgroup/resource_isolation_test/
2628# sudo chmod -R u+rwx /sys/fs/cgroup/resource_isolation_test/
2729# echo $$ | sudo tee /sys/fs/cgroup/resource_isolation_test/cgroup.procs
@@ -337,6 +339,35 @@ def assert_cgroup_hierarchy_exists_for_node(
337339 )
338340
339341
342+ def assert_process_in_not_moved_into_ray_cgroups (
343+ node_id : str ,
344+ resource_isolation_config : ResourceIsolationConfig ,
345+ pid : str ,
346+ ):
347+ """Asserts that the system processes were created in the correct cgroup.
348+
349+ Args:
350+ node_id: used to construct the path of the cgroup subtree
351+ resource_isolation_config: used to construct the path of the cgroup
352+ subtree
353+ pid:
354+ """
355+ base_cgroup_for_node = resource_isolation_config .cgroup_path
356+ node_cgroup = Path (base_cgroup_for_node ) / f"ray-node_{ node_id } "
357+ cgroup_procs_file_paths = [
358+ node_cgroup / "system" / "leaf" / "cgroup.procs" ,
359+ node_cgroup / "user" / "non-ray" / "cgroup.procs" ,
360+ node_cgroup / "user" / "workers" / "cgroup.procs" ,
361+ ]
362+ found_pid = False
363+ for file_path in cgroup_procs_file_paths :
364+ with open (file_path , "r" ) as cgroup_procs_file :
365+ lines = cgroup_procs_file .readlines ()
366+ for line in lines :
367+ found_pid = found_pid or (line .strip () == pid )
368+ assert not found_pid
369+
370+
340371def assert_system_processes_are_in_system_cgroup (
341372 node_id : str ,
342373 resource_isolation_config : ResourceIsolationConfig ,
@@ -407,6 +438,30 @@ def assert_cgroup_hierarchy_cleaned_up_for_node(
407438 ), f"Root cgroup node at { node_cgroup } was not deleted. Cgroup cleanup failed. You may have to manually delete the cgroup subtree."
408439
409440
441+ def create_driver_in_internal_namespace ():
442+ """
443+ Returns a driver process that is a part of the '_ray_internal_' namespace.
444+ If the driver is part of the '_ray_internal_' namespace, it will NOT
445+ be moved into the workers cgroup by the raylet when it registers.
446+ The Dashboard ServeHead and JobHead modules are drivers that are
447+ technically system processes and use the '_ray_internal_' namespace and therefore
448+ must not be moved into the workers cgroup on registration.
449+ """
450+
451+ driver_code = textwrap .dedent (
452+ """
453+ import ray
454+ import time
455+ ray.init(namespace='_ray_internal_')
456+ time.sleep(3600)
457+ """
458+ ).strip ()
459+
460+ second_driver_proc = subprocess .Popen (["python" , "-c" , driver_code ])
461+
462+ return second_driver_proc
463+
464+
410465# The following tests check for cgroup setup and cleanup with the
411466# ray cli.
412467def test_ray_cli_start_invalid_resource_isolation_config (cleanup_ray ):
@@ -465,13 +520,16 @@ def __init__(self):
465520 def get_pid (self ):
466521 return os .getpid ()
467522
523+ second_driver_proc = create_driver_in_internal_namespace ()
524+
468525 actor_refs = []
469526 for _ in range (num_cpus ):
470527 actor_refs .append (Actor .remote ())
471528 worker_pids = set ()
472529 worker_pids .add (str (os .getpid ()))
473530 for actor in actor_refs :
474531 worker_pids .add (str (ray .get (actor .get_pid .remote ())))
532+
475533 assert_system_processes_are_in_system_cgroup (
476534 node_id ,
477535 resource_isolation_config ,
@@ -480,8 +538,13 @@ def get_pid(self):
480538 assert_worker_processes_are_in_workers_cgroup (
481539 node_id , resource_isolation_config , worker_pids
482540 )
483- runner .invoke (scripts .stop )
541+ assert_process_in_not_moved_into_ray_cgroups (
542+ node_id , resource_isolation_config , second_driver_proc .pid
543+ )
484544
545+ second_driver_proc .kill ()
546+ wait_for_condition (lambda : second_driver_proc .wait (), timeout = 5 )
547+ runner .invoke (scripts .stop )
485548 assert_cgroup_hierarchy_cleaned_up_for_node (node_id , resource_isolation_config )
486549
487550
0 commit comments