Skip to content
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
599f968
[wip] [core] (cgroups 14/n) Clean up bazel targets and expose just the
israbbani Sep 30, 2025
77f6b64
[core] Cleaning up Cgroup related bazel targets. CgroupManagerInteface
israbbani Oct 6, 2025
9fd1160
[core] (cgroups 14/n) Clean up bazel targets and enable cross-platform
israbbani Oct 6, 2025
a5f4b5a
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 6, 2025
762b5cf
Merge branch 'irabbani/cgroups-14' of github.com:ray-project/ray into…
israbbani Oct 6, 2025
8d07f6f
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 6, 2025
b92677e
Forgot to use clang locally
israbbani Oct 7, 2025
a870d5a
Merge branch 'irabbani/cgroups-14' of github.com:ray-project/ray into…
israbbani Oct 7, 2025
e34f19b
Unused imports
israbbani Oct 7, 2025
55f1ec8
unused includes breaking the build
israbbani Oct 7, 2025
4c7545e
fixing the macos build
israbbani Oct 7, 2025
aef6bd8
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 7, 2025
bac50d3
[core] (cgroups 15/n) Changing the cgroup heirarchy to have three
israbbani Oct 7, 2025
50b2d14
Merge branch 'irabbani/cgroups-14' into irabbani/cgroups-15
israbbani Oct 7, 2025
59366ce
move operators for NoopCgroupManager
israbbani Oct 7, 2025
44ab09e
Merge branch 'irabbani/cgroups-14' of github.com:ray-project/ray into…
israbbani Oct 7, 2025
0c8d8e3
Update src/ray/common/cgroup2/cgroup_manager_factory.h
israbbani Oct 7, 2025
6dc39ad
feedback
israbbani Oct 7, 2025
60d77bb
up
israbbani Oct 8, 2025
bfd2482
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 9, 2025
59a0bef
Merge branch 'irabbani/cgroups-14' into irabbani/cgroups-15
israbbani Oct 9, 2025
ee024ea
Different cgroup hierarchy.
israbbani Oct 10, 2025
bf390de
Merge branch 'master' into irabbani/cgroups-15
israbbani Oct 10, 2025
028f3d2
Merge branch 'irabbani/cgroups-15' of github.com:ray-project/ray into…
israbbani Oct 10, 2025
cb34c9b
typo
israbbani Oct 10, 2025
fb7d1ac
one more typo
israbbani Oct 10, 2025
8b443f5
one more
israbbani Oct 10, 2025
505c4d5
[core] (cgroups 16/n) Changing default values for the system cgroup to
israbbani Oct 11, 2025
eddb0b2
Cleaning up docs and log lines
israbbani Oct 11, 2025
4550bae
Merge branch 'master' into irabbani/cgroups-15
israbbani Oct 11, 2025
30770e0
Merge branch 'irabbani/cgroups-15' of github.com:ray-project/ray into…
israbbani Oct 11, 2025
117248d
[core] (cgroups 16/n) Updating the algorithm for determining the default
israbbani Oct 12, 2025
c4884ee
Merge branch 'master' into irabbani/cgroups-15
israbbani Oct 12, 2025
35ba7be
Merge branch 'irabbani/cgroups-15' into irabbani/cgroups-16
israbbani Oct 12, 2025
be2f048
typos
israbbani Oct 12, 2025
d49a1e0
Merge branch 'irabbani/cgroups-16' of github.com:ray-project/ray into…
israbbani Oct 12, 2025
a111fd3
another typo
israbbani Oct 12, 2025
8a43394
another one
israbbani Oct 12, 2025
071286f
Merge branch 'master' into irabbani/cgroups-16
edoakes Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 19 additions & 16 deletions python/ray/_private/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,28 +79,31 @@ def env_set_by_user(key):
)

# The following values are only used when resource isolation is enabled
# ===== The default number of bytes to reserve for ray system processes
DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
"RAY_DEFAULT_DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES", (25) * (10**9)
)
# The default proportion available memory to reserve for ray system processes
DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION = env_integer(
"RAY_DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION", 0.10
)
# The default number of cpu cores to reserve for ray system processes
DEFAULT_SYSTEM_RESERVED_CPU_CORES = env_float(
DEFAULT_CGROUP_PATH = "/sys/fs/cgroup"
# The default minimum number of cpu cores to reserve for ray system processes.
DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES = env_float(
"RAY_DEFAULT_SYSTEM_RESERVED_CPU_CORES", 1.0
)
# The default minimum number of cpu cores to reserve for ray system processes.
DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES = env_float(
"RAY_DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION", 3.0
)
# The default proportion of cpu cores to reserve for ray system processes
DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION = env_float(
"RAY_DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION", 0.05
)
# The smallest number of cores that ray system processes can be guaranteed
MINIMUM_SYSTEM_RESERVED_CPU_CORES = 0.5
# The smallest number of bytes that ray system processes can be guaranteed
MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES = (100) * (10**6)
# The default path for cgroupv2
DEFAULT_CGROUP_PATH = "/sys/fs/cgroup"
# ===== The default number of bytes to reserve for ray system processes
# The smallest number of bytes that ray system processes can be guaranteed.
DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
"RAY_DEFAULT_MIN_DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES", (500) * (1024**2)
)
DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
"RAY_DEFAULT_MAX_DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES", (10) * (1024**3)
)
# The default proportion available memory to reserve for ray system processes
DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION = env_integer(
"RAY_DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION", 0.10
)

# The smallest cap on the memory used by the object store that we allow.
# This must be greater than MEMORY_RESOURCE_UNIT_BYTES
Expand Down
99 changes: 68 additions & 31 deletions python/ray/_private/resource_isolation_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ def __init__(
self._constructed = False

if not enable_resource_isolation:
if self.cgroup_path:
raise ValueError(
"cgroup_path cannot be set when resource isolation is not enabled. "
"Set enable_resource_isolation to True if you're using ray.init or use the "
"--enable-resource-isolation flag if you're using the ray cli."
)
if system_reserved_cpu:
raise ValueError(
"system_reserved_cpu cannot be set when resource isolation is not enabled. "
Expand All @@ -72,12 +78,6 @@ def __init__(
"Set enable_resource_isolation to True if you're using ray.init or use the "
"--enable-resource-isolation flag if you're using the ray cli."
)
if self.cgroup_path:
raise ValueError(
"cgroup_path cannot be set when resource isolation is not enabled. "
"Set enable_resource_isolation to True if you're using ray.init or use the "
"--enable-resource-isolation flag if you're using the ray cli."
)
return

# resource isolation is enabled
Expand All @@ -101,16 +101,16 @@ def add_object_store_memory(self, object_store_memory: int):
"enable_resource_isolation is False."
)
assert not self._constructed, (
"Cannot add object_store_memory to system_reserved_memory when"
"multiple times."
"Cannot call add_object_store_memory more than once with an instance "
"ResourceIsolationConfig. This is a bug in the ray code. "
)
self.system_reserved_memory += object_store_memory
available_system_memory = ray._common.utils.get_system_memory()
if self.system_reserved_memory > available_system_memory:
raise ValueError(
f"The total requested system_reserved_memory={self.system_reserved_memory}, calculated by "
" object_store_bytes + system_reserved_memory, is greater than the total memory "
f" available={available_system_memory}. Pick a smaller number of bytes for object_store_bytes "
"object_store_bytes + system_reserved_memory, is greater than the total memory "
f"available={available_system_memory}. Pick a smaller number of bytes for object_store_bytes "
"or system_reserved_memory."
)
self._constructed = True
Expand All @@ -121,8 +121,7 @@ def add_system_pids(self, system_pids: str):

@staticmethod
def _validate_and_get_cgroup_path(cgroup_path: Optional[str]) -> str:
"""Returns the ray_constants.DEFAULT_CGROUP_PATH if cgroup_path is not
specified. Checks the type of cgroup_path.
"""Returns the ray_constants.DEFAULT_CGROUP_PATH if cgroup_path is not specified.

Args:
cgroup_path: The path for the cgroup the raylet should use to enforce
Expand Down Expand Up @@ -150,25 +149,47 @@ def _validate_and_get_cgroup_path(cgroup_path: Optional[str]) -> str:
def _validate_and_get_system_reserved_cpu(
system_reserved_cpu: Optional[float],
) -> int:
"""If system_reserved_cpu is not specified, returns the default value. Otherwise,
checks the type, makes sure that the value is in range, and converts it into cpu.weights
for cgroupv2. See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights for more information.
"""If system_reserved_cpu is specified, validates it, otherwise returns the default value.

Validation entails checking the type, ensuring that the value is in range, and converts it
into cpu.weights for cgroupv2. See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights
for more information.

If system_reserved_cpu is not specified, returns a default value between
[DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES, DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES].

# TODO(54703): The errors from this method are user-facing and thus need
to be linked the user-facing documentation once it's available.

Args:
system_reserved_cpu: The amount of cores reserved for ray system
processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES
and < the total number of cores available.

Raises:
ValueError: If system_reserved_cpu is specified, but invalid.
ValueError: If system_reserved_cpu is specified, but invalid or if the system
does not have enough available cpus.

"""
available_system_cpus = utils.get_num_cpus()

if available_system_cpus < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES:
raise ValueError(
f"The available number of cpu cores on this system {available_system_cpus} is less than "
f"the minimum amount that is required for ray's system processes. "
f"Pick a number of cpu cores greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES}"
)

if not system_reserved_cpu:
system_reserved_cpu = min(
ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_CORES,
ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION
* available_system_cpus,
system_reserved_cpu = float(
min(
max(
ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES,
ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION
* available_system_cpus,
),
ray_constants.DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES,
)
)

if not (
Expand All @@ -183,12 +204,12 @@ def _validate_and_get_system_reserved_cpu(

system_reserved_cpu = float(system_reserved_cpu)

if system_reserved_cpu < ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES:
if system_reserved_cpu < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES:
raise ValueError(
f"The requested system_reserved_cpu={system_reserved_cpu} is less than "
f"the minimum number of cpus that can be used for resource isolation. "
"Pick a number of cpu cores to reserve for ray system processes "
f"greater than or equal to {ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES}"
f"greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES}"
)

if system_reserved_cpu > available_system_cpus:
Expand All @@ -200,8 +221,8 @@ def _validate_and_get_system_reserved_cpu(

# Converting the number of cores the user defined into cpu.weights
# This assumes that ray is allowed to use all available CPU
# cores and distribute them between system processes and
# application processes
# cores and distribute them between system, worker and
# user processes
return int(
(system_reserved_cpu / float(available_system_cpus))
* _CGROUP_CPU_MAX_WEIGHT
Expand All @@ -227,28 +248,44 @@ def _validate_and_get_system_reserved_memory(
"""
available_system_memory = ray._common.utils.get_system_memory()

if (
available_system_memory
< ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES
):
raise ValueError(
f"The available memory on this system {available_system_memory} is less than "
f"the minimum amount that is required for ray's system processes. "
f"Pick a number of bytes greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES}"
)

if not system_reserved_memory:
system_reserved_memory = int(
min(
ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES,
ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION
* available_system_memory,
max(
ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES,
ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION
* available_system_memory,
),
ray_constants.DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES,
)
)

if not isinstance(system_reserved_memory, int):
raise ValueError(
f"Invalid value={system_reserved_memory} for system_reserved_memory. "
f"Invalid value {system_reserved_memory} for system_reserved_memory. "
"Use an integer to represent the number bytes that need to be reserved for "
"ray system processes to enable resource isolation."
)

if system_reserved_memory < ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES:
if (
system_reserved_memory
< ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES
):
raise ValueError(
f"The requested system_reserved_memory={system_reserved_memory} is less than "
f"The requested system_reserved_memory {system_reserved_memory} is less than "
f"the minimum number of bytes that can be used for resource isolation. "
"Pick a number of bytes to reserve for ray system processes "
f"greater than or equal to {ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES}"
f"greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES}"
)

if system_reserved_memory > available_system_memory:
Expand Down
Loading