diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py index 453987a2adfb..973c5d490042 100644 --- a/python/ray/_private/ray_constants.py +++ b/python/ray/_private/ray_constants.py @@ -67,6 +67,41 @@ def env_set_by_user(key): ID_SIZE = 28 +# The following constants are used to create default values for +# resource isolation when it is enabled. +# TODO(54703): Link to OSS documentation about the feature once it's available. +DEFAULT_CGROUP_PATH = "/sys/fs/cgroup" +# The default proportion of cpu cores to reserve for ray system processes. +DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION = env_float( + "RAY_DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION", 0.05 +) +# The default minimum number of cpu cores to reserve for ray system processes. +# This value is used if the available_cores * DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION < this value. +DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES = env_float( + "RAY_DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES", 1.0 +) +# The default maximum number of cpu cores to reserve for ray system processes. +# This value is used if the available_cores * DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION > this value. +DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES = env_float( + "RAY_DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES", 3.0 +) +# The values for SYSTEM_RESERVED_MEMORY do not include the memory reserveed +# for the object store. +# The default proportion available memory to reserve for ray system processes. +DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION = env_float( + "RAY_DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION", 0.10 +) +# The default minimum number of bytes to reserve for ray system processes. +# This value is used if the available_memory * DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION < this value. +DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES = env_integer( + "RAY_DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES", (500) * (1024**2) +) +# The default maximum number of bytes to reserve for ray system processes. +# This value is used if the available_memory * DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION > this value. +DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES = env_integer( + "RAY_DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES", (10) * (1024**3) +) + # The default maximum number of bytes to allocate to the object store unless # overridden by the user. DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = env_integer( @@ -77,31 +112,6 @@ def env_set_by_user(key): "RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION", 0.3, ) - -# The following values are only used when resource isolation is enabled -# ===== The default number of bytes to reserve for ray system processes -DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES = env_integer( - "RAY_DEFAULT_DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES", (25) * (10**9) -) -# The default proportion available memory to reserve for ray system processes -DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION = env_integer( - "RAY_DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION", 0.10 -) -# The default number of cpu cores to reserve for ray system processes -DEFAULT_SYSTEM_RESERVED_CPU_CORES = env_float( - "RAY_DEFAULT_SYSTEM_RESERVED_CPU_CORES", 1.0 -) -# The default proportion of cpu cores to reserve for ray system processes -DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION = env_float( - "RAY_DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION", 0.05 -) -# The smallest number of cores that ray system processes can be guaranteed -MINIMUM_SYSTEM_RESERVED_CPU_CORES = 0.5 -# The smallest number of bytes that ray system processes can be guaranteed -MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES = (100) * (10**6) -# The default path for cgroupv2 -DEFAULT_CGROUP_PATH = "/sys/fs/cgroup" - # The smallest cap on the memory used by the object store that we allow. # This must be greater than MEMORY_RESOURCE_UNIT_BYTES OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024 diff --git a/python/ray/_private/resource_isolation_config.py b/python/ray/_private/resource_isolation_config.py index 9d12fa817363..1044e5c63721 100644 --- a/python/ray/_private/resource_isolation_config.py +++ b/python/ray/_private/resource_isolation_config.py @@ -13,12 +13,10 @@ class ResourceIsolationConfig: - """Configuration for enabling resource isolation by reserving memory - and cpu for ray system processes through cgroupv2. - This class validates configuration for resource isolation by - enforcing types, correct combinations of values, applying default values, - and sanity checking cpu and memory reservations. - Also, converts system_reserved_cpu into cpu.weights for cgroupv2. + """Configuration for enabling resource isolation by reserving memory and cpu for ray system processes through cgroupv2. + + Validates configuration for resource isolation by enforcing types, correct combinations of values, applying default values, + and sanity checking cpu and memory reservations. Also, converts system_reserved_cpu into cpu.weights for cgroupv2. Raises: ValueError: On invalid inputs. @@ -34,6 +32,8 @@ class ResourceIsolationConfig: system_reserved_memory: The amount of memory in bytes reserved for ray system processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES and system_reserved_cpu + object_store_bytes < the total memory available. + + TODO(54703): Link documentation when it's available. """ def __init__( @@ -47,18 +47,22 @@ def __init__( self.cgroup_path = cgroup_path self.system_reserved_memory = system_reserved_memory self.system_pids = "" - # cgroupv2 cpu.weight calculated from system_reserved_cpu - # assumes ray uses all available cores. + + # cgroupv2 cpu.weight calculated from system_reserved_cpu assumes ray uses all available cores. self.system_reserved_cpu_weight: int = None - # TODO(irabbani): this is used to ensure - # that object_store_memory is not added twice - # to self._system_reserved_memory. This should - # be refactored in the future so that ResourceIsolationConfig - # can take object_store_memory as a constructor parameter - # and be constructed fully by the constructor. + + # TODO(irabbani): this is used to ensure that object_store_memory is not added twice + # to self._system_reserved_memory. This should be refactored in the future so that ResourceIsolationConfig + # can take object_store_memory as a constructor parameter and be constructed fully by the constructor. self._constructed = False if not enable_resource_isolation: + if self.cgroup_path: + raise ValueError( + "cgroup_path cannot be set when resource isolation is not enabled. " + "Set enable_resource_isolation to True if you're using ray.init or use the " + "--enable-resource-isolation flag if you're using the ray cli." + ) if system_reserved_cpu: raise ValueError( "system_reserved_cpu cannot be set when resource isolation is not enabled. " @@ -72,45 +76,49 @@ def __init__( "Set enable_resource_isolation to True if you're using ray.init or use the " "--enable-resource-isolation flag if you're using the ray cli." ) - if self.cgroup_path: - raise ValueError( - "cgroup_path cannot be set when resource isolation is not enabled. " - "Set enable_resource_isolation to True if you're using ray.init or use the " - "--enable-resource-isolation flag if you're using the ray cli." - ) return - # resource isolation is enabled self.system_reserved_cpu_weight = self._validate_and_get_system_reserved_cpu( system_reserved_cpu ) + self.system_reserved_memory = self._validate_and_get_system_reserved_memory( system_reserved_memory ) + self.cgroup_path = self._validate_and_get_cgroup_path(cgroup_path) def is_enabled(self) -> bool: return self._resource_isolation_enabled - def add_object_store_memory(self, object_store_memory: int): - """This is only supposed to be called once. It also cannot be - called if resouce isolation is not enabled. + def add_object_store_memory(self, object_store_memory_bytes: int): + """Adds object_store_memory to the memory reserved for system processes. + + Args: + object_store_memory_bytes: The amount processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES + and < the total number of cores available. + + Raises: + AssertionError: If called with resource isolation not enabled or called more than once for the same instance. + ValueError: If the input is not an integer or if the system_reserved_memory + object_store_memory is greater + than the total memory available on the system. + """ assert self.is_enabled(), ( "Cannot add object_store_memory to system_reserved_memory when " "enable_resource_isolation is False." ) assert not self._constructed, ( - "Cannot add object_store_memory to system_reserved_memory when" - "multiple times." + "Cannot call add_object_store_memory more than once with an instance " + "ResourceIsolationConfig. This is a bug in the ray code. " ) - self.system_reserved_memory += object_store_memory + self.system_reserved_memory += object_store_memory_bytes available_system_memory = ray._common.utils.get_system_memory() if self.system_reserved_memory > available_system_memory: raise ValueError( f"The total requested system_reserved_memory={self.system_reserved_memory}, calculated by " - " object_store_bytes + system_reserved_memory, is greater than the total memory " - f" available={available_system_memory}. Pick a smaller number of bytes for object_store_bytes " + "object_store_bytes + system_reserved_memory, is greater than the total memory " + f"available={available_system_memory}. Pick a smaller number of bytes for object_store_bytes " "or system_reserved_memory." ) self._constructed = True @@ -121,8 +129,7 @@ def add_system_pids(self, system_pids: str): @staticmethod def _validate_and_get_cgroup_path(cgroup_path: Optional[str]) -> str: - """Returns the ray_constants.DEFAULT_CGROUP_PATH if cgroup_path is not - specified. Checks the type of cgroup_path. + """Returns the ray_constants.DEFAULT_CGROUP_PATH if cgroup_path is not specified. Args: cgroup_path: The path for the cgroup the raylet should use to enforce @@ -150,9 +157,17 @@ def _validate_and_get_cgroup_path(cgroup_path: Optional[str]) -> str: def _validate_and_get_system_reserved_cpu( system_reserved_cpu: Optional[float], ) -> int: - """If system_reserved_cpu is not specified, returns the default value. Otherwise, - checks the type, makes sure that the value is in range, and converts it into cpu.weights - for cgroupv2. See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights for more information. + """If system_reserved_cpu is specified, validates it, otherwise returns the default value. + + Validation entails checking the type, ensuring that the value is in range, and converts it + into cpu.weights for cgroupv2. See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights + for more information. + + If system_reserved_cpu is not specified, returns a default value between + [DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES, DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES]. + + # TODO(54703): The errors from this method are user-facing and thus need + to be linked the user-facing documentation once it's available. Args: system_reserved_cpu: The amount of cores reserved for ray system @@ -160,15 +175,29 @@ def _validate_and_get_system_reserved_cpu( and < the total number of cores available. Raises: - ValueError: If system_reserved_cpu is specified, but invalid. + ValueError: If system_reserved_cpu is specified, but invalid or if the system + does not have enough available cpus. + """ available_system_cpus = utils.get_num_cpus() + if available_system_cpus < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES: + raise ValueError( + f"The available number of cpu cores on this system {available_system_cpus} is less than " + f"the minimum amount that is required for ray's system processes. " + f"Pick a number of cpu cores greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES}" + ) + if not system_reserved_cpu: - system_reserved_cpu = min( - ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_CORES, - ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION - * available_system_cpus, + system_reserved_cpu = float( + min( + max( + ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES, + ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION + * available_system_cpus, + ), + ray_constants.DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES, + ) ) if not ( @@ -183,12 +212,12 @@ def _validate_and_get_system_reserved_cpu( system_reserved_cpu = float(system_reserved_cpu) - if system_reserved_cpu < ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES: + if system_reserved_cpu < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES: raise ValueError( f"The requested system_reserved_cpu={system_reserved_cpu} is less than " f"the minimum number of cpus that can be used for resource isolation. " "Pick a number of cpu cores to reserve for ray system processes " - f"greater than or equal to {ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES}" + f"greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES}" ) if system_reserved_cpu > available_system_cpus: @@ -200,8 +229,8 @@ def _validate_and_get_system_reserved_cpu( # Converting the number of cores the user defined into cpu.weights # This assumes that ray is allowed to use all available CPU - # cores and distribute them between system processes and - # application processes + # cores and distribute them between system, worker and + # user processes return int( (system_reserved_cpu / float(available_system_cpus)) * _CGROUP_CPU_MAX_WEIGHT @@ -227,28 +256,44 @@ def _validate_and_get_system_reserved_memory( """ available_system_memory = ray._common.utils.get_system_memory() + if ( + available_system_memory + < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES + ): + raise ValueError( + f"The available memory on this system {available_system_memory} is less than " + f"the minimum amount that is required for ray's system processes. " + f"Pick a number of bytes greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES}" + ) + if not system_reserved_memory: system_reserved_memory = int( min( - ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES, - ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION - * available_system_memory, + max( + ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES, + ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION + * available_system_memory, + ), + ray_constants.DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES, ) ) if not isinstance(system_reserved_memory, int): raise ValueError( - f"Invalid value={system_reserved_memory} for system_reserved_memory. " + f"Invalid value {system_reserved_memory} for system_reserved_memory. " "Use an integer to represent the number bytes that need to be reserved for " "ray system processes to enable resource isolation." ) - if system_reserved_memory < ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES: + if ( + system_reserved_memory + < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES + ): raise ValueError( - f"The requested system_reserved_memory={system_reserved_memory} is less than " + f"The requested system_reserved_memory {system_reserved_memory} is less than " f"the minimum number of bytes that can be used for resource isolation. " "Pick a number of bytes to reserve for ray system processes " - f"greater than or equal to {ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES}" + f"greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES}" ) if system_reserved_memory > available_system_memory: diff --git a/python/ray/tests/resource_isolation/test_resource_isolation_config.py b/python/ray/tests/resource_isolation/test_resource_isolation_config.py index f08b4d694b8d..d22c9368fa45 100644 --- a/python/ray/tests/resource_isolation/test_resource_isolation_config.py +++ b/python/ray/tests/resource_isolation/test_resource_isolation_config.py @@ -2,233 +2,308 @@ import pytest +from ray._common import utils as common_utils from ray._private import utils from ray._private.resource_isolation_config import ResourceIsolationConfig -def test_disabled_by_default(): +def test_resource_isolation_is_disabled_by_default(): resource_isolation_config = ResourceIsolationConfig() assert not resource_isolation_config.is_enabled() -def test_disabled_isolation_with_cgroup_path_raises_exception(): - with pytest.raises(ValueError): +def test_disabled_resource_isolation_with_overrides_raises_value_error(): + + with pytest.raises( + ValueError, + match="cgroup_path cannot be set when resource isolation is not enabled", + ): ResourceIsolationConfig( enable_resource_isolation=False, cgroup_path="/some/path" ) - -def test_disabled_isolation_with_reserved_cpu_raises_exception(): - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="system_reserved_cpu cannot be set when resource isolation is not enabled", + ): ResourceIsolationConfig(enable_resource_isolation=False, system_reserved_cpu=1) - -def test_disabled_isolation_with_reserved_memory_raises_exception(): - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="system_reserved_memory cannot be set when resource isolation is not enabled", + ): ResourceIsolationConfig( - enable_resource_isolation=False, system_reserved_memory=1 + enable_resource_isolation=False, system_reserved_memory=1024**3 ) -def test_enabled_invalid_cgroup_path_type(): - with pytest.raises(ValueError): +def test_enabled_resource_isolation_with_non_string_cgroup_path_raises_value_error(): + + with pytest.raises(ValueError, match="Invalid value.*for cgroup_path"): ResourceIsolationConfig(enable_resource_isolation=True, cgroup_path=1) + with pytest.raises(ValueError, match="Invalid value.*for cgroup_path"): + ResourceIsolationConfig(enable_resource_isolation=True, cgroup_path=1.0) -def test_enabled_invalid_reserved_cpu_type(): - with pytest.raises(ValueError): - ResourceIsolationConfig(enable_resource_isolation=True, system_reserved_cpu="1") +def test_enabled_resource_isolation_with_non_number_reserved_cpu_raises_value_error(): -def test_enabled_invalid_reserved_memory_type(): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Invalid value.*for system_reserved_cpu."): ResourceIsolationConfig(enable_resource_isolation=True, system_reserved_cpu="1") -def test_enabled_default_config_proportions(monkeypatch): - object_store_memory = 10 * 10**9 - total_system_memory = 128 * 10**9 - total_system_cpu = 32 +def test_enabled_resource_isolation_with_non_number_reserved_memory_raises_value_error(): + + with pytest.raises(ValueError, match="Invalid value.*for system_reserved_memory."): + ResourceIsolationConfig( + enable_resource_isolation=True, system_reserved_memory="1" + ) + + +def test_enabled_default_config_with_insufficient_cpu_and_memory_raises_value_error( + monkeypatch, +): + # The following values in ray_constants define the minimum requirements for resource isolation + # 1) DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES + # 2) DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES + # NOTE: if you change the DEFAULT_MIN_SYSTEM_* constants, you may need to modify this test. + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 0.5) + with pytest.raises( + ValueError, match="available number of cpu cores.*less than the minimum" + ): + ResourceIsolationConfig(enable_resource_isolation=True) + + monkeypatch.undo() + monkeypatch.setattr( - "ray._common.utils.get_system_memory", - lambda *args, **kwargs: total_system_memory, - ) - monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: total_system_cpu) - resource_isolation_config = ResourceIsolationConfig(enable_resource_isolation=True) - resource_isolation_config.add_object_store_memory(object_store_memory) - # expect the default to be the min(128 * 0.10, 25G) + object_store_memory - expected_reserved_memory = 22800000000 - # expect the default to be the min(32 * 0.05, 1)/32 * 10000 - expected_reserved_cpu_weight = 312 - assert resource_isolation_config.system_reserved_memory == expected_reserved_memory - assert ( - resource_isolation_config.system_reserved_cpu_weight - == expected_reserved_cpu_weight - ) - - -def test_enabled_default_config_values(monkeypatch): - object_store_memory = 10 * 10**9 - total_system_memory = 500 * 10**9 - total_system_cpu = 64 + common_utils, "get_system_memory", lambda *args, **kwargs: 400 * (1024**2) + ) + with pytest.raises(ValueError, match="available memory.*less than the minimum"): + ResourceIsolationConfig(enable_resource_isolation=True) + + +def test_enabled_resource_isolation_with_default_config_picks_min_values(monkeypatch): + # The following values in ray_constants define the minimum requirements for resource isolation + # 1) DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES + # 2) DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES + # NOTE: if you change the DEFAULT_MIN_SYSTEM_* constants, you may need to modify this test. + # if the total number of cpus is between [1,19] the system cgroup will a weight that is equal to 1 cpu core. + # if the total amount of memory is between [0.5GB, 4.8GB] the system cgroup will get 0.5GB + object store memory. + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 1) monkeypatch.setattr( - "ray._common.utils.get_system_memory", - lambda *args, **kwargs: total_system_memory, - ) - monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: total_system_cpu) - resource_isolation_config = ResourceIsolationConfig(enable_resource_isolation=True) - resource_isolation_config.add_object_store_memory(object_store_memory) - # expect the default to be the min(500 * 0.10, 25G) + object_store_memory - expected_reserved_memory = 35000000000 - # expect the default to be the min(64 * 0.05, 1)/64 * 10000 - expected_reserved_cpu_weight = 156 - assert resource_isolation_config.system_reserved_memory == expected_reserved_memory - assert ( - resource_isolation_config.system_reserved_cpu_weight - == expected_reserved_cpu_weight - ) - - -def test_enabled_reserved_cpu_default_memory(monkeypatch): - object_store_memory = 10 * 10**9 - total_system_memory = 128 * 10**9 - total_system_cpu = 32 - system_reserved_cpu = 5 + common_utils, "get_system_memory", lambda *args, **kwargs: 0.5 * (1024**3) + ) + config = ResourceIsolationConfig(enable_resource_isolation=True) + assert config.system_reserved_cpu_weight == 10000 + assert config.system_reserved_memory == 500 * (1024**2) + + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 19) monkeypatch.setattr( - "ray._common.utils.get_system_memory", - lambda *args, **kwargs: total_system_memory, - ) - monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: total_system_cpu) - resource_isolation_config = ResourceIsolationConfig( - enable_resource_isolation=True, system_reserved_cpu=system_reserved_cpu - ) - resource_isolation_config.add_object_store_memory(object_store_memory) - # expect the default to be the min(128 * 0.10, 25G) + object_store_memory - expected_reserved_memory = 22800000000 - # expect the default to be the 5/32 * 10000 - expected_reserved_cpu_weight = 1562 - assert resource_isolation_config.system_reserved_memory == expected_reserved_memory - assert ( - resource_isolation_config.system_reserved_cpu_weight - == expected_reserved_cpu_weight - ) - - -def test_enabled_reserved_memory_default_cpu(monkeypatch): - object_store_memory = 10 * 10**9 - total_system_memory = 128 * 10**9 - total_system_cpu = 32 - system_reserved_memory = 15 * 10**9 + common_utils, "get_system_memory", lambda *args, **kwargs: 4.8 * (1024**3) + ) + config = ResourceIsolationConfig(enable_resource_isolation=True) + assert config.system_reserved_cpu_weight == 526 + assert config.system_reserved_memory == 500 * (1024**2) + + +def test_enabled_resource_isolation_with_default_config_values_scale_with_system( + monkeypatch, +): + # The following values in ray_constants define the default proportion for resource isolation + # 1) DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION + # 2) DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION + # NOTE: if you change the DEFAULT_SYSTEM_RESERVED_* constants, you may need to modify this test. + # if the number of cpus on the system is [20,60] the reserved cpu cores will scale proportionately. + # if the amount of memory on the system is [5GB, 100GB] the reserved system memory will scale proportionately. + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 20) monkeypatch.setattr( - "ray._common.utils.get_system_memory", - lambda *args, **kwargs: total_system_memory, - ) - monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: total_system_cpu) - resource_isolation_config = ResourceIsolationConfig( - enable_resource_isolation=True, system_reserved_memory=system_reserved_memory - ) - resource_isolation_config.add_object_store_memory(object_store_memory) - # expect the default to be the min(128 * 0.10, 25G) + object_store_memory - expected_reserved_memory = system_reserved_memory + object_store_memory - # expect the default to be the min(32 * 0.05, 1)/32 * 1000 - expected_reserved_cpu_weight = 312 - assert resource_isolation_config.system_reserved_memory == expected_reserved_memory - assert ( - resource_isolation_config.system_reserved_cpu_weight - == expected_reserved_cpu_weight - ) - - -def test_enabled_override_all_default_values(monkeypatch): - object_store_memory = 10 * 10**9 - total_system_memory = 128 * 10**9 - system_reserved_memory = 15 * 10**9 - total_system_cpu = 32 - system_reserved_cpu = 5 - cgroup_path = "/sys/fs/cgroup/subcgroup" + common_utils, "get_system_memory", lambda *args, **kwargs: 5 * (1024**3) + ) + config = ResourceIsolationConfig(enable_resource_isolation=True) + assert config.system_reserved_cpu_weight == 500 + assert config.system_reserved_memory == 536870912 + + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 59) monkeypatch.setattr( - "ray._common.utils.get_system_memory", - lambda *args, **kwargs: total_system_memory, + common_utils, "get_system_memory", lambda *args, **kwargs: 99 * (1024**3) ) - monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: total_system_cpu) - resource_isolation_config = ResourceIsolationConfig( - enable_resource_isolation=True, - cgroup_path=cgroup_path, - system_reserved_cpu=system_reserved_cpu, - system_reserved_memory=system_reserved_memory, - ) - resource_isolation_config.add_object_store_memory(object_store_memory) - expected_reserved_memory = 25000000000 - expected_reserved_cpu_weight = 1562 - assert resource_isolation_config.system_reserved_memory == expected_reserved_memory - assert ( - resource_isolation_config.system_reserved_cpu_weight - == expected_reserved_cpu_weight - ) - assert resource_isolation_config.cgroup_path == cgroup_path - - -def test_enabled_reserved_cpu_exceeds_available_cpu_raises_exception(monkeypatch): - total_system_cpu = 32 - system_reserved_cpu = 33 - monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: total_system_cpu) - with pytest.raises(ValueError): + config = ResourceIsolationConfig(enable_resource_isolation=True) + assert config.system_reserved_cpu_weight == 500 + assert config.system_reserved_memory == 10630044057 + + +def test_enabled_resource_isolation_with_default_config_picks_max_values(monkeypatch): + # The following values in ray_constants define the max reserved values for resource isolation + # 1) DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES + # 2) DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES + # NOTE: if you change the DEFAULT_MAX_SYSTEM* constants, you may need to modify this test. + # if the number of cpus on the system >= 60 the reserved cpu cores will be DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES. + # if the amount of memory on the system >= 100GB the reserved memory will be DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES. + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 61) + monkeypatch.setattr( + common_utils, "get_system_memory", lambda *args, **kwargs: 100 * (1024**3) + ) + config = ResourceIsolationConfig(enable_resource_isolation=True) + assert config.system_reserved_cpu_weight == 491 + assert config.system_reserved_memory == 10 * (1024**3) + + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 128) + monkeypatch.setattr( + common_utils, "get_system_memory", lambda *args, **kwargs: 500 * (1024**3) + ) + config = ResourceIsolationConfig(enable_resource_isolation=True) + assert config.system_reserved_cpu_weight == 234 + assert config.system_reserved_memory == 10 * (1024**3) + + +def test_enabled_with_resource_overrides_less_than_minimum_defaults_raise_value_error(): + # The following values in ray_constants define the min values needed to run ray with resource isolation. + # 1) DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES + # 2) DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES + # NOTE: if you change the DEFAULT_MIN_SYSTEM* constants, you may need to modify this test. + with pytest.raises( + ValueError, + match="The requested system_reserved_cpu=0.5 is less than the minimum number of cpus that can be used for resource isolation.", + ): + ResourceIsolationConfig(enable_resource_isolation=True, system_reserved_cpu=0.5) + + with pytest.raises( + ValueError, + match="The requested system_reserved_memory 4194304 is less than the minimum number of bytes that can be used for resource isolation.", + ): ResourceIsolationConfig( - enable_resource_isolation=True, system_reserved_cpu=system_reserved_cpu + enable_resource_isolation=True, system_reserved_memory=4 * (1024**2) ) -def test_enabled_reserved_cpu_less_than_minimum_raises_exception(monkeypatch): - system_reserved_cpu = 0.1 - with pytest.raises(ValueError): +def test_enabled_with_resource_overrides_greater_than_available_resources_raise_value_error( + monkeypatch, +): + # The following values in ray_constants define the maximum reserved values to run ray with resource isolation. + # 1) DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES + # 2) DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES + # NOTE: if you change the DEFAULT_MAX_SYSTEM* constants, you may need to modify this test. + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 32) + with pytest.raises( + ValueError, + match="The requested system_reserved_cpu=32.1 is greater than the number of cpus available=32", + ): ResourceIsolationConfig( - enable_resource_isolation=True, system_reserved_cpu=system_reserved_cpu + enable_resource_isolation=True, system_reserved_cpu=32.1 ) - -def test_enabled_reserved_memory_exceeds_available_memory_raises_exception(monkeypatch): - total_system_cpu = 32 - total_system_memory = 128 * 10**9 - system_reserved_memory = (128 * 10**9) + 1 - monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: total_system_cpu) monkeypatch.setattr( - "ray._common.utils.get_system_memory", - lambda *args, **kwargs: total_system_memory, + common_utils, "get_system_memory", lambda *args, **kwargs: 10 * (1024**3) ) - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="The total requested system_reserved_memory=11811160064 is greater than the amount of memory available=10737418240", + ): ResourceIsolationConfig( - enable_resource_isolation=True, - system_reserved_memory=system_reserved_memory, + enable_resource_isolation=True, system_reserved_memory=11 * (1024**3) ) -def test_enabled_total_system_reserved_memory_exceeds_available_memory_raises_exception( +def test_add_object_store_memory_called_more_than_once_raises_value_error(monkeypatch): + # Monkeypatch to make sure the underlying system's resources don't cause the test to fail. + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 32) + monkeypatch.setattr( + common_utils, "get_system_memory", lambda *args, **kwargs: 128 * (1024**3) + ) + config: ResourceIsolationConfig = ResourceIsolationConfig( + enable_resource_isolation=True + ) + config.add_object_store_memory(5 * (1024**3)) + with pytest.raises( + AssertionError, + match="Cannot call add_object_store_memory more than once with an instance ResourceIsolationConfig. This is a bug in the ray code", + ): + config.add_object_store_memory(5 * (1024**3)) + + +def test_add_object_store_memory_plus_system_reserved_memory_gt_available_memory_raises_value_error( monkeypatch, ): - total_system_cpu = 32 - object_store_memory = 10 * 10**9 - total_system_memory = 128 * 10**9 - # combined with object store, it exceeds available memory - system_reserved_memory = 119 * 10**9 - monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: total_system_cpu) + # Monkeypatch to make sure the underlying system's resources don't cause the test to fail. + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 16) + # 32GB of total memory available on the system. monkeypatch.setattr( - "ray._common.utils.get_system_memory", - lambda *args, **kwargs: total_system_memory, + common_utils, "get_system_memory", lambda *args, **kwargs: 32 * (1024**3) ) - resource_isolation_config = ResourceIsolationConfig( - enable_resource_isolation=True, system_reserved_memory=system_reserved_memory + # 16GB reserved for system processes. + config: ResourceIsolationConfig = ResourceIsolationConfig( + enable_resource_isolation=True, system_reserved_memory=16 * (1024**3) ) - with pytest.raises(ValueError): - resource_isolation_config.add_object_store_memory(object_store_memory) + # 16GB + 1 byte reserved for object store. + with pytest.raises( + ValueError, + match=r"The total requested system_reserved_memory=34359738369.*is greater than the total memory available=34359738368", + ): + config.add_object_store_memory(16 * (1024**3) + 1) -def test_enabled_reserved_memory_less_than_minimum_raises_exception(monkeypatch): - system_reserved_memory = 1 * 10**3 - with pytest.raises(ValueError): - ResourceIsolationConfig( - enable_resource_isolation=True, - system_reserved_memory=system_reserved_memory, - ) +def test_resource_isolation_enabled_with_partial_resource_overrides_and_defaults_happy_path( + monkeypatch, +): + # This is a happy path test where all overrides are specified with valid values. + # NOTE: if you change the DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION, this test may fail. + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 32) + monkeypatch.setattr( + common_utils, "get_system_memory", lambda *args, **kwargs: 64 * (1024**3) + ) + + # Overriding cgroup_path while using default system_reserved_cpu and system_reserved_memory + override_cgroup_path_config: ResourceIsolationConfig = ResourceIsolationConfig( + enable_resource_isolation=True, cgroup_path="/sys/fs/cgroup/ray" + ) + assert override_cgroup_path_config.cgroup_path == "/sys/fs/cgroup/ray" + # (32 cpus * 0.05 (default))/10000 = 500 + assert override_cgroup_path_config.system_reserved_cpu_weight == 500 + # 64GB * 0.10 = 6.4GB + assert override_cgroup_path_config.system_reserved_memory == 6871947673 + + # Overriding system_reserved_cpu while using default cgroup_path and system_reserved_memory + override_cpu_config: ResourceIsolationConfig = ResourceIsolationConfig( + enable_resource_isolation=True, system_reserved_cpu=1.5 + ) + assert override_cpu_config.system_reserved_cpu_weight == 468 + # defaults to /sys/fs/cgroup + assert override_cpu_config.cgroup_path == "/sys/fs/cgroup" + # 64GB * 0.10 = 6.4GB + assert override_cpu_config.system_reserved_memory == 6871947673 + + # Overriding system_reserved_memory while using default cgroup_path and system_reserved_cpu + override_memory_config: ResourceIsolationConfig = ResourceIsolationConfig( + enable_resource_isolation=True, system_reserved_memory=5 * (1024**3) + ) + assert override_memory_config.system_reserved_memory == 5368709120 + # defaults to /sys/fs/cgroup + assert override_memory_config.cgroup_path == "/sys/fs/cgroup" + # (32 cpus * 0.05 (default))/10000 = 500 + assert override_memory_config.system_reserved_cpu_weight == 500 + + +def test_resource_isolation_enabled_with_full_overrides_happy_path(monkeypatch): + monkeypatch.setattr(utils, "get_num_cpus", lambda *args, **kwargs: 32) + monkeypatch.setattr( + common_utils, "get_system_memory", lambda *args, **kwargs: 128 * (1024**3) + ) + # The system_reserved_cpu is deliberately > the maximum default. + # The system_reserved_memory is deliberately > the maximum default. + override_config: ResourceIsolationConfig = ResourceIsolationConfig( + enable_resource_isolation=True, + cgroup_path="/sys/fs/cgroup/ray", + system_reserved_cpu=5.0, + system_reserved_memory=15 * 1024**3, + ) + # Adding the 38G of object store memory. + override_config.add_object_store_memory(38 * (1024**3)) + + assert override_config.cgroup_path == "/sys/fs/cgroup/ray" + # int(5/32 * 10000) + assert override_config.system_reserved_cpu_weight == 1562 + # system_reserved_memory + object_store_memory = 15G + 38G = 53G + assert override_config.system_reserved_memory == 53 * (1024**3) if __name__ == "__main__":