1313
1414
1515class ResourceIsolationConfig :
16- """Configuration for enabling resource isolation by reserving memory
17- and cpu for ray system processes through cgroupv2.
18- This class validates configuration for resource isolation by
19- enforcing types, correct combinations of values, applying default values,
20- and sanity checking cpu and memory reservations.
21- Also, converts system_reserved_cpu into cpu.weights for cgroupv2.
16+ """Configuration for enabling resource isolation by reserving memory and cpu for ray system processes through cgroupv2.
17+
18+ Validates configuration for resource isolation by enforcing types, correct combinations of values, applying default values,
19+ and sanity checking cpu and memory reservations. Also, converts system_reserved_cpu into cpu.weights for cgroupv2.
2220
2321 Raises:
2422 ValueError: On invalid inputs.
@@ -34,6 +32,8 @@ class ResourceIsolationConfig:
3432 system_reserved_memory: The amount of memory in bytes reserved
3533 for ray system processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES
3634 and system_reserved_cpu + object_store_bytes < the total memory available.
35+
36+ TODO(54703): Link documentation when it's available.
3737 """
3838
3939 def __init__ (
@@ -47,18 +47,22 @@ def __init__(
4747 self .cgroup_path = cgroup_path
4848 self .system_reserved_memory = system_reserved_memory
4949 self .system_pids = ""
50- # cgroupv2 cpu.weight calculated from system_reserved_cpu
51- # assumes ray uses all available cores.
50+
51+ # cgroupv2 cpu.weight calculated from system_reserved_cpu assumes ray uses all available cores.
5252 self .system_reserved_cpu_weight : int = None
53- # TODO(irabbani): this is used to ensure
54- # that object_store_memory is not added twice
55- # to self._system_reserved_memory. This should
56- # be refactored in the future so that ResourceIsolationConfig
57- # can take object_store_memory as a constructor parameter
58- # and be constructed fully by the constructor.
53+
54+ # TODO(irabbani): this is used to ensure that object_store_memory is not added twice
55+ # to self._system_reserved_memory. This should be refactored in the future so that ResourceIsolationConfig
56+ # can take object_store_memory as a constructor parameter and be constructed fully by the constructor.
5957 self ._constructed = False
6058
6159 if not enable_resource_isolation :
60+ if self .cgroup_path :
61+ raise ValueError (
62+ "cgroup_path cannot be set when resource isolation is not enabled. "
63+ "Set enable_resource_isolation to True if you're using ray.init or use the "
64+ "--enable-resource-isolation flag if you're using the ray cli."
65+ )
6266 if system_reserved_cpu :
6367 raise ValueError (
6468 "system_reserved_cpu cannot be set when resource isolation is not enabled. "
@@ -72,45 +76,49 @@ def __init__(
7276 "Set enable_resource_isolation to True if you're using ray.init or use the "
7377 "--enable-resource-isolation flag if you're using the ray cli."
7478 )
75- if self .cgroup_path :
76- raise ValueError (
77- "cgroup_path cannot be set when resource isolation is not enabled. "
78- "Set enable_resource_isolation to True if you're using ray.init or use the "
79- "--enable-resource-isolation flag if you're using the ray cli."
80- )
8179 return
8280
83- # resource isolation is enabled
8481 self .system_reserved_cpu_weight = self ._validate_and_get_system_reserved_cpu (
8582 system_reserved_cpu
8683 )
84+
8785 self .system_reserved_memory = self ._validate_and_get_system_reserved_memory (
8886 system_reserved_memory
8987 )
88+
9089 self .cgroup_path = self ._validate_and_get_cgroup_path (cgroup_path )
9190
9291 def is_enabled (self ) -> bool :
9392 return self ._resource_isolation_enabled
9493
95- def add_object_store_memory (self , object_store_memory : int ):
96- """This is only supposed to be called once. It also cannot be
97- called if resouce isolation is not enabled.
94+ def add_object_store_memory (self , object_store_memory_bytes : int ):
95+ """Adds object_store_memory to the memory reserved for system processes.
96+
97+ Args:
98+ object_store_memory_bytes: The amount processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES
99+ and < the total number of cores available.
100+
101+ Raises:
102+ AssertionError: If called with resource isolation not enabled or called more than once for the same instance.
103+ ValueError: If the input is not an integer or if the system_reserved_memory + object_store_memory is greater
104+ than the total memory available on the system.
105+
98106 """
99107 assert self .is_enabled (), (
100108 "Cannot add object_store_memory to system_reserved_memory when "
101109 "enable_resource_isolation is False."
102110 )
103111 assert not self ._constructed , (
104- "Cannot add object_store_memory to system_reserved_memory when "
105- "multiple times. "
112+ "Cannot call add_object_store_memory more than once with an instance "
113+ "ResourceIsolationConfig. This is a bug in the ray code. "
106114 )
107- self .system_reserved_memory += object_store_memory
115+ self .system_reserved_memory += object_store_memory_bytes
108116 available_system_memory = ray ._common .utils .get_system_memory ()
109117 if self .system_reserved_memory > available_system_memory :
110118 raise ValueError (
111119 f"The total requested system_reserved_memory={ self .system_reserved_memory } , calculated by "
112- " object_store_bytes + system_reserved_memory, is greater than the total memory "
113- f" available={ available_system_memory } . Pick a smaller number of bytes for object_store_bytes "
120+ "object_store_bytes + system_reserved_memory, is greater than the total memory "
121+ f"available={ available_system_memory } . Pick a smaller number of bytes for object_store_bytes "
114122 "or system_reserved_memory."
115123 )
116124 self ._constructed = True
@@ -121,8 +129,7 @@ def add_system_pids(self, system_pids: str):
121129
122130 @staticmethod
123131 def _validate_and_get_cgroup_path (cgroup_path : Optional [str ]) -> str :
124- """Returns the ray_constants.DEFAULT_CGROUP_PATH if cgroup_path is not
125- specified. Checks the type of cgroup_path.
132+ """Returns the ray_constants.DEFAULT_CGROUP_PATH if cgroup_path is not specified.
126133
127134 Args:
128135 cgroup_path: The path for the cgroup the raylet should use to enforce
@@ -150,25 +157,47 @@ def _validate_and_get_cgroup_path(cgroup_path: Optional[str]) -> str:
150157 def _validate_and_get_system_reserved_cpu (
151158 system_reserved_cpu : Optional [float ],
152159 ) -> int :
153- """If system_reserved_cpu is not specified, returns the default value. Otherwise,
154- checks the type, makes sure that the value is in range, and converts it into cpu.weights
155- for cgroupv2. See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights for more information.
160+ """If system_reserved_cpu is specified, validates it, otherwise returns the default value.
161+
162+ Validation entails checking the type, ensuring that the value is in range, and converts it
163+ into cpu.weights for cgroupv2. See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights
164+ for more information.
165+
166+ If system_reserved_cpu is not specified, returns a default value between
167+ [DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES, DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES].
168+
169+ # TODO(54703): The errors from this method are user-facing and thus need
170+ to be linked the user-facing documentation once it's available.
156171
157172 Args:
158173 system_reserved_cpu: The amount of cores reserved for ray system
159174 processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES
160175 and < the total number of cores available.
161176
162177 Raises:
163- ValueError: If system_reserved_cpu is specified, but invalid.
178+ ValueError: If system_reserved_cpu is specified, but invalid or if the system
179+ does not have enough available cpus.
180+
164181 """
165182 available_system_cpus = utils .get_num_cpus ()
166183
184+ if available_system_cpus < ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES :
185+ raise ValueError (
186+ f"The available number of cpu cores on this system { available_system_cpus } is less than "
187+ f"the minimum amount that is required for ray's system processes. "
188+ f"Pick a number of cpu cores greater than or equal to { ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES } "
189+ )
190+
167191 if not system_reserved_cpu :
168- system_reserved_cpu = min (
169- ray_constants .DEFAULT_SYSTEM_RESERVED_CPU_CORES ,
170- ray_constants .DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION
171- * available_system_cpus ,
192+ system_reserved_cpu = float (
193+ min (
194+ max (
195+ ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES ,
196+ ray_constants .DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION
197+ * available_system_cpus ,
198+ ),
199+ ray_constants .DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES ,
200+ )
172201 )
173202
174203 if not (
@@ -183,12 +212,12 @@ def _validate_and_get_system_reserved_cpu(
183212
184213 system_reserved_cpu = float (system_reserved_cpu )
185214
186- if system_reserved_cpu < ray_constants .MINIMUM_SYSTEM_RESERVED_CPU_CORES :
215+ if system_reserved_cpu < ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES :
187216 raise ValueError (
188217 f"The requested system_reserved_cpu={ system_reserved_cpu } is less than "
189218 f"the minimum number of cpus that can be used for resource isolation. "
190219 "Pick a number of cpu cores to reserve for ray system processes "
191- f"greater than or equal to { ray_constants .MINIMUM_SYSTEM_RESERVED_CPU_CORES } "
220+ f"greater than or equal to { ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES } "
192221 )
193222
194223 if system_reserved_cpu > available_system_cpus :
@@ -200,8 +229,8 @@ def _validate_and_get_system_reserved_cpu(
200229
201230 # Converting the number of cores the user defined into cpu.weights
202231 # This assumes that ray is allowed to use all available CPU
203- # cores and distribute them between system processes and
204- # application processes
232+ # cores and distribute them between system, worker and
233+ # user processes
205234 return int (
206235 (system_reserved_cpu / float (available_system_cpus ))
207236 * _CGROUP_CPU_MAX_WEIGHT
@@ -227,28 +256,44 @@ def _validate_and_get_system_reserved_memory(
227256 """
228257 available_system_memory = ray ._common .utils .get_system_memory ()
229258
259+ if (
260+ available_system_memory
261+ < ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES
262+ ):
263+ raise ValueError (
264+ f"The available memory on this system { available_system_memory } is less than "
265+ f"the minimum amount that is required for ray's system processes. "
266+ f"Pick a number of bytes greater than or equal to { ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES } "
267+ )
268+
230269 if not system_reserved_memory :
231270 system_reserved_memory = int (
232271 min (
233- ray_constants .DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES ,
234- ray_constants .DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION
235- * available_system_memory ,
272+ max (
273+ ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES ,
274+ ray_constants .DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION
275+ * available_system_memory ,
276+ ),
277+ ray_constants .DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES ,
236278 )
237279 )
238280
239281 if not isinstance (system_reserved_memory , int ):
240282 raise ValueError (
241- f"Invalid value= { system_reserved_memory } for system_reserved_memory. "
283+ f"Invalid value { system_reserved_memory } for system_reserved_memory. "
242284 "Use an integer to represent the number bytes that need to be reserved for "
243285 "ray system processes to enable resource isolation."
244286 )
245287
246- if system_reserved_memory < ray_constants .MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES :
288+ if (
289+ system_reserved_memory
290+ < ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES
291+ ):
247292 raise ValueError (
248- f"The requested system_reserved_memory= { system_reserved_memory } is less than "
293+ f"The requested system_reserved_memory { system_reserved_memory } is less than "
249294 f"the minimum number of bytes that can be used for resource isolation. "
250295 "Pick a number of bytes to reserve for ray system processes "
251- f"greater than or equal to { ray_constants .MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES } "
296+ f"greater than or equal to { ray_constants .DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES } "
252297 )
253298
254299 if system_reserved_memory > available_system_memory :
0 commit comments