Skip to content

Commit 31e1638

Browse files
authored
[CLI] Improve ray status for placement groups (#18289)
1 parent 344f2d9 commit 31e1638

File tree

5 files changed

+143
-41
lines changed

5 files changed

+143
-41
lines changed

python/ray/autoscaler/_private/util.py

Lines changed: 75 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -418,59 +418,81 @@ def parse_placement_group_resource_str(
418418
placement_group_resource_str: str) -> Tuple[str, Optional[str]]:
419419
"""Parse placement group resource in the form of following 3 cases:
420420
{resource_name}_group_{bundle_id}_{group_name};
421+
-> This case is ignored as it is duplicated to the case below.
421422
{resource_name}_group_{group_name};
422423
{resource_name}
423424
424425
Returns:
425-
Tuple of (resource_name, placement_group_name). placement_group_name
426-
could be None if its not a placement group resource.
426+
Tuple of (resource_name, placement_group_name, is_countable_resource).
427+
placement_group_name could be None if its not a placement group
428+
resource. is_countable_resource is True if the resource
429+
doesn't contain bundle index. We shouldn't count resources
430+
with bundle index because it will
431+
have duplicated resource information as
432+
wildcard resources (resource name without bundle index).
427433
"""
428434
result = PLACEMENT_GROUP_RESOURCE_BUNDLED_PATTERN.match(
429435
placement_group_resource_str)
430436
if result:
431-
return (result.group(1), result.group(3))
437+
return (result.group(1), result.group(3), False)
432438
result = PLACEMENT_GROUP_RESOURCE_PATTERN.match(
433439
placement_group_resource_str)
434440
if result:
435-
return (result.group(1), result.group(2))
436-
return (placement_group_resource_str, None)
441+
return (result.group(1), result.group(2), True)
442+
return (placement_group_resource_str, None, True)
437443

438444

439445
def get_usage_report(lm_summary: LoadMetricsSummary) -> str:
440446
# first collect resources used in placement groups
441-
placement_group_resource_usage = collections.defaultdict(float)
447+
placement_group_resource_usage = {}
448+
placement_group_resource_total = collections.defaultdict(float)
442449
for resource, (used, total) in lm_summary.usage.items():
443-
(pg_resource_name,
444-
pg_name) = parse_placement_group_resource_str(resource)
450+
(pg_resource_name, pg_name,
451+
is_countable) = parse_placement_group_resource_str(resource)
445452
if pg_name:
446-
placement_group_resource_usage[pg_resource_name] += used
453+
if pg_resource_name not in placement_group_resource_usage:
454+
placement_group_resource_usage[pg_resource_name] = 0
455+
if is_countable:
456+
placement_group_resource_usage[pg_resource_name] += used
457+
placement_group_resource_total[pg_resource_name] += total
447458
continue
448459

449460
usage_lines = []
450461
for resource, (used, total) in sorted(lm_summary.usage.items()):
451462
if "node:" in resource:
452463
continue # Skip the auto-added per-node "node:<ip>" resource.
453464

454-
(_, pg_name) = parse_placement_group_resource_str(resource)
465+
(_, pg_name, _) = parse_placement_group_resource_str(resource)
455466
if pg_name:
456467
continue # Skip resource used by placement groups
457468

458-
used_in_pg = placement_group_resource_usage[resource]
459-
460-
line = f" {used}/{total} {resource}"
461-
if used_in_pg != 0:
462-
line = line + f" ({used_in_pg} reserved in placement groups)"
469+
pg_used = 0
470+
pg_total = 0
471+
used_in_pg = resource in placement_group_resource_usage
472+
if used_in_pg:
473+
pg_used = placement_group_resource_usage[resource]
474+
pg_total = placement_group_resource_total[resource]
475+
# Used includes pg_total because when pgs are created
476+
# it allocates resources.
477+
# To get the real resource usage, we should subtract the pg
478+
# reserved resources from the usage and add pg used instead.
479+
used = used - pg_total + pg_used
463480

464481
if resource in ["memory", "object_store_memory"]:
465482
to_GiB = 1 / 2**30
466-
used *= to_GiB
467-
total *= to_GiB
468-
used_in_pg *= to_GiB
469-
line = f" {used:.2f}/{total:.3f} GiB {resource}"
470-
if used_in_pg != 0:
471-
line = line + f" ({used_in_pg:.2f} GiB reserved" \
472-
+ " in placement groups)"
473-
usage_lines.append(line)
483+
line = (f" {(used * to_GiB):.2f}/"
484+
f"{(total * to_GiB):.3f} GiB {resource}")
485+
if used_in_pg:
486+
line = line + (f" ({(pg_used * to_GiB):.2f} used of "
487+
f"{(pg_total * to_GiB):.2f} GiB " +
488+
"reserved in placement groups)")
489+
usage_lines.append(line)
490+
else:
491+
line = f" {used}/{total} {resource}"
492+
if used_in_pg:
493+
line += (f" ({pg_used} used of "
494+
f"{pg_total} reserved in placement groups)")
495+
usage_lines.append(line)
474496
usage_report = "\n".join(usage_lines)
475497
return usage_report
476498

@@ -488,8 +510,8 @@ def filter_placement_group_from_bundle(bundle: ResourceBundle):
488510
using_placement_group = False
489511
result_bundle = dict()
490512
for pg_resource_str, resource_count in bundle.items():
491-
(resource_name,
492-
pg_name) = parse_placement_group_resource_str(pg_resource_str)
513+
(resource_name, pg_name,
514+
_) = parse_placement_group_resource_str(pg_resource_str)
493515
result_bundle[resource_name] = resource_count
494516
if pg_name:
495517
using_placement_group = True
@@ -600,6 +622,32 @@ def format_info_string(lm_summary, autoscaler_summary, time=None):
600622
return formatted_output
601623

602624

625+
def format_no_node_type_string(node_type: dict):
626+
placement_group_resource_usage = {}
627+
regular_resource_usage = collections.defaultdict(float)
628+
for resource, total in node_type.items():
629+
(pg_resource_name, pg_name,
630+
is_countable) = parse_placement_group_resource_str(resource)
631+
if pg_name:
632+
if not is_countable:
633+
continue
634+
if pg_resource_name not in placement_group_resource_usage:
635+
placement_group_resource_usage[pg_resource_name] = 0
636+
placement_group_resource_usage[pg_resource_name] += total
637+
else:
638+
regular_resource_usage[resource] += total
639+
640+
output_lines = [""]
641+
for resource, total in regular_resource_usage.items():
642+
output_line = f"{resource}: {total}"
643+
if resource in placement_group_resource_usage:
644+
pg_resource = placement_group_resource_usage[resource]
645+
output_line += f" ({pg_resource} reserved in placement groups)"
646+
output_lines.append(output_line)
647+
648+
return "\n ".join(output_lines)
649+
650+
603651
def format_info_string_no_node_types(lm_summary, time=None):
604652
if time is None:
605653
time = datetime.now()
@@ -608,7 +656,8 @@ def format_info_string_no_node_types(lm_summary, time=None):
608656

609657
node_lines = []
610658
for node_type, count in lm_summary.node_types:
611-
line = f" {count} node(s) with resources: {node_type}"
659+
line = (f" {count} node(s) with resources:"
660+
f"{format_no_node_type_string(node_type)}")
612661
node_lines.append(line)
613662
node_report = "\n".join(node_lines)
614663

python/ray/tests/test_cli.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,8 @@ def _check_output_via_pattern(name, result):
204204
expected_lines = _load_output_pattern(name)
205205

206206
if result.exception is not None:
207-
print(result.output)
208207
raise result.exception from None
209-
208+
print(result.output)
210209
expected = r" *\n".join(expected_lines) + "\n?"
211210
if re.fullmatch(expected, result.output) is None:
212211
_debug_check_line_by_line(result, expected_lines)

python/ray/tests/test_cli_patterns/test_ray_status.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
======== Cluster status: .+
22
Node status
33
------------------------------------------------------------
4-
1 node\(s\) with resources: .+
4+
1 node\(s\) with resources:
5+
.+
6+
.+
7+
.+
8+
.+
59

610
Resources
711
------------------------------------------------------------

python/ray/tests/test_placement_group.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1980,5 +1980,44 @@ def is_usage_updated():
19801980
assert demand_output["demand"] == "(no resource demands)"
19811981

19821982

1983+
def test_placement_group_status(ray_start_cluster):
1984+
cluster = ray_start_cluster
1985+
cluster.add_node(num_cpus=4)
1986+
ray.init(address=cluster.address)
1987+
1988+
@ray.remote(num_cpus=1)
1989+
class A:
1990+
def ready(self):
1991+
pass
1992+
1993+
pg = ray.util.placement_group([{"CPU": 1}])
1994+
ray.get(pg.ready())
1995+
1996+
# Wait until the usage is updated, which is
1997+
# when the demand is also updated.
1998+
def is_usage_updated():
1999+
demand_output = get_ray_status_output(cluster.address)
2000+
return demand_output["usage"] != ""
2001+
2002+
wait_for_condition(is_usage_updated)
2003+
demand_output = get_ray_status_output(cluster.address)
2004+
cpu_usage = demand_output["usage"].split("\n")[0]
2005+
expected = "0.0/4.0 CPU (0.0 used of 1.0 reserved in placement groups)"
2006+
assert cpu_usage == expected
2007+
2008+
# 2 CPU + 1 PG CPU == 3.0/4.0 CPU (1 used by pg)
2009+
actors = [A.remote() for _ in range(2)]
2010+
actors_in_pg = [A.options(placement_group=pg).remote() for _ in range(1)]
2011+
2012+
ray.get([actor.ready.remote() for actor in actors])
2013+
ray.get([actor.ready.remote() for actor in actors_in_pg])
2014+
# Wait long enough until the usage is propagated to GCS.
2015+
time.sleep(5)
2016+
demand_output = get_ray_status_output(cluster.address)
2017+
cpu_usage = demand_output["usage"].split("\n")[0]
2018+
expected = "3.0/4.0 CPU (1.0 used of 1.0 reserved in placement groups)"
2019+
assert cpu_usage == expected
2020+
2021+
19832022
if __name__ == "__main__":
19842023
sys.exit(pytest.main(["-sv", __file__]))

python/ray/tests/test_resource_demand_scheduler.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2391,7 +2391,7 @@ def test_info_string():
23912391
lm_summary = LoadMetricsSummary(
23922392
head_ip="0.0.0.0",
23932393
usage={
2394-
"CPU": (530, 544),
2394+
"CPU": (530.0, 544.0),
23952395
"GPU": (2, 2),
23962396
"AcceleratorType:V100": (0, 2),
23972397
"memory": (2 * 2**30, 2**33),
@@ -2439,7 +2439,7 @@ def test_info_string():
24392439
24402440
Usage:
24412441
0/2 AcceleratorType:V100
2442-
530/544 CPU
2442+
530.0/544.0 CPU
24432443
2/2 GPU
24442444
2.00/8.000 GiB memory
24452445
3.14/16.000 GiB object_store_memory
@@ -2461,7 +2461,7 @@ def test_info_string_failed_node_cap():
24612461
lm_summary = LoadMetricsSummary(
24622462
head_ip="0.0.0.0",
24632463
usage={
2464-
"CPU": (530, 544),
2464+
"CPU": (530.0, 544.0),
24652465
"GPU": (2, 2),
24662466
"AcceleratorType:V100": (0, 2),
24672467
"memory": (2 * 2**30, 2**33),
@@ -2532,7 +2532,7 @@ def test_info_string_failed_node_cap():
25322532
25332533
Usage:
25342534
0/2 AcceleratorType:V100
2535-
530/544 CPU (2.0 reserved in placement groups)
2535+
530.0/544.0 CPU (2.0 used of 2.0 reserved in placement groups)
25362536
2/2 GPU
25372537
2.00/8.000 GiB memory
25382538
3.14/16.000 GiB object_store_memory
@@ -2556,13 +2556,16 @@ def test_info_string_no_node_type():
25562556
lm_summary = LoadMetricsSummary(
25572557
head_ip="0.0.0.0",
25582558
usage={
2559-
"CPU": (530, 544),
2559+
"CPU": (530.0, 544.0),
25602560
"GPU": (2, 2),
25612561
"AcceleratorType:V100": (0, 2),
2562-
"memory": (2 * 2**30, 2**33),
2562+
"memory": (6 * 2**30, 2**33),
25632563
"object_store_memory": (3.14 * 2**30, 2**34),
2564-
"CPU_group_4a82a217aadd8326a3a49f02700ac5c2": (2.0, 2.0),
2565-
"memory_group_4a82a217aadd8326a3a49f02700ac5c2": (2**32, 2.0)
2564+
"CPU_group_4a82a217aadd8326a3a49f02700ac5c2": (1.0, 2.0),
2565+
"CPU_group_1_4a82a217aadd8326a3a49f02700ac5c2": (0.0, 1.0),
2566+
"CPU_group_2_4a82a217aadd8326a3a49f02700ac5c2": (1.0, 1.0),
2567+
"memory_group_4a82a217aadd8326a3a49f02700ac5c2": (2**32, 2**32),
2568+
"memory_group_0_4a82a217aadd8326a3a49f02700ac5c2": (2**32, 2**32)
25662569
},
25672570
resource_demand=[({
25682571
"GPU": 0.5,
@@ -2585,22 +2588,30 @@ def test_info_string_no_node_type():
25852588
"CPU": 16
25862589
}, 100)],
25872590
node_types=[({
2588-
"CPU": 16
2591+
"CPU": 16,
2592+
"CPU_group_4a82a217aadd8326a3a49f02700ac5c2": 2.0,
2593+
"CPU_group_1_4a82a217aadd8326a3a49f02700ac5c2": 1.0,
2594+
"CPU_group_2_4a82a217aadd8326a3a49f02700ac5c2": 1.0,
2595+
"memory": 2**33,
2596+
"memory_group_4a82a217aadd8326a3a49f02700ac5c2": 4 * 2**30,
2597+
"memory_group_0_4a82a217aadd8326a3a49f02700ac5c2": 4 * 2**30,
25892598
}, 1)])
25902599

25912600
expected = """
25922601
======== Cluster status: 2020-12-28 01:02:03 ========
25932602
Node status
25942603
-----------------------------------------------------
2595-
1 node(s) with resources: {'CPU': 16}
2604+
1 node(s) with resources:
2605+
CPU: 16.0 (2.0 reserved in placement groups)
2606+
memory: 8589934592.0 (4294967296 reserved in placement groups)
25962607
25972608
Resources
25982609
-----------------------------------------------------
25992610
Usage:
26002611
0/2 AcceleratorType:V100
2601-
530/544 CPU (2.0 reserved in placement groups)
2612+
529.0/544.0 CPU (1.0 used of 2.0 reserved in placement groups)
26022613
2/2 GPU
2603-
2.00/8.000 GiB memory (4.00 GiB reserved in placement groups)
2614+
6.00/8.000 GiB memory (4.00 used of 4.00 GiB reserved in placement groups)
26042615
3.14/16.000 GiB object_store_memory
26052616
26062617
Demands:

0 commit comments

Comments
 (0)