@@ -59,6 +59,26 @@ def test_internal_config(ray_start_cluster_head):
5959 assert ray .cluster_resources ()["CPU" ] == 1
6060
6161
62+ def verify_load_metrics (monitor , expected_resource_usage = None , timeout = 10 ):
63+ while True :
64+ monitor .process_messages ()
65+ resource_usage = monitor .load_metrics .get_resource_usage ()
66+
67+ if expected_resource_usage is None :
68+ if all (x for x in resource_usage [1 :]):
69+ break
70+ elif all (x == y for x , y in zip (resource_usage , expected_resource_usage )):
71+ break
72+ else :
73+ timeout -= 1
74+ time .sleep (1 )
75+
76+ if timeout <= 0 :
77+ raise ValueError ("Should not be here." )
78+
79+ return resource_usage
80+
81+
6282def test_heartbeats (ray_start_cluster_head ):
6383 """Unit test for `Cluster.wait_for_nodes`.
6484
@@ -67,28 +87,55 @@ def test_heartbeats(ray_start_cluster_head):
6787 cluster = ray_start_cluster_head
6888 monitor = Monitor (cluster .redis_address , None )
6989
90+ work_handles = []
91+
7092 @ray .remote
7193 class Actor ():
72- pass
94+ def work (self , timeout = 10 ):
95+ time .sleep (timeout )
96+ return True
7397
7498 test_actors = [Actor .remote ()]
75- # This is only used to update the load metrics for the autoscaler.
7699
77100 monitor .subscribe (ray .gcs_utils .XRAY_HEARTBEAT_BATCH_CHANNEL )
78101 monitor .subscribe (ray .gcs_utils .XRAY_JOB_CHANNEL )
79102
80103 monitor .update_raylet_map ()
81104 monitor ._maybe_flush_gcs ()
82- # Process a round of messages.
83- monitor .process_messages ()
84- from pprint import pprint ; import ipdb ; ipdb .set_trace (context = 30 )
85- pprint (vars (monitor .load_metrics ))
86-
87- # worker_nodes = [cluster.add_node() for i in range(4)]
88- # for i in range(3):
89- # test_actors += [Actor.remote()]
90- # check_resource_usage(monitor.get_heartbeat())
91- # cluster.wait_for_nodes()
105+
106+ timeout = 5
107+
108+ verify_load_metrics (monitor , (0.0 , {'CPU' : 0.0 }, {'CPU' : 1.0 }))
109+
110+ work_handles += [test_actors [0 ].work .remote (timeout = timeout * 2 )]
111+
112+ verify_load_metrics (monitor , (1.0 , {'CPU' : 1.0 }, {'CPU' : 1.0 }))
113+
114+ ray .get (work_handles )
115+
116+ num_workers = 4
117+ num_nodes_total = float (num_workers + 1 )
118+ worker_nodes = [cluster .add_node () for i in range (num_workers )]
119+
120+ cluster .wait_for_nodes ()
121+ monitor .update_raylet_map ()
122+ monitor ._maybe_flush_gcs ()
123+
124+ verify_load_metrics (monitor , (0.0 , {'CPU' : 0.0 }, {'CPU' : num_nodes_total }))
125+
126+ work_handles = [test_actors [0 ].work .remote (timeout = timeout * 2 )]
127+ for i in range (num_workers ):
128+ new_actor = Actor .remote ()
129+ work_handles += [new_actor .work .remote (timeout = timeout * 2 )]
130+ test_actors += [new_actor ]
131+
132+ verify_load_metrics (
133+ monitor ,
134+ (num_nodes_total , {'CPU' : num_nodes_total }, {'CPU' : num_nodes_total }))
135+
136+ ray .get (work_handles )
137+
138+ verify_load_metrics (monitor , (0.0 , {'CPU' : 0.0 }, {'CPU' : num_nodes_total }))
92139
93140
94141def test_wait_for_nodes (ray_start_cluster_head ):
0 commit comments