From 89cdda88f51ca62773cd708806d8cbc0c5fd5032 Mon Sep 17 00:00:00 2001 From: Rueian Date: Tue, 27 May 2025 14:05:07 -0700 Subject: [PATCH 1/2] [Test][Autoscaler] deflaky unexpected dead actors in tests by setting max_restarts=-1 Signed-off-by: Rueian --- ray-operator/test/e2eautoscaler/create_detached_actor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ray-operator/test/e2eautoscaler/create_detached_actor.py b/ray-operator/test/e2eautoscaler/create_detached_actor.py index 9a5ab968798..7a1a23b854b 100644 --- a/ray-operator/test/e2eautoscaler/create_detached_actor.py +++ b/ray-operator/test/e2eautoscaler/create_detached_actor.py @@ -9,7 +9,8 @@ parser.add_argument('--num-custom-resources', type=float, default=0) args = parser.parse_args() -@ray.remote(num_cpus=args.num_cpus, num_gpus=args.num_gpus, resources={"CustomResource": args.num_custom_resources}) +# set max_restarts=-1 to restart unexpected death in tests. +@ray.remote(max_restarts=-1, num_cpus=args.num_cpus, num_gpus=args.num_gpus, resources={"CustomResource": args.num_custom_resources}) class Actor: pass From 632275832fa3d3d09d4fdf4a6af78c01a2253c70 Mon Sep 17 00:00:00 2001 From: Rueian Date: Wed, 4 Jun 2025 12:02:31 -0700 Subject: [PATCH 2/2] [Test][Autoscaler] deflaky unexpected dead actors in tests by setting max_restarts=-1 Signed-off-by: Rueian --- ray-operator/test/e2eautoscaler/create_detached_actor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ray-operator/test/e2eautoscaler/create_detached_actor.py b/ray-operator/test/e2eautoscaler/create_detached_actor.py index 7a1a23b854b..8fa7b7ba327 100644 --- a/ray-operator/test/e2eautoscaler/create_detached_actor.py +++ b/ray-operator/test/e2eautoscaler/create_detached_actor.py @@ -9,7 +9,8 @@ parser.add_argument('--num-custom-resources', type=float, default=0) args = parser.parse_args() -# set max_restarts=-1 to restart unexpected death in tests. +# set max_restarts=-1 as a workaround to restart unexpected death in tests. +# TODO (rueian): Remove the max_restarts workaround when https://github.com/ray-project/ray/issues/40864 is fixed. @ray.remote(max_restarts=-1, num_cpus=args.num_cpus, num_gpus=args.num_gpus, resources={"CustomResource": args.num_custom_resources}) class Actor: pass