From f4c665022a73a3861bf5b6db89e35cca720f0c48 Mon Sep 17 00:00:00 2001 From: abrar Date: Tue, 21 Oct 2025 22:00:27 +0000 Subject: [PATCH 1/2] deflake app level autoscaling test Signed-off-by: abrar --- .../serve/tests/test_autoscaling_policy.py | 47 +++++++++---------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py index 310781c5d8d1..df92aa8a8dbf 100644 --- a/python/ray/serve/tests/test_autoscaling_policy.py +++ b/python/ray/serve/tests/test_autoscaling_policy.py @@ -1633,31 +1633,31 @@ def verify_scaling_decisions(self, signal_A, signal_B): # ---- Deployment A ---- ray.get(signal_A.send.remote(clear=True)) - [hA.remote() for _ in range(40)] + results = [hA.remote() for _ in range(40)] wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 40) wait_for_condition(check_num_replicas_eq, name="A", target=2) ray.get(signal_A.send.remote(clear=True)) - wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 0) - [hA.remote() for _ in range(70)] + assert all(result.result(timeout_s=10) for result in results) + results = [hA.remote() for _ in range(70)] wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 70) wait_for_condition(check_num_replicas_eq, name="A", target=4) ray.get(signal_A.send.remote()) - wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 0) + assert all(result.result(timeout_s=10) for result in results) # ---- Deployment B ---- ray.get(signal_B.send.remote(clear=True)) - [hB.remote() for _ in range(50)] + results = [hB.remote() for _ in range(50)] wait_for_condition(lambda: ray.get(signal_B.cur_num_waiters.remote()) == 50) wait_for_condition(check_num_replicas_eq, name="B", target=3) ray.get(signal_B.send.remote(clear=True)) - wait_for_condition(lambda: ray.get(signal_B.cur_num_waiters.remote()) == 0) - [hB.remote() for _ in range(120)] + assert all(result.result(timeout_s=10) for result in results) + results = [hB.remote() for _ in range(120)] wait_for_condition(lambda: ray.get(signal_B.cur_num_waiters.remote()) == 120) wait_for_condition(check_num_replicas_eq, name="B", target=5) ray.get(signal_B.send.remote()) - wait_for_condition(lambda: ray.get(signal_B.cur_num_waiters.remote()) == 0) + assert all(result.result(timeout_s=10) for result in results) @pytest.mark.parametrize( "policy", @@ -1748,11 +1748,11 @@ def test_autoscaling_policy_switchback(self, serve_instance_with_two_signal): wait_for_condition(check_running, timeout=15) hA = serve.get_deployment_handle("A", app_name=SERVE_DEFAULT_APP_NAME) - [hA.remote() for _ in range(60)] + results = [hA.remote() for _ in range(60)] wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 60) wait_for_condition(check_num_replicas_eq, name="A", target=3) ray.get(signal_A.send.remote()) - wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 0) + assert all(result.result(timeout_s=10) for result in results) ray.get(signal_A.send.remote(clear=True)) # Switch to app-level policy @@ -1797,19 +1797,19 @@ def test_autoscaling_policy_switchback(self, serve_instance_with_two_signal): wait_for_condition(check_running, timeout=15) hA = serve.get_deployment_handle("A", app_name=SERVE_DEFAULT_APP_NAME) - [hA.remote() for _ in range(120)] + results = [hA.remote() for _ in range(120)] wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 120) wait_for_condition(check_num_replicas_eq, name="A", target=4) ray.get(signal_A.send.remote()) - wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 0) + assert all(result.result(timeout_s=10) for result in results) ray.get(signal_A.send.remote(clear=True)) hB = serve.get_deployment_handle("B", app_name=SERVE_DEFAULT_APP_NAME) - [hB.remote() for _ in range(120)] + results = [hB.remote() for _ in range(120)] wait_for_condition(lambda: ray.get(signal_B.cur_num_waiters.remote()) == 120) wait_for_condition(check_num_replicas_eq, name="B", target=5) ray.get(signal_B.send.remote()) - wait_for_condition(lambda: ray.get(signal_B.cur_num_waiters.remote()) == 0) + assert all(result.result(timeout_s=10) for result in results) ray.get(signal_B.send.remote(clear=True)) # switch back to deployment-level policy @@ -1841,14 +1841,13 @@ def test_autoscaling_policy_switchback(self, serve_instance_with_two_signal): wait_for_condition(check_running, timeout=15) hA = serve.get_deployment_handle("A", app_name=SERVE_DEFAULT_APP_NAME) - [hA.remote() for _ in range(120)] - wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 120) + results = [hA.remote() for _ in range(120)] wait_for_condition(check_num_replicas_eq, name="A", target=3) ray.get(signal_A.send.remote()) - wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 0) + assert all(result.result(timeout_s=10) for result in results) def test_autoscaling_policy_enable_disable(self, serve_instance_with_two_signal): - client, signal_A, signal_B = serve_instance_with_two_signal + client, signal_A, _ = serve_instance_with_two_signal config_template = { "import_path": "ray.serve.tests.test_config_files.get_multi_deployment_signal_app.app", @@ -1866,11 +1865,11 @@ def test_autoscaling_policy_enable_disable(self, serve_instance_with_two_signal) wait_for_condition(check_running, timeout=15) hA = serve.get_deployment_handle("A", app_name=SERVE_DEFAULT_APP_NAME) - [hA.remote() for _ in range(120)] + results = [hA.remote() for _ in range(120)] wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 120) wait_for_condition(check_num_replicas_eq, name="A", target=1) ray.get(signal_A.send.remote(clear=True)) - wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 0) + assert all(result.result(timeout_s=10) for result in results) config_template = { "import_path": "ray.serve.tests.test_config_files.get_multi_deployment_signal_app.app", @@ -1899,11 +1898,11 @@ def test_autoscaling_policy_enable_disable(self, serve_instance_with_two_signal) wait_for_condition(check_running, timeout=15) hA = serve.get_deployment_handle("A", app_name=SERVE_DEFAULT_APP_NAME) - [hA.remote() for _ in range(120)] + results = [hA.remote() for _ in range(120)] wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 120) wait_for_condition(check_num_replicas_eq, name="A", target=4) ray.get(signal_A.send.remote(clear=True)) - wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 0) + assert all(result.result(timeout_s=10) for result in results) # turn off app-level autoscaling policy config_template = { @@ -1922,11 +1921,11 @@ def test_autoscaling_policy_enable_disable(self, serve_instance_with_two_signal) wait_for_condition(check_num_replicas_eq, name="A", target=1) hA = serve.get_deployment_handle("A", app_name=SERVE_DEFAULT_APP_NAME) - [hA.remote() for _ in range(120)] + results = [hA.remote() for _ in range(120)] wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 120) wait_for_condition(check_num_replicas_eq, name="A", target=1) ray.get(signal_A.send.remote(clear=True)) - wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 0) + assert all(result.result(timeout_s=10) for result in results) if __name__ == "__main__": From 90ce26d1bb61fdc762f62c764801b302c5797c27 Mon Sep 17 00:00:00 2001 From: abrar Date: Tue, 21 Oct 2025 22:05:41 +0000 Subject: [PATCH 2/2] add back check Signed-off-by: abrar --- python/ray/serve/tests/test_autoscaling_policy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py index df92aa8a8dbf..883b0fc772d4 100644 --- a/python/ray/serve/tests/test_autoscaling_policy.py +++ b/python/ray/serve/tests/test_autoscaling_policy.py @@ -1842,6 +1842,7 @@ def test_autoscaling_policy_switchback(self, serve_instance_with_two_signal): hA = serve.get_deployment_handle("A", app_name=SERVE_DEFAULT_APP_NAME) results = [hA.remote() for _ in range(120)] + wait_for_condition(lambda: ray.get(signal_A.cur_num_waiters.remote()) == 120) wait_for_condition(check_num_replicas_eq, name="A", target=3) ray.get(signal_A.send.remote()) assert all(result.result(timeout_s=10) for result in results)