Skip to content

Commit

Permalink
Split python/ray/tests/test_actor_retry over two files (ray-project#4…
Browse files Browse the repository at this point in the history
…7188)

The `test_actor_retry` tests are failing/flaky on windows. They pass locally. I have not been able to access the CI logs to see what is going wrong. In order to shrink the problem (is it a overall timeout? Is one of the tests failing?) we can start by splitting the tests into two files.

Toward solving ray-project#43845.

Signed-off-by: ujjawal-khare <[email protected]>
  • Loading branch information
mattip authored and ujjawal-khare committed Oct 12, 2024
1 parent 4f7ff39 commit 3557f98
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 86 deletions.
3 changes: 2 additions & 1 deletion ci/ray_ci/core.tests.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
flaky_tests:
- windows://:metric_exporter_grpc_test
- windows://python/ray/tests:test_actor_retry
- windows://python/ray/tests:test_actor_retry1
- windows://python/ray/tests:test_actor_retry2
- windows://python/ray/tests:test_object_spilling
- windows://python/ray/tests:test_object_spilling_asan
- windows://python/ray/tests:test_object_spilling_debug_mode
Expand Down
3 changes: 2 additions & 1 deletion python/ray/tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,8 @@ py_test_module_list(
py_test_module_list(
files = [
"test_actor.py",
"test_actor_retry.py",
"test_actor_retry1.py",
"test_actor_retry2.py",
"test_actor_failures.py",
"test_actor_resources.py",
"test_failure.py",
Expand Down
155 changes: 155 additions & 0 deletions python/ray/tests/test_actor_retry1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import os
import sys

import pytest

import ray


class MyError(Exception):
pass


@ray.remote
class Counter:
def __init__(self) -> None:
self.count = 0

def increment(self) -> int:
c = self.count
self.count += 1
return c

def get_count(self) -> int:
return self.count


# TODO: also do work for async and threaded actors
@ray.remote(max_task_retries=3)
class TroubleMaker:
@ray.method(max_task_retries=5, retry_exceptions=[MyError])
def may_raise_n_times(self, counter, n):
"""
Raises if there were n calls before this call.
Returns the number of calls before this call, if it's > n.
"""
c = ray.get(counter.increment.remote())
print(f"may_raise_n_times, n = {n}, count = {c}")
if c < n:
print(f"method raises in {c} th call, want {n} times")
raise MyError()
return c

@ray.method(retry_exceptions=[MyError])
def raise_or_exit(self, counter, actions):
"""
Increments the counter and performs an action based on the param actions[count].
If count >= len(actions), return the count.
Note: This method doesn't set `max_task_retries`. Ray expects it to inherit
max_task_retries = 3.
@param actions: List["raise" | "exit"]
"""

c = ray.get(counter.increment.remote())
action = "return" if c >= len(actions) else actions[c]
print(f"raise_or_exit, action = {action}, count = {c}")

if action == "raise":
raise MyError()
elif action == "exit":
sys.exit(1)
else:
return c


def test_method_raise_5_times(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()
assert ray.get(trouble_maker.may_raise_n_times.remote(counter, 5)) == 5
assert ray.get(counter.get_count.remote()) == 6


def test_method_raise_no_over_retry(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()
with pytest.raises(MyError):
ray.get(trouble_maker.may_raise_n_times.remote(counter, 6))
assert ray.get(counter.get_count.remote()) == 6


def test_method_no_retry_without_retry_exceptions(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()
with pytest.raises(MyError):
ray.get(
trouble_maker.may_raise_n_times.options(retry_exceptions=False).remote(
counter, 5
)
)
assert ray.get(counter.get_count.remote()) == 1


def test_options_takes_precedence(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()
assert (
ray.get(
trouble_maker.may_raise_n_times.options(max_task_retries=10).remote(
counter, 10
)
)
== 10
)
assert ray.get(counter.get_count.remote()) == 11


def test_options_takes_precedence_no_over_retry(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()

with pytest.raises(MyError):
ray.get(
trouble_maker.may_raise_n_times.options(max_task_retries=10).remote(
counter, 11
)
)
assert ray.get(counter.get_count.remote()) == 11


def test_method_exit_no_over_retry_max_restarts(shutdown_only):
"""
Even if we have enough max_task_retries, we may still raise due to max_restarts.
"""
counter = Counter.remote()
trouble_maker = TroubleMaker.options(max_restarts=1).remote()
with pytest.raises(ray.exceptions.RayActorError):
assert ray.get(
trouble_maker.raise_or_exit.options(max_task_retries=4).remote(
counter, ["raise", "exit", "exit"]
)
)
# 2 calls: 1 initial + 1 exception-retry + 1 exit-retry (then no more)
assert ray.get(counter.get_count.remote()) == 3


def test_exit_only_no_over_retry(shutdown_only):
"""
Sanity testing: only do exit-retry works
"""
counter = Counter.remote()
trouble_maker = TroubleMaker.options(max_restarts=2).remote()
ray.get(
trouble_maker.raise_or_exit.options(max_task_retries=2).remote(
counter, ["exit", "exit"]
)
) == 2
assert ray.get(counter.get_count.remote()) == 3


if __name__ == "__main__":
if os.environ.get("PARALLEL_CI"):
sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
else:
sys.exit(pytest.main(["-sv", __file__]))
Original file line number Diff line number Diff line change
Expand Up @@ -112,33 +112,6 @@ async def yield_or_raise(self, counter, actions):
return


def test_method_raise_5_times(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()
assert ray.get(trouble_maker.may_raise_n_times.remote(counter, 5)) == 5
assert ray.get(counter.get_count.remote()) == 6


def test_method_raise_no_over_retry(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()
with pytest.raises(MyError):
ray.get(trouble_maker.may_raise_n_times.remote(counter, 6))
assert ray.get(counter.get_count.remote()) == 6


def test_method_no_retry_without_retry_exceptions(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()
with pytest.raises(MyError):
ray.get(
trouble_maker.may_raise_n_times.options(retry_exceptions=False).remote(
counter, 5
)
)
assert ray.get(counter.get_count.remote()) == 1


def test_generator_method_no_retry_without_retry_exceptions(shutdown_only):
counter = Counter.remote()
trouble_maker = AsyncTroubleMaker.remote()
Expand Down Expand Up @@ -226,33 +199,6 @@ def test_generator_method_does_not_over_retry(shutdown_only):
assert ray.get(counter.get_count.remote()) == 8


def test_options_takes_precedence(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()
assert (
ray.get(
trouble_maker.may_raise_n_times.options(max_task_retries=10).remote(
counter, 10
)
)
== 10
)
assert ray.get(counter.get_count.remote()) == 11


def test_options_takes_precedence_no_over_retry(shutdown_only):
counter = Counter.remote()
trouble_maker = TroubleMaker.remote()

with pytest.raises(MyError):
ray.get(
trouble_maker.may_raise_n_times.options(max_task_retries=10).remote(
counter, 11
)
)
assert ray.get(counter.get_count.remote()) == 11


@pytest.mark.parametrize(
"actions",
[
Expand Down Expand Up @@ -353,22 +299,6 @@ def test_method_raise_and_exit_no_over_retry(
assert ray.get(counter.get_count.remote()) == 3


def test_method_exit_no_over_retry_max_restarts(shutdown_only):
"""
Even if we have enough max_task_retries, we may still raise due to max_restarts.
"""
counter = Counter.remote()
trouble_maker = TroubleMaker.options(max_restarts=1).remote()
with pytest.raises(ray.exceptions.RayActorError):
assert ray.get(
trouble_maker.raise_or_exit.options(max_task_retries=4).remote(
counter, ["raise", "exit", "exit"]
)
)
# 2 calls: 1 initial + 1 exception-retry + 1 exit-retry (then no more)
assert ray.get(counter.get_count.remote()) == 3


@pytest.mark.parametrize(
"is_async", [False, True], ids=lambda a: "async" if a else "sync"
)
Expand All @@ -390,20 +320,6 @@ def test_exit_only(is_async, shutdown_only):
assert ray.get(counter.get_count.remote()) == 3


def test_exit_only_no_over_retry(shutdown_only):
"""
Sanity testing: only do exit-retry works
"""
counter = Counter.remote()
trouble_maker = TroubleMaker.options(max_restarts=2).remote()
ray.get(
trouble_maker.raise_or_exit.options(max_task_retries=2).remote(
counter, ["exit", "exit"]
)
) == 2
assert ray.get(counter.get_count.remote()) == 3


if __name__ == "__main__":
if os.environ.get("PARALLEL_CI"):
sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
Expand Down

0 comments on commit 3557f98

Please sign in to comment.