-
Notifications
You must be signed in to change notification settings - Fork 7k
[FIX] raise error if job does not terminate in tail_job_logs() #57037
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
efa79d6
aea0717
c011722
d4b73e6
83ad810
21e1ecc
0dcb9aa
f1bfdd5
e62dac9
fc81dee
d1c0c29
899c05d
a21145f
a2e76fd
2df9d50
b51feb2
8035a6b
681a536
5ed2be4
1406cd3
c1e93cd
905d742
238e0fb
5186f4c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -482,8 +482,9 @@ async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]: | |
| The iterator. | ||
|
|
||
| Raises: | ||
| RuntimeError: If the job does not exist or if the request to the | ||
| job server fails. | ||
| RuntimeError: If the job does not exist, if the request to the | ||
| job server fails, or if the connection closes unexpectedly | ||
| before the job reaches a terminal state. | ||
| """ | ||
| async with aiohttp.ClientSession( | ||
| cookies=self._cookies, headers=self._headers | ||
|
|
@@ -498,6 +499,17 @@ async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]: | |
| if msg.type == aiohttp.WSMsgType.TEXT: | ||
| yield msg.data | ||
| elif msg.type == aiohttp.WSMsgType.CLOSED: | ||
| logger.debug( | ||
| f"WebSocket closed for job {job_id} with close code {ws.close_code}" | ||
| ) | ||
| if ws.close_code == aiohttp.WSCloseCode.ABNORMAL_CLOSURE: | ||
| raise RuntimeError( | ||
| f"WebSocket connection closed unexpectedly with close code {ws.close_code}" | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: WebSocket Closure Triggers Unintended Job ErrorsThe |
||
| break | ||
| elif msg.type == aiohttp.WSMsgType.ERROR: | ||
| pass | ||
| # Old Ray versions may send ERROR on connection close | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when did this behavior change?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I encounter the error in test |
||
| logger.debug( | ||
| f"WebSocket error for job {job_id}, treating as normal close. Err: {ws.exception()}" | ||
| ) | ||
| break | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -268,8 +268,7 @@ def f(): | |||||
| yield { | ||||||
| "runtime_env": {"py_modules": [str(Path(tmp_dir) / "test_module")]}, | ||||||
| "entrypoint": ( | ||||||
| "python -c 'import test_module;" | ||||||
| "print(test_module.run_test())'" | ||||||
| "python -c 'import test_module;print(test_module.run_test())'" | ||||||
| ), | ||||||
| "expected_logs": "Hello from test_module!\n", | ||||||
| } | ||||||
|
|
@@ -711,6 +710,52 @@ async def test_tail_job_logs(job_sdk_client): | |||||
| wait_for_condition(_check_job_succeeded, client=client, job_id=job_id) | ||||||
|
|
||||||
|
|
||||||
| @pytest.mark.asyncio | ||||||
| async def test_tail_job_logs_websocket_abnormal_closure(): | ||||||
| """ | ||||||
| Test that ABNORMAL_CLOSURE raises RuntimeError when tailing logs. | ||||||
|
|
||||||
| This test starts its own Ray cluster using default ports, | ||||||
| then forcefully stops Ray while tailing logs to simulate an abnormal WebSocket closure. | ||||||
| """ | ||||||
| subprocess.check_output(["ray", "start", "--head"]) | ||||||
| address = "127.0.0.1:8265" | ||||||
|
|
||||||
| try: | ||||||
| assert wait_until_server_available(address) | ||||||
| client = JobSubmissionClient(format_web_url(address)) | ||||||
|
|
||||||
| # Submit a long-running job | ||||||
| driver_script = """ | ||||||
| import time | ||||||
| for i in range(100): | ||||||
| print("Hello", i) | ||||||
| time.sleep(0.5) | ||||||
| """ | ||||||
| entrypoint = f"python -c '{driver_script}'" | ||||||
| job_id = client.submit_job(entrypoint=entrypoint) | ||||||
|
|
||||||
| # Start tailing logs and stop Ray while tailing | ||||||
| # Expect RuntimeError when WebSocket closes abnormally | ||||||
| with pytest.raises( | ||||||
| RuntimeError, | ||||||
| match="WebSocket connection closed unexpectedly with close code", | ||||||
| ): | ||||||
| i = 0 | ||||||
| async for lines in client.tail_job_logs(job_id): | ||||||
| print(lines, end="") | ||||||
| i += 1 | ||||||
|
|
||||||
| # Run ray stop to terminate Ray after receiving a few log lines | ||||||
| if i == 3: | ||||||
| print("\nStopping Ray cluster forcefully...") | ||||||
| subprocess.check_output(["ray", "stop", "--force"]) | ||||||
|
|
||||||
| finally: | ||||||
| # Ensure Ray is stopped even if test fails | ||||||
| subprocess.check_output(["ray", "stop", "--force"]) | ||||||
|
||||||
| def test_dashboard_requests_fail_on_missing_deps(ray_start_regular): |
you might need to import it on this line:
| from ray.tests.conftest import _ray_start |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated in 5ed2be4
Thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it easy to write a test for it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me have a try!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added in 899c05d