-
Notifications
You must be signed in to change notification settings - Fork 7k
[FIX] raise error if job does not terminate in tail_job_logs() #57037
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 21 commits
efa79d6
aea0717
c011722
d4b73e6
83ad810
21e1ecc
0dcb9aa
f1bfdd5
e62dac9
fc81dee
d1c0c29
899c05d
a21145f
a2e76fd
2df9d50
b51feb2
8035a6b
681a536
5ed2be4
1406cd3
c1e93cd
905d742
238e0fb
5186f4c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -482,8 +482,9 @@ async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]: | |
| The iterator. | ||
|
|
||
| Raises: | ||
| RuntimeError: If the job does not exist or if the request to the | ||
| job server fails. | ||
| RuntimeError: If the job does not exist, if the request to the | ||
| job server fails, or if the connection closes unexpectedly | ||
| before the job reaches a terminal state. | ||
| """ | ||
| async with aiohttp.ClientSession( | ||
| cookies=self._cookies, headers=self._headers | ||
|
|
@@ -498,6 +499,17 @@ async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]: | |
| if msg.type == aiohttp.WSMsgType.TEXT: | ||
| yield msg.data | ||
| elif msg.type == aiohttp.WSMsgType.CLOSED: | ||
| logger.debug( | ||
| f"WebSocket closed for job {job_id} with close code {ws.close_code}" | ||
| ) | ||
| if ws.close_code == aiohttp.WSCloseCode.ABNORMAL_CLOSURE: | ||
| raise RuntimeError( | ||
| f"WebSocket connection closed unexpectedly with close code {ws.close_code}" | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: WebSocket Closure Triggers Unintended Job ErrorsThe |
||
| break | ||
| elif msg.type == aiohttp.WSMsgType.ERROR: | ||
| pass | ||
| # Old Ray versions may send ERROR on connection close | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when did this behavior change?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I encounter the error in test |
||
| logger.debug( | ||
| f"WebSocket error for job {job_id}, treating as normal close. Err: {ws.exception()}" | ||
| ) | ||
| break | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,8 @@ | |
| from ray.runtime_env.runtime_env import RuntimeEnv, RuntimeEnvConfig | ||
| from ray.tests.conftest import _ray_start | ||
|
|
||
| import psutil | ||
|
||
|
|
||
| # This test requires you have AWS credentials set up (any AWS credentials will | ||
| # do, this test only accesses a public bucket). | ||
|
|
||
|
|
@@ -268,8 +270,7 @@ def f(): | |
| yield { | ||
| "runtime_env": {"py_modules": [str(Path(tmp_dir) / "test_module")]}, | ||
| "entrypoint": ( | ||
| "python -c 'import test_module;" | ||
| "print(test_module.run_test())'" | ||
| "python -c 'import test_module;print(test_module.run_test())'" | ||
| ), | ||
| "expected_logs": "Hello from test_module!\n", | ||
| } | ||
|
|
@@ -711,6 +712,49 @@ async def test_tail_job_logs(job_sdk_client): | |
| wait_for_condition(_check_job_succeeded, client=client, job_id=job_id) | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_tail_job_logs_websocket_abnormal_closure(ray_start_regular): | ||
| """ | ||
| Test that ABNORMAL_CLOSURE raises RuntimeError when tailing logs. | ||
|
|
||
| This test starts its own Ray cluster using default ports, | ||
| then forcefully stops Ray while tailing logs to simulate an abnormal WebSocket closure. | ||
| """ | ||
| dashboard_url = ray_start_regular.dashboard_url | ||
| client = JobSubmissionClient(format_web_url(dashboard_url)) | ||
|
|
||
| # Submit a long-running job | ||
| driver_script = """ | ||
| import time | ||
| for i in range(100): | ||
| print("Hello", i) | ||
| time.sleep(0.5) | ||
| """ | ||
| entrypoint = f"python -c '{driver_script}'" | ||
| job_id = client.submit_job(entrypoint=entrypoint) | ||
|
|
||
| # Start tailing logs and stop Ray while tailing | ||
| # Expect RuntimeError when WebSocket closes abnormally | ||
| with pytest.raises( | ||
| RuntimeError, | ||
| match="WebSocket connection closed unexpectedly with close code", | ||
| ): | ||
| i = 0 | ||
| async for lines in client.tail_job_logs(job_id): | ||
| print(lines, end="") | ||
| i += 1 | ||
|
|
||
| # Kill the dashboard after receiving a few log lines | ||
| if i == 3: | ||
| from ray._private import ray_constants | ||
|
||
|
|
||
| print("\nKilling the dashboard to close websocket abnormally...") | ||
| dash_info = ray._private.worker._global_node.all_processes[ | ||
| ray_constants.PROCESS_TYPE_DASHBOARD | ||
| ][0] | ||
| psutil.Process(dash_info.process.pid).kill() | ||
|
|
||
|
|
||
| def _hook(env): | ||
| with open(env["env_vars"]["TEMPPATH"], "w+") as f: | ||
| f.write(env["env_vars"]["TOKEN"]) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it easy to write a test for it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me have a try!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added in 899c05d