-
Notifications
You must be signed in to change notification settings - Fork 7k
[release test] remove JobRunner #58720
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
| import os | ||
| import re | ||
| import shlex | ||
| import shutil | ||
| import tempfile | ||
| from typing import TYPE_CHECKING, Any, Dict, List, Optional | ||
|
|
||
|
|
@@ -11,14 +12,15 @@ | |
| upload_working_dir_to_azure, | ||
| ) | ||
| from ray_release.cluster_manager.cluster_manager import ClusterManager | ||
| from ray_release.command_runner.job_runner import JobRunner | ||
| from ray_release.command_runner.command_runner import CommandRunner | ||
| from ray_release.exception import ( | ||
| FetchResultError, | ||
| JobBrokenError, | ||
| JobNoLogsError, | ||
| JobOutOfRetriesError, | ||
| JobTerminatedBeforeStartError, | ||
| JobTerminatedError, | ||
| LogsError, | ||
| PrepareCommandError, | ||
| PrepareCommandTimeout, | ||
| TestCommandError, | ||
|
|
@@ -60,7 +62,7 @@ def _get_env_str(env: Dict[str, str]) -> str: | |
| return env_str | ||
|
|
||
|
|
||
| class AnyscaleJobRunner(JobRunner): | ||
| class AnyscaleJobRunner(CommandRunner): | ||
| def __init__( | ||
| self, | ||
| cluster_manager: ClusterManager, | ||
|
|
@@ -114,9 +116,14 @@ def __init__( | |
| self._artifact_path = artifact_path | ||
| self._artifact_uploaded = artifact_path is not None | ||
|
|
||
| def _copy_script_to_working_dir(self, script_name): | ||
| script = os.path.join(os.path.dirname(__file__), f"_{script_name}") | ||
| shutil.copy(script, script_name) | ||
|
|
||
| def prepare_remote_env(self): | ||
| self._copy_script_to_working_dir("anyscale_job_wrapper.py") | ||
| super().prepare_remote_env() | ||
| self._copy_script_to_working_dir("wait_cluster.py") | ||
| self._copy_script_to_working_dir("prometheus_metrics.py") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Cluster Node Waiting Logic MissingThe |
||
|
|
||
| def run_prepare_command( | ||
| self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0 | ||
|
|
@@ -337,6 +344,12 @@ def run_command( | |
|
|
||
| return time_taken | ||
|
|
||
| def get_last_logs_ex(self) -> Optional[str]: | ||
| try: | ||
| return self.job_manager.get_last_logs() | ||
| except Exception as e: | ||
| raise LogsError(f"Could not get last logs: {e}") from e | ||
|
|
||
| def _fetch_json(self, path: str) -> Dict[str, Any]: | ||
| try: | ||
| tmpfile = tempfile.mkstemp(suffix=".json")[1] | ||
|
|
||
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,6 @@ | |
| from ray_release.cluster_manager.minimal import MinimalClusterManager | ||
| from ray_release.command_runner.anyscale_job_runner import AnyscaleJobRunner | ||
| from ray_release.command_runner.command_runner import CommandRunner | ||
| from ray_release.command_runner.job_runner import JobRunner | ||
| from ray_release.config import ( | ||
| DEFAULT_AUTOSUSPEND_MINS, | ||
| DEFAULT_BUILD_TIMEOUT, | ||
|
|
@@ -56,12 +55,11 @@ | |
| from ray_release.test import Test | ||
|
|
||
| type_str_to_command_runner = { | ||
| "job": JobRunner, | ||
| "job": AnyscaleJobRunner, | ||
| "anyscale_job": AnyscaleJobRunner, | ||
|
Comment on lines
+58
to
59
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change makes |
||
| } | ||
|
|
||
| command_runner_to_cluster_manager = { | ||
| JobRunner: FullClusterManager, | ||
| AnyscaleJobRunner: MinimalClusterManager, | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changing the base class from
JobRunnertoCommandRunnerintroduces a critical issue. Thewait_for_nodesmethod in this class callssuper().wait_for_nodes(). Previously, this resolved toJobRunner.wait_for_nodes(). Now, it will resolve toCommandRunner.wait_for_nodes(), which raises aNotImplementedError.This will break tests that require waiting for nodes. The implementation of
wait_for_nodesfromJobRunnershould be merged into this class. Specifically, thesuper()call inAnyscaleJobRunner.wait_for_nodesshould be replaced with the logic to schedule thewait_cluster.pyscript, like this: