-
Notifications
You must be signed in to change notification settings - Fork 47
fix(trainer): Fix empty image for Runtime trainer #143
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
f6ab049
ab87526
4ae75dc
432289c
1323e00
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -126,7 +126,7 @@ def container_status_to_trainjob_status(status: str, exit_code: int) -> str: | |||
| if status == "exited": | ||||
| # Exit code 0 -> complete, else failed | ||||
| return constants.TRAINJOB_COMPLETE if exit_code == 0 else constants.TRAINJOB_FAILED | ||||
| return constants.UNKNOWN | ||||
| return UNKNOWN | ||||
|
|
||||
|
|
||||
| def aggregate_status_from_containers(container_statuses: list[str]) -> str: | ||||
|
|
@@ -150,38 +150,6 @@ def aggregate_status_from_containers(container_statuses: list[str]) -> str: | |||
| return UNKNOWN | ||||
|
|
||||
|
|
||||
| def resolve_image(runtime: types.Runtime) -> str: | ||||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Fiona-Waters It looks like we don't need image resolver, since we fallback to the default runtime in case we can't get it online:
Is that correct ?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes if image is not optional then we don't need this function. |
||||
| """ | ||||
| Resolve the container image for a runtime. | ||||
|
|
||||
| Priority: | ||||
| 1. Use runtime.image if specified in the ClusterTrainingRuntime | ||||
| 2. Fall back to DEFAULT_FRAMEWORK_IMAGES based on framework | ||||
|
|
||||
| Args: | ||||
| runtime: Runtime object. | ||||
|
|
||||
| Returns: | ||||
| Container image name. | ||||
|
|
||||
| Raises: | ||||
| ValueError: If no image is found for the runtime's framework. | ||||
| """ | ||||
| # Use image from runtime if specified | ||||
| if runtime.image: | ||||
| return runtime.image | ||||
|
|
||||
| # Fall back to default framework images | ||||
| framework = runtime.trainer.framework | ||||
| if framework in constants.DEFAULT_FRAMEWORK_IMAGES: | ||||
| return constants.DEFAULT_FRAMEWORK_IMAGES[framework] | ||||
|
|
||||
| raise ValueError( | ||||
| f"No default image found for framework '{framework}'. " | ||||
| f"Supported frameworks: {list(constants.DEFAULT_FRAMEWORK_IMAGES.keys())}" | ||||
| ) | ||||
|
|
||||
|
|
||||
| def maybe_pull_image(adapter, image: str, pull_policy: str): | ||||
| """ | ||||
| Pull image based on pull policy. | ||||
|
|
@@ -227,7 +195,7 @@ def get_container_status(adapter, container_id: str) -> str: | |||
| status, exit_code = adapter.container_status(container_id) | ||||
| return container_status_to_trainjob_status(status, exit_code) | ||||
| except Exception: | ||||
| return constants.UNKNOWN | ||||
| return UNKNOWN | ||||
|
|
||||
|
|
||||
| def aggregate_container_statuses(adapter, containers: list[dict]) -> str: | ||||
|
|
||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -232,6 +232,7 @@ class TrainerType(Enum): | |
| class RuntimeTrainer: | ||
| trainer_type: TrainerType | ||
| framework: str | ||
| image: str | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @astefanutti @Fiona-Waters I made
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it makes sense for the image to be mandatory. Adding a dummy const to local process backend seems fine to me as long as we make sure it's purpose is mentioned clearly in a comment. |
||
| num_nodes: int = 1 # The default value is set in the APIs. | ||
| device: str = common_constants.UNKNOWN | ||
| device_count: str = common_constants.UNKNOWN | ||
|
|
@@ -251,7 +252,6 @@ class Runtime: | |
| name: str | ||
| trainer: RuntimeTrainer | ||
| pretrained_model: Optional[str] = None | ||
| image: Optional[str] = None | ||
|
|
||
|
|
||
| # Representation for the TrainJob steps. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should fix E2Es in this PR: kubeflow/trainer#2907
cc @Fiona-Waters @astefanutti @kramaranya