Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions python/ray/train/v2/_internal/execution/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,17 @@ class StorageContext:
"""Shared context that holds the source of truth for all paths and
storage utilities, passed along from the driver to workers.

Args:
storage_path: Path where all results and checkpoints are persisted.
Can be a local directory or a remote URI (e.g., s3://bucket/path).
experiment_dir_name: Name of the experiment directory within the storage path.
storage_filesystem: Optional custom PyArrow filesystem to use for storage.
If not provided, will be auto-resolved from the storage_path URI.
skip_validation: If True, skips creating and checking the storage validation
marker file. This should be set to True for read-only operations (e.g.,
restoring from an existing experiment directory) to avoid unnecessary
file system writes. Defaults to False.

This object defines a few types of paths:
1. *_fs_path: A path on the `storage_filesystem`. This is a regular path
which has been prefix-stripped by pyarrow.fs.FileSystem.from_uri and
Expand Down Expand Up @@ -383,6 +394,7 @@ def __init__(
storage_path: Union[str, os.PathLike],
experiment_dir_name: str,
storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
skip_validation: bool = False,
):
self.custom_fs_provided = storage_filesystem is not None

Expand All @@ -395,8 +407,9 @@ def __init__(
)
self.storage_fs_path = Path(self.storage_fs_path).as_posix()

self._create_validation_file()
self._check_validation_file()
if not skip_validation:
self._create_validation_file()
self._check_validation_file()

def __str__(self):
return (
Expand Down
1 change: 1 addition & 0 deletions python/ray/train/v2/api/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def from_path(
storage_path=storage_path,
experiment_dir_name=experiment_dir_name,
storage_filesystem=fs,
skip_validation=True,
)

# Validate that the checkpoint manager snapshot file exists
Expand Down
6 changes: 3 additions & 3 deletions python/ray/train/v2/tests/test_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def test_result_restore(
storage_context = StorageContext(
storage_path=storage_path,
experiment_dir_name=exp_name,
skip_validation=True,
)

trial_dir = storage_context.experiment_fs_path
Expand Down Expand Up @@ -187,12 +188,11 @@ def test_result_restore(
assert len(result.best_checkpoints) == num_checkpoints

"""
Top-3 checkpoints with metrics:
Top-2 checkpoints with metrics:

| iter | metric_a metric_b
checkpoint_000004 4 4 -4
checkpoint_000003 3 3 -3
checkpoint_000002 2 2 -2
checkpoint_000001 1 1 -1
"""
# Check if the checkpoints bounded with correct metrics
best_ckpt_a = result.get_best_checkpoint(metric="metric_a", mode="max")
Expand Down