Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/lerobot/configs/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ class DatasetConfig:
# "dataset_index" into the returned item. The index mapping is made according to the order in which the
# datasets are provided.
repo_id: str
# Root directory where the dataset will be stored (e.g. 'dataset/path'). If None, defaults to $HF_LEROBOT_HOME/repo_id.
# Root directory for a concrete local dataset tree (e.g. 'dataset/path'). If None, local datasets are
# looked up under $HF_LEROBOT_HOME/repo_id and Hub downloads use a revision-safe cache under $HF_LEROBOT_HOME/hub.
root: str | None = None
episodes: list[int] | None = None
image_transforms: ImageTransformsConfig = field(default_factory=ImageTransformsConfig)
Expand Down
40 changes: 33 additions & 7 deletions src/lerobot/datasets/dataset_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
update_chunk_file_indices,
)
from lerobot.datasets.video_utils import get_video_info
from lerobot.utils.constants import HF_LEROBOT_HOME
from lerobot.utils.constants import HF_LEROBOT_HOME, HF_LEROBOT_HUB_CACHE

CODEBASE_VERSION = "v3.0"

Expand Down Expand Up @@ -77,8 +77,12 @@ def __init__(

Args:
repo_id: Repository identifier (e.g. ``'lerobot/aloha_sim'``).
root: Local directory for the dataset. Defaults to
``$HF_LEROBOT_HOME/{repo_id}``.
root: Local directory for the dataset. When provided, Hub downloads
are materialized directly into this directory. When omitted,
existing local datasets are still looked up under
``$HF_LEROBOT_HOME/{repo_id}``, but Hub downloads use a
revision-safe snapshot cache under
``$HF_LEROBOT_HOME/hub``.
revision: Git revision (branch, tag, or commit hash). Defaults to
the current codebase version.
force_cache_sync: If ``True``, re-download metadata from the Hub
Expand All @@ -88,25 +92,32 @@ def __init__(
"""
self.repo_id = repo_id
self.revision = revision if revision else CODEBASE_VERSION
self.root = Path(root) if root is not None else HF_LEROBOT_HOME / repo_id
self._requested_root = Path(root) if root is not None else None
self.root = self._requested_root if self._requested_root is not None else HF_LEROBOT_HOME / repo_id
self._pq_writer = None
self.latest_episode = None
self._metadata_buffer: list[dict] = []
self._metadata_buffer_size = metadata_buffer_size
self._finalized = False

try:
if force_cache_sync:
if force_cache_sync or (
self._requested_root is None and self._has_legacy_hub_download_metadata(self.root)
):
raise FileNotFoundError
self._load_metadata()
except (FileNotFoundError, NotADirectoryError):
if is_valid_version(self.revision):
self.revision = get_safe_version(self.repo_id, self.revision)

(self.root / "meta").mkdir(exist_ok=True, parents=True)
self._pull_from_repo(allow_patterns="meta/")
self._load_metadata()

@staticmethod
def _has_legacy_hub_download_metadata(root: Path) -> bool:
"""Return True when ``root`` looks like a legacy Hub ``local_dir`` mirror."""
return (root / ".cache" / "huggingface" / "download").exists()

def _flush_metadata_buffer(self) -> None:
"""Write all buffered episode metadata to parquet file."""
if not hasattr(self, "_metadata_buffer") or len(self._metadata_buffer) == 0:
Expand Down Expand Up @@ -178,14 +189,29 @@ def _pull_from_repo(
allow_patterns: list[str] | str | None = None,
ignore_patterns: list[str] | str | None = None,
) -> None:
if self._requested_root is None:
self.root = Path(
snapshot_download(
self.repo_id,
repo_type="dataset",
revision=self.revision,
cache_dir=HF_LEROBOT_HUB_CACHE,
allow_patterns=allow_patterns,
ignore_patterns=ignore_patterns,
)
)
return

self._requested_root.mkdir(exist_ok=True, parents=True)
snapshot_download(
self.repo_id,
repo_type="dataset",
revision=self.revision,
local_dir=self.root,
local_dir=self._requested_root,
allow_patterns=allow_patterns,
ignore_patterns=ignore_patterns,
)
self.root = self._requested_root

@property
def url_root(self) -> str:
Expand Down
61 changes: 43 additions & 18 deletions src/lerobot/datasets/lerobot_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
get_safe_default_codec,
resolve_vcodec,
)
from lerobot.utils.constants import HF_LEROBOT_HOME
from lerobot.utils.constants import HF_LEROBOT_HOME, HF_LEROBOT_HUB_CACHE

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -144,10 +144,11 @@ def __init__(

Args:
repo_id (str): This is the repo id that will be used to fetch the dataset.
root (Path | None, optional): Local directory where the dataset will be downloaded and
stored. If set, all dataset files will be stored directly under this path. If not set, the
dataset files will be stored under $HF_LEROBOT_HOME/repo_id (configurable via the
HF_LEROBOT_HOME environment variable).
root (Path | None, optional): Local directory where the dataset will be read from or downloaded
into. If set, all dataset files are materialized directly under this path. If not set,
existing local datasets are still looked up under ``$HF_LEROBOT_HOME/{repo_id}``, but Hub
downloads use a revision-safe snapshot cache under
``$HF_LEROBOT_HOME/hub``.
episodes (list[int] | None, optional): If specified, this will only load episodes specified by
their episode_index in this list. Defaults to None.
image_transforms (Callable | None, optional): You can pass standard v2 image transforms from
Expand Down Expand Up @@ -190,7 +191,8 @@ def __init__(
"""
super().__init__()
self.repo_id = repo_id
self.root = Path(root) if root else HF_LEROBOT_HOME / repo_id
self._requested_root = Path(root) if root else None
self.root = self._requested_root if self._requested_root is not None else HF_LEROBOT_HOME / repo_id
self.image_transforms = image_transforms
self.delta_timestamps = delta_timestamps
self.episodes = episodes
Expand All @@ -201,12 +203,15 @@ def __init__(
self._vcodec = resolve_vcodec(vcodec)
self._encoder_threads = encoder_threads

self.root.mkdir(exist_ok=True, parents=True)
if self._requested_root is not None:
self.root.mkdir(exist_ok=True, parents=True)

# Load metadata
self.meta = LeRobotDatasetMetadata(
self.repo_id, self.root, self.revision, force_cache_sync=force_cache_sync
self.repo_id, self._requested_root, self.revision, force_cache_sync=force_cache_sync
)
self.root = self.meta.root
self.revision = self.meta.revision

# Create reader (hf_dataset loaded below)
self.reader = DatasetReader(
Expand Down Expand Up @@ -556,14 +561,32 @@ def _download(self, download_videos: bool = True) -> None:
if self.episodes is not None:
# Reader is guaranteed to exist here (created in __init__ before _download)
files = self.reader.get_episodes_file_paths()
snapshot_download(
self.repo_id,
repo_type="dataset",
revision=self.revision,
local_dir=self.root,
allow_patterns=files,
ignore_patterns=ignore_patterns,
)

if self._requested_root is None:
self.root = Path(
snapshot_download(
self.repo_id,
repo_type="dataset",
revision=self.revision,
cache_dir=HF_LEROBOT_HUB_CACHE,
allow_patterns=files,
ignore_patterns=ignore_patterns,
)
)
else:
self._requested_root.mkdir(exist_ok=True, parents=True)
snapshot_download(
self.repo_id,
repo_type="dataset",
revision=self.revision,
local_dir=self._requested_root,
allow_patterns=files,
ignore_patterns=ignore_patterns,
)
self.root = self._requested_root

self.meta.root = self.root
self.reader._root = self.root

# ── Class constructors ────────────────────────────────────────────

Expand Down Expand Up @@ -635,6 +658,7 @@ def create(
metadata_buffer_size=metadata_buffer_size,
)
obj.repo_id = obj.meta.repo_id
obj._requested_root = obj.meta.root
obj.root = obj.meta.root
obj.revision = None
obj.tolerance_s = tolerance_s
Expand Down Expand Up @@ -719,7 +743,8 @@ def resume(
vcodec = resolve_vcodec(vcodec)
obj = cls.__new__(cls)
obj.repo_id = repo_id
obj.root = Path(root) if root else HF_LEROBOT_HOME / repo_id
obj._requested_root = Path(root) if root else HF_LEROBOT_HOME / repo_id
obj.root = obj._requested_root
obj.root.mkdir(exist_ok=True, parents=True)
obj.revision = revision if revision else CODEBASE_VERSION
obj.tolerance_s = tolerance_s
Expand All @@ -733,7 +758,7 @@ def resume(

# Load metadata
obj.meta = LeRobotDatasetMetadata(
obj.repo_id, obj.root, obj.revision, force_cache_sync=force_cache_sync
obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync
)

# Reader is lazily created on first access (write-only mode)
Expand Down
14 changes: 10 additions & 4 deletions src/lerobot/datasets/streaming_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,9 @@ def __init__(

Args:
repo_id (str): This is the repo id that will be used to fetch the dataset.
root (Path | None, optional): Local directory to use for downloading/writing files.
root (Path | None, optional): Local directory to use for local datasets. When omitted, Hub
metadata is resolved through a revision-safe snapshot cache under
``$HF_LEROBOT_HOME/hub``.
episodes (list[int] | None, optional): If specified, this will only load episodes specified by
their episode_index in this list.
image_transforms (Callable | None, optional): Transform to apply to image data.
Expand All @@ -271,7 +273,8 @@ def __init__(
"""
super().__init__()
self.repo_id = repo_id
self.root = Path(root) if root else HF_LEROBOT_HOME / repo_id
self._requested_root = Path(root) if root else None
self.root = self._requested_root if self._requested_root is not None else HF_LEROBOT_HOME / repo_id
self.streaming_from_local = root is not None

self.image_transforms = image_transforms
Expand All @@ -288,12 +291,15 @@ def __init__(
# We cache the video decoders to avoid re-initializing them at each frame (avoiding a ~10x slowdown)
self.video_decoder_cache = None

self.root.mkdir(exist_ok=True, parents=True)
if self._requested_root is not None:
self.root.mkdir(exist_ok=True, parents=True)

# Load metadata
self.meta = LeRobotDatasetMetadata(
self.repo_id, self.root, self.revision, force_cache_sync=force_cache_sync
self.repo_id, self._requested_root, self.revision, force_cache_sync=force_cache_sync
)
self.root = self.meta.root
self.revision = self.meta.revision
# Check version
check_version_compatibility(self.repo_id, self.meta._version, CODEBASE_VERSION)

Expand Down
1 change: 1 addition & 0 deletions src/lerobot/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
# cache dir
default_cache_path = Path(HF_HOME) / "lerobot"
HF_LEROBOT_HOME = Path(os.getenv("HF_LEROBOT_HOME", default_cache_path)).expanduser()
HF_LEROBOT_HUB_CACHE = HF_LEROBOT_HOME / "hub"

# calibration dir
default_calibration_path = HF_LEROBOT_HOME / "calibration"
Expand Down
Loading
Loading