Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 38 additions & 38 deletions lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions lerobot/common/policies/act/modeling_act.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# limitations under the License.
"""Action Chunking Transformer Policy

As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705).
As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://huggingface.co/papers/2304.13705).
The majority of changes here involve removing unused code, unifying naming, and adding helpful comments.
"""

Expand All @@ -41,7 +41,7 @@
class ACTPolicy(PreTrainedPolicy):
"""
Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost
Hardware (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act)
Hardware (paper: https://huggingface.co/papers/2304.13705, code: https://github.com/tonyzhaozh/act)
"""

config_class = ACTConfig
Expand Down Expand Up @@ -161,7 +161,7 @@ def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:
# Calculate Dβ‚–β‚—(latent_pdf || standard_normal). Note: After computing the KL-divergence for
# each dimension independently, we sum over the latent dimension to get the total
# KL-divergence per batch element, then take the mean over the batch.
# (See App. B of https://arxiv.org/abs/1312.6114 for more details).
# (See App. B of https://huggingface.co/papers/1312.6114 for more details).
mean_kld = (
(-0.5 * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())).sum(-1).mean()
)
Expand All @@ -175,7 +175,7 @@ def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:

class ACTTemporalEnsembler:
def __init__(self, temporal_ensemble_coeff: float, chunk_size: int) -> None:
"""Temporal ensembling as described in Algorithm 2 of https://arxiv.org/abs/2304.13705.
"""Temporal ensembling as described in Algorithm 2 of https://huggingface.co/papers/2304.13705.

The weights are calculated as wα΅’ = exp(-temporal_ensemble_coeff * i) where wβ‚€ is the oldest action.
They are then normalized to sum to 1 by dividing by Ξ£wα΅’. Here's some intuition around how the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class DiffusionConfig(PreTrainedConfig):
n_groups: Number of groups used in the group norm of the Unet's convolutional blocks.
diffusion_step_embed_dim: The Unet is conditioned on the diffusion timestep via a small non-linear
network. This is the output dimension of that network, i.e., the embedding dimension.
use_film_scale_modulation: FiLM (https://arxiv.org/abs/1709.07871) is used for the Unet conditioning.
use_film_scale_modulation: FiLM (https://huggingface.co/papers/1709.07871) is used for the Unet conditioning.
Bias modulation is used be default, while this parameter indicates whether to also use scale
modulation.
noise_scheduler_type: Name of the noise scheduler to use. Supported options: ["DDPM", "DDIM"].
Expand Down
6 changes: 3 additions & 3 deletions lerobot/common/policies/diffusion/modeling_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
class DiffusionPolicy(PreTrainedPolicy):
"""
Diffusion Policy as per "Diffusion Policy: Visuomotor Policy Learning via Action Diffusion"
(paper: https://arxiv.org/abs/2303.04137, code: https://github.com/real-stanford/diffusion_policy).
(paper: https://huggingface.co/papers/2303.04137, code: https://github.com/real-stanford/diffusion_policy).
"""

config_class = DiffusionConfig
Expand Down Expand Up @@ -370,7 +370,7 @@ def compute_loss(self, batch: dict[str, Tensor]) -> Tensor:
class SpatialSoftmax(nn.Module):
"""
Spatial Soft Argmax operation described in "Deep Spatial Autoencoders for Visuomotor Learning" by Finn et al.
(https://arxiv.org/pdf/1509.06113). A minimal port of the robomimic implementation.
(https://huggingface.co/papers/1509.06113). A minimal port of the robomimic implementation.

At a high level, this takes 2D feature maps (from a convnet/ViT) and returns the "center of mass"
of activations of each channel, i.e., keypoints in the image space for the policy to focus on.
Expand Down Expand Up @@ -728,7 +728,7 @@ def __init__(

self.conv1 = DiffusionConv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups)

# FiLM modulation (https://arxiv.org/abs/1709.07871) outputs per-channel bias and (maybe) scale.
# FiLM modulation (https://huggingface.co/papers/1709.07871) outputs per-channel bias and (maybe) scale.
cond_channels = out_channels * 2 if use_film_scale_modulation else out_channels
self.cond_encoder = nn.Sequential(nn.Mish(), nn.Linear(cond_dim, cond_channels))

Expand Down
2 changes: 1 addition & 1 deletion lerobot/common/policies/pi0fast/modeling_pi0fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"""
Ο€0+FAST: Efficient Action Tokenization for Vision-Language-Action Models

[Paper](https://arxiv.org/abs/2501.09747)
[Paper](https://huggingface.co/papers/2501.09747)
[Jax code](https://github.com/Physical-Intelligence/openpi)

Designed by Physical Intelligence. Ported from Jax by Hugging Face.
Expand Down
4 changes: 2 additions & 2 deletions lerobot/common/policies/tdmpc/modeling_tdmpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
"""Implementation of Finetuning Offline World Models in the Real World.

The comments in this code may sometimes refer to these references:
TD-MPC paper: Temporal Difference Learning for Model Predictive Control (https://arxiv.org/abs/2203.04955)
FOWM paper: Finetuning Offline World Models in the Real World (https://arxiv.org/abs/2310.16029)
TD-MPC paper: Temporal Difference Learning for Model Predictive Control (https://huggingface.co/papers/2203.04955)
FOWM paper: Finetuning Offline World Models in the Real World (https://huggingface.co/papers/2310.16029)
"""

# ruff: noqa: N806
Expand Down
12 changes: 6 additions & 6 deletions lerobot/common/policies/vqbet/modeling_vqbet.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:
batch = dict(batch) # shallow copy so that adding a key doesn't modify the original
batch["observation.images"] = torch.stack([batch[key] for key in self.config.image_features], dim=-4)
batch = self.normalize_targets(batch)
# VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://arxiv.org/pdf/2403.03181)
# VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://huggingface.co/papers/2403.03181)
if not self.vqbet.action_head.vqvae_model.discretized.item():
# loss: total loss of training RVQ
# n_different_codes: how many of the total possible VQ codes are being used in single batch (how many of them have at least one encoder embedding as a nearest neighbor). This can be at most `vqvae_n_embed * number of layers of RVQ (=2)`.
Expand All @@ -185,7 +185,7 @@ def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]:
class SpatialSoftmax(nn.Module):
"""
Spatial Soft Argmax operation described in "Deep Spatial Autoencoders for Visuomotor Learning" by Finn et al.
(https://arxiv.org/pdf/1509.06113). A minimal port of the robomimic implementation.
(https://huggingface.co/papers/1509.06113). A minimal port of the robomimic implementation.

At a high level, this takes 2D feature maps (from a convnet/ViT) and returns the "center of mass"
of activations of each channel, i.e., keypoints in the image space for the policy to focus on.
Expand Down Expand Up @@ -387,7 +387,7 @@ def forward(self, batch: dict[str, Tensor], rollout: bool) -> tuple[dict, dict]:

# only extract the output tokens at the position of action query:
# Behavior Transformer (BeT), and VQ-BeT are both sequence-to-sequence prediction models,
# mapping sequential observation to sequential action (please refer to section 2.2 in BeT paper https://arxiv.org/pdf/2206.11251).
# mapping sequential observation to sequential action (please refer to section 2.2 in BeT paper https://huggingface.co/papers/2206.11251).
# Thus, it predicts a historical action sequence, in addition to current and future actions (predicting future actions : optional).
if len_additional_action_token > 0:
features = torch.cat(
Expand Down Expand Up @@ -824,8 +824,8 @@ def get_action_from_latent(self, latent):
return einops.rearrange(output, "N (T A) -> N T A", A=self.config.action_feature.shape[0])

def get_code(self, state):
# in phase 2 of VQ-BeT training, we need a `ground truth labels of action data` to calculate the Focal loss for code prediction head. (please refer to section 3.3 in the paper https://arxiv.org/pdf/2403.03181)
# this function outputs the `GT code` of given action using frozen encoder and quantization layers. (please refer to Figure 2. in the paper https://arxiv.org/pdf/2403.03181)
# in phase 2 of VQ-BeT training, we need a `ground truth labels of action data` to calculate the Focal loss for code prediction head. (please refer to section 3.3 in the paper https://huggingface.co/papers/2403.03181)
# this function outputs the `GT code` of given action using frozen encoder and quantization layers. (please refer to Figure 2. in the paper https://huggingface.co/papers/2403.03181)
state = einops.rearrange(state, "N T A -> N (T A)")
with torch.no_grad():
state_rep = self.encoder(state)
Expand All @@ -838,7 +838,7 @@ def get_code(self, state):
return state_vq, vq_code

def vqvae_forward(self, state):
# This function passes the given data through Residual VQ with Encoder and Decoder. Please refer to section 3.2 in the paper https://arxiv.org/pdf/2403.03181).
# This function passes the given data through Residual VQ with Encoder and Decoder. Please refer to section 3.2 in the paper https://huggingface.co/papers/2403.03181).
state = einops.rearrange(state, "N T A -> N (T A)")
# We start with passing action (or action chunk) at:t+n through the encoder Ο•.
state_rep = self.encoder(state)
Expand Down
6 changes: 3 additions & 3 deletions lerobot/common/policies/vqbet/vqbet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ class ResidualVQ(nn.Module):
"""
Residual VQ is composed of multiple VectorQuantize layers.

Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
Follows Algorithm 1. in https://huggingface.co/papers/2107.03312
"Residual Vector Quantizer (a.k.a. multi-stage vector quantizer [36]) cascades Nq layers of VQ as follows. The unquantized input vector is
passed through a first VQ and quantization residuals are computed. The residuals are then iteratively quantized by a sequence of additional
Nq -1 vector quantizers, as described in Algorithm 1."
Expand Down Expand Up @@ -1006,7 +1006,7 @@ def gumbel_sample(
if not straight_through or temperature <= 0.0 or not training:
return ind, one_hot

# use reinmax for better second-order accuracy - https://arxiv.org/abs/2304.08612
# use reinmax for better second-order accuracy - https://huggingface.co/papers/2304.08612
# algorithm 2

if reinmax:
Expand Down Expand Up @@ -1156,7 +1156,7 @@ def batched_embedding(indices, embeds):


def orthogonal_loss_fn(t):
# eq (2) from https://arxiv.org/abs/2112.00384
# eq (2) from https://huggingface.co/papers/2112.00384
h, n = t.shape[:2]
normed_codes = F.normalize(t, p=2, dim=-1)
cosine_sim = einsum("h i d, h j d -> h i j", normed_codes, normed_codes)
Expand Down
Loading