diff --git a/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py index 99ab2cbf6d..f7d46b29ec 100644 --- a/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py +++ b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py @@ -36,7 +36,7 @@ "robot_config": AlohaRobotConfig(), "license": "mit", "url": "https://mobile-aloha.github.io/", - "paper": "https://arxiv.org/abs/2401.02117", + "paper": "https://huggingface.co/papers/2401.02117", "citation_bibtex": dedent(r""" @inproceedings{fu2024mobile, author = {Fu, Zipeng and Zhao, Tony Z. and Finn, Chelsea}, @@ -49,7 +49,7 @@ "robot_config": AlohaRobotConfig(), "license": "mit", "url": "https://tonyzhaozh.github.io/aloha/", - "paper": "https://arxiv.org/abs/2304.13705", + "paper": "https://huggingface.co/papers/2304.13705", "citation_bibtex": dedent(r""" @article{Zhao2023LearningFB, title={Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware}, @@ -57,13 +57,13 @@ journal={RSS}, year={2023}, volume={abs/2304.13705}, - url={https://arxiv.org/abs/2304.13705} + url={https://huggingface.co/papers/2304.13705} }""").lstrip(), } PUSHT_INFO = { "license": "mit", "url": "https://diffusion-policy.cs.columbia.edu/", - "paper": "https://arxiv.org/abs/2303.04137v5", + "paper": "https://huggingface.co/papers/2303.04137v5", "citation_bibtex": dedent(r""" @article{chi2024diffusionpolicy, author = {Cheng Chi and Zhenjia Xu and Siyuan Feng and Eric Cousineau and Yilun Du and Benjamin Burchfiel and Russ Tedrake and Shuran Song}, @@ -75,7 +75,7 @@ XARM_INFO = { "license": "mit", "url": "https://www.nicklashansen.com/td-mpc/", - "paper": "https://arxiv.org/abs/2203.04955", + "paper": "https://huggingface.co/papers/2203.04955", "citation_bibtex": dedent(r""" @inproceedings{Hansen2022tdmpc, title={Temporal Difference Learning for Model Predictive Control}, @@ -244,7 +244,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://ut-austin-rpl.github.io/BUDS-website/", - "paper": "https://arxiv.org/abs/2109.13841", + "paper": "https://huggingface.co/papers/2109.13841", "citation_bibtex": dedent(r""" @article{zhu2022bottom, title={Bottom-Up Skill Discovery From Unsegmented Demonstrations for Long-Horizon Robot Manipulation}, @@ -261,7 +261,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://ut-austin-rpl.github.io/sailor/", - "paper": "https://arxiv.org/abs/2210.11435", + "paper": "https://huggingface.co/papers/2210.11435", "citation_bibtex": dedent(r""" @inproceedings{nasiriany2022sailor, title={Learning and Retrieval from Prior Data for Skill-based Imitation Learning}, @@ -274,7 +274,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://ut-austin-rpl.github.io/sirius/", - "paper": "https://arxiv.org/abs/2211.08416", + "paper": "https://huggingface.co/papers/2211.08416", "citation_bibtex": dedent(r""" @inproceedings{liu2022robot, title = {Robot Learning on the Job: Human-in-the-Loop Autonomy and Learning During Deployment}, @@ -298,14 +298,14 @@ "tasks_col": "language_instruction", "license": "cc-by-4.0", "url": "https://sites.google.com/view/cablerouting/home", - "paper": "https://arxiv.org/abs/2307.08927", + "paper": "https://huggingface.co/papers/2307.08927", "citation_bibtex": dedent(r""" @article{luo2023multistage, author = {Jianlan Luo and Charles Xu and Xinyang Geng and Gilbert Feng and Kuan Fang and Liam Tan and Stefan Schaal and Sergey Levine}, title = {Multi-Stage Cable Routing through Hierarchical Imitation Learning}, journal = {arXiv pre-print}, year = {2023}, - url = {https://arxiv.org/abs/2307.08927}, + url = {https://huggingface.co/papers/2307.08927}, }""").lstrip(), }, "berkeley_fanuc_manipulation": { @@ -322,7 +322,7 @@ "berkeley_gnm_cory_hall": { "tasks_col": "language_instruction", "license": "mit", - "paper": "https://arxiv.org/abs/1709.10489", + "paper": "https://huggingface.co/papers/1709.10489", "citation_bibtex": dedent(r""" @inproceedings{kahn2018self, title={Self-supervised deep reinforcement learning with generalized computation graphs for robot navigation}, @@ -337,7 +337,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://sites.google.com/view/recon-robot", - "paper": "https://arxiv.org/abs/2104.05859", + "paper": "https://huggingface.co/papers/2104.05859", "citation_bibtex": dedent(r""" @inproceedings{shah2021rapid, title={Rapid Exploration for Open-World Navigation with Latent Goal Models}, @@ -351,7 +351,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://sites.google.com/view/SACSoN-review", - "paper": "https://arxiv.org/abs/2306.01874", + "paper": "https://huggingface.co/papers/2306.01874", "citation_bibtex": dedent(r""" @article{hirose2023sacson, title={SACSoN: Scalable Autonomous Data Collection for Social Navigation}, @@ -363,7 +363,7 @@ "berkeley_mvp": { "tasks_col": "language_instruction", "license": "mit", - "paper": "https://arxiv.org/abs/2203.06173", + "paper": "https://huggingface.co/papers/2203.06173", "citation_bibtex": dedent(r""" @InProceedings{Radosavovic2022, title = {Real-World Robot Learning with Masked Visual Pre-training}, @@ -375,7 +375,7 @@ "berkeley_rpt": { "tasks_col": "language_instruction", "license": "mit", - "paper": "https://arxiv.org/abs/2306.10007", + "paper": "https://huggingface.co/papers/2306.10007", "citation_bibtex": dedent(r""" @article{Radosavovic2023, title={Robot Learning with Sensorimotor Pre-training}, @@ -388,7 +388,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://human-world-model.github.io/", - "paper": "https://arxiv.org/abs/2308.10901", + "paper": "https://huggingface.co/papers/2308.10901", "citation_bibtex": dedent(r""" @inproceedings{mendonca2023structured, title={Structured World Models from Human Videos}, @@ -401,7 +401,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://play-fusion.github.io/", - "paper": "https://arxiv.org/abs/2312.04549", + "paper": "https://huggingface.co/papers/2312.04549", "citation_bibtex": dedent(r""" @inproceedings{chen2023playfusion, title={PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play}, @@ -414,7 +414,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://robo-affordances.github.io/", - "paper": "https://arxiv.org/abs/2304.08488", + "paper": "https://huggingface.co/papers/2304.08488", "citation_bibtex": dedent(r""" @inproceedings{bahl2023affordances, title={Affordances from Human Videos as a Versatile Representation for Robotics}, @@ -433,7 +433,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://diffusion-policy.cs.columbia.edu/", - "paper": "https://arxiv.org/abs/2303.04137v5", + "paper": "https://huggingface.co/papers/2303.04137", "citation_bibtex": dedent(r""" @inproceedings{chi2023diffusionpolicy, title={Diffusion Policy: Visuomotor Policy Learning via Action Diffusion}, @@ -505,7 +505,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://droid-dataset.github.io/", - "paper": "https://arxiv.org/abs/2403.12945", + "paper": "https://huggingface.co/papers/2403.12945", "citation_bibtex": dedent(r""" @article{khazatsky2024droid, title = {DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset}, @@ -517,7 +517,7 @@ "tasks_col": "language_instruction", "license": "cc-by-4.0", "url": "https://functional-manipulation-benchmark.github.io/", - "paper": "https://arxiv.org/abs/2401.08553", + "paper": "https://huggingface.co/papers/2401.08553", "citation_bibtex": dedent(r""" @article{luo2024fmb, title={FMB: a Functional Manipulation Benchmark for Generalizable Robotic Learning}, @@ -530,7 +530,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://openreview.net/forum?id=WuBv9-IGDUA", - "paper": "https://arxiv.org/abs/2401.14502", + "paper": "https://huggingface.co/papers/2401.14502", "citation_bibtex": dedent(r""" @inproceedings{saxena2023multiresolution, title={Multi-Resolution Sensing for Real-Time Control with Vision-Language Models}, @@ -575,7 +575,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://jyopari.github.io/VINN/", - "paper": "https://arxiv.org/abs/2112.01511", + "paper": "https://huggingface.co/papers/2112.01511", "citation_bibtex": dedent(r""" @misc{pari2021surprising, title={The Surprising Effectiveness of Representation Learning for Visual Imitation}, @@ -590,7 +590,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://play-to-policy.github.io/", - "paper": "https://arxiv.org/abs/2210.10047", + "paper": "https://huggingface.co/papers/2210.10047", "citation_bibtex": dedent(r""" @article{cui2022play, title = {From Play to Policy: Conditional Behavior Generation from Uncurated Robot Data}, @@ -603,7 +603,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://rot-robot.github.io/", - "paper": "https://arxiv.org/abs/2206.15469", + "paper": "https://huggingface.co/papers/2206.15469", "citation_bibtex": dedent(r""" @inproceedings{haldar2023watch, title={Watch and match: Supercharging imitation with regularized optimal transport}, @@ -633,7 +633,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://sites.google.com/view/hydra-il-2023", - "paper": "https://arxiv.org/abs/2306.17237", + "paper": "https://huggingface.co/papers/2306.17237", "citation_bibtex": dedent(r""" @article{belkhale2023hydra, title={HYDRA: Hybrid Robot Actions for Imitation Learning}, @@ -646,21 +646,21 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://sites.google.com/view/visionandtouch", - "paper": "https://arxiv.org/abs/1810.10191", + "paper": "https://huggingface.co/papers/1810.10191", "citation_bibtex": dedent(r""" @inproceedings{lee2019icra, title={Making sense of vision and touch: Self-supervised learning of multimodal representations for contact-rich tasks}, author={Lee, Michelle A and Zhu, Yuke and Srinivasan, Krishnan and Shah, Parth and Savarese, Silvio and Fei-Fei, Li and Garg, Animesh and Bohg, Jeannette}, booktitle={2019 IEEE International Conference on Robotics and Automation (ICRA)}, year={2019}, - url={https://arxiv.org/abs/1810.10191} + url={https://huggingface.co/papers/1810.10191} }""").lstrip(), }, "stanford_robocook": { "tasks_col": "language_instruction", "license": "mit", "url": "https://hshi74.github.io/robocook/", - "paper": "https://arxiv.org/abs/2306.14447", + "paper": "https://huggingface.co/papers/2306.14447", "citation_bibtex": dedent(r""" @article{shi2023robocook, title={RoboCook: Long-Horizon Elasto-Plastic Object Manipulation with Diverse Tools}, @@ -673,7 +673,7 @@ "tasks_col": "language_instruction", "license": "cc-by-4.0", "url": "https://www.kaggle.com/datasets/oiermees/taco-robot", - "paper": "https://arxiv.org/abs/2209.08959, https://arxiv.org/abs/2210.01911", + "paper": "https://huggingface.co/papers/2209.08959, https://huggingface.co/papers/2210.01911", "citation_bibtex": dedent(r""" @inproceedings{rosete2022tacorl, author = {Erick Rosete-Beas and Oier Mees and Gabriel Kalweit and Joschka Boedecker and Wolfram Burgard}, @@ -693,7 +693,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "URL", - "paper": "https://arxiv.org/abs/2107.05842", + "paper": "https://huggingface.co/papers/2107.05842", "citation_bibtex": dedent(r""" @Article{Osa22, author = {Takayuki Osa}, @@ -709,7 +709,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://toto-benchmark.org/", - "paper": "https://arxiv.org/abs/2306.00942", + "paper": "https://huggingface.co/papers/2306.00942", "citation_bibtex": dedent(r""" @inproceedings{zhou2023train, author={Zhou, Gaoyue and Dean, Victoria and Srirama, Mohan Kumar and Rajeswaran, Aravind and Pari, Jyothish and Hatch, Kyle and Jain, Aryan and Yu, Tianhe and Abbeel, Pieter and Pinto, Lerrel and Finn, Chelsea and Gupta, Abhinav}, @@ -733,7 +733,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://owmcorl.github.io/#", - "paper": "https://arxiv.org/abs/2310.16029", + "paper": "https://huggingface.co/papers/2310.16029", "citation_bibtex": dedent(r""" @preprint{Feng2023Finetuning, title={Finetuning Offline World Models in the Real World}, @@ -745,7 +745,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://robopil.github.io/d3fields/", - "paper": "https://arxiv.org/abs/2309.16118", + "paper": "https://huggingface.co/papers/2309.16118", "citation_bibtex": dedent(r""" @article{wang2023d3field, title={D^3Field: Dynamic 3D Descriptor Fields for Generalizable Robotic Manipulation}, @@ -758,7 +758,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://uscresl.github.io/dmfd/", - "paper": "https://arxiv.org/abs/2207.10148", + "paper": "https://huggingface.co/papers/2207.10148", "citation_bibtex": dedent(r""" @article{salhotra2022dmfd, author={Salhotra, Gautam and Liu, I-Chun Arthur and Dominguez-Kuhne, Marcus and Sukhatme, Gaurav S.}, @@ -775,7 +775,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://ut-austin-rpl.github.io/MUTEX/", - "paper": "https://arxiv.org/abs/2309.14320", + "paper": "https://huggingface.co/papers/2309.14320", "citation_bibtex": dedent(r""" @inproceedings{shah2023mutex, title={{MUTEX}: Learning Unified Policies from Multimodal Task Specifications}, @@ -811,7 +811,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://saytap.github.io/", - "paper": "https://arxiv.org/abs/2306.07580", + "paper": "https://huggingface.co/papers/2306.07580", "citation_bibtex": dedent(r""" @article{saytap2023, author = {Yujin Tang and Wenhao Yu and Jie Tan and Heiga Zen and Aleksandra Faust and @@ -847,7 +847,7 @@ "tasks_col": "language_instruction", "license": "mit", "url": "https://ut-austin-rpl.github.io/VIOLA/", - "paper": "https://arxiv.org/abs/2210.11339", + "paper": "https://huggingface.co/papers/2210.11339", "citation_bibtex": dedent(r""" @article{zhu2022viola, title={VIOLA: Imitation Learning for Vision-Based Manipulation with Object Proposal Priors}, diff --git a/lerobot/common/policies/act/modeling_act.py b/lerobot/common/policies/act/modeling_act.py index 72d4df03a2..e7e74bf380 100644 --- a/lerobot/common/policies/act/modeling_act.py +++ b/lerobot/common/policies/act/modeling_act.py @@ -15,7 +15,7 @@ # limitations under the License. """Action Chunking Transformer Policy -As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705). +As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://huggingface.co/papers/2304.13705). The majority of changes here involve removing unused code, unifying naming, and adding helpful comments. """ @@ -41,7 +41,7 @@ class ACTPolicy(PreTrainedPolicy): """ Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost - Hardware (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act) + Hardware (paper: https://huggingface.co/papers/2304.13705, code: https://github.com/tonyzhaozh/act) """ config_class = ACTConfig @@ -161,7 +161,7 @@ def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]: # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for # each dimension independently, we sum over the latent dimension to get the total # KL-divergence per batch element, then take the mean over the batch. - # (See App. B of https://arxiv.org/abs/1312.6114 for more details). + # (See App. B of https://huggingface.co/papers/1312.6114 for more details). mean_kld = ( (-0.5 * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())).sum(-1).mean() ) @@ -175,7 +175,7 @@ def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]: class ACTTemporalEnsembler: def __init__(self, temporal_ensemble_coeff: float, chunk_size: int) -> None: - """Temporal ensembling as described in Algorithm 2 of https://arxiv.org/abs/2304.13705. + """Temporal ensembling as described in Algorithm 2 of https://huggingface.co/papers/2304.13705. The weights are calculated as wᵢ = exp(-temporal_ensemble_coeff * i) where w₀ is the oldest action. They are then normalized to sum to 1 by dividing by Σwᵢ. Here's some intuition around how the diff --git a/lerobot/common/policies/diffusion/configuration_diffusion.py b/lerobot/common/policies/diffusion/configuration_diffusion.py index e73c65fe9a..c8841f06b9 100644 --- a/lerobot/common/policies/diffusion/configuration_diffusion.py +++ b/lerobot/common/policies/diffusion/configuration_diffusion.py @@ -81,7 +81,7 @@ class DiffusionConfig(PreTrainedConfig): n_groups: Number of groups used in the group norm of the Unet's convolutional blocks. diffusion_step_embed_dim: The Unet is conditioned on the diffusion timestep via a small non-linear network. This is the output dimension of that network, i.e., the embedding dimension. - use_film_scale_modulation: FiLM (https://arxiv.org/abs/1709.07871) is used for the Unet conditioning. + use_film_scale_modulation: FiLM (https://huggingface.co/papers/1709.07871) is used for the Unet conditioning. Bias modulation is used be default, while this parameter indicates whether to also use scale modulation. noise_scheduler_type: Name of the noise scheduler to use. Supported options: ["DDPM", "DDIM"]. diff --git a/lerobot/common/policies/diffusion/modeling_diffusion.py b/lerobot/common/policies/diffusion/modeling_diffusion.py index 9ecadcb05b..d3f57775f2 100644 --- a/lerobot/common/policies/diffusion/modeling_diffusion.py +++ b/lerobot/common/policies/diffusion/modeling_diffusion.py @@ -48,7 +48,7 @@ class DiffusionPolicy(PreTrainedPolicy): """ Diffusion Policy as per "Diffusion Policy: Visuomotor Policy Learning via Action Diffusion" - (paper: https://arxiv.org/abs/2303.04137, code: https://github.com/real-stanford/diffusion_policy). + (paper: https://huggingface.co/papers/2303.04137, code: https://github.com/real-stanford/diffusion_policy). """ config_class = DiffusionConfig @@ -370,7 +370,7 @@ def compute_loss(self, batch: dict[str, Tensor]) -> Tensor: class SpatialSoftmax(nn.Module): """ Spatial Soft Argmax operation described in "Deep Spatial Autoencoders for Visuomotor Learning" by Finn et al. - (https://arxiv.org/pdf/1509.06113). A minimal port of the robomimic implementation. + (https://huggingface.co/papers/1509.06113). A minimal port of the robomimic implementation. At a high level, this takes 2D feature maps (from a convnet/ViT) and returns the "center of mass" of activations of each channel, i.e., keypoints in the image space for the policy to focus on. @@ -728,7 +728,7 @@ def __init__( self.conv1 = DiffusionConv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups) - # FiLM modulation (https://arxiv.org/abs/1709.07871) outputs per-channel bias and (maybe) scale. + # FiLM modulation (https://huggingface.co/papers/1709.07871) outputs per-channel bias and (maybe) scale. cond_channels = out_channels * 2 if use_film_scale_modulation else out_channels self.cond_encoder = nn.Sequential(nn.Mish(), nn.Linear(cond_dim, cond_channels)) diff --git a/lerobot/common/policies/pi0fast/modeling_pi0fast.py b/lerobot/common/policies/pi0fast/modeling_pi0fast.py index 36aafce94b..7a6dfd7b65 100644 --- a/lerobot/common/policies/pi0fast/modeling_pi0fast.py +++ b/lerobot/common/policies/pi0fast/modeling_pi0fast.py @@ -17,7 +17,7 @@ """ π0+FAST: Efficient Action Tokenization for Vision-Language-Action Models -[Paper](https://arxiv.org/abs/2501.09747) +[Paper](https://huggingface.co/papers/2501.09747) [Jax code](https://github.com/Physical-Intelligence/openpi) Designed by Physical Intelligence. Ported from Jax by Hugging Face. diff --git a/lerobot/common/policies/tdmpc/modeling_tdmpc.py b/lerobot/common/policies/tdmpc/modeling_tdmpc.py index b46ae9030b..a43857a82b 100644 --- a/lerobot/common/policies/tdmpc/modeling_tdmpc.py +++ b/lerobot/common/policies/tdmpc/modeling_tdmpc.py @@ -17,8 +17,8 @@ """Implementation of Finetuning Offline World Models in the Real World. The comments in this code may sometimes refer to these references: - TD-MPC paper: Temporal Difference Learning for Model Predictive Control (https://arxiv.org/abs/2203.04955) - FOWM paper: Finetuning Offline World Models in the Real World (https://arxiv.org/abs/2310.16029) + TD-MPC paper: Temporal Difference Learning for Model Predictive Control (https://huggingface.co/papers/2203.04955) + FOWM paper: Finetuning Offline World Models in the Real World (https://huggingface.co/papers/2310.16029) """ # ruff: noqa: N806 diff --git a/lerobot/common/policies/vqbet/modeling_vqbet.py b/lerobot/common/policies/vqbet/modeling_vqbet.py index 97a08e2f4f..44006a5b21 100644 --- a/lerobot/common/policies/vqbet/modeling_vqbet.py +++ b/lerobot/common/policies/vqbet/modeling_vqbet.py @@ -162,7 +162,7 @@ def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]: batch = dict(batch) # shallow copy so that adding a key doesn't modify the original batch["observation.images"] = torch.stack([batch[key] for key in self.config.image_features], dim=-4) batch = self.normalize_targets(batch) - # VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://arxiv.org/pdf/2403.03181) + # VQ-BeT discretizes action using VQ-VAE before training BeT (please refer to section 3.2 in the VQ-BeT paper https://huggingface.co/papers/2403.03181) if not self.vqbet.action_head.vqvae_model.discretized.item(): # loss: total loss of training RVQ # n_different_codes: how many of the total possible VQ codes are being used in single batch (how many of them have at least one encoder embedding as a nearest neighbor). This can be at most `vqvae_n_embed * number of layers of RVQ (=2)`. @@ -185,7 +185,7 @@ def forward(self, batch: dict[str, Tensor]) -> tuple[Tensor, dict]: class SpatialSoftmax(nn.Module): """ Spatial Soft Argmax operation described in "Deep Spatial Autoencoders for Visuomotor Learning" by Finn et al. - (https://arxiv.org/pdf/1509.06113). A minimal port of the robomimic implementation. + (https://huggingface.co/papers/1509.06113). A minimal port of the robomimic implementation. At a high level, this takes 2D feature maps (from a convnet/ViT) and returns the "center of mass" of activations of each channel, i.e., keypoints in the image space for the policy to focus on. @@ -387,7 +387,7 @@ def forward(self, batch: dict[str, Tensor], rollout: bool) -> tuple[dict, dict]: # only extract the output tokens at the position of action query: # Behavior Transformer (BeT), and VQ-BeT are both sequence-to-sequence prediction models, - # mapping sequential observation to sequential action (please refer to section 2.2 in BeT paper https://arxiv.org/pdf/2206.11251). + # mapping sequential observation to sequential action (please refer to section 2.2 in BeT paper https://huggingface.co/papers/2206.11251). # Thus, it predicts a historical action sequence, in addition to current and future actions (predicting future actions : optional). if len_additional_action_token > 0: features = torch.cat( @@ -824,8 +824,8 @@ def get_action_from_latent(self, latent): return einops.rearrange(output, "N (T A) -> N T A", A=self.config.action_feature.shape[0]) def get_code(self, state): - # in phase 2 of VQ-BeT training, we need a `ground truth labels of action data` to calculate the Focal loss for code prediction head. (please refer to section 3.3 in the paper https://arxiv.org/pdf/2403.03181) - # this function outputs the `GT code` of given action using frozen encoder and quantization layers. (please refer to Figure 2. in the paper https://arxiv.org/pdf/2403.03181) + # in phase 2 of VQ-BeT training, we need a `ground truth labels of action data` to calculate the Focal loss for code prediction head. (please refer to section 3.3 in the paper https://huggingface.co/papers/2403.03181) + # this function outputs the `GT code` of given action using frozen encoder and quantization layers. (please refer to Figure 2. in the paper https://huggingface.co/papers/2403.03181) state = einops.rearrange(state, "N T A -> N (T A)") with torch.no_grad(): state_rep = self.encoder(state) @@ -838,7 +838,7 @@ def get_code(self, state): return state_vq, vq_code def vqvae_forward(self, state): - # This function passes the given data through Residual VQ with Encoder and Decoder. Please refer to section 3.2 in the paper https://arxiv.org/pdf/2403.03181). + # This function passes the given data through Residual VQ with Encoder and Decoder. Please refer to section 3.2 in the paper https://huggingface.co/papers/2403.03181). state = einops.rearrange(state, "N T A -> N (T A)") # We start with passing action (or action chunk) at:t+n through the encoder ϕ. state_rep = self.encoder(state) diff --git a/lerobot/common/policies/vqbet/vqbet_utils.py b/lerobot/common/policies/vqbet/vqbet_utils.py index 139d119edc..09a86c07ba 100644 --- a/lerobot/common/policies/vqbet/vqbet_utils.py +++ b/lerobot/common/policies/vqbet/vqbet_utils.py @@ -336,7 +336,7 @@ class ResidualVQ(nn.Module): """ Residual VQ is composed of multiple VectorQuantize layers. - Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf + Follows Algorithm 1. in https://huggingface.co/papers/2107.03312 "Residual Vector Quantizer (a.k.a. multi-stage vector quantizer [36]) cascades Nq layers of VQ as follows. The unquantized input vector is passed through a first VQ and quantization residuals are computed. The residuals are then iteratively quantized by a sequence of additional Nq -1 vector quantizers, as described in Algorithm 1." @@ -1006,7 +1006,7 @@ def gumbel_sample( if not straight_through or temperature <= 0.0 or not training: return ind, one_hot - # use reinmax for better second-order accuracy - https://arxiv.org/abs/2304.08612 + # use reinmax for better second-order accuracy - https://huggingface.co/papers/2304.08612 # algorithm 2 if reinmax: @@ -1156,7 +1156,7 @@ def batched_embedding(indices, embeds): def orthogonal_loss_fn(t): - # eq (2) from https://arxiv.org/abs/2112.00384 + # eq (2) from https://huggingface.co/papers/2112.00384 h, n = t.shape[:2] normed_codes = F.normalize(t, p=2, dim=-1) cosine_sim = einsum("h i d, h j d -> h i j", normed_codes, normed_codes)