Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions examples/tutorial/smolvla/train_smolvla_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
from pathlib import Path
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing module-level docstring. Other training examples in this repository (act_training_example.py, diffusion_training_example.py) include a docstring at the top that describes what the script demonstrates. Consider adding a similar docstring such as: """This script demonstrates how to train SmolVLA Policy on a real-world dataset."""

Copilot uses AI. Check for mistakes.

import torch

from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
from lerobot.policies.factory import make_pre_post_processors
from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy


# Output directory for saving the trained model
output_directory = Path("outputs/train/my_smolvla")
output_directory.mkdir(parents=True, exist_ok=True)

device = torch.device("cuda") # or "cuda" or "cpu"
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment says 'or "cuda" or "cpu"' but the device is already set to "cuda", making this redundant. This should either say 'or "mps" or "cpu"' to match the other examples, or the device should be set to "mps" to be consistent with act_training_example.py and diffusion_training_example.py which use torch.device("mps").

Suggested change
device = torch.device("cuda") # or "cuda" or "cpu"
device = torch.device("mps") # or "cuda" or "cpu"

Copilot uses AI. Check for mistakes.

# Replace with your custom dataset repo_id (e.g., "${HF_USER}/mydataset")
dataset_id = "lerobot/svla_so100_pickplace"

# Model configuration
# Option 1: Load from a pretrained SmolVLA checkpoint (recommended for fine-tuning)
pretrained_model_id = "lerobot/smolvla_base"
load_from_pretrained = True # Set to False to train from scratch

# Option 2: Train from scratch (only if you have a large dataset and computational resources)
# This will initialize the model with a pretrained VLM backbone but random action expert
# load_from_pretrained = False

# Load dataset metadata to get features and statistics
print(f"Loading dataset metadata from {dataset_id}...")
dataset_metadata = LeRobotDatasetMetadata(dataset_id)

if load_from_pretrained:
print(f"Loading pretrained model from {pretrained_model_id}...")
policy = SmolVLAPolicy.from_pretrained(pretrained_model_id)

# Create rename map to match dataset keys to model's expected keys
rename_map = {
"observation.images.top": "observation.images.camera1",
"observation.images.wrist": "observation.images.camera2",
}
Comment on lines +38 to +41
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded rename_map is specific to the svla_so100_pickplace dataset and will not work with other datasets that have different camera keys. Consider adding a comment explaining this mapping is dataset-specific and may need adjustment, or checking if the pretrained model's camera keys match the dataset's keys before applying the rename.

Copilot uses AI. Check for mistakes.

# Create preprocessor and postprocessor with dataset statistics
# This is important for normalizing inputs/outputs to match your dataset
preprocessor, postprocessor = make_pre_post_processors(
policy.config,
pretrained_path=pretrained_model_id,
preprocessor_overrides={
"device_processor": {"device": str(device)},
"rename_observations_processor": {"rename_map": rename_map},
"normalizer_processor": {
"stats": dataset_metadata.stats,
"features": {**policy.config.input_features, **policy.config.output_features},
"norm_map": policy.config.normalization_mapping,
},
},
postprocessor_overrides={
"unnormalizer_processor": {
"stats": dataset_metadata.stats,
"features": policy.config.output_features,
"norm_map": policy.config.normalization_mapping,
},
},
)
else:
print("Initializing new SmolVLA model from scratch...")
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This statement is unreachable.

Copilot uses AI. Check for mistakes.
# Note: Training from scratch requires careful configuration
# The input/output features must match your dataset structure
from lerobot.configs.types import FeatureType
from lerobot.datasets.utils import dataset_to_policy_features

features = dataset_to_policy_features(dataset_metadata.features)
output_features = {key: ft for key, ft in features.items() if ft.type is FeatureType.ACTION}
input_features = {key: ft for key, ft in features.items() if key not in output_features}

cfg = SmolVLAConfig(input_features=input_features, output_features=output_features)
cfg.load_vlm_weights = True # Load pretrained VLM backbone
policy = SmolVLAPolicy(cfg)

preprocessor, postprocessor = make_pre_post_processors(
cfg, dataset_stats=dataset_metadata.stats
)

# Move policy to the specified device
policy.train()
policy.to(device)


def make_delta_timestamps(delta_indices: list[int] | None, fps: int) -> list[float]:
"""Convert delta indices to delta timestamps based on dataset FPS."""
if delta_indices is None:
return [0]
return [i / fps for i in delta_indices]


# SmolVLA expects action sequences of length chunk_size (default 50)
# and single observation frames (n_obs_steps=1)
delta_timestamps = {
"action": make_delta_timestamps(policy.config.action_delta_indices, dataset_metadata.fps),
}

# Add delta timestamps for image features that actually exist in the dataset
dataset_image_keys = [k for k in dataset_metadata.features.keys() if "image" in k.lower()]
delta_timestamps |= {
k: make_delta_timestamps(policy.config.observation_delta_indices, dataset_metadata.fps)
for k in dataset_image_keys
}

# Add delta timestamp for state if present
if "observation.state" in dataset_metadata.features:
delta_timestamps["observation.state"] = make_delta_timestamps(
policy.config.observation_delta_indices, dataset_metadata.fps
)

# Load the dataset with appropriate delta timestamps
print(f"Loading dataset {dataset_id}...")
dataset = LeRobotDataset(dataset_id, delta_timestamps=delta_timestamps)
print(f"Dataset loaded: {dataset.num_episodes} episodes, {dataset.num_frames} frames")

# Training configuration
batch_size = 64 # Adjust based on your GPU memory (64 works well for most GPUs)
training_steps = 20000 # Number of training steps (increase for better performance)
log_freq = 100 # Log every N steps

# Create optimizer and scheduler using SmolVLA's preset configurations
optimizer = policy.config.get_optimizer_preset().build(policy.parameters())
lr_scheduler = policy.config.get_scheduler_preset().build(optimizer, num_training_steps=training_steps)

# Create dataloader for offline training
dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=batch_size,
shuffle=True,
pin_memory=device.type == "cuda",
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent pin_memory condition. This script uses device.type == "cuda" but other training examples (act_training_example.py:64, diffusion_training_example.py:65) use device.type != "cpu". The latter is more inclusive as it also covers MPS devices. Consider changing to device.type != "cpu" for consistency.

Suggested change
pin_memory=device.type == "cuda",
pin_memory=device.type != "cpu",

Copilot uses AI. Check for mistakes.
drop_last=True,
num_workers=4, # Adjust based on your system
)

print(f"\nStarting training for {training_steps} steps...")
print(f"Batch size: {batch_size}")
print(f"Device: {device}")
print(f"Output directory: {output_directory}")
print("-" * 80)

# Training loop
step = 0
done = False
while not done:
for batch in dataloader:
# Preprocess the batch (normalization, tokenization, etc.)
batch = preprocessor(batch)

# Forward pass: compute loss
loss, output_dict = policy.forward(batch)

# Backward pass and optimization
loss.backward()
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing gradient clipping before optimizer step. The SmolVLA config defines grad_clip_norm=10 (via get_optimizer_preset), but gradient clipping must be manually applied. Add torch.nn.utils.clip_grad_norm_(policy.parameters(), optimizer_config.grad_clip_norm) after loss.backward() and before optimizer.step().

Suggested change
loss.backward()
loss.backward()
torch.nn.utils.clip_grad_norm_(policy.parameters(), optimizer_config.grad_clip_norm)

Copilot uses AI. Check for mistakes.
optimizer.step()
optimizer.zero_grad()

# Update learning rate
if lr_scheduler is not None:
lr_scheduler.step()

# Logging
if step % log_freq == 0:
current_lr = optimizer.param_groups[0]["lr"]
print(
f"Step: {step}/{training_steps} | "
f"Loss: {loss.item():.4f} | "
f"LR: {current_lr:.2e}"
)

step += 1
if step >= training_steps:
done = True
break

print("-" * 80)
print("Training completed!")

# Save the trained model
print(f"\nSaving model to {output_directory}...")
policy.save_pretrained(output_directory)
preprocessor.save_pretrained(output_directory)
postprocessor.save_pretrained(output_directory)
print("Model saved successfully!")

# Optional: Push to Hugging Face Hub
# Uncomment and update with your Hugging Face username
push_to_hub = False # Set to True to push to Hub
hub_repo_id = "YOUR_HF_USERNAME/my_smolvla_so101" # Replace with your repo ID
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent dataset naming. The dataset ID uses 'svla_so100_pickplace' but line 192's comment refers to 'my_smolvla_so101' and line 205 mentions 'SO101 robot'. The documentation at docs/source/smolvla.mdx:40 confirms the dataset is 'svla_so100_pickplace'. The comments should consistently use SO100 to match the dataset, or clarify if SO101 is intentionally different.

Copilot uses AI. Check for mistakes.

if push_to_hub:
print(f"\nPushing model to Hugging Face Hub: {hub_repo_id}...")
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This statement is unreachable.

Copilot uses AI. Check for mistakes.
policy.push_to_hub(hub_repo_id)
preprocessor.push_to_hub(hub_repo_id)
postprocessor.push_to_hub(hub_repo_id)
print(f"Model pushed to: https://huggingface.co/{hub_repo_id}")

print("\n" + "=" * 80)
print("Training complete! Next steps:")
print("1. Test the model with: examples/tutorial/smolvla/using_smolvla_example.py")
print(f"2. Update model_id in the script to: {output_directory}")
print("3. Deploy on your SO101 robot!")
Copy link

Copilot AI Dec 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent reference to robot type. The comment mentions 'SO101 robot' but the dataset being used is 'svla_so100_pickplace' (line 18). This should be 'SO100 robot' to match the dataset, or clarified if SO101 is a different robot model.

Suggested change
print("3. Deploy on your SO101 robot!")
print("3. Deploy on your SO100 robot!")

Copilot uses AI. Check for mistakes.
print("=" * 80)