diff --git a/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b.sh b/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b.sh
new file mode 100644
index 0000000000..014ee567fe
--- /dev/null
+++ b/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b.sh
@@ -0,0 +1,120 @@
+set -x
+
+# Colocated DAPO training+generation for Qwen3-4B-Base on DAPO training data with Megatron.
+# bash examples/algorithms/dapo/prepare_dapo_data.sh
+# bash examples/megatron/run_megatron_dapo_qwen3_4b.sh
+
+MODEL_NAME="Qwen/Qwen3-4B-Base"
+DATA_DIR="$HOME/data/dapo"
+TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet"
+TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet"
+NUM_NODES=1
+NUM_GPUS_PER_NODE=8
+NUM_INFERENCE_ENGINES=4
+INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=2
+LOGGER="wandb"  # change to "console" to print to stdout
+
+CLIP_RATIO_LOW=0.2
+CLIP_RATIO_HIGH=0.28
+# use token mean loss reduction
+LOSS_REDUCTION="token_mean"
+# applies overlong filtering (but not soft overlong punishment)
+APPLY_OVERLONG_FILTERING=true
+# apply soft overlong punishment with custom trainer impl in main_dapo.py
+OVERLONG_BUFFER_LEN=$((1024 * 4))
+OVERLONG_BUFFER_PENALTY_FACTOR=1.0
+
+# other DAPO parameters
+USE_KL_LOSS=false
+TEMPERATURE=1.0
+TOP_P=1.0
+EVAL_TOP_P=0.7
+CLIP_RATIO_C=10.0
+MAX_PROMPT_LENGTH=$((1024 * 2))
+MAX_RESPONSE_LENGTH=$((1024 * 8))
+
+# repro run parameters
+TRAIN_BATCH_SIZE=512
+MINI_BATCH_SIZE=32
+N_SAMPLES_PER_PROMPT=16
+EVAL_N_SAMPLES_PER_PROMPT=32
+ENFORCE_EAGER=true # cuda graphs can cause some instability
+LR=1e-6
+
+# megatron config
+MEGATRON_TP=4
+MEGATRON_PP=2
+MEGATRON_CP=1
+MEGATRON_EP=1
+MEGATRON_ETP=null
+
+# TIS parameters
+TIS_IMP_RATIO_CAP=2.0
+USE_TIS=true
+
+uv run --isolated --extra mcore -m examples.algorithms.dapo.main_dapo \
+  data.train_data="['$TRAIN_FILE']" \
+  data.val_data="['$TEST_FILE']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.algorithm.policy_loss_type="dual_clip" \
+  +trainer.algorithm.overlong_buffer.len=$OVERLONG_BUFFER_LEN \
+  +trainer.algorithm.overlong_buffer.penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \
+  trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
+  generator.enforce_eager=$ENFORCE_EAGER \
+  generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
+  generator.sampling_params.temperature=$TEMPERATURE \
+  generator.sampling_params.top_p=$TOP_P \
+  generator.eval_sampling_params.top_p=$EVAL_TOP_P \
+  generator.eval_sampling_params.temperature=$TEMPERATURE \
+  trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
+  trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \
+  trainer.policy.model.path="$MODEL_NAME" \
+  trainer.placement.colocate_all=true \
+  trainer.strategy=megatron \
+  trainer.placement.policy_num_nodes=$NUM_NODES \
+  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \
+  generator.num_inference_engines=$NUM_INFERENCE_ENGINES \
+  generator.inference_engine_tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \
+  trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \
+  trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \
+  trainer.algorithm.use_tis=$USE_TIS \
+  trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \
+  trainer.epochs=20 \
+  trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \
+  trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \
+  trainer.eval_batch_size=1024 \
+  trainer.eval_before_train=false \
+  trainer.eval_interval=5 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=$TRAIN_BATCH_SIZE \
+  trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \
+  trainer.micro_forward_batch_size_per_gpu=8 \
+  trainer.micro_train_batch_size_per_gpu=8 \
+  trainer.ckpt_interval=10 \
+  trainer.max_prompt_length=$MAX_PROMPT_LENGTH \
+  generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \
+  trainer.policy.optimizer_config.lr=$LR \
+  trainer.policy.optimizer_config.num_warmup_steps=160 \
+  trainer.policy.optimizer_config.weight_decay=0.1 \
+  trainer.policy.optimizer_config.max_grad_norm=1.0 \
+  generator.backend=vllm \
+  generator.run_engines_locally=true \
+  generator.weight_sync_backend=nccl \
+  generator.async_engine=false \
+  generator.batched=true \
+  environment.env_class=aime \
+  generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
+  generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \
+  generator.gpu_memory_utilization=0.8 \
+  trainer.logger="$LOGGER" \
+  trainer.project_name="dapo_aime" \
+  trainer.run_name="dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}" \
+  trainer.export_path="$HOME/exports/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}" \
+  trainer.hf_save_interval=25 \
+  trainer.resume_mode=latest \
+  trainer.max_ckpts_to_keep=3 \
+  trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}" \
+  $@
\ No newline at end of file
diff --git a/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh b/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh
new file mode 100644
index 0000000000..25dc8e6d30
--- /dev/null
+++ b/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh
@@ -0,0 +1,128 @@
+set -x
+
+# Colocated DAPO training+generation for Qwen3-4B-Base on DAPO training data with Megatron and LoRA.
+# bash examples/algorithms/dapo/prepare_dapo_data.sh
+# bash examples/megatron/run_megatron_dapo_qwen3_4b_lora.sh
+
+MODEL_NAME="Qwen/Qwen3-4B-Base"
+DATA_DIR="$HOME/data/dapo"
+TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet"
+TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet"
+NUM_NODES=1
+NUM_GPUS_PER_NODE=8
+NUM_INFERENCE_ENGINES=4
+INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=2
+LOGGER="wandb"  # change to "console" to print to stdout
+
+CLIP_RATIO_LOW=0.2
+CLIP_RATIO_HIGH=0.28
+# use token mean loss reduction
+LOSS_REDUCTION="token_mean"
+# applies overlong filtering (but not soft overlong punishment)
+APPLY_OVERLONG_FILTERING=true
+# apply soft overlong punishment with custom trainer impl in main_dapo.py
+OVERLONG_BUFFER_LEN=$((1024 * 4))
+OVERLONG_BUFFER_PENALTY_FACTOR=1.0
+
+# other DAPO parameters
+USE_KL_LOSS=false
+TEMPERATURE=1.0
+TOP_P=1.0
+EVAL_TOP_P=0.7
+CLIP_RATIO_C=10.0
+MAX_PROMPT_LENGTH=$((1024 * 2))
+MAX_RESPONSE_LENGTH=$((1024 * 8))
+
+# repro run parameters
+TRAIN_BATCH_SIZE=512
+MINI_BATCH_SIZE=32
+N_SAMPLES_PER_PROMPT=16
+EVAL_N_SAMPLES_PER_PROMPT=32
+ENFORCE_EAGER=true # cuda graphs can cause some instability
+LR=3e-5
+
+# megatron config
+MEGATRON_TP=4
+MEGATRON_PP=1
+MEGATRON_CP=1
+MEGATRON_EP=1
+MEGATRON_ETP=null
+
+# lora config
+LORA_RANK=32
+LORA_ALPHA=64
+LORA_A_INIT_METHOD="kaiming"
+
+# TIS parameters
+TIS_IMP_RATIO_CAP=2.0
+USE_TIS=true
+
+uv run --isolated --extra mcore -m examples.algorithms.dapo.main_dapo \
+  data.train_data="['$TRAIN_FILE']" \
+  data.val_data="['$TEST_FILE']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.algorithm.policy_loss_type="dual_clip" \
+  +trainer.algorithm.overlong_buffer.len=$OVERLONG_BUFFER_LEN \
+  +trainer.algorithm.overlong_buffer.penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \
+  trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
+  generator.enforce_eager=$ENFORCE_EAGER \
+  generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
+  generator.sampling_params.temperature=$TEMPERATURE \
+  generator.sampling_params.top_p=$TOP_P \
+  generator.eval_sampling_params.top_p=$EVAL_TOP_P \
+  generator.eval_sampling_params.temperature=$TEMPERATURE \
+  trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
+  trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \
+  trainer.policy.model.path="$MODEL_NAME" \
+  trainer.placement.colocate_all=true \
+  trainer.strategy=megatron \
+  trainer.placement.policy_num_nodes=$NUM_NODES \
+  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \
+  generator.num_inference_engines=$NUM_INFERENCE_ENGINES \
+  generator.inference_engine_tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \
+  trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \
+  trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \
+  trainer.algorithm.use_tis=$USE_TIS \
+  trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \
+  trainer.policy.model.lora.rank=$LORA_RANK \
+  trainer.policy.model.lora.alpha=$LORA_ALPHA \
+  trainer.policy.model.lora.init_method=$LORA_A_INIT_METHOD \
+  trainer.epochs=20 \
+  trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \
+  trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \
+  trainer.eval_batch_size=1024 \
+  trainer.eval_before_train=true \
+  trainer.eval_interval=5 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=$TRAIN_BATCH_SIZE \
+  trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \
+  trainer.micro_forward_batch_size_per_gpu=8 \
+  trainer.micro_train_batch_size_per_gpu=8 \
+  trainer.ckpt_interval=10 \
+  trainer.max_prompt_length=$MAX_PROMPT_LENGTH \
+  generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \
+  trainer.policy.optimizer_config.lr=$LR \
+  trainer.policy.optimizer_config.num_warmup_steps=160 \
+  trainer.policy.optimizer_config.weight_decay=0.1 \
+  trainer.policy.optimizer_config.max_grad_norm=1.0 \
+  generator.backend=vllm \
+  generator.run_engines_locally=true \
+  generator.weight_sync_backend=nccl \
+  generator.async_engine=false \
+  generator.batched=true \
+  environment.env_class=aime \
+  generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
+  generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \
+  generator.gpu_memory_utilization=0.8 \
+  trainer.logger="$LOGGER" \
+  trainer.project_name="dapo_aime" \
+  trainer.run_name="dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}" \
+  trainer.export_path="$HOME/exports/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}" \
+  trainer.hf_save_interval=300 \
+  trainer.resume_mode=latest \
+  trainer.max_ckpts_to_keep=3 \
+  trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_4b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}" \
+  $@
\ No newline at end of file
diff --git a/skyrl-train/examples/megatron/run_megatron_lora_qwen3-0.6b.sh b/skyrl-train/examples/megatron/run_megatron_lora_qwen3-0.6b.sh
new file mode 100644
index 0000000000..02cf41c70d
--- /dev/null
+++ b/skyrl-train/examples/megatron/run_megatron_lora_qwen3-0.6b.sh
@@ -0,0 +1,76 @@
+set -x
+
+# Colocated GRPO training+generation for Qwen3-0.6B on GSM8K with Megatron and LoRA.
+
+# uv run examples/gsm8k/gsm8k_dataset.py --output_dir $HOME/data/gsm8k
+# export WANDB_API_KEY=<your_key_here>
+# bash examples/megatron/run_megatron_lora_qwen3-0.6b.sh
+
+DATA_DIR="$HOME/data/gsm8k"
+NUM_GPUS=8
+LOGGER="wandb"  # change to "console" to print to stdout
+MODEL_NAME="Qwen/Qwen3-0.6B"
+
+INFERENCE_BACKEND="vllm" # currently only vllm is supported for megatron
+
+MEGATRON_TP=1
+MEGATRON_PP=1
+MEGATRON_CP=1
+
+# LoRA configuration
+LORA_RANK=32
+LORA_ALPHA=64
+LORA_A_INIT_METHOD="kaiming"
+
+
+uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \
+  data.train_data="['$DATA_DIR/train.parquet']" \
+  data.val_data="['$DATA_DIR/validation.parquet']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.policy.model.path=$MODEL_NAME \
+  trainer.placement.colocate_all=true \
+  trainer.strategy=megatron \
+  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
+  trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \
+  generator.num_inference_engines=$NUM_GPUS \
+  generator.inference_engine_tensor_parallel_size=1 \
+  trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.ref.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.ref.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.ref.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.model.lora.rank=$LORA_RANK \
+  trainer.policy.model.lora.alpha=$LORA_ALPHA \
+  trainer.policy.model.lora.init_method=$LORA_A_INIT_METHOD \
+  trainer.gradient_checkpointing=true \
+  trainer.policy.model.lora.target_modules="all-linear" \
+  trainer.use_sample_packing=true \
+  trainer.epochs=20 \
+  trainer.eval_batch_size=1024 \
+  trainer.eval_before_train=false \
+  trainer.eval_interval=5 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=128 \
+  trainer.policy_mini_batch_size=64 \
+  trainer.micro_forward_batch_size_per_gpu=4 \
+  trainer.micro_train_batch_size_per_gpu=4 \
+  trainer.ckpt_interval=10 \
+  trainer.max_prompt_length=512 \
+  generator.sampling_params.max_generate_length=1024 \
+  trainer.policy.optimizer_config.lr=1.0e-5 \
+  trainer.algorithm.use_kl_loss=true \
+  generator.backend=$INFERENCE_BACKEND \
+  generator.run_engines_locally=true \
+  generator.weight_sync_backend=nccl \
+  generator.async_engine=true \
+  generator.batched=true \
+  environment.env_class=gsm8k \
+  generator.n_samples_per_prompt=5 \
+  generator.gpu_memory_utilization=0.6 \
+  trainer.logger="$LOGGER" \
+  trainer.project_name="gsm8k_megatron" \
+  trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}_lora_r${LORA_RANK}_a${LORA_ALPHA}" \
+  trainer.resume_mode=null \
+  trainer.ckpt_path="$HOME/ckpts/gsm8k_megatron_ckpt" \
+  $@
\ No newline at end of file
diff --git a/skyrl-train/examples/megatron/run_megatron_lora_qwen3-30b-a3b.sh b/skyrl-train/examples/megatron/run_megatron_lora_qwen3-30b-a3b.sh
new file mode 100644
index 0000000000..6c2f3a899b
--- /dev/null
+++ b/skyrl-train/examples/megatron/run_megatron_lora_qwen3-30b-a3b.sh
@@ -0,0 +1,103 @@
+set -x
+
+# Colocated GRPO training+generation for Qwen3-30B-A3B on GSM8K with Megatron.
+# Should run on 1 node of 8xH100s
+
+# uv run examples/gsm8k/gsm8k_dataset.py --output_dir $HOME/data/gsm8k
+# export WANDB_API_KEY=<your_key_here>
+# bash examples/megatron/run_megatron_lora_qwen3-30b-a3b.sh
+
+DATA_DIR="$HOME/data/gsm8k"
+LOGGER="wandb"  # change to "console" to print to stdout
+MODEL_NAME="Qwen/Qwen3-30B-A3B"
+
+INFERENCE_BACKEND="vllm" # currently only vllm is supported for megatron
+
+NUM_NODES=1
+NUM_GPUS=8
+
+MEGATRON_TP=4
+MEGATRON_PP=1
+MEGATRON_CP=1
+MEGATRON_EP=8
+MEGATRON_ETP=1
+
+NUM_INFERENCE_ENGINES=1
+INFERENCE_ENGINE_TP=8
+FLASH_ATTN=true
+
+# Megatron gradient checkpointing config
+RECOMPUTE_GRANULARITY="full"
+RECOMPUTE_METHOD="uniform"
+RECOMPUTE_NUM_LAYERS=1
+
+# LoRA configuration
+LORA_RANK=64
+LORA_ALPHA=64
+LORA_A_INIT_METHOD="kaiming"
+
+# TIS parameters
+TIS_IMP_RATIO_CAP=2.0
+USE_TIS=true
+
+uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \
+  data.train_data="['$DATA_DIR/train.parquet']" \
+  data.val_data="['$DATA_DIR/validation.parquet']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.policy.model.path=$MODEL_NAME \
+  trainer.placement.colocate_all=true \
+  trainer.strategy=megatron \
+  trainer.placement.policy_num_nodes=$NUM_NODES \
+  trainer.placement.ref_num_nodes=$NUM_NODES \
+  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
+  trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \
+  generator.num_inference_engines=$NUM_INFERENCE_ENGINES \
+  generator.inference_engine_tensor_parallel_size=$INFERENCE_ENGINE_TP \
+  trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.ref.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.ref.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.ref.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \
+  trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \
+  trainer.ref.megatron_config.expert_model_parallel_size=$MEGATRON_EP \
+  trainer.ref.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \
+  trainer.algorithm.use_tis=$USE_TIS \
+  trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \
+  trainer.policy.model.lora.rank=$LORA_RANK \
+  trainer.policy.model.lora.alpha=$LORA_ALPHA \
+  trainer.policy.model.lora.init_method=$LORA_A_INIT_METHOD \
+  trainer.policy.megatron_config.transformer_config_kwargs.recompute_granularity=$RECOMPUTE_GRANULARITY \
+  trainer.policy.megatron_config.transformer_config_kwargs.recompute_method=$RECOMPUTE_METHOD \
+  trainer.policy.megatron_config.transformer_config_kwargs.recompute_num_layers=$RECOMPUTE_NUM_LAYERS \
+  trainer.use_sample_packing=true \
+  trainer.flash_attn=$FLASH_ATTN \
+  trainer.epochs=20 \
+  trainer.eval_batch_size=1024 \
+  trainer.eval_before_train=false \
+  trainer.eval_interval=5 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=128 \
+  trainer.policy_mini_batch_size=64 \
+  trainer.micro_forward_batch_size_per_gpu=8 \
+  trainer.micro_train_batch_size_per_gpu=8 \
+  trainer.ckpt_interval=10 \
+  trainer.max_prompt_length=512 \
+  generator.sampling_params.max_generate_length=1024 \
+  trainer.policy.optimizer_config.lr=1.0e-5 \
+  trainer.algorithm.use_kl_loss=true \
+  generator.backend=$INFERENCE_BACKEND \
+  generator.run_engines_locally=true \
+  generator.weight_sync_backend=nccl \
+  generator.async_engine=true \
+  generator.batched=true \
+  environment.env_class=gsm8k \
+  generator.n_samples_per_prompt=5 \
+  generator.gpu_memory_utilization=0.6 \
+  trainer.logger="$LOGGER" \
+  trainer.project_name="gsm8k_megatron" \
+  trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}_qwen3_30b_a3b" \
+  trainer.resume_mode=null \
+  trainer.ckpt_path="$HOME/ckpts/gsm8k_megatron_ckpt" \
+  $@
\ No newline at end of file
diff --git a/skyrl-train/pyproject.toml b/skyrl-train/pyproject.toml
index 93618f8bf6..56f8aee4bb 100644
--- a/skyrl-train/pyproject.toml
+++ b/skyrl-train/pyproject.toml
@@ -100,7 +100,7 @@ flashinfer-jit-cache = { index = "flashinfer-cu128", marker = "extra == 'vllm' o
 flashinfer-python = [
     { url = "https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl", marker = "extra == 'sglang' and extra != 'mcore' and extra != 'vllm'" }
 ]
-megatron-bridge = {git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge", rev = "22ef9ff9f9684ba2f2dbea14db974f5c31bbd683"}
+megatron-bridge = {git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge", rev = "953aabf75c0500180dc14a6a76cf9e7e7c4baec7"}
 
 
 [project.optional-dependencies]
diff --git a/skyrl-train/skyrl_train/config/megatron_config/policy.yaml b/skyrl-train/skyrl_train/config/megatron_config/policy.yaml
index fe436cbb4e..3ba0dc8f95 100644
--- a/skyrl-train/skyrl_train/config/megatron_config/policy.yaml
+++ b/skyrl-train/skyrl_train/config/megatron_config/policy.yaml
@@ -21,6 +21,10 @@ torch_profiler_config:
   ranks: []
   save_path: null
 
+lora_config:
+  # see: https://docs.nvidia.com/nemo/megatron-bridge/0.2.0/apidocs/bridge/bridge.peft.lora.html for details - currently "lora" and "canonical_lora" are supported
+  lora_type: "lora"
+
 # pass-through kwargs to Megatron's `OptimizerConfig` object
 # any overlapping arguments with those we attempt to resolve in trainer.policy.optimizer_config will be overridden by the values here
 # https://github.com/NVIDIA/Megatron-LM/blob/core_r0.13.0/megatron/core/optimizer/optimizer_config.py#L12
diff --git a/skyrl-train/skyrl_train/config/ppo_base_config.yaml b/skyrl-train/skyrl_train/config/ppo_base_config.yaml
index 012b4e43da..b88d2c708c 100644
--- a/skyrl-train/skyrl_train/config/ppo_base_config.yaml
+++ b/skyrl-train/skyrl_train/config/ppo_base_config.yaml
@@ -27,12 +27,15 @@ trainer:
     model:
       path: "Qwen/Qwen2.5-1.5B-Instruct"
       lora:
-          rank: 0
-          alpha: 16
-          dropout: 0
-          lora_sync_path: "/tmp/skyrl_lora_sync"
-          target_modules: "all-linear"
-          exclude_modules: null
+        rank: 0
+        alpha: 16
+        dropout: 0
+        lora_sync_path: "/tmp/skyrl_lora_sync"
+        target_modules: "all-linear"
+        exclude_modules: null
+        # see https://huggingface.co/docs/peft/v0.18.0/en/package_reference/lora#peft.LoraConfig.init_lora_weights for supported initialization methods for FSDP
+        # for megatron, this is used for `lora_A_init_method`, and "xavier", "normal", "kaiming", and "zero" are supported
+        init_method: "kaiming"
     deepspeed_config: ${deepspeed_config.train}
     optimizer_config:
       lr: 1.0e-6
diff --git a/skyrl-train/skyrl_train/distributed/megatron/megatron_strategy.py b/skyrl-train/skyrl_train/distributed/megatron/megatron_strategy.py
index 5aa11b1e07..6884feb2ca 100644
--- a/skyrl-train/skyrl_train/distributed/megatron/megatron_strategy.py
+++ b/skyrl-train/skyrl_train/distributed/megatron/megatron_strategy.py
@@ -50,12 +50,14 @@ def __init__(
         megatron_config,
         optimizer_config=None,
         seed: int = 42,
+        is_lora: bool = False,
     ) -> None:
         super().__init__()
         self.megatron_config = megatron_config
         self.optimizer_config = optimizer_config
         self.seed = seed
         self.hf_config = None  # Set by the megatron worker once configs are initialized.
+        self.is_lora = is_lora
 
         # NOTE: Set Megatron dist checkpoint async backend to persistent to avoid `os.fork()`-ing
         # short-lived background workers, which does not work well with Ray.
@@ -145,9 +147,9 @@ def save_checkpoint(
         # Extract base model.
         model: List[nn.Module] = model.actor_module
         assert len(model) == 1, "Megatron virtual pipeline parallel is not yet supported"
-        model = model[0]
-        if hasattr(model, "module"):
-            model = model.module
+        unwrapped_model = model[0]
+        while hasattr(unwrapped_model, "module"):
+            unwrapped_model = unwrapped_model.module
 
         # Create checkpoint directory if it doesn't exist.
         if node_local_rank == 0:
@@ -158,8 +160,9 @@ def save_checkpoint(
 
         # Collect the sharded state dicts for model and optimizer, and full state dict for the scheduler.
         sharded_state_dict = {}
-        model_sharded_state_dict = model.sharded_state_dict()
-        sharded_state_dict["model"] = model_sharded_state_dict
+        model_sharded_state_dict = unwrapped_model.sharded_state_dict()
+        if not self.is_lora:
+            sharded_state_dict["model"] = model_sharded_state_dict
         if optimizer:
             sharded_state_dict["optimizer"] = optimizer.sharded_state_dict(model_sharded_state_dict)
         if scheduler:
@@ -190,11 +193,43 @@ def save_checkpoint(
                 hf_dir = os.path.join(work_dir, "huggingface")
                 self.save_hf_configs(self.hf_config, hf_dir, tokenizer)
 
+        if self.is_lora:
+            self._save_lora_adapters(unwrapped_model, ckpt_dir)
+
         dist.barrier()
         ckpt_base.async_calls.close()
         ckpt_base.async_calls = AsyncCallsQueue(persistent=True)
         self.print(f"Checkpoint successfully saved to {ckpt_dir}")
 
+    def _get_rank_path(self, ckpt_dir):
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        pp_rank = mpu.get_pipeline_model_parallel_rank()
+        cp_rank = mpu.get_context_parallel_rank()
+        dp_rank = mpu.get_data_parallel_rank()
+        ep_rank = mpu.get_expert_model_parallel_rank()
+        etp_rank = mpu.get_expert_tensor_parallel_rank()
+
+        return os.path.join(
+            ckpt_dir, f"adapter_tp{tp_rank}_pp{pp_rank}_cp{cp_rank}_dp{dp_rank}_ep{ep_rank}_etp{etp_rank}.pt"
+        )
+
+    def _save_lora_adapters(self, model, ckpt_dir):
+        """Save LoRA adapters to checkpoint."""
+        if not self.is_lora:
+            return
+
+        assert isinstance(model, nn.Module), "Model must be a nn.Module"
+
+        model_state_dict = {}
+        for name, param in model.named_parameters():
+            if ".adapter" in name.lower():
+                model_state_dict[name] = param.data
+
+        with io.local_work_dir(ckpt_dir) as work_dir:
+            adapter_path = self._get_rank_path(work_dir)
+            torch.save({"model_state_dict": model_state_dict}, adapter_path)
+            self.print(f"Saved {len(model_state_dict)} LoRA adapter parameters to {adapter_path}")
+
     def load_checkpoint(
         self,
         model: MegatronModelWrapper,
@@ -212,13 +247,14 @@ def load_checkpoint(
         model: List[nn.Module] = model.actor_module
         assert len(model) == 1, "Megatron virtual pipeline parallel is not yet supported"
         unwrapped_model = model[0]
-        if hasattr(unwrapped_model, "module"):
+        while hasattr(unwrapped_model, "module"):
             unwrapped_model = unwrapped_model.module
 
         # Extract sharded state dicts.
         sharded_state_dict = {}
         model_sharded_state_dict = unwrapped_model.sharded_state_dict()
-        sharded_state_dict["model"] = model_sharded_state_dict
+        if not self.is_lora:
+            sharded_state_dict["model"] = model_sharded_state_dict
         if optimizer and load_optimizer_states:
             sharded_state_dict["optimizer"] = optimizer.sharded_state_dict(model_sharded_state_dict)
         if scheduler and load_lr_scheduler_states:
@@ -233,13 +269,15 @@ def load_checkpoint(
             state_dict = dist_checkpointing.load(
                 sharded_state_dict=sharded_state_dict, checkpoint_dir=read_dir, sharded_strategy=load_strategy
             )
-
-        # Load the model, optimizer, and scheduler state dicts.
-        assert (
-            "model" in state_dict
-        ), f"Model state dict not found in checkpoint loaded from {ckpt_dir}. Available keys: {state_dict.keys()}"
-        model[0].load_state_dict(state_dict["model"], strict=load_module_strict)
-        self.print("Loaded model state dict.")
+        if not self.is_lora:
+            # Load the model, optimizer, and scheduler state dicts.
+            assert (
+                "model" in state_dict
+            ), f"Model state dict not found in checkpoint loaded from {ckpt_dir}. Available keys: {state_dict.keys()}"
+            model[0].load_state_dict(state_dict["model"], strict=load_module_strict)
+            self.print("Loaded model state dict.")
+        else:
+            self._load_lora_adapters(unwrapped_model, ckpt_dir)
 
         if optimizer and load_optimizer_states:
             assert (
@@ -261,6 +299,22 @@ def load_checkpoint(
 
         return ckpt_dir, {}
 
+    def _load_lora_adapters(self, model, ckpt_dir):
+        """Load LoRA adapters from checkpoint."""
+        # TODO (erictang000): Update this logic once LoRA checkpointing is upstreamed to Megatron-Bridge
+        if not self.is_lora:
+            return
+
+        assert isinstance(model, nn.Module), "Model must be a nn.Module"
+
+        with io.local_read_dir(ckpt_dir) as read_dir:
+            adapter_path = self._get_rank_path(read_dir)
+            state_dict = torch.load(adapter_path, map_location="cpu")
+            _, unexpected = model.load_state_dict(state_dict["model_state_dict"], strict=False)
+            if len(unexpected) > 0:
+                raise ValueError(f"Unexpected keys in LoRA adapter state dict: {unexpected}")
+            self.print(f"Loaded {len(state_dict['model_state_dict'])} LoRA adapters from {adapter_path}.")
+
     def save_hf_model(self, bridge, model: MegatronModelWrapper, output_dir: str, tokenizer=None, **kwargs) -> None:
         # Create checkpoint directory if it doesn't exist.
         if self.is_rank_0():
diff --git a/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py b/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py
index f69929368c..f09504ecaa 100644
--- a/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py
+++ b/skyrl-train/skyrl_train/distributed/megatron/megatron_utils.py
@@ -158,13 +158,33 @@ def offload_megatron_model_to_cpu(models):
             model_chunk_all_buffers = [model_chunk.buffers, model_chunk.expert_parallel_buffers]
             for buffers in model_chunk_all_buffers:
                 for buffer in buffers:
-                    # offload parameters
+                    # offload parameters from fused Megatron buffers
                     if buffer.param_data.storage().size() > 0:
                         buffer.param_data.cpu_data = buffer.param_data.data.cpu().pin_memory()
                         buffer.param_data_size = buffer.param_data.storage().size()
                         buffer.param_data.storage().resize_(0)
 
                     assert buffer.param_data_size == buffer.param_data.cpu_data.storage().size()
+
+            # lora aware offloading - if using lora,offload non-lora base weights, since megatron fused buffers do not include the HF/bridge "to_wrap" weights
+            for name, param in model_chunk.named_parameters():
+                if (
+                    param.is_cuda
+                    and not param.requires_grad
+                    and "adapter" not in name
+                    and param.data.storage().size() > 0
+                ):
+                    # Always refresh the CPU copy and release GPU storage.
+                    cpu_tensor = param.data.detach().cpu().pin_memory()
+                    param._offload_cpu_data = cpu_tensor
+                    param._offload_cuda_numel = param.data.numel()
+                    # Release GPU storage while keeping dtype/device metadata.
+                    empty_cuda = torch.empty(
+                        0,
+                        dtype=param.data.dtype,
+                        device=param.data.device,
+                    )
+                    param.data = empty_cuda
         else:
             # we need this for ref module
             for _, param in model_chunk.named_parameters():
@@ -184,6 +204,13 @@ def load_megatron_model_to_gpu(models):
                         buffer.param_data.storage().resize_(buffer.param_data_size)
                         # copy data from cpu to cuda
                         buffer.param_data.copy_(buffer.param_data.cpu_data, non_blocking=True)
+
+            # Restore any LoRA-frozen base weights that were offloaded above.
+            device_id = torch.cuda.current_device()
+            for name, param in model_chunk.named_parameters():
+                if hasattr(param, "_offload_cpu_data") and param.data.storage().size() == 0:
+                    restored = param._offload_cpu_data.to(device_id, non_blocking=True)
+                    param.data = restored
         else:
             # we need this for ref module
             device_id = torch.cuda.current_device()
diff --git a/skyrl-train/skyrl_train/entrypoints/main_base.py b/skyrl-train/skyrl_train/entrypoints/main_base.py
index 1604c61c85..cebc28dd65 100644
--- a/skyrl-train/skyrl_train/entrypoints/main_base.py
+++ b/skyrl-train/skyrl_train/entrypoints/main_base.py
@@ -61,7 +61,7 @@ def create_ray_wrapped_inference_engines_from_config(cfg: DictConfig, colocate_p
     }
 
     # Conditionally add LoRA parameters if LoRA is enabled
-    if cfg.trainer.policy.model.lora.rank > 0:
+    if cfg.trainer.policy.model.lora.rank > 0 and cfg.trainer.strategy != "megatron":
         engine_kwargs["enable_lora"] = True
         engine_kwargs["max_lora_rank"] = cfg.trainer.policy.model.lora.rank
         engine_kwargs["sleep_level"] = 1
diff --git a/skyrl-train/skyrl_train/model_wrapper.py b/skyrl-train/skyrl_train/model_wrapper.py
index 028cb86ac9..8767e1201b 100644
--- a/skyrl-train/skyrl_train/model_wrapper.py
+++ b/skyrl-train/skyrl_train/model_wrapper.py
@@ -35,6 +35,7 @@ class HFModelWrapper(nn.Module):
         lora_rank (int, optional): Rank for LoRA adaptation. Defaults to 0.
         lora_alpha (int, optional): Alpha parameter for LoRA. Defaults to 16.
         lora_dropout (float, optional): Dropout rate for LoRA layers. Defaults to 0.
+        lora_init_method (str, optional): Initialization method for LoRA layers. Defaults to "kaiming".
         target_modules (list, optional): List of target modules for applying LoRA. Defaults to None.
         exclude_modules (list, optional): List of modules to exclude from applying LoRA. Defaults to None.
         ds_config (dict, optional): Configuration for DeepSpeed, enabling model partitioning across multiple GPUs. Defaults to None.
@@ -54,6 +55,7 @@ def __init__(
         lora_rank=0,
         lora_alpha=16,
         lora_dropout=0,
+        lora_init_method="kaiming",
         target_modules=None,
         exclude_modules=None,
         ds_config=None,
@@ -161,6 +163,7 @@ def __init__(
                     exclude_modules=exclude_modules,
                     lora_dropout=lora_dropout,
                     bias="none",
+                    init_lora_weights=True if lora_init_method == "kaiming" else lora_init_method,
                 )
                 self.model = get_peft_model(self.model, lora_config)
 
diff --git a/skyrl-train/skyrl_train/utils/utils.py b/skyrl-train/skyrl_train/utils/utils.py
index 32b15e3fb4..6285696fb5 100644
--- a/skyrl-train/skyrl_train/utils/utils.py
+++ b/skyrl-train/skyrl_train/utils/utils.py
@@ -297,7 +297,11 @@ def validate_cfg(cfg: DictConfig):
         # LoRA enabled
         # Right now: assert generator backend must be vllm, training backend must be fsdp/fsdp2
         assert cfg.generator.backend == "vllm", "LoRA enabled requires vLLM backend"
-        assert cfg.trainer.strategy in ("fsdp", "fsdp2"), "LoRA enabled requires fsdp/fsdp2 training backend"
+        assert cfg.trainer.strategy in (
+            "fsdp",
+            "fsdp2",
+            "megatron",
+        ), "LoRA enabled requires fsdp/fsdp2/megatron training backend"
 
         if cfg.trainer.target_modules is not None:
             logger.warning(
diff --git a/skyrl-train/skyrl_train/workers/fsdp/fsdp_worker.py b/skyrl-train/skyrl_train/workers/fsdp/fsdp_worker.py
index 9aea864ac9..7c5853c38a 100644
--- a/skyrl-train/skyrl_train/workers/fsdp/fsdp_worker.py
+++ b/skyrl-train/skyrl_train/workers/fsdp/fsdp_worker.py
@@ -133,6 +133,7 @@ def init_model(self, model_path, num_training_steps: int = None):
                 lora_rank=self.cfg.trainer.policy.model.lora.rank,
                 lora_alpha=self.cfg.trainer.policy.model.lora.alpha,
                 lora_dropout=self.cfg.trainer.policy.model.lora.dropout,
+                lora_init_method=self.cfg.trainer.policy.model.lora.init_method,
                 target_modules=self.cfg.trainer.policy.model.lora.target_modules,
                 exclude_modules=self.cfg.trainer.policy.model.lora.exclude_modules,
                 sequence_parallel_size=self.cfg.trainer.policy.sequence_parallel_size,
diff --git a/skyrl-train/skyrl_train/workers/megatron/megatron_worker.py b/skyrl-train/skyrl_train/workers/megatron/megatron_worker.py
index 4b6424c5c4..cc566ef4f4 100644
--- a/skyrl-train/skyrl_train/workers/megatron/megatron_worker.py
+++ b/skyrl-train/skyrl_train/workers/megatron/megatron_worker.py
@@ -14,6 +14,8 @@
 from omegaconf import OmegaConf
 
 from megatron.bridge import AutoBridge
+from megatron.bridge.peft.lora import LoRA
+from megatron.bridge.peft.canonical_lora import CanonicalLoRA
 import megatron.core.parallel_state as mpu
 from megatron.core.optimizer import DistributedOptimizer
 from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler
@@ -184,7 +186,14 @@ def extract_weights(self, dtype: torch.dtype):
 
 class MegatronWorker:
     def init_configs(
-        self, model_path, megatron_config, model_config_kwargs, transformer_config_kwargs, bf16=True, flash_attn=False
+        self,
+        model_path,
+        megatron_config,
+        model_config_kwargs,
+        transformer_config_kwargs,
+        bf16=True,
+        flash_attn=False,
+        lora_config=None,
     ):
         """
         Initialize the Megatron-Bridge bridge and provider objects + hf_config and tokenizer
@@ -232,10 +241,51 @@ def init_configs(
         self.strategy.hf_config = hf_config
         self.tokenizer = tokenizer
 
+    def configure_lora(self, lora_config, lora_type: Optional[str] = "lora"):
+        if lora_type == "lora":
+            self.lora_cls = LoRA(
+                target_modules=(
+                    ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"]
+                    if lora_config.target_modules == "all-linear"
+                    else lora_config.target_modules
+                ),
+                dim=lora_config.rank,
+                alpha=lora_config.alpha,
+                dropout=lora_config.dropout,
+                lora_A_init_method=lora_config.init_method,
+                lora_B_init_method="zero",
+                exclude_modules=[] if lora_config.exclude_modules is None else lora_config.exclude_modules,
+                lora_dtype=torch.bfloat16 if self.cfg.trainer.bf16 else torch.float32,
+            )
+        elif lora_type == "canonical_lora":
+            self.lora_cls = CanonicalLoRA(
+                target_modules=(
+                    [
+                        "linear_q",
+                        "linear_k",
+                        "linear_v",
+                        "linear_proj",
+                        "linear_fc1_up",
+                        "linear_fc1_gate",
+                        "linear_fc2",
+                    ]
+                    if lora_config.target_modules == "all-linear"
+                    else lora_config.target_modules
+                ),
+                dim=lora_config.rank,
+                alpha=lora_config.alpha,
+                dropout=lora_config.dropout,
+                lora_A_init_method=lora_config.init_method,
+                lora_B_init_method="zero",
+                exclude_modules=[] if lora_config.exclude_modules is None else lora_config.exclude_modules,
+            )
+
     def make_megatron_module(
         self,
         wrap_with_ddp: bool = True,
         ddp_config: Optional[Dict[str, Any]] = None,
+        lora_config: Optional[Dict[str, Any]] = None,
+        lora_type: Optional[str] = "lora",
         bf16: bool = True,
     ) -> List[nn.Module]:
         """
@@ -243,6 +293,17 @@ def make_megatron_module(
         """
         from megatron.core.distributed.distributed_data_parallel_config import DistributedDataParallelConfig
 
+        if lora_config is not None:
+            self.configure_lora(lora_config, lora_type)
+
+            def lora_pre_wrap_hook(model):
+                lora_model = self.lora_cls(model, training=True)
+                self.lora_cls.set_params_to_save(lora_model)
+
+                return lora_model
+
+            self.provider.register_pre_wrap_hook(lora_pre_wrap_hook)
+
         default_ddp_config = DistributedDataParallelConfig()
         if wrap_with_ddp:
             default_ddp_config.use_distributed_optimizer = True
@@ -315,6 +376,7 @@ def __init__(self, **kwargs):
         self.scheduler: OptimizerParamScheduler = None
         self.optimizer: DistributedOptimizer = None
         self.profiler: Profiler = None
+        self._is_lora = self.cfg.trainer.policy.model.lora.rank > 0
 
     def offload_to_cpu(self, pin_memory=True, non_blocking=True, offload_optimizer=True, offload_model=True):
         self._set_numa_affinity(torch.distributed.get_rank() % torch.cuda.device_count())
@@ -355,6 +417,7 @@ def _broadcast_no_grad(*args, **kwargs):
             megatron_config=self.cfg.trainer.policy.megatron_config,
             optimizer_config=self.cfg.trainer.policy.optimizer_config,
             seed=self.cfg.trainer.seed,
+            is_lora=self._is_lora,
         )
         self.strategy.setup_distributed()
 
@@ -386,6 +449,8 @@ def init_model(self, model_path, num_training_steps: int = 1e9):
         self.actor_module = self.make_megatron_module(
             wrap_with_ddp=True,
             ddp_config=self.cfg.trainer.policy.megatron_config.ddp_config,
+            lora_config=self.cfg.trainer.policy.model.lora if self._is_lora else None,
+            lora_type=self.cfg.trainer.policy.megatron_config.lora_config.lora_type,
             bf16=self.cfg.trainer.bf16,
         )
 
diff --git a/skyrl-train/tests/gpu/gpu_ci/test_save_load_checkpoint.py b/skyrl-train/tests/gpu/gpu_ci/test_save_load_checkpoint.py
index 4c52b17aa4..9c8afbcac1 100644
--- a/skyrl-train/tests/gpu/gpu_ci/test_save_load_checkpoint.py
+++ b/skyrl-train/tests/gpu/gpu_ci/test_save_load_checkpoint.py
@@ -64,15 +64,16 @@ def get_test_actor_config(strategy: str) -> DictConfig:
 
 
 @pytest.mark.parametrize(
-    "strategy",
+    ("strategy", "lora"),
     [
-        "deepspeed",
-        "fsdp",
-        "fsdp2",
-        pytest.param("megatron", marks=pytest.mark.megatron),
+        ("deepspeed", False),
+        ("fsdp", False),
+        ("fsdp2", False),
+        pytest.param("megatron", False, marks=pytest.mark.megatron),
+        pytest.param("megatron", True, marks=[pytest.mark.megatron, pytest.mark.lora]),
     ],
 )
-def test_save_load_checkpoint(ray_init_fixture, strategy):
+def test_save_load_checkpoint(ray_init_fixture, strategy, lora):
     """
     Test checkpointing logic by:
     1. Creating model and doing one training step
@@ -82,6 +83,9 @@ def test_save_load_checkpoint(ray_init_fixture, strategy):
     5. Repeating second training step and comparing logits
     """
     cfg = get_test_actor_config(strategy)
+    if lora:
+        cfg.trainer.policy.model.lora.rank = 32
+        cfg.trainer.policy.model.lora.alpha = 32
 
     try:
         actor_group = init_worker_with_type(
diff --git a/skyrl-train/tests/gpu/test_megatron_worker.py b/skyrl-train/tests/gpu/test_megatron_worker.py
index fe7907918b..a5434feaa7 100644
--- a/skyrl-train/tests/gpu/test_megatron_worker.py
+++ b/skyrl-train/tests/gpu/test_megatron_worker.py
@@ -9,7 +9,7 @@
 from omegaconf import DictConfig
 import torch
 import asyncio
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from omegaconf import OmegaConf
 from tests.gpu.utils import (
     init_worker_with_type,
@@ -110,16 +110,21 @@ def get_test_training_batch(batch_size=4) -> TrainingInputBatch:
 
 
 @pytest.mark.parametrize(
-    ("colocate_all", "inference_tp", "megatron_tp", "megatron_pp", "megatron_ep", "megatron_etp"),
-    [(True, 4, 2, 2, 1, None), (False, 2, 2, 1, 1, None)],
-    ids=["colocate_all", "non_colocated"],
+    ("colocate_all", "inference_tp", "megatron_tp", "megatron_pp", "megatron_ep", "megatron_etp", "lora"),
+    [(True, 4, 2, 2, 1, None, False), (False, 2, 2, 1, 1, None, False), (True, 4, 2, 2, 1, None, True)],
+    ids=["colocate_all", "non_colocated", "colocate_all_lora"],
 )
-def test_megatron_policy_weight_sync(colocate_all, inference_tp, megatron_tp, megatron_pp, megatron_ep, megatron_etp):
+def test_megatron_policy_weight_sync(
+    colocate_all, inference_tp, megatron_tp, megatron_pp, megatron_ep, megatron_etp, lora
+):
     """
     Test that we can sync weights between policy and inference for megatron then run inference
     """
     try:
         cfg = get_test_actor_config(model_name=MODEL_NAME)
+        if lora:
+            cfg.trainer.policy.model.lora.rank = 16
+            cfg.trainer.policy.model.lora.alpha = 16
         cfg.trainer.placement.colocate_all = colocate_all
         cfg.generator.weight_sync_backend = "nccl"
         cfg.trainer.strategy = "megatron"
@@ -174,17 +179,18 @@ def test_megatron_policy_weight_sync(colocate_all, inference_tp, megatron_tp, me
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    ("worker_type", "tp", "pp", "cp", "ep", "etp", "gpus_per_node", "use_sample_packing"),
+    ("worker_type", "tp", "pp", "cp", "ep", "etp", "gpus_per_node", "use_sample_packing", "lora"),
     [
-        ("policy", 2, 1, 1, 1, None, 2, False),
+        ("policy", 2, 1, 1, 1, None, 2, False, False),
         # ref has same forward pass as policy - just duplicate one test to test setup
-        ("ref", 2, 1, 1, 1, None, 2, False),
-        ("policy", 1, 2, 1, 1, None, 2, False),
-        ("policy", 2, 2, 1, 1, None, 4, False),
-        ("policy", 2, 2, 1, 1, None, 4, True),
-        ("policy", 1, 1, 2, 1, None, 2, True),
-        ("policy", 2, 2, 2, 1, None, 8, True),
-        ("policy", 4, 2, 1, 4, 1, 8, True),
+        ("ref", 2, 1, 1, 1, None, 2, False, False),
+        ("policy", 1, 2, 1, 1, None, 2, False, False),
+        ("policy", 2, 2, 1, 1, None, 4, False, False),
+        ("policy", 2, 2, 1, 1, None, 4, True, False),
+        ("policy", 2, 2, 1, 1, None, 4, True, True),
+        ("policy", 1, 1, 2, 1, None, 2, True, False),
+        ("policy", 2, 2, 2, 1, None, 8, True, False),
+        ("policy", 4, 2, 1, 4, 1, 8, True, False),
     ],
     ids=[
         "tp2_pp1_policy",
@@ -192,12 +198,15 @@ def test_megatron_policy_weight_sync(colocate_all, inference_tp, megatron_tp, me
         "tp1_pp2_policy",
         "tp2_pp2_policy_unpacked",
         "tp2_pp2_policy_seq_packing",
+        "tp2_pp2_lora",
         "cp_2_policy_seq_packing",
         "tp_2_pp_2_cp_2_policy_seq_packing",
         "tp4_pp2_cp1_ep4_etp1_policy_seq_packing",
     ],
 )
-async def test_megatron_forward(ray_init_fixture, worker_type, tp, pp, cp, ep, etp, gpus_per_node, use_sample_packing):
+async def test_megatron_forward(
+    ray_init_fixture, worker_type, tp, pp, cp, ep, etp, gpus_per_node, use_sample_packing, lora
+):
     """
     Test that the Megatron forward pass is numerically equivalent to just running a huggingface model forward.
     """
@@ -213,6 +222,17 @@ async def test_megatron_forward(ray_init_fixture, worker_type, tp, pp, cp, ep, e
     cfg.trainer.use_sample_packing = use_sample_packing
     batch = get_test_training_batch(max(4, gpus_per_node))
 
+    if ep > 1:
+        transformer_config_kwargs = OmegaConf.to_container(
+            cfg.trainer.policy.megatron_config.transformer_config_kwargs, resolve=True
+        )
+        transformer_config_kwargs["num_layers"] = 4
+        cfg.trainer.policy.megatron_config.transformer_config_kwargs = transformer_config_kwargs
+
+    if lora:
+        cfg.trainer.policy.model.lora.rank = 16
+        cfg.trainer.policy.model.lora.alpha = 16
+
     actor_group = init_worker_with_type(
         worker_type,
         shared_pg=None,
@@ -234,7 +254,10 @@ async def test_megatron_forward(ray_init_fixture, worker_type, tp, pp, cp, ep, e
     # now run the huggingface model forward
     @ray.remote(num_gpus=1)
     def run_hf_forward(batch, model_name):
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, dtype=torch.bfloat16)
+        if ep > 1:
+            config.num_hidden_layers = 4
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, dtype=torch.bfloat16)
         model.eval()
         model.to("cuda")
         sequences_fwd = batch["sequences"]
@@ -283,7 +306,8 @@ def run_hf_forward(batch, model_name):
     avg_diff = torch.mean(torch.abs(action_log_probs_masked - action_log_probs_megatron_masked))
     print(f"Avg diff: {avg_diff}")
 
-    assert max_diff < 4.5e-1, f"Max diff {max_diff} is too large"
+    if ep == 1:
+        assert max_diff < 4e-1, f"Max diff {max_diff} is too large"
 
     if ep == 1:
         assert avg_diff < 7e-2, f"Avg diff {avg_diff} is too large"
@@ -294,24 +318,127 @@ def run_hf_forward(batch, model_name):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    ("worker_type", "tp", "pp", "cp", "ep", "etp", "gpus_per_node", "use_sample_packing", "use_entropy_loss"),
+    ("tp", "pp", "cp", "ep", "etp", "gpus_per_node"),
+    [
+        (2, 2, 1, 1, None, 4),
+        (4, 1, 1, 4, 1, 4),
+    ],
+    ids=[
+        "tp2_pp2_policy",
+        "tp4_pp1_cp1_ep4_etp1_policy",
+    ],
+)
+async def test_megatron_lora_forward(ray_init_fixture, tp, pp, cp, ep, etp, gpus_per_node):
+    """
+    Test that the Megatron + lora forward pass is numerically equivalent to just running a megatron model forward.
+    """
+    cfg = get_test_actor_config(model_name=MOE_MODEL_NAME if ep > 1 else MODEL_NAME)
+    #### Megatron forward pass ####
+    cfg.trainer.strategy = "megatron"
+    cfg.trainer.placement.policy_num_gpus_per_node = gpus_per_node
+    cfg.trainer.policy.megatron_config.tensor_model_parallel_size = tp
+    cfg.trainer.policy.megatron_config.pipeline_model_parallel_size = pp
+    cfg.trainer.policy.megatron_config.context_parallel_size = cp
+    cfg.trainer.policy.megatron_config.expert_model_parallel_size = ep
+    cfg.trainer.policy.megatron_config.expert_tensor_parallel_size = etp
+    cfg.trainer.use_sample_packing = True
+    batch = get_test_training_batch(max(4, gpus_per_node))
+
+    if ep > 1:
+        transformer_config_kwargs = OmegaConf.to_container(
+            cfg.trainer.policy.megatron_config.transformer_config_kwargs, resolve=True
+        )
+        transformer_config_kwargs["num_layers"] = 4
+        cfg.trainer.policy.megatron_config.transformer_config_kwargs = transformer_config_kwargs
+
+    actor_group = init_worker_with_type(
+        "policy",
+        shared_pg=None,
+        colocate_all=False,
+        num_gpus_per_node=cfg.trainer.placement.policy_num_gpus_per_node,
+        cfg=cfg,
+    )
+
+    action_log_probs_refs = actor_group.async_run_ray_method("mesh", "forward", data=batch)
+    all_rank_action_log_probs = ray.get(action_log_probs_refs)
+    action_log_probs_full = concatenate_outputs_after_mesh_dispatch(actor_group.actor_infos, all_rank_action_log_probs)[
+        "output"
+    ]
+
+    ray.shutdown()
+    ray_init_for_tests()
+
+    #### Megatron forward pass ####
+    cfg.trainer.strategy = "megatron"
+    cfg.trainer.placement.policy_num_gpus_per_node = gpus_per_node
+    cfg.trainer.policy.megatron_config.tensor_model_parallel_size = tp
+    cfg.trainer.policy.megatron_config.pipeline_model_parallel_size = pp
+    cfg.trainer.policy.megatron_config.context_parallel_size = cp
+    cfg.trainer.policy.megatron_config.expert_model_parallel_size = ep
+    cfg.trainer.policy.megatron_config.expert_tensor_parallel_size = etp
+    cfg.trainer.use_sample_packing = True
+    batch = get_test_training_batch(max(4, gpus_per_node))
+
+    # set lora this time
+    cfg.trainer.policy.model.lora.rank = 16
+    cfg.trainer.policy.model.lora.alpha = 16
+
+    if ep > 1:
+        transformer_config_kwargs = OmegaConf.to_container(
+            cfg.trainer.policy.megatron_config.transformer_config_kwargs, resolve=True
+        )
+        transformer_config_kwargs["num_layers"] = 4
+        cfg.trainer.policy.megatron_config.transformer_config_kwargs = transformer_config_kwargs
+
+    actor_group = init_worker_with_type(
+        "policy",
+        shared_pg=None,
+        colocate_all=False,
+        num_gpus_per_node=cfg.trainer.placement.policy_num_gpus_per_node,
+        cfg=cfg,
+    )
+
+    action_log_probs_refs = actor_group.async_run_ray_method("mesh", "forward", data=batch)
+    all_rank_action_log_probs = ray.get(action_log_probs_refs)
+    action_log_probs_lora = concatenate_outputs_after_mesh_dispatch(actor_group.actor_infos, all_rank_action_log_probs)[
+        "output"
+    ]
+
+    #### Compare results ####
+    # compare just non-padding tokens
+    print(f"Comparing {action_log_probs_full.numel()} valid response tokens")
+    print(f"Full sample: {action_log_probs_full[:5]}")
+    print(f"Lora sample: {action_log_probs_lora[:5]}")
+
+    # max diff
+    max_diff = torch.max(torch.abs(action_log_probs_full - action_log_probs_lora))
+    print(f"Max diff: {max_diff}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("worker_type", "tp", "pp", "cp", "ep", "etp", "gpus_per_node", "use_sample_packing", "use_entropy_loss", "lora"),
     [
-        ("policy", 2, 2, 1, 1, 1, 4, True, False),
-        ("policy", 2, 2, 1, 1, 1, 4, True, True),
-        ("policy", 2, 2, 1, 1, 1, 4, False, False),
-        ("policy", 2, 2, 2, 1, 1, 8, True, False),
-        ("policy", 2, 1, 1, 8, 1, 8, True, False),
+        ("policy", 2, 2, 1, 1, 1, 4, True, False, False),
+        ("policy", 2, 2, 1, 1, 1, 4, True, True, False),
+        ("policy", 1, 1, 1, 1, 1, 1, True, False, True),
+        ("policy", 2, 2, 1, 1, 1, 4, False, False, False),
+        ("policy", 2, 2, 2, 1, 1, 8, True, False, False),
+        ("policy", 4, 1, 1, 8, 1, 8, True, False, False),
+        ("policy", 4, 1, 1, 8, 1, 8, True, False, True),
     ],
     ids=[
         "tp2_pp2_policy_seq_packing",
         "tp2_pp2_policy_seq_packing_with_entropy_loss",
+        "tp1_pp1_policy_lora",
         "tp2_pp2_policy_unpacked",
         "tp2_pp2_cp2_policy_seq_packing",
-        "tp4_pp2_cp1_ep8_etp1_policy_seq_packing",
+        "tp4_pp1_cp1_ep8_etp1_policy_seq_packing",
+        "tp4_pp1_cp1_ep8_etp1_policy_seq_packing_lora",
     ],
 )
 async def test_megatron_train(
-    ray_init_fixture, worker_type, tp, pp, cp, ep, etp, gpus_per_node, use_sample_packing, use_entropy_loss
+    ray_init_fixture, worker_type, tp, pp, cp, ep, etp, gpus_per_node, use_sample_packing, use_entropy_loss, lora
 ):
     """
     Full test: initialize actor group, send dummy experience to training_step, validate output.
@@ -330,6 +457,9 @@ async def test_megatron_train(
     if use_entropy_loss:
         cfg.trainer.algorithm.use_entropy_loss = True
         cfg.trainer.algorithm.entropy_loss_coef = 0.01
+    if lora:
+        cfg.trainer.policy.model.lora.rank = 16
+        cfg.trainer.policy.model.lora.alpha = 16
 
     # set batch sizes correctly
     cfg.trainer.train_batch_size = gpus_per_node
diff --git a/skyrl-train/uv.lock b/skyrl-train/uv.lock
index 272900b343..062f8206aa 100644
--- a/skyrl-train/uv.lock
+++ b/skyrl-train/uv.lock
@@ -865,7 +865,7 @@ wheels = [
 
 [[package]]
 name = "fastapi"
-version = "0.125.0"
+version = "0.126.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-doc" },
@@ -873,9 +873,9 @@ dependencies = [
     { name = "starlette" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/17/71/2df15009fb4bdd522a069d2fbca6007c6c5487fce5cb965be00fc335f1d1/fastapi-0.125.0.tar.gz", hash = "sha256:16b532691a33e2c5dee1dac32feb31dc6eb41a3dd4ff29a95f9487cb21c054c0", size = 370550, upload-time = "2025-12-17T21:41:44.15Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/03/6c/28773e19bc203a2f3cf1d54a8e96ca7d05b58157a350aa4d8d37f2a5ba07/fastapi-0.126.0.tar.gz", hash = "sha256:f099fceb2a6d56dd21c59c4543d00be123dedacff869e76ae31ba3c0f963e2cd", size = 367455, upload-time = "2025-12-20T16:16:44.484Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/34/2f/ff2fcc98f500713368d8b650e1bbc4a0b3ebcdd3e050dcdaad5f5a13fd7e/fastapi-0.125.0-py3-none-any.whl", hash = "sha256:2570ec4f3aecf5cca8f0428aed2398b774fcdfee6c2116f86e80513f2f86a7a1", size = 112888, upload-time = "2025-12-17T21:41:41.286Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/0b/d5f999f27cb90152a6aadf094205b4d0eeab6a6b03e3e60346cde988c1bd/fastapi-0.126.0-py3-none-any.whl", hash = "sha256:c9330b9731e3bd2caae0a00e76353f86adbf592c5a25649a1682f3a92aeaff41", size = 111758, upload-time = "2025-12-20T16:16:42.349Z" },
 ]
 
 [package.optional-dependencies]
@@ -884,6 +884,8 @@ standard = [
     { name = "fastapi-cli", extra = ["standard"], marker = "extra == 'extra-11-skyrl-train-mcore' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-miniswe') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-miniswe' and extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
     { name = "httpx" },
     { name = "jinja2" },
+    { name = "pydantic-extra-types" },
+    { name = "pydantic-settings" },
     { name = "python-multipart" },
     { name = "uvicorn", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, extra = ["standard"], marker = "extra == 'extra-11-skyrl-train-mcore' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-miniswe') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-miniswe' and extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
 ]
@@ -1792,7 +1794,7 @@ proxy = [
     { name = "gunicorn", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
     { name = "litellm-enterprise", version = "0.1.25", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
     { name = "litellm-proxy-extras", version = "0.4.14", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
-    { name = "mcp", version = "1.24.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
+    { name = "mcp", version = "1.25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
     { name = "orjson", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
     { name = "polars", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
     { name = "pyjwt", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
@@ -2081,7 +2083,7 @@ wheels = [
 
 [[package]]
 name = "mcp"
-version = "1.24.0"
+version = "1.25.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "sys_platform == 'linux' and extra != 'extra-11-skyrl-train-flashrl' and extra != 'extra-11-skyrl-train-mcore' and extra == 'extra-11-skyrl-train-miniswe' and extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm'",
@@ -2120,9 +2122,9 @@ dependencies = [
     { name = "typing-inspection", marker = "extra == 'extra-11-skyrl-train-flashrl' or extra == 'extra-11-skyrl-train-mcore' or extra != 'extra-11-skyrl-train-sglang' or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
     { name = "uvicorn", version = "0.31.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform != 'emscripten' and extra == 'extra-11-skyrl-train-flashrl') or (sys_platform != 'emscripten' and extra == 'extra-11-skyrl-train-mcore') or (sys_platform != 'emscripten' and extra != 'extra-11-skyrl-train-sglang') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-mcore') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-miniswe') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm') or (extra == 'extra-11-skyrl-train-mcore' and extra == 'extra-11-skyrl-train-sglang') or (extra == 'extra-11-skyrl-train-mcore' and extra == 'extra-11-skyrl-train-vllm') or (extra == 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d6/2c/db9ae5ab1fcdd9cd2bcc7ca3b7361b712e30590b64d5151a31563af8f82d/mcp-1.24.0.tar.gz", hash = "sha256:aeaad134664ce56f2721d1abf300666a1e8348563f4d3baff361c3b652448efc", size = 604375, upload-time = "2025-12-12T14:19:38.205Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d5/2d/649d80a0ecf6a1f82632ca44bec21c0461a9d9fc8934d38cb5b319f2db5e/mcp-1.25.0.tar.gz", hash = "sha256:56310361ebf0364e2d438e5b45f7668cbb124e158bb358333cd06e49e83a6802", size = 605387, upload-time = "2025-12-19T10:19:56.985Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/61/0d/5cf14e177c8ae655a2fd9324a6ef657ca4cafd3fc2201c87716055e29641/mcp-1.24.0-py3-none-any.whl", hash = "sha256:db130e103cc50ddc3dffc928382f33ba3eaef0b711f7a87c05e7ded65b1ca062", size = 232896, upload-time = "2025-12-12T14:19:36.14Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/fc/6dc7659c2ae5ddf280477011f4213a74f806862856b796ef08f028e664bf/mcp-1.25.0-py3-none-any.whl", hash = "sha256:b37c38144a666add0862614cc79ec276e97d72aa8ca26d622818d4e278b9721a", size = 233076, upload-time = "2025-12-19T10:19:55.416Z" },
 ]
 
 [[package]]
@@ -2149,7 +2151,7 @@ wheels = [
 [[package]]
 name = "megatron-bridge"
 version = "0.3.0rc0"
-source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge?rev=22ef9ff9f9684ba2f2dbea14db974f5c31bbd683#22ef9ff9f9684ba2f2dbea14db974f5c31bbd683" }
+source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge?rev=953aabf75c0500180dc14a6a76cf9e7e7c4baec7#953aabf75c0500180dc14a6a76cf9e7e7c4baec7" }
 dependencies = [
     { name = "accelerate" },
     { name = "causal-conv1d", marker = "sys_platform == 'never'" },
@@ -2504,11 +2506,11 @@ wheels = [
 
 [[package]]
 name = "nodeenv"
-version = "1.9.1"
+version = "1.10.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/24/bf/d1bda4f6168e0b2e9e5958945e01910052158313224ada5ce1fb2e1113b8/nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb", size = 55611, upload-time = "2025-12-20T14:08:54.006Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" },
+    { url = "https://files.pythonhosted.org/packages/88/b2/d0896bdcdc8d28a7fc5717c305f1a861c26e18c05047949fb371034d98bd/nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827", size = 23438, upload-time = "2025-12-20T14:08:52.782Z" },
 ]
 
 [[package]]
@@ -2781,12 +2783,12 @@ wheels = [
 
 [[package]]
 name = "nvidia-cudnn-frontend"
-version = "1.16.0"
+version = "1.17.0"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/00/39/79b606e805abd67ab4fa72f752a5413a496159f10d94fbdb1d67bb5ae86c/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd6fdd71c0896ff2ca1809d914cbd17f2904d55863f8881f47946e1d634c7a88", size = 1839271, upload-time = "2025-11-07T01:29:53.06Z" },
-    { url = "https://files.pythonhosted.org/packages/09/21/a0e0d50ba8d7b639fe635500fee0d9c0319561b1ae72176d7024ec04b439/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:16efb069d4bda4d3b99134f59f376cfd4d09558298bd96af778fdc7f2851e696", size = 1954062, upload-time = "2025-11-07T01:32:18.556Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/d6/30ae67bb9c010e9459d1211c56d73373eb4e3dd9f57f4c3c1fe0966efcb1/nvidia_cudnn_frontend-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:7b7860db03767c158accbe0b4e9c9553506513cc970ff08ed28c7761681ac466", size = 1368435, upload-time = "2025-11-07T01:26:28.022Z" },
+    { url = "https://files.pythonhosted.org/packages/42/d9/f58ed6292c9396f7422812a0a2d9f80cc5a623ea6c758bcb3d34d4795bb8/nvidia_cudnn_frontend-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de0c473f32d705abcf14f351615f7ffbeed7320e3499cf2195ae5689652a2592", size = 1917620, upload-time = "2025-12-20T00:27:46.179Z" },
+    { url = "https://files.pythonhosted.org/packages/db/eb/c641135632bd2afc21339aadee96af4c5db1460dfa07ca74836de75a590f/nvidia_cudnn_frontend-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c913c87fca691a91385287f2587575531933acfebc85c33dbcecb191886c7a53", size = 2038994, upload-time = "2025-12-20T00:25:18.9Z" },
+    { url = "https://files.pythonhosted.org/packages/82/49/a92da03eb43bde90be770a43666c5ab26b4f8b15f6e46c4b0b0e84f37994/nvidia_cudnn_frontend-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0d4cfd03961592108abd1ba246e43c8bb7540aed984df860256d0bff181de98", size = 1441271, upload-time = "2025-12-20T00:29:52.056Z" },
 ]
 
 [[package]]
@@ -3247,7 +3249,7 @@ wheels = [
 
 [[package]]
 name = "openai"
-version = "2.13.0"
+version = "2.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -3259,9 +3261,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/0f/39/8e347e9fda125324d253084bb1b82407e5e3c7777a03dc398f79b2d95626/openai-2.13.0.tar.gz", hash = "sha256:9ff633b07a19469ec476b1e2b5b26c5ef700886524a7a72f65e6f0b5203142d5", size = 626583, upload-time = "2025-12-16T18:19:44.387Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/b1/12fe1c196bea326261718eb037307c1c1fe1dedc2d2d4de777df822e6238/openai-2.14.0.tar.gz", hash = "sha256:419357bedde9402d23bf8f2ee372fca1985a73348debba94bddff06f19459952", size = 626938, upload-time = "2025-12-19T03:28:45.742Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bb/d5/eb52edff49d3d5ea116e225538c118699ddeb7c29fa17ec28af14bc10033/openai-2.13.0-py3-none-any.whl", hash = "sha256:746521065fed68df2f9c2d85613bb50844343ea81f60009b60e6a600c9352c79", size = 1066837, upload-time = "2025-12-16T18:19:43.124Z" },
+    { url = "https://files.pythonhosted.org/packages/27/4b/7c1a00c2c3fbd004253937f7520f692a9650767aa73894d7a34f0d65d3f4/openai-2.14.0-py3-none-any.whl", hash = "sha256:7ea40aca4ffc4c4a776e77679021b47eec1160e341f42ae086ba949c9dcc9183", size = 1067558, upload-time = "2025-12-19T03:28:43.727Z" },
 ]
 
 [[package]]
@@ -4714,7 +4716,7 @@ requires-dist = [
     { name = "litellm", marker = "extra == 'miniswe'" },
     { name = "litellm", extras = ["proxy"], marker = "extra == 'sandboxes'", specifier = ">=1.67.5" },
     { name = "loguru" },
-    { name = "megatron-bridge", marker = "extra == 'mcore'", git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge?rev=22ef9ff9f9684ba2f2dbea14db974f5c31bbd683" },
+    { name = "megatron-bridge", marker = "extra == 'mcore'", git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge?rev=953aabf75c0500180dc14a6a76cf9e7e7c4baec7" },
     { name = "megatron-core", marker = "extra == 'mcore'", specifier = "==0.15.0" },
     { name = "mini-swe-agent", marker = "extra == 'miniswe'", specifier = ">=1.12.0" },
     { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=2.0.0" },
@@ -5789,7 +5791,7 @@ wheels = [
 
 [[package]]
 name = "typer"
-version = "0.20.0"
+version = "0.20.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click", marker = "extra == 'extra-11-skyrl-train-mcore' or extra == 'extra-11-skyrl-train-miniswe' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm')" },
@@ -5797,9 +5799,9 @@ dependencies = [
     { name = "shellingham", marker = "extra == 'extra-11-skyrl-train-mcore' or extra == 'extra-11-skyrl-train-miniswe' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm')" },
     { name = "typing-extensions", marker = "extra == 'extra-11-skyrl-train-mcore' or extra == 'extra-11-skyrl-train-miniswe' or (extra == 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-sglang') or (extra != 'extra-11-skyrl-train-sglang' and extra == 'extra-11-skyrl-train-vllm') or (extra != 'extra-11-skyrl-train-flashrl' and extra == 'extra-11-skyrl-train-vllm')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/8f/28/7c85c8032b91dbe79725b6f17d2fffc595dff06a35c7a30a37bef73a1ab4/typer-0.20.0.tar.gz", hash = "sha256:1aaf6494031793e4876fb0bacfa6a912b551cf43c1e63c800df8b1a866720c37", size = 106492, upload-time = "2025-10-20T17:03:49.445Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/6d/c1/933d30fd7a123ed981e2a1eedafceab63cb379db0402e438a13bc51bbb15/typer-0.20.1.tar.gz", hash = "sha256:68585eb1b01203689c4199bc440d6be616f0851e9f0eb41e4a778845c5a0fd5b", size = 105968, upload-time = "2025-12-19T16:48:56.302Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/78/64/7713ffe4b5983314e9d436a90d5bd4f63b6054e2aca783a3cfc44cb95bbf/typer-0.20.0-py3-none-any.whl", hash = "sha256:5b463df6793ec1dca6213a3cf4c0f03bc6e322ac5e16e13ddd622a889489784a", size = 47028, upload-time = "2025-10-20T17:03:47.617Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/52/1f2df7e7d1be3d65ddc2936d820d4a3d9777a54f4204f5ca46b8513eff77/typer-0.20.1-py3-none-any.whl", hash = "sha256:4b3bde918a67c8e03d861aa02deca90a95bbac572e71b1b9be56ff49affdb5a8", size = 47381, upload-time = "2025-12-19T16:48:53.679Z" },
 ]
 
 [[package]]