THUDM · zhuzilin · Dec 4, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 13, 2025
diff --git a/examples/geo3k_vlm/README.md b/examples/geo3k_vlm/README.md
@@ -0,0 +1,32 @@
+# FSDP + VLM Single-Turn RL
+
+Training VLMs with FSDP on single-turn reasoning task using GRPO on the [GEO3K dataset](https://huggingface.co/datasets/hiyouga/geometry3k). We used processed version [here](https://huggingface.co/datasets/chenhegu/geo3k_imgurl).
+
+<p align="center">
+  <img src="rewards.png" alt="Reward Plot" width="800">
+</p>
+
+## Reproduce
+
+```bash
+export WANDB_API_KEY=your_wandb_api_key
+
+SLIME_SCRIPT_MODEL_NAME=Qwen3-VL-2B-Instruct SLIME_SCRIPT_NUM_GPUS=8 python examples/geo3k_vlm/run_geo3k_vlm.py 2>&1 | tee run_simple.log
+```
+
+## Notes
+
+### Reward Model Configuration
+
+We experimented with three reward model configurations:
+1. A geo3k-specific RM with tolerance=0.05 (to handle rounding in ground truth labels)
+2. A geo3k-specific RM with tolerance=0.0 (strict matching)
+3. The default math RM
+
+All three performed similarly, so we use the default math RM for simplicity.
+
+### Numerical Precision with Non-Binary Rewards
+
+Our initial geo3k-specific verifier produced "format scores" (**0 and 0.9**) instead of clean binary rewards. Under **fp32**, fractional values like 0.9 can't be exactly represented, so when all samples in a group have the same reward, `reward - mean` doesn't equal zero—creating spurious gradient signal.
+
+We fixed this by switching to the default math RM with clean **binary 0/1 rewards**. If you encounter similar precision issues with non-binary rewards, you can change the reward tensor dtype from `torch.float` to `torch.float16` in `slime/ray/rollout.py` (`_post_process_rewards` method) to truncate precision artifacts.
diff --git a/examples/geo3k_vlm/rewards.png b/examples/geo3k_vlm/rewards.png
diff --git a/examples/geo3k_vlm/run_geo3k_vlm.py b/examples/geo3k_vlm/run_geo3k_vlm.py
@@ -0,0 +1,193 @@
+import json
+import os
+import subprocess
+
+import slime.utils.misc as U
+
+MODEL_NAME = os.environ.get("SLIME_SCRIPT_MODEL_NAME", "Qwen3-VL-2B-Instruct")
+assert MODEL_NAME in {"Qwen2.5-VL-3B-Instruct", "Qwen3-VL-2B-Instruct", "Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct"}
+
+NUM_GPUS = int(os.environ.get("SLIME_SCRIPT_NUM_GPUS", "1"))
+EXTERNAL_RAY = int(os.environ.get("SLIME_SCRIPT_EXTERNAL_RAY", "0"))
+MASTER_ADDR = os.environ.get("MASTER_ADDR", "127.0.0.1")
+
+
+def detect_nvlink():
+    """Detect if NVLink is available on the system."""
+    try:
+        result = subprocess.run(["nvidia-smi"], capture_output=True, text=True, check=True)
+        nvlink_count = result.stdout.count("NVLink")
+        has_nvlink = 1 if nvlink_count > 0 else 0
+        print(f"HAS_NVLINK: {has_nvlink} (detected {nvlink_count} NVLink references)")
+        return has_nvlink
+    except Exception as e:
+        print(f"Failed to detect NVLink: {e}")
+        return 0
+
+
+def prepare():
+    U.exec_command("mkdir -p /root/models /root/datasets")
+    U.exec_command(f"hf download Qwen/{MODEL_NAME} --local-dir /root/models/{MODEL_NAME}")
+    dataset_name = "chenhegu/geo3k_imgurl"
+    _, partial_name = dataset_name.split("/")
+    U.exec_command(f"hf download --repo-type dataset {dataset_name} --local-dir /root/datasets/{partial_name}")
+
+
+def execute():
+    # Detect NVLink for optimized NCCL settings
+    has_nvlink = detect_nvlink()
+
+    ckpt_args = f"--hf-checkpoint /root/models/{MODEL_NAME} "
+
+    rollout_args = (
+        "--prompt-data /root/datasets/geo3k_imgurl/train.parquet "
+        "--input-key problem "
+        "--label-key answer "
+        '--multimodal-keys \'{"image": "images"}\' '
+        "--apply-chat-template "
+        "--rollout-shuffle "
+        "--rm-type math "
+        "--num-rollout 3000 "
+        "--rollout-batch-size 64 "
+        "--n-samples-per-prompt 8 "
+        "--rollout-max-response-len 4096 "
+        "--rollout-temperature 0.8 "
+        "--global-batch-size 512 "
+    )
+
+    eval_args = (
+        # "--eval-interval 20 "
+        "--eval-prompt-data geo3k-test /root/datasets/geo3k_imgurl/test.parquet "
+        "--n-samples-per-eval-prompt 1 "
+        "--eval-max-response-len 4096 "
+        "--eval-top-k 1 "
+    )
+
+    grpo_args = (
+        "--advantage-estimator grpo "
+        # "--use-kl-loss "
+        "--kl-loss-coef 0.00 "
+        "--kl-loss-type low_var_kl "
+        "--kl-coef 0.00 "
+        "--entropy-coef 0.00 "
+        "--eps-clip 0.2 "
+        "--eps-clip-high 0.28 "
+    )
+
+    optimizer_args = (
+        "--optimizer adam "
+        "--lr 1e-6 "
+        "--lr-decay-style constant "
+        "--weight-decay 0.1 "
+        "--adam-beta1 0.9 "
+        "--adam-beta2 0.98 "
+    )
+
+    sglang_args = (
+        "--rollout-num-gpus-per-engine 1 "
+        "--sglang-mem-fraction-static 0.6 "
+        f"--sglang-cuda-graph-bs {' '.join(map(str, [1, 2, 4, 8] + list(range(16, 257, 8))))} "
+    )
+
+    fsdp_args = (
+        # Set to true for FULL_STATE_DICT mode, false for SHARDED_STATE_DICT mode (default)
+        # "--fsdp-full-params "  # Uncomment this line to enable full params mode
+        # Set the bucket size for weight update
+        "--update-weight-buffer-size 536870912 "  # 512MB
+        "--train-backend fsdp "
+        "--gradient-checkpointing "
+        "--sglang-attention-backend fa3 "
+        "--attn-implementation flash_attention_3 "
+    )
+
+    wandb_args = (
+        "--use-wandb "
+        "--wandb-project geo3k-vlm "
+        "--wandb-group geo3k-vlm "
+        "--wandb-key ${WANDB_API_KEY} "
+        "--disable-wandb-random-suffix "
+    )
+
+    misc_args = "--actor-num-nodes 1 " f"--actor-num-gpus-per-node {NUM_GPUS} " "--colocate "
+
+    # misc_args += (
+    #     "--use-dynamic-batch-size "
+    #     # TODO pick a good value
+    #     "--max-tokens-per-gpu 2048 "
+    # )
+
+    # true_on_policy_args = (
+    #     "--sglang-enable-deterministic-inference "
+    #     "--sglang-rl-on-policy-target fsdp "
+    #     "--deterministic-mode "
+    #     "--true-on-policy-mode "
+    # )
+    # true_on_policy_envs = {
+    #     # TODO note: "Ring" in original RL PR, "allreduce:tree" in SGLang
+    #     # "NCCL_ALGO": "Ring",
+    #     "NCCL_ALGO": "allreduce:tree",
+    #     "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0",
+    #     "CUBLAS_WORKSPACE_CONFIG": ":4096:8",
+    # }
+
+    train_args = (
+        f"{ckpt_args} "
+        f"{rollout_args} "
+        f"{optimizer_args} "
+        f"{grpo_args} "
+        f"{sglang_args} "
+        f"{fsdp_args} "
+        f"{eval_args} "
+        f"{misc_args} "
+        f"{wandb_args} "
+        # f"{true_on_policy_args} "
+    )
+
+    # Kill existing processes
+    U.exec_command(
+        "pkill -9 sglang; "
+        "sleep 3; "
+        f"{'' if EXTERNAL_RAY else 'ray stop --force; '}"
+        f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}"
+        "pkill -9 slime; "
+        "sleep 3; "
+        f"{'' if EXTERNAL_RAY else 'pkill -9 ray; '}"
+        "pkill -9 slime; "
+        "pkill -9 redis; "
+        "true; "
+    )
+
+    if not EXTERNAL_RAY:
+        # Start Ray
+        U.exec_command(
+            f"export PYTHONBUFFERED=16 && "
+            f"ray start --head --node-ip-address {MASTER_ADDR} --num-gpus {NUM_GPUS} "
+            f"--disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265"
+        )
+
+    # Prepare runtime environment
+    runtime_env_json = json.dumps(
+        {
+            "env_vars": {
+                "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+                "NCCL_NVLS_ENABLE": str(has_nvlink),
+                # **true_on_policy_envs,
+                # "SGLANG_DUMPER_ENABLE": "0",
+                # "SGLANG_TEMP_UTILS_ENABLE_DEBUG_PRINT": "0",
+            }
+        }
+    )
+
+    # Submit Ray job
+    U.exec_command(
+        f"export no_proxy=127.0.0.1 && export PYTHONBUFFERED=16 && "
+        f'ray job submit --address="http://127.0.0.1:8265" '
+        f"--runtime-env-json='{runtime_env_json}' "
+        f"-- python3 /root/slime/train.py "
+        f"{train_args}"
+    )
+
+
+if __name__ == "__main__":
+    prepare()
+    execute()
diff --git a/requirements.txt b/requirements.txt
@@ -8,6 +8,7 @@ omegaconf
 pillow
 pylatexenc
 pyyaml
+qwen_vl_utils # for VLM
 ray[default]
 ring_flash_attn
 sglang-router>=0.2.3

diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -1,15 +1,15 @@
 import logging
+import os
 from argparse import Namespace
 from itertools import accumulate
 
-
 import ray
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
 from ring_flash_attn import substitute_hf_flash_attn, update_ring_flash_attn_params
 from tqdm import tqdm
-from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+from transformers import AutoConfig
 
 from slime.ray.train_actor import TrainRayActor
 from slime.utils import train_dump_utils, train_metric_utils
@@ -19,6 +19,7 @@
 from slime.utils.memory_utils import clear_memory, print_memory
 from slime.utils.metric_utils import compute_rollout_step
 from slime.utils.ppo_utils import compute_approx_kl, compute_gspo_kl, compute_opsm_mask, compute_policy_loss
+from slime.utils.processing_utils import load_processor, load_tokenizer
 from slime.utils.ray_utils import Box
 from slime.utils.timer import Timer, inverse_timer, timer
 from slime.utils.tracking_utils import init_tracking
@@ -73,16 +74,15 @@ def init(self, args: Namespace, role: str, with_ref: bool = False) -> int:  # ty
         for i in range(dist.get_world_size()):
             if i == dist.get_rank():
                 self.hf_config = AutoConfig.from_pretrained(self.args.hf_checkpoint, trust_remote_code=True)
-                self.tokenizer = AutoTokenizer.from_pretrained(self.args.hf_checkpoint, trust_remote_code=True)
+                self.tokenizer = load_tokenizer(self.args.hf_checkpoint, trust_remote_code=True)
+                if self.args.multimodal_keys:
+                    self.processor = load_processor(self.args.hf_checkpoint, trust_remote_code=True)
             dist.barrier(group=get_gloo_group())
 
-        if self.args.multimodal_keys:
-            self.vlm_processor = AutoProcessor.from_pretrained(self.args.hf_checkpoint, trust_remote_code=True)
-
         init_context = self._get_init_weight_context_manager()
 
         with init_context():
-            model = AutoModelForCausalLM.from_pretrained(
+            model = self.get_model_cls().from_pretrained(
                 self.args.hf_checkpoint,
                 trust_remote_code=True,
                 attn_implementation=self.args.attn_implementation,
@@ -142,6 +142,16 @@ def init(self, args: Namespace, role: str, with_ref: bool = False) -> int:  # ty
 
         return int(getattr(self.args, "start_rollout_id", 0))
 
+    def get_model_cls(self):
+        if self.args.multimodal_keys:
+            from transformers import AutoModelForVision2Seq
+
+            return AutoModelForVision2Seq
+        else:
+            from transformers import AutoModelForCausalLM
+
+            return AutoModelForCausalLM
+
     def _enable_true_on_policy_optimizations(self, args):
         if args.true_on_policy_mode:
             from sglang.srt.batch_invariant_ops import enable_batch_invariant_mode
@@ -347,8 +357,6 @@ def _compute_log_prob(
                     tqdm(packed_batches, desc=f"{store_prefix}log_probs", disable=dist.get_rank() != 0)
                 ):
                     model_args = self._get_model_inputs_args(batch)
-                    if "pixel_values" in batch:
-                        model_args["pixel_values"] = batch["pixel_values"]
                     logits = active_model(**model_args).logits.squeeze(0).float()
                     log_probs_result, entropy_result = get_logprob_and_entropy_with_cp(
                         logits=logits,
@@ -436,6 +444,9 @@ def _packed_data(
                     rollout_log_probs=(
                         rollout_data["rollout_log_probs"][start:end] if "rollout_log_probs" in rollout_data else None
                     ),
+                    multimodal_inputs=(
+                        rollout_data["multimodal_inputs"][start:end] if "multimodal_inputs" in rollout_data else None
+                    ),
                     num_packs=mbs_size,
                 )
             )
@@ -783,15 +794,13 @@ def _create_ref_model(self, ref_load_path: str | None):
         if ref_load_path is None:
             raise ValueError("ref_load_path must be provided when loading reference model")
 
-        import os
-
         if os.path.isdir(ref_load_path):
             logger.info(f"[Rank {dist.get_rank()}] Creating separate ref model from {ref_load_path}")
 
             init_context = self._get_init_weight_context_manager()
 
             with init_context():
-                ref_model = AutoModelForCausalLM.from_pretrained(
+                ref_model = self.get_model_cls().from_pretrained(
                     ref_load_path,
                     trust_remote_code=True,
                     attn_implementation=self.args.attn_implementation,
@@ -828,6 +837,8 @@ def _get_model_inputs_args(self, packed_sequence: dict) -> dict:
             "position_ids": position_ids,
             "attention_mask": None,
         }
+        if packed_sequence.get("multimodal_inputs"):
+            model_args.update(packed_sequence["multimodal_inputs"])
         return model_args