vllm-project · shanjiaz · May 26, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/docs/cli/prepare_data.md b/docs/cli/prepare_data.md
@@ -26,6 +26,8 @@ python scripts/prepare_data.py \
 
   Example: `meta-llama/Llama-3.1-8B-Instruct`
 
+- **`--trust-remote-code`** (flag) Allow executing code from HF Hub when loading the target model's processor.
+
 ### Data Arguments
 
 - **`--data`** (str, required, repeatable) Path to training data. Can be a HuggingFace dataset name or local path. Use multiple times to specify multiple datasets.

diff --git a/docs/cli/train.md b/docs/cli/train.md
@@ -32,6 +32,8 @@ torchrun --standalone --nproc_per_node=4 scripts/train.py \
 
 - **`--verifier-name-or-path`** (str, required) HuggingFace model ID or local path for the verifier/target model.
 
+- **`--trust-remote-code`** (flag) Allow executing code from HF Hub when loading the verifier's tokenizer.
+
 - **`--speculator-type`** (str, default: `"eagle3"`) Type of speculator model to train. Options: `eagle3`, `dflash`
 
 - **`--from-pretrained`** (str, default: `""`) Path to a pretrained draft model to finetune.

diff --git a/pyproject.toml b/pyproject.toml
@@ -50,6 +50,8 @@ dependencies = [
     "safetensors",
     "setuptools",
     "torch>=2.9.0,<=2.11.0",
+    "torchaudio",
+    "torchvision",
     "tqdm>=4.66.3,<=4.67.3",
     "transformers>=4.56.1,<5.9.0",
     "typer>=0.12.0",
@@ -249,6 +251,7 @@ select = [
     "PTH", # os.path is acceptable in scripts
     "T201", # print statements are acceptable in scripts
     "SLF001", # allow private member access for model configuration
+    "PLR0915", # allow long parse_args functions
 ]
 
 "examples/**/*.py" = [

diff --git a/scripts/data_generation_offline.py b/scripts/data_generation_offline.py
@@ -31,6 +31,7 @@
     DEFAULT_REQUEST_TIMEOUT,
     generate_hidden_states_async,
 )
+from speculators.train.data import build_client_item
 from speculators.train.logger import setup_root_logger
 
 logger = logging.getLogger(__name__)
@@ -66,8 +67,8 @@ def parse_args():
         type=str,
         default=None,
         help=(
-            "HuggingFace model ID or local path for target model (default auto select)."
-            "For verification purposes only."
+            "HuggingFace model ID or local path for target model "
+            "(default auto select). For verification purposes only."
         ),
     )
     parser.add_argument(
@@ -113,16 +114,16 @@ def parse_args():
         type=int,
         default=32,
         help=(
-            "Number of active vLLM requests at a time."
+            "Number of active vLLM requests at a time. "
             "Note: number of async workers set to 2*concurrency"
         ),
     )
     parser.add_argument(
         "--validate-outputs",
         action="store_true",
         help=(
-            "Load generated safetensor files and check output token ids match prompt"
-            " tokens and hidden states seq_len matches num tokens"
+            "Load generated safetensor files and check output token ids match "
+            "prompt tokens and hidden states seq_len matches num tokens"
         ),
     )
     parser.add_argument(
@@ -276,16 +277,14 @@ async def worker(
             queue.task_done()
             continue
 
-        input_ids = item["input_ids"].tolist()
-
         target_hidden_states_path = hidden_states_output_dir / f"hs_{idx}.safetensors"
 
         try:
             async with vllm_semaphore:  # Limit number of active generate calls
                 hidden_states_path = await generate_hidden_states_async(
                     client,
                     model,
-                    input_ids,
+                    item,
                     timeout=request_timeout,
                     max_retries=max_retries,
                 )
@@ -295,7 +294,9 @@ async def worker(
                 )
                 if validate_outputs:
                     await asyncio.to_thread(
-                        check_safetensors_file, target_hidden_states_path, input_ids
+                        check_safetensors_file,
+                        target_hidden_states_path,
+                        item["input_ids"],
                     )
         except Exception as e:
             if fail_on_error:
@@ -325,12 +326,15 @@ async def _feed_queue(to_process, dataset, queue, cancel_event):
     for i in to_process:
         if cancel_event.is_set():
             break
-        item = dataset[i]
+
+        dataset_item = dataset[i]
+        client_item = build_client_item(dataset_item) | {"idx": i}
+
         # Check cancel_event while waiting for queue space to avoid
         # deadlocking when all workers have died.
         while not cancel_event.is_set():
             try:
-                queue.put_nowait({"idx": i, "input_ids": item["input_ids"]})
+                queue.put_nowait(client_item)
                 break
             except asyncio.QueueFull:
                 await asyncio.sleep(0.1)
@@ -397,7 +401,7 @@ async def generate_and_save_hidden_states(args, dataset):
         if args.model and args.model != model_id:
             raise ValueError(
                 f"An explicit model name was passed ({args.model}) which doesn't match"
-                "found model_id {model_id}."
+                f" found model_id {model_id}."
                 "Please make sure --endpoint is set to the correct vllm instance."
             )
 

diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py
@@ -49,6 +49,14 @@ def parse_args():
         required=True,
         help="HuggingFace model ID or local path for target model",
     )
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help=(
+            "Allow executing code from HF Hub when loading the target model's "
+            "processor."
+        ),
+    )
 
     # Data arguments
     parser.add_argument(
@@ -75,7 +83,7 @@ def parse_args():
         type=str,
         default=None,
         help=(
-            "Path to save token frequency distribution"
+            "Path to save token frequency distribution "
             "(default: args.output / 'token_freq.pt')"
         ),
     )
@@ -177,6 +185,7 @@ def main():
         assistant_pattern=args.assistant_pattern,
         turn_dropout=args.turn_dropout,
         minimum_valid_tokens=args.minimum_valid_tokens,
+        trust_remote_code=args.trust_remote_code,
     )
 
     log.info("Done preparing data")

diff --git a/scripts/train.py b/scripts/train.py
@@ -259,6 +259,7 @@ def main(args: argparse.Namespace):
         args.verifier_name_or_path,
         transformer_layer_config.vocab_size,
         args.mask_token_id,
+        trust_remote_code=args.trust_remote_code,
     )
 
     registry = SpeculatorModel.registry
@@ -398,6 +399,11 @@ def _checkpoint_freq(value: str) -> float:
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--verifier-name-or-path", type=str, required=True)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Allow executing code from HF Hub when loading the verifier's tokenizer.",
+    )
     parser.add_argument(
         "--speculator-type",
         type=str,

diff --git a/src/speculators/data_generation/configs.py b/src/speculators/data_generation/configs.py
@@ -1,5 +1,6 @@
 """Configuration registries for data generation pipeline."""
 
+import os
 from collections.abc import Callable
 from dataclasses import dataclass
 
@@ -9,14 +10,15 @@
 ]
 
 
-@dataclass
+@dataclass(kw_only=True)
 class DatasetConfig:
     """Configuration for loading a dataset"""
 
     name: str
     hf_path: str
-    split: str
     subset: str | None = None
+    split: str
+    filter_fn: Callable[[dict], bool] | None = None
     normalize_fn: Callable[[dict], dict] | None = None
 
 
@@ -35,6 +37,60 @@ def _normalize_gsm8k(example: dict) -> dict:
     }
 
 
+def get_coco_dir():
+    return os.getenv("COCO_DIR") or "coco/"
+
+
+def _parse_sharegpt4v_part(part: str, image_path: str):
+    if part == "<image>":
+        return {"type": "image", "path": image_path}
+
+    return {"type": "text", "text": part}
+
+
+def _parse_sharegpt4v_user_content(content: str, image_path: str):
+    return [_parse_sharegpt4v_part(part, image_path) for part in content.split("\n")]
+
+
+def _parse_sharegpt4v_assistant_content(content: str):
+    return [{"type": "text", "text": content}]
+
+
+def _filter_sharegpt4v_coco(example: dict) -> bool:
+    return example["image"].startswith("coco/")
+
+
+def _normalize_sharegpt4v_coco(example: dict) -> dict:
+    coco_dir = get_coco_dir()
+    image_path = os.path.join(coco_dir, example["image"].removeprefix("coco/"))
+
+    if not os.path.exists(image_path):
+        state_str = "set to" if os.getenv("COCO_DIR") else "default"
+
+        raise ValueError(
+            f"No image found at <{image_path}>. "
+            f"Please download COCO 2017 Train Images from "
+            f"<http://images.cocodataset.org/zips/train2017.zip> and place the "
+            f"extracted folder under `COCO_DIR` ({state_str}: `{coco_dir}`)."
+        )
+
+    messages = [
+        (
+            turn
+            | {
+                "value": (
+                    _parse_sharegpt4v_user_content(turn["value"], image_path)
+                    if turn["from"] in ("human", "user")
+                    else _parse_sharegpt4v_assistant_content(turn["value"])
+                )
+            }
+        )
+        for turn in example["conversations"]
+    ]
+
+    return {"conversations": messages}
+
+
 DATASET_CONFIGS: dict[str, DatasetConfig] = {
     "sharegpt": DatasetConfig(
         name="sharegpt",
@@ -50,8 +106,17 @@ def _normalize_gsm8k(example: dict) -> dict:
     "gsm8k": DatasetConfig(
         name="gsm8k",
         hf_path="openai/gsm8k",
-        split="train",
         subset="main",
+        split="train",
         normalize_fn=_normalize_gsm8k,
     ),
+    # NOTE: You need to serve vLLM with `--allowed-local-media-path /path/to/coco`
+    "sharegpt4v_coco": DatasetConfig(
+        name="sharegpt4v_coco",
+        hf_path="Lin-Chen/ShareGPT4V",
+        subset="ShareGPT4V",
+        split="train",
+        filter_fn=_filter_sharegpt4v_coco,
+        normalize_fn=_normalize_sharegpt4v_coco,
+    ),
 }