NVIDIA · Superjomn · Jul 7, 2025 · Jul 4, 2025
@@ -79,7 +79,7 @@ class LlmArgs(BaseLlmArgs):
         repr=False,
     )
 
-    mixed_sampler: bool = Field(
+    enable_mixed_sampler: bool = Field(
         default=False,
         description="If true, will iterate over sampling_params of each request and use the corresponding "
         "sampling strategy, e.g. top-k, top-p, etc.",

@@ -293,7 +293,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
     scheduler = SimpleScheduler(capacitor_scheduler, mb_scheduler)
 
     # search sampler with speculative decoding
-    # TODO (lucaslie, fridah-nv): some models require mixed_sampler=True to have good outputs, see
+    # TODO (lucaslie, fridah-nv): some models require enable_mixed_sampler=True to have good outputs, see
     # https://github.com/NVIDIA/TensorRT-LLM/issues/5254
     # We should expose mixed_sample to our build_and_run_ad script so we can configure this
     # correctly for models as needed.
@@ -302,7 +302,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
         max_draft_tokens=max_draft_tokens,
         max_num_sequences=max_num_sequences,
         max_beam_width=executor_config.max_beam_width,
-        mixed_sampler=ad_config.mixed_sampler,
+        enable_mixed_sampler=ad_config.enable_mixed_sampler,
     )
     sampler = TorchSampler(sampler_args)
 

@@ -546,7 +546,7 @@ def create_py_executor_instance(
 
 
 def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
-                              *, max_seq_len: int, mixed_sampler: bool):
+                              *, max_seq_len: int, enable_mixed_sampler: bool):
     max_num_sequences = executor_config.max_batch_size * mapping.pp_size
     max_draft_tokens = (0 if executor_config.speculative_config is None else
                         executor_config.speculative_config.max_draft_tokens)
@@ -555,7 +555,7 @@ def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
         max_draft_tokens=max_draft_tokens,
         max_num_sequences=max_num_sequences,
         max_beam_width=executor_config.max_beam_width,
-        mixed_sampler=mixed_sampler,
+        enable_mixed_sampler=enable_mixed_sampler,
     )
 
 
@@ -567,7 +567,7 @@ def instantiate_sampler(engine: PyTorchModelEngine,
         executor_config,
         mapping,
         max_seq_len=engine.max_seq_len,
-        mixed_sampler=pytorch_backend_config.mixed_sampler)
+        enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler)
     if mapping.cp_config.get('cp_type') == 'star_attention':
         assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'"
         return TorchSampler(sampler_args)

@@ -48,7 +48,7 @@ class PyTorchConfig:
     attn_backend: str = 'TRTLLM'
     moe_backend: str = 'CUTLASS'
 
-    mixed_sampler: bool = False
+    enable_mixed_sampler: bool = False
     """
     If true, will iterate over sampling_params of each request and use the
     corresponding sampling strategy, e.g. top-k, top-p, etc.

@@ -225,11 +225,11 @@ class Args:
         max_draft_tokens: int
         max_num_sequences: int
         max_beam_width: int
-        mixed_sampler: bool
+        enable_mixed_sampler: bool
 
     def __init__(self, args: Args):
         self.max_seq_len = args.max_seq_len
-        self.mixed_sampler = args.mixed_sampler
+        self.enable_mixed_sampler = args.enable_mixed_sampler
         self.max_tokens = args.max_draft_tokens + 1
         assert args.max_beam_width == self.MAX_BEAM_WIDTH, "TorchSampler only supports beam_width = 1"
         self.num_seq_slots = args.max_num_sequences
@@ -406,7 +406,7 @@ def _process_requests(self,
         num_steps = [1 + len(req.py_draft_tokens) for req in requests]
         sum_steps = sum(num_steps)
         no_draft_tokens = len(requests) == sum_steps
-        fast_path = not self.mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None
+        fast_path = not self.enable_mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None
 
         seq_slots = torch.as_tensor([r.seq_slot for r in requests])
         seq_slots = seq_slots.to(device="cuda", non_blocking=True)
@@ -423,7 +423,7 @@ def _process_requests(self,
         strategies = sampling_strategies(requests)
         batched_next_tokens, batched_softmax = None, None
         batched_strategy: Strategy | None = GREEDY
-        if self.mixed_sampler:
+        if self.enable_mixed_sampler:
             assert "d2t" not in model_outputs, "eagle3 does not yet support non-greedy sampling"
             if len(set(strategies)) == 1:
                 batched_strategy = strategies[0]

@@ -1702,7 +1702,7 @@ class TorchLlmArgs(BaseLlmArgs):
     moe_backend: str = Field(default='CUTLASS',
                              description="MoE backend to use.")
 
-    mixed_sampler: bool = Field(
+    enable_mixed_sampler: bool = Field(
         default=False,
         description=
         "If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."
@@ -1918,7 +1918,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             moe_load_balancer=self.moe_load_balancer,
             attn_backend=self.attn_backend,
             moe_backend=self.moe_backend,
-            mixed_sampler=self.mixed_sampler,
+            enable_mixed_sampler=self.enable_mixed_sampler,
             enable_trtllm_sampler=self.enable_trtllm_sampler,
             kv_cache_dtype=self.kv_cache_dtype,
             enable_iter_perf_stats=self.enable_iter_perf_stats,

diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py
@@ -167,7 +167,7 @@ def init_with_new_llm(
 
         llm = LLM(model_dir,
                   tokenizer=tokenizer,
-                  mixed_sampler=True,
+                  enable_mixed_sampler=True,
                   disable_overlap_scheduler=disable_overlap_scheduler,
                   kv_cache_config=kv_cache_config,
                   max_batch_size=max_batch_size,

@@ -81,7 +81,7 @@ methods:
       moe_backend:
         annotation: str
         default: CUTLASS
-      mixed_sampler:
+      enable_mixed_sampler:
         annotation: bool
         default: False
       enable_trtllm_sampler: