change mixed_sampler

Superjomn · Superjomn · commit b36f07da2917 · 2025-07-04T14:43:23.000+08:00
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -79,7 +79,7 @@ class LlmArgs(BaseLlmArgs):
         repr=False,
     )
 
-    mixed_sampler: bool = Field(
+    enable_mixed_sampler: bool = Field(
         default=False,
         description="If true, will iterate over sampling_params of each request and use the corresponding "
         "sampling strategy, e.g. top-k, top-p, etc.",
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -291,7 +291,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
     scheduler = SimpleScheduler(capacitor_scheduler, mb_scheduler)
 
     # search sampler with speculative decoding
-    # TODO (lucaslie, fridah-nv): some models require mixed_sampler=True to have good outputs, see
+    # TODO (lucaslie, fridah-nv): some models require enable_mixed_sampler=True to have good outputs, see
     # https://github.com/NVIDIA/TensorRT-LLM/issues/5254
     # We should expose mixed_sample to our build_and_run_ad script so we can configure this
     # correctly for models as needed.
@@ -300,7 +300,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
         max_draft_tokens=max_draft_tokens,
         max_num_sequences=max_num_sequences,
         max_beam_width=executor_config.max_beam_width,
-        mixed_sampler=ad_config.mixed_sampler,
+        enable_mixed_sampler=ad_config.enable_mixed_sampler,
     )
     sampler = TorchSampler(sampler_args)
 
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -534,7 +534,7 @@ def create_py_executor_instance(
 
 
 def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
-                              *, max_seq_len: int, mixed_sampler: bool):
+                              *, max_seq_len: int, enable_mixed_sampler: bool):
     max_num_sequences = executor_config.max_batch_size * mapping.pp_size
     max_draft_tokens = (0 if executor_config.speculative_config is None else
                         executor_config.speculative_config.max_draft_tokens)
@@ -543,7 +543,7 @@ def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
         max_draft_tokens=max_draft_tokens,
         max_num_sequences=max_num_sequences,
         max_beam_width=executor_config.max_beam_width,
-        mixed_sampler=mixed_sampler,
+        enable_mixed_sampler=enable_mixed_sampler,
     )
 
 
@@ -555,7 +555,7 @@ def instantiate_sampler(engine: PyTorchModelEngine,
         executor_config,
         mapping,
         max_seq_len=engine.max_seq_len,
-        mixed_sampler=pytorch_backend_config.mixed_sampler)
+        enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler)
     if mapping.cp_config.get('cp_type') == 'star_attention':
         assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'"
         return TorchSampler(sampler_args)
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -48,7 +48,7 @@ class PyTorchConfig:
     attn_backend: str = 'TRTLLM'
     moe_backend: str = 'CUTLASS'
 
-    mixed_sampler: bool = False
+    enable_mixed_sampler: bool = False
     """
     If true, will iterate over sampling_params of each request and use the
     corresponding sampling strategy, e.g. top-k, top-p, etc.
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -221,11 +221,11 @@ class Args:
         max_draft_tokens: int
         max_num_sequences: int
         max_beam_width: int
-        mixed_sampler: bool
+        enable_mixed_sampler: bool
 
     def __init__(self, args: Args):
         self.max_seq_len = args.max_seq_len
-        self.mixed_sampler = args.mixed_sampler
+        self.enable_mixed_sampler = args.enable_mixed_sampler
         self.max_tokens = args.max_draft_tokens + 1
         assert args.max_beam_width == self.MAX_BEAM_WIDTH, "TorchSampler only supports beam_width = 1"
         self.num_seq_slots = args.max_num_sequences
@@ -402,7 +402,7 @@ def _process_requests(self,
         num_steps = [1 + len(req.py_draft_tokens) for req in requests]
         sum_steps = sum(num_steps)
         no_draft_tokens = len(requests) == sum_steps
-        fast_path = not self.mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None
+        fast_path = not self.enable_mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None
 
         seq_slots = torch.as_tensor([r.seq_slot for r in requests])
         seq_slots = seq_slots.to(device="cuda", non_blocking=True)
@@ -419,7 +419,7 @@ def _process_requests(self,
         strategies = sampling_strategies(requests)
         batched_next_tokens, batched_softmax = None, None
         batched_strategy: Strategy | None = GREEDY
-        if self.mixed_sampler:
+        if self.enable_mixed_sampler:
             assert "d2t" not in model_outputs, "eagle3 does not yet support non-greedy sampling"
             if len(set(strategies)) == 1:
                 batched_strategy = strategies[0]
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1702,7 +1702,7 @@ class TorchLlmArgs(BaseLlmArgs):
     moe_backend: str = Field(default='CUTLASS',
                              description="MoE backend to use.")
 
-    mixed_sampler: bool = Field(
+    enable_mixed_sampler: bool = Field(
         default=False,
         description=
         "If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."
@@ -1918,7 +1918,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             moe_load_balancer=self.moe_load_balancer,
             attn_backend=self.attn_backend,
             moe_backend=self.moe_backend,
-            mixed_sampler=self.mixed_sampler,
+            enable_mixed_sampler=self.enable_mixed_sampler,
             enable_trtllm_sampler=self.enable_trtllm_sampler,
             kv_cache_dtype=self.kv_cache_dtype,
             enable_iter_perf_stats=self.enable_iter_perf_stats,
diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py
@@ -167,7 +167,7 @@ def init_with_new_llm(
 
         llm = LLM(model_dir,
                   tokenizer=tokenizer,
-                  mixed_sampler=True,
+                  enable_mixed_sampler=True,
                   disable_overlap_scheduler=disable_overlap_scheduler,
                   kv_cache_config=kv_cache_config,
                   max_batch_size=max_batch_size,
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
@@ -81,7 +81,7 @@ methods:
       moe_backend:
         annotation: str
         default: CUTLASS
-      mixed_sampler:
+      enable_mixed_sampler:
         annotation: bool
         default: False
       enable_trtllm_sampler:

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ class LlmArgs(BaseLlmArgs):`
`79`	`79`	`repr=False,`
`80`	`80`	`)`
`81`	`81`
`82`		`- mixed_sampler: bool = Field(`
	`82`	`+ enable_mixed_sampler: bool = Field(`
`83`	`83`	`default=False,`
`84`	`84`	`description="If true, will iterate over sampling_params of each request and use the corresponding "`
`85`	`85`	`"sampling strategy, e.g. top-k, top-p, etc.",`