From 1a30cdd9149919493c0a6bf04766322088bdda59 Mon Sep 17 00:00:00 2001 From: Superjomn <328693+Superjomn@users.noreply.github.com> Date: Fri, 4 Jul 2025 14:40:37 +0800 Subject: [PATCH] change mixed_sampler Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> --- tensorrt_llm/_torch/auto_deploy/llm_args.py | 2 +- tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py | 4 ++-- tensorrt_llm/_torch/pyexecutor/_util.py | 6 +++--- tensorrt_llm/_torch/pyexecutor/config.py | 2 +- tensorrt_llm/_torch/pyexecutor/sampler.py | 8 ++++---- tensorrt_llm/llmapi/llm_args.py | 4 ++-- tensorrt_llm/scaffolding/worker.py | 2 +- tests/unittest/api_stability/references/llm.yaml | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py index a056bcdd30b..ba6ad81595b 100644 --- a/tensorrt_llm/_torch/auto_deploy/llm_args.py +++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py @@ -79,7 +79,7 @@ class LlmArgs(BaseLlmArgs): repr=False, ) - mixed_sampler: bool = Field( + enable_mixed_sampler: bool = Field( default=False, description="If true, will iterate over sampling_params of each request and use the corresponding " "sampling strategy, e.g. top-k, top-p, etc.", diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py index 8b0437b15ed..6aca50ba844 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py @@ -293,7 +293,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir: scheduler = SimpleScheduler(capacitor_scheduler, mb_scheduler) # search sampler with speculative decoding - # TODO (lucaslie, fridah-nv): some models require mixed_sampler=True to have good outputs, see + # TODO (lucaslie, fridah-nv): some models require enable_mixed_sampler=True to have good outputs, see # https://github.com/NVIDIA/TensorRT-LLM/issues/5254 # We should expose mixed_sample to our build_and_run_ad script so we can configure this # correctly for models as needed. @@ -302,7 +302,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir: max_draft_tokens=max_draft_tokens, max_num_sequences=max_num_sequences, max_beam_width=executor_config.max_beam_width, - mixed_sampler=ad_config.mixed_sampler, + enable_mixed_sampler=ad_config.enable_mixed_sampler, ) sampler = TorchSampler(sampler_args) diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 28e2fec867a..5494c53b56d 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -546,7 +546,7 @@ def create_py_executor_instance( def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping, - *, max_seq_len: int, mixed_sampler: bool): + *, max_seq_len: int, enable_mixed_sampler: bool): max_num_sequences = executor_config.max_batch_size * mapping.pp_size max_draft_tokens = (0 if executor_config.speculative_config is None else executor_config.speculative_config.max_draft_tokens) @@ -555,7 +555,7 @@ def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping, max_draft_tokens=max_draft_tokens, max_num_sequences=max_num_sequences, max_beam_width=executor_config.max_beam_width, - mixed_sampler=mixed_sampler, + enable_mixed_sampler=enable_mixed_sampler, ) @@ -567,7 +567,7 @@ def instantiate_sampler(engine: PyTorchModelEngine, executor_config, mapping, max_seq_len=engine.max_seq_len, - mixed_sampler=pytorch_backend_config.mixed_sampler) + enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler) if mapping.cp_config.get('cp_type') == 'star_attention': assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'" return TorchSampler(sampler_args) diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py index e8fe9de34a0..52d760c36c6 100644 --- a/tensorrt_llm/_torch/pyexecutor/config.py +++ b/tensorrt_llm/_torch/pyexecutor/config.py @@ -48,7 +48,7 @@ class PyTorchConfig: attn_backend: str = 'TRTLLM' moe_backend: str = 'CUTLASS' - mixed_sampler: bool = False + enable_mixed_sampler: bool = False """ If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc. diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index 4ce421ea031..c7c3e3357d1 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -225,11 +225,11 @@ class Args: max_draft_tokens: int max_num_sequences: int max_beam_width: int - mixed_sampler: bool + enable_mixed_sampler: bool def __init__(self, args: Args): self.max_seq_len = args.max_seq_len - self.mixed_sampler = args.mixed_sampler + self.enable_mixed_sampler = args.enable_mixed_sampler self.max_tokens = args.max_draft_tokens + 1 assert args.max_beam_width == self.MAX_BEAM_WIDTH, "TorchSampler only supports beam_width = 1" self.num_seq_slots = args.max_num_sequences @@ -406,7 +406,7 @@ def _process_requests(self, num_steps = [1 + len(req.py_draft_tokens) for req in requests] sum_steps = sum(num_steps) no_draft_tokens = len(requests) == sum_steps - fast_path = not self.mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None + fast_path = not self.enable_mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None seq_slots = torch.as_tensor([r.seq_slot for r in requests]) seq_slots = seq_slots.to(device="cuda", non_blocking=True) @@ -423,7 +423,7 @@ def _process_requests(self, strategies = sampling_strategies(requests) batched_next_tokens, batched_softmax = None, None batched_strategy: Strategy | None = GREEDY - if self.mixed_sampler: + if self.enable_mixed_sampler: assert "d2t" not in model_outputs, "eagle3 does not yet support non-greedy sampling" if len(set(strategies)) == 1: batched_strategy = strategies[0] diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index dbb4de7ded7..0cfd6e95eaf 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1702,7 +1702,7 @@ class TorchLlmArgs(BaseLlmArgs): moe_backend: str = Field(default='CUTLASS', description="MoE backend to use.") - mixed_sampler: bool = Field( + enable_mixed_sampler: bool = Field( default=False, description= "If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc." @@ -1918,7 +1918,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig": moe_load_balancer=self.moe_load_balancer, attn_backend=self.attn_backend, moe_backend=self.moe_backend, - mixed_sampler=self.mixed_sampler, + enable_mixed_sampler=self.enable_mixed_sampler, enable_trtllm_sampler=self.enable_trtllm_sampler, kv_cache_dtype=self.kv_cache_dtype, enable_iter_perf_stats=self.enable_iter_perf_stats, diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py index 1aa59ce5113..c48af14753e 100644 --- a/tensorrt_llm/scaffolding/worker.py +++ b/tensorrt_llm/scaffolding/worker.py @@ -167,7 +167,7 @@ def init_with_new_llm( llm = LLM(model_dir, tokenizer=tokenizer, - mixed_sampler=True, + enable_mixed_sampler=True, disable_overlap_scheduler=disable_overlap_scheduler, kv_cache_config=kv_cache_config, max_batch_size=max_batch_size, diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index 95d1f58147c..665b09950d0 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -81,7 +81,7 @@ methods: moe_backend: annotation: str default: CUTLASS - mixed_sampler: + enable_mixed_sampler: annotation: bool default: False enable_trtllm_sampler: