Skip to content

Commit b36f07d

Browse files
committed
change mixed_sampler
Signed-off-by: Superjomn <[email protected]>
1 parent c434147 commit b36f07d

File tree

8 files changed

+15
-15
lines changed

8 files changed

+15
-15
lines changed

tensorrt_llm/_torch/auto_deploy/llm_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ class LlmArgs(BaseLlmArgs):
7979
repr=False,
8080
)
8181

82-
mixed_sampler: bool = Field(
82+
enable_mixed_sampler: bool = Field(
8383
default=False,
8484
description="If true, will iterate over sampling_params of each request and use the corresponding "
8585
"sampling strategy, e.g. top-k, top-p, etc.",

tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
291291
scheduler = SimpleScheduler(capacitor_scheduler, mb_scheduler)
292292

293293
# search sampler with speculative decoding
294-
# TODO (lucaslie, fridah-nv): some models require mixed_sampler=True to have good outputs, see
294+
# TODO (lucaslie, fridah-nv): some models require enable_mixed_sampler=True to have good outputs, see
295295
# https://github.com/NVIDIA/TensorRT-LLM/issues/5254
296296
# We should expose mixed_sample to our build_and_run_ad script so we can configure this
297297
# correctly for models as needed.
@@ -300,7 +300,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
300300
max_draft_tokens=max_draft_tokens,
301301
max_num_sequences=max_num_sequences,
302302
max_beam_width=executor_config.max_beam_width,
303-
mixed_sampler=ad_config.mixed_sampler,
303+
enable_mixed_sampler=ad_config.enable_mixed_sampler,
304304
)
305305
sampler = TorchSampler(sampler_args)
306306

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,7 @@ def create_py_executor_instance(
534534

535535

536536
def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
537-
*, max_seq_len: int, mixed_sampler: bool):
537+
*, max_seq_len: int, enable_mixed_sampler: bool):
538538
max_num_sequences = executor_config.max_batch_size * mapping.pp_size
539539
max_draft_tokens = (0 if executor_config.speculative_config is None else
540540
executor_config.speculative_config.max_draft_tokens)
@@ -543,7 +543,7 @@ def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
543543
max_draft_tokens=max_draft_tokens,
544544
max_num_sequences=max_num_sequences,
545545
max_beam_width=executor_config.max_beam_width,
546-
mixed_sampler=mixed_sampler,
546+
enable_mixed_sampler=enable_mixed_sampler,
547547
)
548548

549549

@@ -555,7 +555,7 @@ def instantiate_sampler(engine: PyTorchModelEngine,
555555
executor_config,
556556
mapping,
557557
max_seq_len=engine.max_seq_len,
558-
mixed_sampler=pytorch_backend_config.mixed_sampler)
558+
enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler)
559559
if mapping.cp_config.get('cp_type') == 'star_attention':
560560
assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'"
561561
return TorchSampler(sampler_args)

tensorrt_llm/_torch/pyexecutor/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class PyTorchConfig:
4848
attn_backend: str = 'TRTLLM'
4949
moe_backend: str = 'CUTLASS'
5050

51-
mixed_sampler: bool = False
51+
enable_mixed_sampler: bool = False
5252
"""
5353
If true, will iterate over sampling_params of each request and use the
5454
corresponding sampling strategy, e.g. top-k, top-p, etc.

tensorrt_llm/_torch/pyexecutor/sampler.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -221,11 +221,11 @@ class Args:
221221
max_draft_tokens: int
222222
max_num_sequences: int
223223
max_beam_width: int
224-
mixed_sampler: bool
224+
enable_mixed_sampler: bool
225225

226226
def __init__(self, args: Args):
227227
self.max_seq_len = args.max_seq_len
228-
self.mixed_sampler = args.mixed_sampler
228+
self.enable_mixed_sampler = args.enable_mixed_sampler
229229
self.max_tokens = args.max_draft_tokens + 1
230230
assert args.max_beam_width == self.MAX_BEAM_WIDTH, "TorchSampler only supports beam_width = 1"
231231
self.num_seq_slots = args.max_num_sequences
@@ -402,7 +402,7 @@ def _process_requests(self,
402402
num_steps = [1 + len(req.py_draft_tokens) for req in requests]
403403
sum_steps = sum(num_steps)
404404
no_draft_tokens = len(requests) == sum_steps
405-
fast_path = not self.mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None
405+
fast_path = not self.enable_mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None
406406

407407
seq_slots = torch.as_tensor([r.seq_slot for r in requests])
408408
seq_slots = seq_slots.to(device="cuda", non_blocking=True)
@@ -419,7 +419,7 @@ def _process_requests(self,
419419
strategies = sampling_strategies(requests)
420420
batched_next_tokens, batched_softmax = None, None
421421
batched_strategy: Strategy | None = GREEDY
422-
if self.mixed_sampler:
422+
if self.enable_mixed_sampler:
423423
assert "d2t" not in model_outputs, "eagle3 does not yet support non-greedy sampling"
424424
if len(set(strategies)) == 1:
425425
batched_strategy = strategies[0]

tensorrt_llm/llmapi/llm_args.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,7 +1702,7 @@ class TorchLlmArgs(BaseLlmArgs):
17021702
moe_backend: str = Field(default='CUTLASS',
17031703
description="MoE backend to use.")
17041704

1705-
mixed_sampler: bool = Field(
1705+
enable_mixed_sampler: bool = Field(
17061706
default=False,
17071707
description=
17081708
"If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."
@@ -1918,7 +1918,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
19181918
moe_load_balancer=self.moe_load_balancer,
19191919
attn_backend=self.attn_backend,
19201920
moe_backend=self.moe_backend,
1921-
mixed_sampler=self.mixed_sampler,
1921+
enable_mixed_sampler=self.enable_mixed_sampler,
19221922
enable_trtllm_sampler=self.enable_trtllm_sampler,
19231923
kv_cache_dtype=self.kv_cache_dtype,
19241924
enable_iter_perf_stats=self.enable_iter_perf_stats,

tensorrt_llm/scaffolding/worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def init_with_new_llm(
167167

168168
llm = LLM(model_dir,
169169
tokenizer=tokenizer,
170-
mixed_sampler=True,
170+
enable_mixed_sampler=True,
171171
disable_overlap_scheduler=disable_overlap_scheduler,
172172
kv_cache_config=kv_cache_config,
173173
max_batch_size=max_batch_size,

tests/unittest/api_stability/references/llm.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ methods:
8181
moe_backend:
8282
annotation: str
8383
default: CUTLASS
84-
mixed_sampler:
84+
enable_mixed_sampler:
8585
annotation: bool
8686
default: False
8787
enable_trtllm_sampler:

0 commit comments

Comments
 (0)