Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/auto_deploy/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class LlmArgs(BaseLlmArgs):
repr=False,
)

mixed_sampler: bool = Field(
enable_mixed_sampler: bool = Field(
default=False,
description="If true, will iterate over sampling_params of each request and use the corresponding "
"sampling strategy, e.g. top-k, top-p, etc.",
Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
scheduler = SimpleScheduler(capacitor_scheduler, mb_scheduler)

# search sampler with speculative decoding
# TODO (lucaslie, fridah-nv): some models require mixed_sampler=True to have good outputs, see
# TODO (lucaslie, fridah-nv): some models require enable_mixed_sampler=True to have good outputs, see
# https://github.com/NVIDIA/TensorRT-LLM/issues/5254
# We should expose mixed_sample to our build_and_run_ad script so we can configure this
# correctly for models as needed.
Expand All @@ -302,7 +302,7 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir:
max_draft_tokens=max_draft_tokens,
max_num_sequences=max_num_sequences,
max_beam_width=executor_config.max_beam_width,
mixed_sampler=ad_config.mixed_sampler,
enable_mixed_sampler=ad_config.enable_mixed_sampler,
)
sampler = TorchSampler(sampler_args)

Expand Down
6 changes: 3 additions & 3 deletions tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ def create_py_executor_instance(


def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
*, max_seq_len: int, mixed_sampler: bool):
*, max_seq_len: int, enable_mixed_sampler: bool):
max_num_sequences = executor_config.max_batch_size * mapping.pp_size
max_draft_tokens = (0 if executor_config.speculative_config is None else
executor_config.speculative_config.max_draft_tokens)
Expand All @@ -555,7 +555,7 @@ def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
max_draft_tokens=max_draft_tokens,
max_num_sequences=max_num_sequences,
max_beam_width=executor_config.max_beam_width,
mixed_sampler=mixed_sampler,
enable_mixed_sampler=enable_mixed_sampler,
)


Expand All @@ -567,7 +567,7 @@ def instantiate_sampler(engine: PyTorchModelEngine,
executor_config,
mapping,
max_seq_len=engine.max_seq_len,
mixed_sampler=pytorch_backend_config.mixed_sampler)
enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler)
if mapping.cp_config.get('cp_type') == 'star_attention':
assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'"
return TorchSampler(sampler_args)
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/pyexecutor/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class PyTorchConfig:
attn_backend: str = 'TRTLLM'
moe_backend: str = 'CUTLASS'

mixed_sampler: bool = False
enable_mixed_sampler: bool = False
"""
If true, will iterate over sampling_params of each request and use the
corresponding sampling strategy, e.g. top-k, top-p, etc.
Expand Down
8 changes: 4 additions & 4 deletions tensorrt_llm/_torch/pyexecutor/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,11 @@ class Args:
max_draft_tokens: int
max_num_sequences: int
max_beam_width: int
mixed_sampler: bool
enable_mixed_sampler: bool

def __init__(self, args: Args):
self.max_seq_len = args.max_seq_len
self.mixed_sampler = args.mixed_sampler
self.enable_mixed_sampler = args.enable_mixed_sampler
self.max_tokens = args.max_draft_tokens + 1
assert args.max_beam_width == self.MAX_BEAM_WIDTH, "TorchSampler only supports beam_width = 1"
self.num_seq_slots = args.max_num_sequences
Expand Down Expand Up @@ -406,7 +406,7 @@ def _process_requests(self,
num_steps = [1 + len(req.py_draft_tokens) for req in requests]
sum_steps = sum(num_steps)
no_draft_tokens = len(requests) == sum_steps
fast_path = not self.mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None
fast_path = not self.enable_mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None

seq_slots = torch.as_tensor([r.seq_slot for r in requests])
seq_slots = seq_slots.to(device="cuda", non_blocking=True)
Expand All @@ -423,7 +423,7 @@ def _process_requests(self,
strategies = sampling_strategies(requests)
batched_next_tokens, batched_softmax = None, None
batched_strategy: Strategy | None = GREEDY
if self.mixed_sampler:
if self.enable_mixed_sampler:
assert "d2t" not in model_outputs, "eagle3 does not yet support non-greedy sampling"
if len(set(strategies)) == 1:
batched_strategy = strategies[0]
Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/llmapi/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -1702,7 +1702,7 @@ class TorchLlmArgs(BaseLlmArgs):
moe_backend: str = Field(default='CUTLASS',
description="MoE backend to use.")

mixed_sampler: bool = Field(
enable_mixed_sampler: bool = Field(
default=False,
description=
"If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."
Expand Down Expand Up @@ -1918,7 +1918,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
moe_load_balancer=self.moe_load_balancer,
attn_backend=self.attn_backend,
moe_backend=self.moe_backend,
mixed_sampler=self.mixed_sampler,
enable_mixed_sampler=self.enable_mixed_sampler,
enable_trtllm_sampler=self.enable_trtllm_sampler,
kv_cache_dtype=self.kv_cache_dtype,
enable_iter_perf_stats=self.enable_iter_perf_stats,
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/scaffolding/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def init_with_new_llm(

llm = LLM(model_dir,
tokenizer=tokenizer,
mixed_sampler=True,
enable_mixed_sampler=True,
disable_overlap_scheduler=disable_overlap_scheduler,
kv_cache_config=kv_cache_config,
max_batch_size=max_batch_size,
Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/api_stability/references/llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ methods:
moe_backend:
annotation: str
default: CUTLASS
mixed_sampler:
enable_mixed_sampler:
annotation: bool
default: False
enable_trtllm_sampler:
Expand Down