Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,7 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
max_batch_size: int,
speculative_config: SpeculativeConfig,
max_beam_width: int,
disable_flash_infer_sampling: bool):
disable_flashinfer_sampling: bool):
max_num_sequences = max_batch_size * mapping.pp_size
max_draft_len = (0 if speculative_config is None else
speculative_config.max_draft_len)
Expand All @@ -841,7 +841,7 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
max_total_draft_tokens=max_total_draft_tokens,
max_num_sequences=max_num_sequences,
max_beam_width=max_beam_width,
disable_flash_infer_sampling=disable_flash_infer_sampling,
disable_flashinfer_sampling=disable_flashinfer_sampling,
)


Expand All @@ -857,15 +857,15 @@ def instantiate_sampler(
speculative_config: SpeculativeConfig,
decoding_config: trtllm.DecodingConfig,
kv_cache_config: KvCacheConfig,
disable_flash_infer_sampling: bool,
disable_flashinfer_sampling: bool,
):
sampler_args = create_torch_sampler_args(
mapping,
max_seq_len=engine.max_seq_len,
max_batch_size=max_batch_size,
speculative_config=speculative_config,
max_beam_width=max_beam_width,
disable_flash_infer_sampling=disable_flash_infer_sampling,
disable_flashinfer_sampling=disable_flashinfer_sampling,
)
decoding_mode = get_decoding_mode(decoding_config=decoding_config,
max_beam_width=max_beam_width)
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ def drafting_loop_wrapper(model):
speculative_config=spec_config,
decoding_config=decoding_config,
kv_cache_config=kv_cache_config,
disable_flash_infer_sampling=llm_args._disable_flash_infer_sampling,
disable_flashinfer_sampling=llm_args.disable_flashinfer_sampling,
)
logger.info(f"Using Sampler: {type(sampler).__name__}")

Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/_torch/pyexecutor/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ class Args:
max_num_sequences: int
max_beam_width: int
max_total_draft_tokens: int
disable_flash_infer_sampling: bool = False
disable_flashinfer_sampling: bool = False

def __init__(self, args: Args):
self.max_seq_len = args.max_seq_len
Expand Down Expand Up @@ -652,7 +652,7 @@ def __init__(self, args: Args):
}

self._grouped_sampler_cls: Type[GroupedStrategySampler]
if IS_FLASHINFER_AVAILABLE and not args.disable_flash_infer_sampling:
if IS_FLASHINFER_AVAILABLE and not args.disable_flashinfer_sampling:
from .sampling_utils_flashinfer import FlashInferGroupedStrategySampler

self._grouped_sampler_cls = FlashInferGroupedStrategySampler
Expand Down
8 changes: 6 additions & 2 deletions tensorrt_llm/llmapi/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -2707,8 +2707,12 @@ class TorchLlmArgs(BaseLlmArgs):
# PrivateVars
_quant_config: Optional[QuantConfig] = PrivateAttr(default=None)

_disable_flash_infer_sampling: bool = PrivateAttr(default=True)
"""Unless this is set to False, FlashInfer.sampling is not used, even if available."""
disable_flashinfer_sampling: bool = Field(
default=True,
description=
"Disable the use of FlashInfer.sampling. This option is likely to be removed in the future.",
status="prototype",
)

@property
def quant_config(self) -> QuantConfig:
Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/_torch/sampler/test_torch_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,7 +1077,7 @@ def _build_sampler(
max_beam_width=1, # currently the only supported value
max_num_sequences=num_seq_slots,
max_total_draft_tokens=max_draft_len,
disable_flash_infer_sampling=(not use_flashinfer),
disable_flashinfer_sampling=(not use_flashinfer),
)
)

Expand Down
4 changes: 4 additions & 0 deletions tests/unittest/api_stability/references/llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ methods:
annotation: bool
default: False
status: beta
disable_flashinfer_sampling:
annotation: bool
default: False
status: prototype
moe_config:
annotation: tensorrt_llm.llmapi.llm_args.MoeConfig
status: beta
Expand Down