Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,8 +423,7 @@ def __init__(
# This way it can also be used for CUDA graphs.
if self.use_beam_search:
self.cache_indirection_attention = torch.zeros(
(self.batch_size, self.max_beam_width, self.max_seq_len +
(0 if self._disable_overlap_scheduler else 1)),
(self.batch_size, self.max_beam_width, self.max_seq_len),
device="cuda",
dtype=torch.int32)
else:
Expand Down
4 changes: 0 additions & 4 deletions tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,10 +403,6 @@ def drafting_loop_wrapper(model):
# PyTorchModelEngine modifies these fields, update them
model_engine_max_seq_len = model_engine.max_seq_len
net_max_seq_len = model_engine_max_seq_len
if not llm_args.disable_overlap_scheduler:
model_engine_max_seq_len = model_engine.max_seq_len + 1
if spec_config is not None:
model_engine_max_seq_len += spec_config.max_total_draft_tokens

if spec_config is not None:
model_engine_max_seq_len += get_num_extra_kv_tokens(spec_config)
Expand Down
Loading