Skip to content

Commit 60982d5

Browse files
committed
fix
Signed-off-by: Enwei Zhu <[email protected]>
1 parent 767879e commit 60982d5

File tree

2 files changed

+13
-2
lines changed

2 files changed

+13
-2
lines changed

tensorrt_llm/_torch/pyexecutor/guided_decoder.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,11 @@ def _require_matcher_advance(self, llm_req: LlmRequest) -> bool:
8383
if llm_req.guided_decoding_params is None:
8484
return False
8585
if llm_req.py_is_draft:
86-
return True
86+
if llm_req.is_context_init_state and llm_req.is_last_context_chunk:
87+
return True
88+
if llm_req.is_generation_in_progress_state:
89+
return True
90+
return False
8791
# The request is in a generation forward step.
8892
return llm_req.is_generation_in_progress_state
8993

@@ -189,7 +193,8 @@ def execute(self,
189193
batched_bitmask.append(self.bitmask[slot, i])
190194
offset += len(llm_req.py_draft_tokens) + 1
191195

192-
assert offset == logits.size(0)
196+
# Dummy logits may exist for CUDA graph dummy requests.
197+
assert offset <= logits.size(0)
193198

194199
if len(batched_logits) > 0:
195200
torch.ops.trtllm.logits_bitmask(batched_logits, batched_bitmask)

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ def test_guided_decoding_4gpus(self, backend: str, mocker):
325325
def test_guided_decoding_with_eagle3(self, backend: str, mocker):
326326
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
327327
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
328+
cuda_graph_config = CudaGraphConfig(enable_padding=True)
328329
spec_config = EagleDecodingConfig(
329330
max_draft_len=3,
330331
speculative_model_dir=
@@ -333,6 +334,8 @@ def test_guided_decoding_with_eagle3(self, backend: str, mocker):
333334
llm = LLM(self.MODEL_PATH,
334335
guided_decoding_backend=backend,
335336
kv_cache_config=kv_cache_config,
337+
cuda_graph_config=cuda_graph_config,
338+
enable_chunked_prefill=True,
336339
speculative_config=spec_config,
337340
disable_overlap_scheduler=True)
338341
with llm:
@@ -344,11 +347,14 @@ def test_guided_decoding_with_eagle3(self, backend: str, mocker):
344347
def test_guided_decoding_with_ngram(self, backend: str, mocker):
345348
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
346349
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
350+
cuda_graph_config = CudaGraphConfig(enable_padding=True)
347351
spec_config = NGramDecodingConfig(max_draft_len=3,
348352
max_matching_ngram_size=3)
349353
llm = LLM(self.MODEL_PATH,
350354
guided_decoding_backend=backend,
351355
kv_cache_config=kv_cache_config,
356+
cuda_graph_config=cuda_graph_config,
357+
enable_chunked_prefill=True,
352358
speculative_config=spec_config,
353359
disable_overlap_scheduler=True)
354360
with llm:

0 commit comments

Comments
 (0)