Skip to content

Commit 75798c4

Browse files
committed
fix non-support
Signed-off-by: Enwei Zhu <[email protected]>
1 parent 12d20e2 commit 75798c4

File tree

6 files changed

+9
-13
lines changed

6 files changed

+9
-13
lines changed

docs/source/torch/features/feature_combination_matrix.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@
1515
| KV Cache Reuse | Yes | Yes | Yes | Untested | Yes | Untested | Yes | No | Yes | Yes | --- | | | |
1616
| Slide Window Attention | Yes | Yes | Yes | Untested | No | Untested | Untested | Untested | Yes | Yes | WIP | --- | | |
1717
| Logits Post Processor | No | Yes | Yes | No | Untested | No | No | No | Yes | Yes | Yes | Yes | --- | |
18-
| Guided Decoding | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | Yes | --- |
18+
| Guided Decoding | Yes | Yes | Yes | No | Yes | No | No | Yes | Yes | Yes | Yes | Yes | Yes | --- |

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,9 @@ def _executor_loop(self):
933933

934934
self.resource_manager.prepare_resources(scheduled_batch)
935935
if self.drafter is not None and self.use_spec_decode:
936+
if self.guided_decoder is not None:
937+
self.guided_decoder.rollback_rejected_tokens(
938+
scheduled_batch)
936939
self.drafter.prepare_draft_tokens(
937940
scheduled_batch, self.resource_manager)
938941

tensorrt_llm/_torch/pyexecutor/py_executor_creator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,10 @@ def create_py_executor(
331331
_ExecutorCreationStage.GUIDED_DECODER):
332332
guided_decoder: Optional[GuidedDecoder] = None
333333
if executor_config.guided_decoding_config is not None:
334+
if spec_config is not None and not has_spec_drafter:
335+
raise ValueError(
336+
"Guided decoding is only supported with speculative decoding that has a dedicated drafter (two-model engine)."
337+
)
334338
if mapping.is_last_pp_rank():
335339
max_num_draft_tokens = 0
336340
if spec_config is not None:

tensorrt_llm/_torch/speculative/model_drafter.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -340,9 +340,6 @@ def prepare_draft_tokens(
340340
raise ValueError("Resource manager is required")
341341

342342
try:
343-
if self.guided_decoder is not None:
344-
self.guided_decoder.rollback_rejected_tokens(scheduled_requests)
345-
346343
draft_batch = self._prepare_draft_batch(scheduled_requests)
347344

348345
if draft_batch.batch_size == 0:

tensorrt_llm/_torch/speculative/ngram.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from tensorrt_llm.llmapi import NGramDecodingConfig
77
from tensorrt_llm.logger import logger
88

9-
from ..pyexecutor.guided_decoder import GuidedDecoder
109
from ..pyexecutor.llm_request import LlmRequest, LlmRequestState
1110
from ..pyexecutor.resource_manager import BaseResourceManager, ResourceManager
1211
from ..pyexecutor.scheduler import ScheduledRequests
@@ -168,22 +167,17 @@ def __init__(
168167
self,
169168
spec_config: NGramDecodingConfig,
170169
ngram_pool_manager: NGramPoolManager = None,
171-
guided_decoder: Optional[GuidedDecoder] = None,
172170
):
173171
assert ngram_pool_manager is not None, "NGram needs a resource manager to maintain the pool."
174172
self.spec_config = spec_config
175173
self.max_draft_len = spec_config.max_draft_len
176174
self.spec_resource_manager = ngram_pool_manager
177-
self.guided_decoder = guided_decoder
178175

179176
def prepare_draft_tokens(
180177
self,
181178
scheduled_requests: ScheduledRequests,
182179
resource_manager: Optional[ResourceManager] = None,
183180
) -> None:
184-
if self.guided_decoder is not None:
185-
self.guided_decoder.rollback_rejected_tokens(scheduled_requests)
186-
187181
# Disable NGram speculative decoding auto heuristic for batch size > 32.
188182
if self.spec_config.is_auto_heuristic and len(
189183
scheduled_requests.all_requests()) > 32:

tensorrt_llm/_torch/speculative/utils.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,7 @@ def get_spec_drafter(model_engine,
140140
guided_decoder=guided_decoder)
141141

142142
if spec_config.spec_dec_mode.is_ngram():
143-
return NGramDrafter(spec_config,
144-
ngram_pool_manager=spec_resource_manager,
145-
guided_decoder=guided_decoder)
143+
return NGramDrafter(spec_config, spec_resource_manager)
146144

147145
return None
148146

0 commit comments

Comments
 (0)