| 
6 | 6 | from tensorrt_llm.llmapi import NGramDecodingConfig  | 
7 | 7 | from tensorrt_llm.logger import logger  | 
8 | 8 | 
 
  | 
9 |  | -from ..pyexecutor.guided_decoder import GuidedDecoder  | 
10 | 9 | from ..pyexecutor.llm_request import LlmRequest, LlmRequestState  | 
11 | 10 | from ..pyexecutor.resource_manager import BaseResourceManager, ResourceManager  | 
12 | 11 | from ..pyexecutor.scheduler import ScheduledRequests  | 
@@ -168,22 +167,17 @@ def __init__(  | 
168 | 167 |         self,  | 
169 | 168 |         spec_config: NGramDecodingConfig,  | 
170 | 169 |         ngram_pool_manager: NGramPoolManager = None,  | 
171 |  | -        guided_decoder: Optional[GuidedDecoder] = None,  | 
172 | 170 |     ):  | 
173 | 171 |         assert ngram_pool_manager is not None, "NGram needs a resource manager to maintain the pool."  | 
174 | 172 |         self.spec_config = spec_config  | 
175 | 173 |         self.max_draft_len = spec_config.max_draft_len  | 
176 | 174 |         self.spec_resource_manager = ngram_pool_manager  | 
177 |  | -        self.guided_decoder = guided_decoder  | 
178 | 175 | 
 
  | 
179 | 176 |     def prepare_draft_tokens(  | 
180 | 177 |         self,  | 
181 | 178 |         scheduled_requests: ScheduledRequests,  | 
182 | 179 |         resource_manager: Optional[ResourceManager] = None,  | 
183 | 180 |     ) -> None:  | 
184 |  | -        if self.guided_decoder is not None:  | 
185 |  | -            self.guided_decoder.rollback_rejected_tokens(scheduled_requests)  | 
186 |  | - | 
187 | 181 |         # Disable NGram speculative decoding auto heuristic for batch size > 32.  | 
188 | 182 |         if self.spec_config.is_auto_heuristic and len(  | 
189 | 183 |                 scheduled_requests.all_requests()) > 32:  | 
 | 
0 commit comments