@@ -325,6 +325,7 @@ def test_guided_decoding_4gpus(self, backend: str, mocker):
325325 def test_guided_decoding_with_eagle3 (self , backend : str , mocker ):
326326 mocker .patch .dict (os .environ , {"TRTLLM_XGUIDANCE_LENIENT" : "1" })
327327 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
328+ cuda_graph_config = CudaGraphConfig (enable_padding = True )
328329 spec_config = EagleDecodingConfig (
329330 max_draft_len = 3 ,
330331 speculative_model_dir =
@@ -333,6 +334,8 @@ def test_guided_decoding_with_eagle3(self, backend: str, mocker):
333334 llm = LLM (self .MODEL_PATH ,
334335 guided_decoding_backend = backend ,
335336 kv_cache_config = kv_cache_config ,
337+ cuda_graph_config = cuda_graph_config ,
338+ enable_chunked_prefill = True ,
336339 speculative_config = spec_config ,
337340 disable_overlap_scheduler = True )
338341 with llm :
@@ -344,11 +347,14 @@ def test_guided_decoding_with_eagle3(self, backend: str, mocker):
344347 def test_guided_decoding_with_ngram (self , backend : str , mocker ):
345348 mocker .patch .dict (os .environ , {"TRTLLM_XGUIDANCE_LENIENT" : "1" })
346349 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
350+ cuda_graph_config = CudaGraphConfig (enable_padding = True )
347351 spec_config = NGramDecodingConfig (max_draft_len = 3 ,
348352 max_matching_ngram_size = 3 )
349353 llm = LLM (self .MODEL_PATH ,
350354 guided_decoding_backend = backend ,
351355 kv_cache_config = kv_cache_config ,
356+ cuda_graph_config = cuda_graph_config ,
357+ enable_chunked_prefill = True ,
352358 speculative_config = spec_config ,
353359 disable_overlap_scheduler = True )
354360 with llm :
0 commit comments