@@ -323,13 +323,15 @@ def test_guided_decoding_4gpus(self, backend: str, mocker):
323323    @pytest .mark .parametrize ("backend" , ["xgrammar" , "llguidance" ]) 
324324    def  test_guided_decoding_with_eagle3 (self , backend : str , mocker ):
325325        mocker .patch .dict (os .environ , {"TRTLLM_XGUIDANCE_LENIENT" : "1" })
326+         kv_cache_config  =  KvCacheConfig (free_gpu_memory_fraction = 0.8 )
326327        spec_config  =  EagleDecodingConfig (
327328            max_draft_len = 3 ,
328329            speculative_model_dir = 
329330            f"{ llm_models_root ()}  ,
330331            eagle3_one_model = False )
331332        llm  =  LLM (self .MODEL_PATH ,
332333                  guided_decoding_backend = backend ,
334+                   kv_cache_config = kv_cache_config ,
333335                  speculative_config = spec_config ,
334336                  disable_overlap_scheduler = True )
335337        with  llm :
@@ -340,10 +342,12 @@ def test_guided_decoding_with_eagle3(self, backend: str, mocker):
340342    @pytest .mark .parametrize ("backend" , ["xgrammar" , "llguidance" ]) 
341343    def  test_guided_decoding_with_ngram (self , backend : str , mocker ):
342344        mocker .patch .dict (os .environ , {"TRTLLM_XGUIDANCE_LENIENT" : "1" })
345+         kv_cache_config  =  KvCacheConfig (free_gpu_memory_fraction = 0.8 )
343346        spec_config  =  NGramDecodingConfig (max_draft_len = 3 ,
344347                                          max_matching_ngram_size = 3 )
345348        llm  =  LLM (self .MODEL_PATH ,
346349                  guided_decoding_backend = backend ,
350+                   kv_cache_config = kv_cache_config ,
347351                  speculative_config = spec_config ,
348352                  disable_overlap_scheduler = True )
349353        with  llm :
0 commit comments