@@ -325,13 +325,15 @@ def test_guided_decoding_4gpus(self, backend: str, mocker):
325325    @pytest .mark .parametrize ("backend" , ["xgrammar" , "llguidance" ]) 
326326    def  test_guided_decoding_with_eagle3 (self , backend : str , mocker ):
327327        mocker .patch .dict (os .environ , {"TRTLLM_XGUIDANCE_LENIENT" : "1" })
328+         kv_cache_config  =  KvCacheConfig (free_gpu_memory_fraction = 0.8 )
328329        spec_config  =  EagleDecodingConfig (
329330            max_draft_len = 3 ,
330331            speculative_model_dir = 
331332            f"{ llm_models_root ()}  ,
332333            eagle3_one_model = False )
333334        llm  =  LLM (self .MODEL_PATH ,
334335                  guided_decoding_backend = backend ,
336+                   kv_cache_config = kv_cache_config ,
335337                  speculative_config = spec_config ,
336338                  disable_overlap_scheduler = True )
337339        with  llm :
@@ -342,10 +344,12 @@ def test_guided_decoding_with_eagle3(self, backend: str, mocker):
342344    @pytest .mark .parametrize ("backend" , ["xgrammar" , "llguidance" ]) 
343345    def  test_guided_decoding_with_ngram (self , backend : str , mocker ):
344346        mocker .patch .dict (os .environ , {"TRTLLM_XGUIDANCE_LENIENT" : "1" })
347+         kv_cache_config  =  KvCacheConfig (free_gpu_memory_fraction = 0.8 )
345348        spec_config  =  NGramDecodingConfig (max_draft_len = 3 ,
346349                                          max_matching_ngram_size = 3 )
347350        llm  =  LLM (self .MODEL_PATH ,
348351                  guided_decoding_backend = backend ,
352+                   kv_cache_config = kv_cache_config ,
349353                  speculative_config = spec_config ,
350354                  disable_overlap_scheduler = True )
351355        with  llm :
0 commit comments