fix mem frac

syuoni · syuoni · commit d9913e107f4f · 2025-08-07T09:31:00.000Z
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -325,13 +325,15 @@ def test_guided_decoding_4gpus(self, backend: str, mocker):
     @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
     def test_guided_decoding_with_eagle3(self, backend: str, mocker):
         mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
         spec_config = EagleDecodingConfig(
             max_draft_len=3,
             speculative_model_dir=
             f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
             eagle3_one_model=False)
         llm = LLM(self.MODEL_PATH,
                   guided_decoding_backend=backend,
+                  kv_cache_config=kv_cache_config,
                   speculative_config=spec_config,
                   disable_overlap_scheduler=True)
         with llm:
@@ -342,10 +344,12 @@ def test_guided_decoding_with_eagle3(self, backend: str, mocker):
     @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
     def test_guided_decoding_with_ngram(self, backend: str, mocker):
         mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
         spec_config = NGramDecodingConfig(max_draft_len=3,
                                           max_matching_ngram_size=3)
         llm = LLM(self.MODEL_PATH,
                   guided_decoding_backend=backend,
+                  kv_cache_config=kv_cache_config,
                   speculative_config=spec_config,
                   disable_overlap_scheduler=True)
         with llm: