diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index b82d36fe8e8..f4bcc441940 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -1,4 +1,5 @@ import copy +import gc import importlib import os from concurrent.futures import ThreadPoolExecutor @@ -687,6 +688,9 @@ def drafting_loop_wrapper(model): with allocation_scope(ExecutorMemoryType.EXTRA_RESOURCES, RestoreMode.PINNED): + + # run gc.collect() to free memory of the previous py_executor, avoid cudaFree overlap with cuda graph capture + gc.collect() py_executor = create_py_executor_instance( dist=dist, resources=resources, diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 2ae474014ba..3f250477296 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -401,7 +401,6 @@ examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5 full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551) unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_batch_sizes[model_drafter-schedule1] SKIP (https://nvbugs/5680911) full:RTXPro6000D/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5684703) -disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8] SKIP (https://nvbugs/5685143) test_e2e.py::test_openai_responses SKIP (https://nvbugs/5635153) accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438) accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] SKIP (https://nvbugs/5680905)