From e9435e4c6161fe1da798bbdcc5087c5b3ce8af44 Mon Sep 17 00:00:00 2001 From: yunruis <205571022+yunruis@users.noreply.github.com> Date: Thu, 30 Oct 2025 23:18:15 -0700 Subject: [PATCH 1/2] change var mem to heap, to manually control Signed-off-by: yunruis <205571022+yunruis@users.noreply.github.com> --- cpp/tensorrt_llm/common/opUtils.cpp | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp index 053f9d9ece7..ae3810a255f 100644 --- a/cpp/tensorrt_llm/common/opUtils.cpp +++ b/cpp/tensorrt_llm/common/opUtils.cpp @@ -179,16 +179,24 @@ class PerCudaCtxPerThreadSingletonCreator PerCudaCtxPerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter) : mCreator{std::move(creator)} , mDeleter{std::move(deleter)} + , mObservers{new std::unordered_map, hash>()} { } + ~PerCudaCtxPerThreadSingletonCreator() + { + std::lock_guard lk{mMutex}; + delete mObservers; + mObservers = nullptr; + } + std::shared_ptr operator()() { std::lock_guard lk{mMutex}; CUcontext ctx{getCurrentCudaCtx()}; std::thread::id thread = std::this_thread::get_id(); auto const key = std::make_tuple(ctx, thread); - std::shared_ptr result = mObservers[key].lock(); + std::shared_ptr result = (*mObservers)[key].lock(); if (result == nullptr) { TLLM_LOG_TRACE("creating singleton instance for CUDA context %lu and thread %lu", ctx, thread); @@ -202,6 +210,11 @@ class PerCudaCtxPerThreadSingletonCreator } mDeleter(obj); + if (mObservers == nullptr) + { + return; + } + // Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts // frequently. std::shared_ptr observedObjHolder; // Delay destroy to avoid dead lock. @@ -210,17 +223,18 @@ class PerCudaCtxPerThreadSingletonCreator // thread just before we lock mMutex. We can't infer that the observer is stale from the fact that // obj is destroyed, because shared_ptr ref-count checking and observer removing are not in one // atomic operation, and the observer may be changed to observe another instance. - if (mObservers.find(key) == mObservers.end()) + auto it = mObservers->find(key); + if (it == mObservers->end()) { return; } - observedObjHolder = mObservers.at(key).lock(); + observedObjHolder = it->second.lock(); if (observedObjHolder == nullptr) { - mObservers.erase(key); + mObservers->erase(it); } }}; - mObservers.at(key) = result; + (*mObservers)[key] = result; } else { @@ -235,7 +249,7 @@ class PerCudaCtxPerThreadSingletonCreator mutable std::mutex mMutex; // CUDA resources are per-context and per-thread. using CacheKey = std::tuple; - std::unordered_map, hash> mObservers; + std::unordered_map, hash>* mObservers; }; } // namespace @@ -253,6 +267,7 @@ std::shared_ptr getCublasHandle() { TLLM_CUDA_CHECK(cublasDestroy(*handle)); delete handle; + handle = nullptr; }); return creator(); } @@ -270,6 +285,7 @@ std::shared_ptr getCublasLtHandle() { TLLM_CUDA_CHECK(cublasLtDestroy(*handle)); delete handle; + handle = nullptr; }); return creator(); } From aa32a278105cfc56344278d39e3bff0b53cee88f Mon Sep 17 00:00:00 2001 From: yunruis <205571022+yunruis@users.noreply.github.com> Date: Mon, 3 Nov 2025 10:56:22 +0800 Subject: [PATCH 2/2] drop tiny llama in waives.txt Removed a skipped test case related to MPI single GPU. Signed-off-by: yunruis <205571022+yunruis@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index a05a27b68e6..64fe6dbeb19 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -358,6 +358,5 @@ triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-T accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8 SKIP (https://nvbugs/5606233) examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233) accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] SKIP (https://nvbugs/5606266) -examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5606268) disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5626197) disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5628952)