From e9435e4c6161fe1da798bbdcc5087c5b3ce8af44 Mon Sep 17 00:00:00 2001
From: yunruis <205571022+yunruis@users.noreply.github.com>
Date: Thu, 30 Oct 2025 23:18:15 -0700
Subject: [PATCH 1/2] change var mem to heap, to manually control

Signed-off-by: yunruis <205571022+yunruis@users.noreply.github.com>
---
 cpp/tensorrt_llm/common/opUtils.cpp | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp
index 053f9d9ece7..ae3810a255f 100644
--- a/cpp/tensorrt_llm/common/opUtils.cpp
+++ b/cpp/tensorrt_llm/common/opUtils.cpp
@@ -179,16 +179,24 @@ class PerCudaCtxPerThreadSingletonCreator
     PerCudaCtxPerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter)
         : mCreator{std::move(creator)}
         , mDeleter{std::move(deleter)}
+        , mObservers{new std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>()}
     {
     }
 
+    ~PerCudaCtxPerThreadSingletonCreator()
+    {
+        std::lock_guard<std::mutex> lk{mMutex};
+        delete mObservers;
+        mObservers = nullptr;
+    }
+
     std::shared_ptr<T> operator()()
     {
         std::lock_guard<std::mutex> lk{mMutex};
         CUcontext ctx{getCurrentCudaCtx()};
         std::thread::id thread = std::this_thread::get_id();
         auto const key = std::make_tuple(ctx, thread);
-        std::shared_ptr<T> result = mObservers[key].lock();
+        std::shared_ptr<T> result = (*mObservers)[key].lock();
         if (result == nullptr)
         {
             TLLM_LOG_TRACE("creating singleton instance for CUDA context %lu and thread %lu", ctx, thread);
@@ -202,6 +210,11 @@ class PerCudaCtxPerThreadSingletonCreator
                     }
                     mDeleter(obj);
 
+                    if (mObservers == nullptr)
+                    {
+                        return;
+                    }
+
                     // Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts
                     // frequently.
                     std::shared_ptr<T> observedObjHolder; // Delay destroy to avoid dead lock.
@@ -210,17 +223,18 @@ class PerCudaCtxPerThreadSingletonCreator
                     // thread just before we lock mMutex. We can't infer that the observer is stale from the fact that
                     // obj is destroyed, because shared_ptr ref-count checking and observer removing are not in one
                     // atomic operation, and the observer may be changed to observe another instance.
-                    if (mObservers.find(key) == mObservers.end())
+                    auto it = mObservers->find(key);
+                    if (it == mObservers->end())
                     {
                         return;
                     }
-                    observedObjHolder = mObservers.at(key).lock();
+                    observedObjHolder = it->second.lock();
                     if (observedObjHolder == nullptr)
                     {
-                        mObservers.erase(key);
+                        mObservers->erase(it);
                     }
                 }};
-            mObservers.at(key) = result;
+            (*mObservers)[key] = result;
         }
         else
         {
@@ -235,7 +249,7 @@ class PerCudaCtxPerThreadSingletonCreator
     mutable std::mutex mMutex;
     // CUDA resources are per-context and per-thread.
     using CacheKey = std::tuple<CUcontext, std::thread::id>;
-    std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>> mObservers;
+    std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>* mObservers;
 };
 
 } // namespace
@@ -253,6 +267,7 @@ std::shared_ptr<cublasHandle_t> getCublasHandle()
         {
             TLLM_CUDA_CHECK(cublasDestroy(*handle));
             delete handle;
+            handle = nullptr;
         });
     return creator();
 }
@@ -270,6 +285,7 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
         {
             TLLM_CUDA_CHECK(cublasLtDestroy(*handle));
             delete handle;
+            handle = nullptr;
         });
     return creator();
 }

From aa32a278105cfc56344278d39e3bff0b53cee88f Mon Sep 17 00:00:00 2001
From: yunruis <205571022+yunruis@users.noreply.github.com>
Date: Mon, 3 Nov 2025 10:56:22 +0800
Subject: [PATCH 2/2] drop tiny llama in waives.txt

Removed a skipped test case related to MPI single GPU.

Signed-off-by: yunruis <205571022+yunruis@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index a05a27b68e6..64fe6dbeb19 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -358,6 +358,5 @@ triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-T
 accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8 SKIP (https://nvbugs/5606233)
 examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233)
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] SKIP (https://nvbugs/5606266)
-examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5606268)
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5626197)
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_deepseek[True-True-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/5628952)