Fix disagg pp bug

Tabrizian · Tabrizian · commit 33978ba34262 · 2025-08-20T13:27:43.000-07:00
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
@@ -101,10 +101,52 @@ class DataResponder::Impl
     {
         TLLM_CHECK(mSender);
         TLLM_CUDA_CHECK(cudaGetDevice(&mDeviceId));
-        mCurrentRequest = std::nullopt;
         mResponseFuture = std::async(std::launch::async, &Impl::response, this);
     }
 
+    void sendResponse(RequestIdType reqId) noexcept
+    {
+        std::unique_lock lk(mSendMutex);
+        // Send context cache is not called for this request yet.
+        std::unique_lock<std::mutex> lkResp(mResponderMutex);
+        auto readyResponseIt = mReadyResponses.find(reqId);
+        if (readyResponseIt == mReadyResponses.end())
+        {
+            return;
+        }
+        lkResp.unlock();
+        auto it = mRequestInfoMap.find(reqId);
+        if (it == mRequestInfoMap.end())
+        {
+            return;
+        }
+        auto blockHashes = it->second.getBlockHashes();
+        auto count = --mRemainSendCount[reqId];
+        TLLM_CHECK(count >= 0);
+        if (count == 0)
+        {
+            mRequestInfoMap.erase(it);
+            mRemainSendCount.erase(reqId);
+
+            // TODO(zhengd): pass the hashes directly instead of update llmRequest
+            auto llmRequest = readyResponseIt->second.mRequest;
+            llmRequest->setRequestedBlockHashes(std::move(blockHashes));
+
+            if (common::getEnvParallelCacheSend())
+            {
+                // TODO: Use a thread pool and check for thread safety.
+                std::thread(
+                    &DataResponder::Impl::sendAndRemoveResponse, this, reqId, std::move(readyResponseIt->second))
+                    .detach();
+            }
+            else
+            {
+                DataResponder::Impl::sendAndRemoveResponse(reqId, std::move(readyResponseIt->second));
+            }
+            removeResponse(readyResponseIt);
+        }
+    }
+
     [[nodiscard]] std::future<void> respondAndSendAsync(LlmRequest& llmRequest)
     {
         std::promise<void> promise;
@@ -115,6 +157,7 @@ class DataResponder::Impl
                 mReadyResponses.emplace(
                     llmRequest.mRequestId, Response{std::addressof(llmRequest), std::move(promise)});
             }
+            sendResponse(llmRequest.mRequestId);
             std::unique_lock lkCond(mCondMutex);
             mAnyReady = true;
         }
@@ -178,56 +221,18 @@ class DataResponder::Impl
                     break;
                 }
                 std::vector<size_t> blockHashes;
-                if (!isSending() && !mReadyResponses.empty())
+                auto const& requestInfo = mSender->recvRequestInfo();
+                auto reqId = requestInfo.getRequestId();
+                blockHashes = requestInfo.getBlockHashes();
                 {
-                    auto const& requestInfo = mSender->recvRequestInfo();
-                    auto reqId = requestInfo.getRequestId();
-                    blockHashes = requestInfo.getBlockHashes();
-
-                    mCurrentRequest = reqId;
+                    std::unique_lock lk(mSendMutex);
+                    mRequestInfoMap[reqId] = std::move(requestInfo);
                     if (mRemainSendCount.find(reqId) == mRemainSendCount.end())
                     {
                         mRemainSendCount[reqId] = mSender->getCounterpartsCount(reqId);
                     }
                 }
-                auto it = getCurrentResponse();
-                if (it != mReadyResponses.end())
-                {
-                    auto reqId = mCurrentRequest.value();
-                    auto count = --mRemainSendCount[reqId];
-                    TLLM_CHECK(count >= 0);
-                    if (count == 0)
-                    {
-                        mRemainSendCount.erase(reqId);
-
-                        // TODO(zhengd): pass the hashes directly instead of update llmRequest
-                        auto llmRequest = it->second.mRequest;
-                        llmRequest->setRequestedBlockHashes(std::move(blockHashes));
-
-                        if (common::getEnvParallelCacheSend())
-                        {
-                            // TODO: Use a thread pool and check for thread safety.
-                            std::thread(
-                                &DataResponder::Impl::sendAndRemoveResponse, this, it->first, std::move(it->second))
-                                .detach();
-                        }
-                        else
-                        {
-                            DataResponder::Impl::sendAndRemoveResponse(it->first, std::move(it->second));
-                        }
-                        removeResponse(it);
-                    }
-                    mCurrentRequest = std::nullopt;
-                }
-                else
-                {
-                    TLLM_CHECK_WITH_INFO(!mCurrentRequest.has_value(),
-                        "This executor does not have a prepared KV cache for request ID: %zu, and the "
-                        "mReadyResponses size is: %zu. mpi rank :%d     ",
-                        mCurrentRequest.value(), mReadyResponses.size(), mpi::MpiComm::world().getRank());
-                    std::unique_lock lk(mCondMutex);
-                    mResponderCv.wait(lk, [this]() { return (mAnyReady || mTerminate); });
-                }
+                sendResponse(reqId);
             }
         }
         catch (std::exception const& err)
@@ -264,31 +269,15 @@ class DataResponder::Impl
         }
     }
 
-    [[nodiscard]] bool isSending() const
-    {
-        return mCurrentRequest.has_value();
-    }
-
-    [[nodiscard]] RequestIdType getCurrentRequestId() const
-    {
-        return mCurrentRequest.value();
-    }
-
-    [[nodiscard]] std::map<RequestIdType, Response>::iterator getCurrentResponse()
-    {
-        std::unique_lock lk(mResponderMutex);
-        return mReadyResponses.find(getCurrentRequestId());
-    }
-
 private:
-    std::optional<RequestIdType> mCurrentRequest;
     std::map<RequestIdType, Response> mReadyResponses;
-    std::mutex mResponderMutex, mCondMutex;
+    std::mutex mCondMutex, mSendMutex, mResponderMutex;
     std::atomic<bool> mAnyReady{false}, mTerminate{false};
     std::condition_variable mResponderCv;
     std::future<void> mResponseFuture;
     std::unique_ptr<DataSender> mSender;
     std::unordered_map<LlmRequest::RequestIdType, int> mRemainSendCount;
+    std::unordered_map<LlmRequest::RequestIdType, RequestInfo> mRequestInfoMap;
     int mDeviceId{-1};
 };
 
diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -69,7 +69,6 @@ void LlmRequest::createSerializedResult(
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
 std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
 {
-    TLLM_CHECK(!isDisaggContextCompleteState());
     if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))
     {
         return std::nullopt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -815,6 +815,9 @@ def _executor_loop_pp(self):
                                 ),
                                 dest=self.dist.next_pp_rank,
                                 tag=prev_microbatch_id)
+                        # TODO: remove this wait, without this wait
+                        # there is an intermittent hang on some nodes.
+                        self.send_handles[prev_microbatch_id].wait()
                     torch.cuda.nvtx.range_pop()
 
                 # Stage 3: Finalize previous batch that finished tokens communication
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -183,7 +183,7 @@ def multi_popen(server_configs):
             )
             raise
 
-    with (MyThreadPoolExecutor(max_workers=16) as thread_pool, temp_dir):
+    with (MyThreadPoolExecutor(max_workers=4) as thread_pool, temp_dir):
         with multi_popen(ctx_servers + gen_servers):
             with popen([
                     trtllm_serve_path, "disaggregated", "-c",
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -259,17 +259,14 @@ examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padd
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5421989)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5430124)
 examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132)
-accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4] SKIP (https://nvbugs/5434320)
 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True] SKIP (https://nvbugs/5427801)
-accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5434320)
 accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache SKIP (https://nvbugs/5433541)
 accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2 SKIP (https://nvbugs/5433541)
 accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4 SKIP (https://nvbugs/5409414)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5409414)
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype SKIP (https://nvbugs/5433543)
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope SKIP (https://nvbugs/5433543)
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5433545)
-accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5434320)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,6 @@ void LlmRequest::createSerializedResult(`
`69`	`69`	`/// Note that there is some dependency on the order of operations in this method. Modify with care!`
`70`	`70`	`std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)`
`71`	`71`	`{`
`72`		`- TLLM_CHECK(!isDisaggContextCompleteState());`
`73`	`72`	`if (!(isFinished() \|\| (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))`
`74`	`73`	`{`
`75`	`74`	`return std::nullopt;`
Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ def multi_popen(server_configs):`
`183`	`183`	`)`
`184`	`184`	`raise`
`185`	`185`
`186`		`- with (MyThreadPoolExecutor(max_workers=16) as thread_pool, temp_dir):`
	`186`	`+ with (MyThreadPoolExecutor(max_workers=4) as thread_pool, temp_dir):`
`187`	`187`	`with multi_popen(ctx_servers + gen_servers):`
`188`	`188`	`with popen([`
`189`	`189`	`trtllm_serve_path, "disaggregated", "-c",`