[https://nvbugs/5387771] fix deadlocks due to insufficient numSemaphores (#6262)

PerkzZheng · web-flow · commit 2193ad3aac97 · 2025-07-23T11:20:55.000+08:00
Signed-off-by: Perkz Zheng &lt;67892460+PerkzZheng@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h
@@ -341,6 +341,11 @@ class AttentionOp
 
     void debugCheckSemaphores(cudaStream_t stream);
 
+    [[nodiscard]] int getMultiProcessorCount() const
+    {
+        return mMultiProcessorCount;
+    }
+
     [[nodiscard]] std::string toString() const;
 
     int mLayerIdx = -1;
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -101,7 +101,9 @@ class Runner : public RunnerBase
 
         // Always reserve SemaphoreArray (for multi-block mode) as MMHA may enable multi-block mode when shared memory
         // is not enough.
-        op.reserveSemaphoreArray(op.mNumHeads * max_num_requests);
+        // The attention kernel might split the heads into multiple blocks, so we might need to reserve more semaphores.
+        // Use mMultiProcessorCount as the lower-bound to make sure we reserve enough semaphores.
+        op.reserveSemaphoreArray(std::max(op.mNumHeads * max_num_requests, op.getMultiProcessorCount()));
     }
 
     int64_t getWorkspaceSize(AttentionOp const& op, int const num_tokens, int const max_attention_window_size,

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,9 @@ class Runner : public RunnerBase`
`101`	`101`
`102`	`102`	`// Always reserve SemaphoreArray (for multi-block mode) as MMHA may enable multi-block mode when shared memory`
`103`	`103`	`// is not enough.`
`104`		`- op.reserveSemaphoreArray(op.mNumHeads * max_num_requests);`
	`104`	`+ // The attention kernel might split the heads into multiple blocks, so we might need to reserve more semaphores.`
	`105`	`+ // Use mMultiProcessorCount as the lower-bound to make sure we reserve enough semaphores.`
	`106`	`+ op.reserveSemaphoreArray(std::max(op.mNumHeads * max_num_requests, op.getMultiProcessorCount()));`
`105`	`107`	`}`
`106`	`108`
`107`	`109`	`int64_t getWorkspaceSize(AttentionOp const& op, int const num_tokens, int const max_attention_window_size,`