Skip to content

Commit 2193ad3

Browse files
authored
[https://nvbugs/5387771] fix deadlocks due to insufficient numSemaphores (#6262)
Signed-off-by: Perkz Zheng <[email protected]>
1 parent 9538c8d commit 2193ad3

File tree

2 files changed

+8
-1
lines changed

2 files changed

+8
-1
lines changed

cpp/tensorrt_llm/common/attentionOp.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,11 @@ class AttentionOp
341341

342342
void debugCheckSemaphores(cudaStream_t stream);
343343

344+
[[nodiscard]] int getMultiProcessorCount() const
345+
{
346+
return mMultiProcessorCount;
347+
}
348+
344349
[[nodiscard]] std::string toString() const;
345350

346351
int mLayerIdx = -1;

cpp/tensorrt_llm/thop/attentionOp.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,9 @@ class Runner : public RunnerBase
101101

102102
// Always reserve SemaphoreArray (for multi-block mode) as MMHA may enable multi-block mode when shared memory
103103
// is not enough.
104-
op.reserveSemaphoreArray(op.mNumHeads * max_num_requests);
104+
// The attention kernel might split the heads into multiple blocks, so we might need to reserve more semaphores.
105+
// Use mMultiProcessorCount as the lower-bound to make sure we reserve enough semaphores.
106+
op.reserveSemaphoreArray(std::max(op.mNumHeads * max_num_requests, op.getMultiProcessorCount()));
105107
}
106108

107109
int64_t getWorkspaceSize(AttentionOp const& op, int const num_tokens, int const max_attention_window_size,

0 commit comments

Comments
 (0)