fix: nvbugs/5187237: fix deterministic mode crash (#3448)

VALLIS-NERIA · web-flow · commit 5cc1d38958c6 · 2025-04-17T12:01:57.000+08:00
* nvbugs/5187237 nvbugs/5112075: fix deterministic mode error


* remove waive
Signed-off-by: Xiwen Yu &lt;13230610+VALLIS-NERIA@users.noreply.github.com&gt;

* Revert "remove waive"

This reverts commit 0bf5486d19906d692bfb7a6262333c296b0087ac.

Signed-off-by: Xiwen Yu &lt;13230610+VALLIS-NERIA@users.noreply.github.com&gt;

* revert ar fusion

Signed-off-by: Xiwen Yu &lt;13230610+VALLIS-NERIA@users.noreply.github.com&gt;

---------

Signed-off-by: Xiwen Yu &lt;13230610+VALLIS-NERIA@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu
@@ -28,7 +28,8 @@ __global__ void lamport_initialize_kernel(float* ptr, int size)
 
 void lamport_initialize(void* ptr, int bytes, cudaStream_t stream)
 {
-    lamport_initialize_kernel<<<bytes / 128, 128, 0, stream>>>(reinterpret_cast<float*>(ptr), bytes / sizeof(float));
+    int grid_size = (bytes + 127) / 128;
+    lamport_initialize_kernel<<<grid_size, 128, 0, stream>>>(reinterpret_cast<float*>(ptr), bytes / sizeof(float));
 }
 
 Workspace::Workspace(int rank, int tp_size, int max_token_num, int hidden_dim,
diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
@@ -1989,6 +1989,10 @@ void residualRmsNorm(
 void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream)
 {
     sync_check_cuda_error(stream);
+    if (size == 0)
+    {
+        return;
+    }
     switch (dataType)
     {
     case nvinfer1::DataType::kFLOAT:
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
@@ -32,7 +32,8 @@ def get_deepseek_allreduce_workspace(mapping: Mapping) -> torch.LongTensor:
     if mapping not in deepseek_allreduce_workspaces:
         ipc_buffers, workspace = CustomAllReduceHelper.allocate_allreduce_fusion_workspace(
             mapping,
-            CustomAllReduceHelper.max_workspace_size_auto(mapping.tp_size),
+            CustomAllReduceHelper.max_workspace_size_auto(
+                mapping.tp_size, support_deterministic=False),
         )
         deepseek_allreduce_workspaces[mapping] = (ipc_buffers, workspace)
     return deepseek_allreduce_workspaces[mapping][1]
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -517,6 +517,10 @@ def _build_model(self):
         if self.args.kv_cache_config is not None:
             executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
                 self.args.kv_cache_config)
+        if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
+            # Disable KV cache reuse for deterministic mode
+            executor_config.kv_cache_config.enable_block_reuse = False
+            executor_config.kv_cache_config.enable_partial_reuse = False
         if self.args.peft_cache_config is not None:
             executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
                 self.args.peft_cache_config)
diff --git a/tensorrt_llm/plugin/plugin.py b/tensorrt_llm/plugin/plugin.py
@@ -704,8 +704,8 @@ def set_workspace_tensor(self,
         )
 
     @staticmethod
-    def max_workspace_size_auto(tp_size: int) -> int:
-        if force_all_reduce_deterministic():
+    def max_workspace_size_auto(tp_size: int, support_deterministic) -> int:
+        if force_all_reduce_deterministic() and support_deterministic:
             workspace_size = os.getenv("FORCE_ALLREDUCE_KERNEL_WORKSPACE_SIZE",
                                        "1000000000")
             return int(workspace_size)
@@ -746,7 +746,7 @@ def allocate_workspace(mapping: Mapping,
                 lamport_buffers_0.local_ptr,
                 lamport_buffers_1.local_ptr,
                 lamport_buffers_2.local_ptr,
-                size * mapping.tp_size,
+                lamport_buffers_size,
             )
         buffers = [
             ipc_buffers_ping, ipc_buffers_pong, ipc_barriers_in,

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,8 @@ __global__ void lamport_initialize_kernel(float* ptr, int size)`
`28`	`28`
`29`	`29`	`void lamport_initialize(void* ptr, int bytes, cudaStream_t stream)`
`30`	`30`	`{`
`31`		`- lamport_initialize_kernel<<<bytes / 128, 128, 0, stream>>>(reinterpret_cast<float*>(ptr), bytes / sizeof(float));`
	`31`	`+ int grid_size = (bytes + 127) / 128;`
	`32`	`+ lamport_initialize_kernel<<<grid_size, 128, 0, stream>>>(reinterpret_cast<float*>(ptr), bytes / sizeof(float));`
`32`	`33`	`}`
`33`	`34`
`34`	`35`	`Workspace::Workspace(int rank, int tp_size, int max_token_num, int hidden_dim,`
Original file line number	Diff line number	Diff line change
`@@ -1989,6 +1989,10 @@ void residualRmsNorm(`
`1989`	`1989`	`void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream)`
`1990`	`1990`	`{`
`1991`	`1991`	`sync_check_cuda_error(stream);`
	`1992`	`+ if (size == 0)`
	`1993`	`+ {`
	`1994`	`+ return;`
	`1995`	`+ }`
`1992`	`1996`	`switch (dataType)`
`1993`	`1997`	`{`
`1994`	`1998`	`case nvinfer1::DataType::kFLOAT:`