first step new allreduce

nv-lschneider · nv-lschneider · commit 08bf35d91ab6 · 2025-11-05T16:52:52.000-06:00
Signed-off-by: Ludwig Schneider &lt;lschneider@nvidia.com&gt;

better UB init handling

Signed-off-by: Ludwig Schneider &lt;lschneider@nvidia.com&gt;

accept multiple strategies

Signed-off-by: Ludwig Schneider &lt;lschneider@nvidia.com&gt;

test to debug mnnvl

Signed-off-by: Ludwig Schneider &lt;lschneider@nvidia.com&gt;

rebasing and addressing comments

Signed-off-by: Ludwig Schneider &lt;lschneider@nvidia.com&gt;

remove unneeded type decl

Signed-off-by: Ludwig Schneider &lt;lschneider@nvidia.com&gt;
diff --git a/.gitattributes b/.gitattributes
@@ -12,3 +12,4 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_context_wait_performance.png  filter=lfs diff=lfs merge=lfs -text
+ATTRIBUTIONS-CPP.md filter=lfs diff=lfs merge=lfs -text
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1395,6 +1395,7 @@ repos:
         exclude: |
             (?x)^(.*cubin.cpp | .*cubin.h)$
     -   id: check-merge-conflict
+        exclude: ^ATTRIBUTIONS-CPP\.md$
     -   id: check-symlinks
     -   id: detect-private-key
     -   id: end-of-file-fixer
diff --git a/ATTRIBUTIONS-CPP.md b/ATTRIBUTIONS-CPP.md
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c56252c4d635377c3202c34f8f049053ceb9567fc1f243f4fa5ba91d762176b
+size 818494
diff --git a/cpp/tensorrt_llm/kernels/nccl_device/CMakeLists.txt b/cpp/tensorrt_llm/kernels/nccl_device/CMakeLists.txt
@@ -1,3 +1,20 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+
 # CMakeLists.txt for nccl_device This directory contains CUDA kernels and host
 # launcher code
 
@@ -20,9 +37,3 @@ target_include_directories(
 
 # Link libraries
 target_link_libraries(tensorrt_llm_nccl_device tensorrt_llm_common)
-
-# Install target
-install(
-  TARGETS tensorrt_llm_nccl_device
-  LIBRARY DESTINATION lib
-  ARCHIVE DESTINATION lib)
diff --git a/cpp/tensorrt_llm/kernels/nccl_device/config.cu b/cpp/tensorrt_llm/kernels/nccl_device/config.cu
@@ -1,13 +1,23 @@
-/*************************************************************************
- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
  *
- * See LICENSE.txt for license information
- ************************************************************************/
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include "config.h"
 #include "nccl.h"
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0)
-#include "kernels.h"
+#include "kernels.cuh"
 #endif
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/envUtils.h"
@@ -176,19 +186,9 @@ bool TypedLaunchConfig<T>::isValidConfig(int threadsPerBlock, int unrollFactor)
 {
     // Get CUDA device properties
     int dev = -1;
-    cudaError_t cudaStatus = cudaGetDevice(&dev);
-    if (cudaStatus != cudaSuccess)
-    {
-        TLLM_LOG_ERROR("Failed to get CUDA device: " + std::string(cudaGetErrorString(cudaStatus)));
-        return false;
-    }
+    TLLM_CUDA_CHECK(cudaGetDevice(&dev));
     cudaDeviceProp deviceProp;
-    cudaStatus = cudaGetDeviceProperties(&deviceProp, dev);
-    if (cudaStatus != cudaSuccess)
-    {
-        TLLM_LOG_ERROR("Failed to get CUDA device properties: " + std::string(cudaGetErrorString(cudaStatus)));
-        return false;
-    }
+    TLLM_CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 
     // Check threads per block limits
     if (threadsPerBlock <= 0 || threadsPerBlock > deviceProp.maxThreadsPerBlock)
@@ -217,13 +217,7 @@ bool TypedLaunchConfig<T>::isValidConfig(int threadsPerBlock, int unrollFactor)
 
     // Get actual register and shared memory usage from the kernel
     cudaFuncAttributes funcAttrib;
-    cudaError_t attrStatus = cudaFuncGetAttributes(&funcAttrib, reinterpret_cast<void const*>(kernelPtr));
-    if (attrStatus != cudaSuccess)
-    {
-        TLLM_LOG_WARNING(
-            "Failed to get kernel attributes for validation: " + std::string(cudaGetErrorString(attrStatus)));
-        return false;
-    }
+    TLLM_CUDA_CHECK(cudaFuncGetAttributes(&funcAttrib, reinterpret_cast<void const*>(kernelPtr)));
 
     // Check register usage
     int const totalRegistersPerBlock = funcAttrib.numRegs * threadsPerBlock;
diff --git a/cpp/tensorrt_llm/kernels/nccl_device/config.h b/cpp/tensorrt_llm/kernels/nccl_device/config.h
@@ -1,8 +1,18 @@
-/*************************************************************************
- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
  *
- * See LICENSE.txt for license information
- ************************************************************************/
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #ifndef TRTLLM_NCCL_DEVICE_CONFIG_H
 #define TRTLLM_NCCL_DEVICE_CONFIG_H
@@ -115,8 +125,6 @@ template <typename T>
 class TypedLaunchConfig : public LaunchConfig
 {
 private:
-    nvinfer1::DataType mType;
-
     // Private templated helper function to get kernel pointer for specific unroll factor
     template <int Nunroll>
     void* getKernelPtrForUnroll() const;
diff --git a/cpp/tensorrt_llm/kernels/nccl_device/kernels.cuh b/cpp/tensorrt_llm/kernels/nccl_device/kernels.cuh
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef TRTLLM_NCCL_DEVICE_KERNELS_H
-#define TRTLLM_NCCL_DEVICE_KERNELS_H
+#ifndef TRTLLM_NCCL_DEVICE_KERNELS_CUH
+#define TRTLLM_NCCL_DEVICE_KERNELS_CUH
 
 #include "constants.h"
 #include "multimem.h"
@@ -252,4 +252,4 @@ __global__ void fusedAllReduceRMSNormKernel(ncclWindow_t input_win, ncclWindow_t
 
 } // namespace tensorrt_llm::kernels::nccl_device
 
-#endif // TRTLLM_NCCL_DEVICE_KERNELS_H
+#endif // TRTLLM_NCCL_DEVICE_KERNELS_CUH
diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@@ -558,19 +558,14 @@ class AllreduceOp
                     norm_weight.value().data_ptr(), nullptr, devComm, mEps, stream);
                 return {norm_out, residual_out};
             }
-            else
-            {
-                // Fall back to old strategy with warning
-                TLLM_LOG_WARNING(
-                    "NCCL device Fused AR not supported for data type %d, hidden size %d & %d nRanks on current "
-                    "architecture. Falling back to standard allreduce + separate RMSNorm.",
-                    static_cast<int>(mType), hidden_size, nRanks);
-
-                goto default_case;
-            }
+            // Fall back to old strategy with warning
+            TLLM_LOG_WARNING(
+                "NCCL device Fused AR not supported for data type %d, hidden size %d & %d nRanks on current "
+                "architecture. Falling back to standard allreduce + separate RMSNorm.",
+                static_cast<int>(mType), hidden_size, nRanks);
         }
+        // Intentional fallthrough to default
         default:
-        default_case:
             NCCLCHECK(ncclAllReduce(
                 ub_buffer0.addr, ub_buffer1.addr, size, (*getDtypeMap())[mType], ncclSum, *rawComm, stream));
             return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, norm_out);
diff --git a/setup.py b/setup.py
@@ -275,7 +275,7 @@ def extract_from_precompiled(precompiled_location: str, package_data: List[str],
     package_data={
         'tensorrt_llm': package_data,
     },
-    license_files=get_license(),
+    license_files=["LICENSE", "ATTRIBUTIONS-CPP.md"],
     entry_points={
         'console_scripts': [
             'trtllm-build=tensorrt_llm.commands.build:main',
diff --git a/tests/microbenchmarks/all_reduce.py b/tests/microbenchmarks/all_reduce.py
diff --git a/tests/unittest/_torch/multi_gpu/test_nccl_device.py b/tests/unittest/_torch/multi_gpu/test_nccl_device.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:7c56252c4d635377c3202c34f8f049053ceb9567fc1f243f4fa5ba91d762176b`
	`3`	`+size 818494`