NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/customAllReduceKernels.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/customAllReduceKernels.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/nccl_device/CMakeLists.txt‎
Lines changed: 34 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/nccl_device/CMakeLists.txt‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/nccl_device/config.cu‎
Lines changed: 389 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/nccl_device/config.cu‎
Lines changed: 389 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/nccl_device/config.h‎
Lines changed: 136 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/nccl_device/config.h‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/nccl_device/constants.h‎
Lines changed: 32 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/nccl_device/constants.h‎
Lines changed: 32 additions & 0 deletions
@@ -86,3 +86,4 @@ add_subdirectory(groupRmsNormKernels)
 add_subdirectory(llama4MinLatencyKernels)
 add_subdirectory(dsv3MinLatencyKernels)
 add_subdirectory(causalConv1d)
+add_subdirectory(nccl_device)
@@ -58,6 +58,7 @@ enum class AllReduceStrategyType : int8_t
     LOWPRECISION = 6,
     MNNVL = 7,
     NCCL_SYMMETRIC = 8,
+    NCCL_DEVICE = 9,
 };
 
 enum class AllReduceStrategyConfig : int8_t
 
@@ -0,0 +1,34 @@
+# CMakeLists.txt for nccl_device
+# This directory contains CUDA kernels and host launcher code
+
+# Enable CUDA
+enable_language(CUDA)
+
+# Create CUDA library
+add_library(tensorrt_llm_nccl_device
+    config.cu
+)
+
+# Set properties for the CUDA library
+set_target_properties(tensorrt_llm_nccl_device PROPERTIES
+    CUDA_STANDARD 17
+    CUDA_SEPARABLE_COMPILATION ON
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# Include directories
+target_include_directories(tensorrt_llm_nccl_device PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../..
+)
+
+# Link libraries
+target_link_libraries(tensorrt_llm_nccl_device
+    tensorrt_llm_common
+)
+
+# Install target
+install(TARGETS tensorrt_llm_nccl_device
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib
+)
@@ -0,0 +1,136 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TRTLLM_NCCL_DEVICE_CONFIG_H
+#define TRTLLM_NCCL_DEVICE_CONFIG_H
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <typeinfo>
+#include <vector>
+#include <cassert>
+#include <cuda_runtime.h>
+#include "nccl.h"
+#include "nccl_device.h"
+#include "vector_types.h"
+#include "constants.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/dataType.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
+
+namespace tensorrt_llm::kernels::nccl_device {
+
+    // Kernel launch information helper class
+    class LaunchConfig {
+    public:
+        const int hidden_dim;
+        const int num_tokens;
+        const int nRanks;
+        const int rank;
+        const bool useResidual;
+        const bool useBias;
+        const bool unshardResidualOut;
+    protected:
+        int token_per_rank;
+        int start_token;
+        bool valid;
+        int threadsPerBlock;
+        int unrollFactor;
+
+        std::pair<int, int> pickLaunchCombo(const std::vector<std::pair<int,int>>& options);
+
+    public:
+        // Constructor with dynamic block size calculation
+        LaunchConfig(const int hidden_dim, const int num_tokens, const int rank, const int nRanks, bool useResidual, bool useBias, bool unshardResidualOut);
+
+        inline int getThreadsPerBlock() const { return this->threadsPerBlock; }
+        int getUnrollFactor() const{ return this->unrollFactor;}
+        virtual bool getValid()const=0;
+        int getBlocksPerRank() const {return this->token_per_rank;}
+        int getStartToken()const {return this->start_token;}
+        virtual int getElementsPerVector()const = 0;
+        virtual nvinfer1::DataType getDataType()const =0;
+        virtual void* getKernelPtr() const = 0;
+        virtual bool isValidConfig(int threadsPerBlock, int unrollFactor, int blocksPerRank) const = 0;
+        
+        // Launcher functions as member functions
+        void launchRMSNorm(ncclWindow_t inWindow, ncclWindow_t outWindow,
+                          const void* const residual, ncclWindow_t residualOutWindow,
+                          const void* const weight, const void* const bias,
+                          ncclDevComm devComm, const float eps, cudaStream_t stream) const;
+        
+        bool supportsMultimem() const;
+    
+    protected:
+        // Pure virtual launch function that must be implemented by derived classes
+        virtual void launchKernel(ncclWindow_t inWindow, ncclWindow_t outWindow,
+                                 const void* const residual, ncclWindow_t residualOutWindow,
+                                 const void* const weight, const void* const bias,
+                                 ncclDevComm devComm, const float eps, cudaStream_t stream) const = 0;
+    
+        // Logging output
+        std::string getLoggingString() const;
+    };
+
+  
+    // Kernel launch information helper class
+    template<typename T>
+    class TypedLaunchConfig : public LaunchConfig {
+    private:
+        nvinfer1::DataType mType;
+        
+        // Private templated helper function to get kernel pointer for specific unroll factor
+        template<int Nunroll>
+        void* getKernelPtrForUnroll() const;
+        
+        // Private helper function to get kernel pointer for any unroll factor
+        void* getKernelPtrForUnrollFactor(int unrollFactor) const;
+        
+        // Private helper function to launch kernel for any unroll factor
+        void launchKernelForUnrollFactor(ncclWindow_t inWindow, ncclWindow_t outWindow,
+                                        const void* const residual, ncclWindow_t residualOutWindow,
+                                        const void* const weight, const void* const bias,
+                                        ncclDevComm devComm, const float eps, cudaStream_t stream,
+                                        const dim3& gridDim, const dim3& blockDim, const size_t sharedMemSize) const;
+        
+        // Private templated helper function to launch kernel for specific unroll factor
+        template<int Nunroll>
+        void launchKernelForUnrollImpl(ncclWindow_t inWindow, ncclWindow_t outWindow,
+                                       const void* const residual, ncclWindow_t residualOutWindow,
+                                       const void* const weight, const void* const bias,
+                                       ncclDevComm devComm, const float eps, cudaStream_t stream,
+                                       const dim3& gridDim, const dim3& blockDim, const size_t sharedMemSize,
+                                       bool useResidual, bool useBias, bool unshardResidualOut,
+                                       int startToken, int hiddenDim, int numTokens) const;
+        
+    public:
+        using TN = typename VectorType<T>::type;
+        constexpr static int elementsPerVector = sizeof(TN) / sizeof(T);
+    public:
+
+        virtual int getElementsPerVector() const {return this->elementsPerVector;}
+        virtual void* getKernelPtr() const override { return getKernelPtrForUnrollFactor(this->unrollFactor); }
+        virtual bool isValidConfig(int threadsPerBlock, int unrollFactor, int blocksPerRank) const override;
+        
+        // Launch function that handles all the type-specific logic internally
+        virtual void launchKernel(ncclWindow_t inWindow, ncclWindow_t outWindow,
+                                 const void* const residual, ncclWindow_t residualOutWindow,
+                                 const void* const weight, const void* const bias,
+                                 ncclDevComm devComm, const float eps, cudaStream_t stream) const override;
+  
+        // Constructor with dynamic block size calculation
+        TypedLaunchConfig(const int hidden_dim, const int num_tokens, const int rank, const int nRanks, bool useResidual, bool useBias, bool unshardResidualOut);
+        nvinfer1::DataType getDataType()const{return tensorrt_llm::runtime::TRTDataType<T>::value;}
+        virtual bool getValid()const{ return this->valid;}
+
+    };
+
+    std::shared_ptr<LaunchConfig> makeLaunchConfig(nvinfer1::DataType dataType, const int hidden_dim, const int num_tokens, const int rank, const int nRanks, bool useResidual, bool useBias, bool unshardResidualOut);
+
+} // namespace tensorrt_llm::kernels::nccl_device
+
+#endif // TRTLLM_NCCL_DEVICE_CONFIG_H 
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRTLLM_NCCL_DEVICE_CONSTANTS_H
+#define TRTLLM_NCCL_DEVICE_CONSTANTS_H
+
+#include <cstdint>
+
+namespace tensorrt_llm::kernels::nccl_device {
+
+// CUDA and kernel constants
+    constexpr int kWarpSize = 32;
+    constexpr int kMaxThreadsPerBlock = 256;  // Maximum block size configurable for performance. Corresponse to shared memory requirement for cub::BlockReduce
+    constexpr int kMinThreadsPerBlock = kWarpSize;   // Minimum block size is a warp.
+    constexpr int kMaxUnrollFactor = 8; // We require manual instantiation and switches. Changing the number is not good enough, see launcher function for details
+    constexpr bool kUnshardCompletely = true;
+} // namespace tensorrt_llm::kernels::nccl_device
+
+#endif // TRTLLM_NCCL_DEVICE_CONSTANTS_H