NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/common/cudaUtils.h‎
Lines changed: 0 additions & 6 deletions b/‎cpp/include/tensorrt_llm/common/cudaUtils.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 58 additions & 252 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 58 additions & 252 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h‎
Lines changed: 16 additions & 20 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h‎
Lines changed: 16 additions & 20 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/.clang-format‎
Lines changed: 0 additions & 78 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/.clang-format‎
Lines changed: 0 additions & 78 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h‎
Lines changed: 2 additions & 11 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h‎
Lines changed: 2 additions & 11 deletions
@@ -19,9 +19,6 @@
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
-#if ENABLE_FP4
-#include <cuda_fp4.h>
-#endif
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/tllmException.h"
 #include <algorithm>
@@ -548,9 +545,6 @@ template void printArrayInfo(__nv_bfloat16 const* ptr, uint64_t nElement, std::s
 #ifdef ENABLE_FP8
 template void printArrayInfo(__nv_fp8_e4m3 const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 #endif
-#ifdef ENABLE_FP4
-template void printArrayInfo(__nv_fp4_e2m1 const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
-#endif
 template void printArrayInfo(uint32_t const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 template void printArrayInfo(uint64_t const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 template void printArrayInfo(int const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 
@@ -68,54 +68,50 @@ class TrtllmGenBatchedGemmRunner
         int32_t configIndex) const;
 
     // Generic GEMM interface
-    void run(int32_t m, int32_t n, int32_t k, int32_t validM, int32_t validN, int32_t validK,
-        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim,
-        void const* a, void const* sfA, void const* b, void const* sfB, void const* perTokensSfA,
-        void const* perTokensSfB, float const* scaleC, float const* scaleGateC, float const* bias,
-        float const* swiGluAlpha, float const* swiGluBeta, float const* clampLimit, void* c, void* outSfC,
-        int32_t const* routeMap, int32_t const* totalNumPaddedTokens, int32_t const* ctaIdxXyToBatchIdx,
-        int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas, void* workspace, CUstream stream,
-        int device, int32_t configIndex);
+    void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, int32_t numTokens,
+        int32_t numBatches, int32_t maxNumCtasInBatchDim, void const* a, void const* sfA, void const* b,
+        void const* sfB, void const* perTokensSfA, void const* perTokensSfB, float const* scaleC,
+        float const* scaleGateC, float const* bias, float const* swiGluAlpha, float const* swiGluBeta,
+        float const* clampLimit, void* c, void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens,
+        int32_t const* ctaIdxXyToBatchIdx, int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas,
+        void* workspace, CUstream stream, int device, int32_t configIndex);
 
     // Block-scaling GEMM
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* sfA,
         void const* b, void const* sfB, void* c, void* outSfC, void* workspace, CUstream stream, int device,
-        int32_t configIndex, int32_t validM = -1, int32_t validN = -1, int32_t validK = -1);
+        int32_t configIndex);
 
     // Block-scaling GEMM with SwiGLU activation
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* sfA,
         void const* b, void const* sfB, float const* bias, float const* swiGluAlpha, float const* swiGluBeta,
         float const* clampLimit, void* c, void* outSfC, void* workspace, CUstream stream, int device,
-        int32_t configIndex, int32_t validM = -1, int32_t validN = -1, int32_t validK = -1);
+        int32_t configIndex);
 
     // FP8 per-tensor scaling GEMM
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* b,
         float const* scaleC, float const* scaleGateC, void* c, void* workspace, CUstream stream, int device,
-        int32_t configIndex, int32_t validM = -1, int32_t validN = -1, int32_t validK = -1);
+        int32_t configIndex);
 
     // Get the list of configs that passed the validation based on the constructor options
     [[nodiscard]] std::vector<int64_t> getPassingConfigIndices() const
     {
         return mPassingConfigIndices;
     }
 
-    // Get the kernel name from the config index
-    [[nodiscard]] std::string getKernelNameFromConfigIndex(int32_t configIndex) const;
-
     // Get the list of config indices that are valid for the given problem shape
     [[nodiscard]] std::vector<int64_t> getValidConfigIndices(int32_t m, int32_t n, int32_t k,
-        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim,
-        int32_t validM = -1, int32_t validN = -1, int32_t validK = -1) const;
+        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches,
+        int32_t maxNumCtasInBatchDim) const;
 
     // Get a default config index that is valid for the given problem shape
     // This will be used as the fallback config if using auto-tuning
     [[nodiscard]] int64_t getDefaultValidConfigIndex(int32_t m, int32_t n, int32_t k,
-        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim,
-        int32_t validM = -1, int32_t validN = -1, int32_t validK = -1) const;
+        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches,
+        int32_t maxNumCtasInBatchDim) const;
 
     [[nodiscard]] bool isValidConfigIndex(int32_t configIndex, int32_t m, int32_t n, int32_t k,
-        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim,
-        int32_t validM = -1, int32_t validN = -1, int32_t validK = -1) const;
+        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches,
+        int32_t maxNumCtasInBatchDim) const;
 
 private:
     void selectGemmConfig(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, int32_t numTokens,
 
@@ -16,8 +16,8 @@
  */
 #pragma once
 
-#include <string>
 #include <cassert>
+#include <string>
 
 namespace batchedGemm
 {
@@ -34,9 +34,7 @@ enum class RouteImpl
     // Use LDGSTS to do the routing
     Ldgsts = 1,
     // Use UTMALDG.GATHER4 to do the routing
-    Tma = 2,
-    // Use LDG+STS to do the routing
-    LdgPlusSts = 3
+    Tma = 2
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -62,13 +60,6 @@ inline bool doesRouteImplUseTma(RouteImpl mode)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-inline bool doesRouteImplUseLdgPlusSts(RouteImpl mode)
-{
-    return (mode == RouteImpl::LdgPlusSts);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace batchedGemm
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////