NVIDIA · litaotju · Sep 16, 2025 · Jun 17, 2025 · Jun 17, 2025 · Jul 2, 2025
@@ -215,6 +215,7 @@ endif()
 include_directories(
   SYSTEM
   ${CUDAToolkit_INCLUDE_DIRS}
+  ${CUDAToolkit_INCLUDE_DIRS}/cccl
   ${CUDNN_ROOT_DIR}/include
   $<TARGET_PROPERTY:TensorRT::NvInfer,INTERFACE_INCLUDE_DIRECTORIES>
   ${3RDPARTY_DIR}/cutlass/include
@@ -477,7 +478,6 @@ print(os.path.dirname(torch.__file__),end='');"
       endif()
     endif()
   endif()
-
 else()
   if(NOT WIN32)
     if(NOT USE_CXX11_ABI)

@@ -138,6 +138,9 @@ function(setup_cuda_architectures)
         message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}")
       endif()
     endforeach()
+    if("103" IN_LIST CMAKE_CUDA_ARCHITECTURES_CLEAN)
+      list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN "100")
+    endif()
     list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_CLEAN)
     set(CMAKE_CUDA_ARCHITECTURES_RAW ${CMAKE_CUDA_ARCHITECTURES_CLEAN})
   endif()
@@ -150,6 +153,9 @@ function(setup_cuda_architectures)
     if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.7")
       list(APPEND CMAKE_CUDA_ARCHITECTURES_RAW 100 120)
     endif()
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9")
+      list(APPEND CMAKE_CUDA_ARCHITECTURES_RAW 103)
+    endif()
   endif()
 
   # CMAKE_CUDA_ARCHITECTURES_ORIG contains all architectures enabled, without
@@ -160,7 +166,14 @@ function(setup_cuda_architectures)
       ${CMAKE_CUDA_ARCHITECTURES_ORIG}
       PARENT_SCOPE)
 
-  set(ARCHITECTURES_WITH_KERNELS 80 86 89 90 100 120)
+  set(ARCHITECTURES_WITH_KERNELS
+      80
+      86
+      89
+      90
+      100
+      103
+      120)
   foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
     if(NOT ${CUDA_ARCH} IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
       add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}")

diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h
@@ -311,6 +311,16 @@ inline int getSMVersion()
     return sm;
 }
 
+inline int getSMFamily()
+{
+    int sm = getSMVersion();
+    if (sm == 100 || sm == 103)
+    {
+        return 100;
+    }
+    return sm;
+}
+
 inline int getDevice()
 {
     int deviceID{0};

diff --git a/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh b/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh
@@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType()
     }
 }
 
-PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
+PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
 {
     // Get pointer to `cuTensorMapEncodeTiled`
     cudaDriverEntryPointQueryResult driver_status;
@@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()
 
     if (driver_status != cudaDriverEntryPointSuccess)
         throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess");
-    return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);
+    return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);
 }
 
 template <typename T>
 CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes,
-    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr)
+    uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
 {
     CUtensorMap tensor_map{};
     constexpr uint32_t rank = 2;

diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -2531,22 +2531,22 @@ int AttentionOp::initialize() noexcept
     if (mFP8ContextFMHA)
     {
         TLLM_CHECK_WITH_INFO(mEnableContextFMHA, "FP8 FMHA cannot be enabled because Context FMHA is not supported.");
-        TLLM_CHECK_WITH_INFO(mSM == 89 || mSM == 90 || mSM == 100 || mSM == 120 || mSM == 121,
+        TLLM_CHECK_WITH_INFO(mSM == 89 || mSM == 90 || mSM == 100 || mSM == 103 || mSM == 120 || mSM == 121,
             "FP8 FMHA can only be enabled on sm_89, sm_90, sm_100, sm_120 or sm_121.");
     }
 
     // Pre-Check of FP8 Generation MLA.
     if (mFP8GenerationMLA)
     {
         TLLM_CHECK_WITH_INFO(mIsMLAEnabled, "FP8 Generation MLA cannot be enabled because MLA is not supported.");
-        TLLM_CHECK_WITH_INFO(mSM == 89 || mSM == 90 || mSM == 100 || mSM == 120 || mSM == 121,
+        TLLM_CHECK_WITH_INFO(mSM == 89 || mSM == 90 || mSM == 100 || mSM == 103 || mSM == 120 || mSM == 121,
             "FP8 Generation MLA is supported on Ada, Hopper or Blackwell architecture.");
     }
 
     // Check requirements for FP4 output.
     TLLM_CHECK_WITH_INFO(!mFuseFp4Quant || mEnableContextFMHA, "Context FMHA must enable if fuse_fp4_quant is enabled");
-    TLLM_CHECK_WITH_INFO(!mFuseFp4Quant || mSM == 100 || mSM == 120 || mSM == 121,
-        "fuse_fp4_quant only supports SM100 or SM120 or SM121 devices.");
+    TLLM_CHECK_WITH_INFO(!mFuseFp4Quant || (mSM == 100 || mSM == 103) || mSM == 120 || mSM == 121,
+        "fuse_fp4_quant only supports SM100f or SM120 or SM121 devices.");
 
     // Check requirements for FP4 KV cache.
     TLLM_CHECK_WITH_INFO(!mKVCacheQuantMode.hasFp4KvCache() || mFP8ContextFMHA,

diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@@ -155,6 +155,8 @@ enum class CutlassTileConfigSM100 : int
     CtaShape128x256x256B = shape_tuple_to_enum(128, 256, 256),
 };
 
+using CutlassTileConfigSM103 = CutlassTileConfigSM100;
+
 enum class CutlassTileConfigSM120 : int
 {
     // Signals that we should run heuristics do choose a config
@@ -411,16 +413,17 @@ struct CutlassGemmConfig
     CutlassGemmConfig(CutlassTileConfigSM100 tile_config_sm100, MainloopScheduleType mainloop_schedule,
         EpilogueScheduleType epilogue_schedule, ClusterShape cluster_shape,
         ClusterShape dynamic_cluster_shape = ClusterShape::Undefined,
-        ClusterShape fallback_cluster_shape = ClusterShape::Undefined)
+        ClusterShape fallback_cluster_shape = ClusterShape::Undefined, int sm_version = 100)
         : tile_config_sm100(tile_config_sm100)
         , mainloop_schedule(mainloop_schedule)
         , epilogue_schedule(epilogue_schedule)
         , cluster_shape(cluster_shape)
         , dynamic_cluster_shape(dynamic_cluster_shape)
         , fallback_cluster_shape(fallback_cluster_shape)
-        , sm_version(100)
+        , sm_version(sm_version)
         , is_tma_warp_specialized(true)
     {
+        assert(sm_version >= 100 && sm_version < 120 && "Expected SM 10x version");
     }
 
     CutlassGemmConfig(CutlassTileConfigSM120 tile_config_sm120, MainloopScheduleType mainloop_schedule,

diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
@@ -21,7 +21,14 @@ foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
   if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9)
     # The FP4-related conversion instructions in DeepEP require SM100a, SM110a,
     # or SM120a.
-    if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
+    if(${CUDA_ARCH_MAJOR} EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
+      if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.31)
+        list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100f${CUDA_ARCH_POSTFIX}")
+      else()
+        list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100a${CUDA_ARCH_POSTFIX}"
+             "103a${CUDA_ARCH_POSTFIX}")
+      endif()
+    elseif(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
       list(APPEND DEEP_EP_CUDA_ARCHITECTURES
            "${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}a${CUDA_ARCH_POSTFIX}")
     else()
@@ -134,6 +141,8 @@ ExternalProject_Add(
                 ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
   COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i
           src/CMakeLists.txt
+  COMMAND sed "s/_STANDARD 11/_STANDARD 17/" -i src/device/CMakeLists.txt
+  COMMAND sed "s/_STANDARD 11/_STANDARD 17/" -i src/CMakeLists.txt
   COMMAND patch -p1 --forward --batch -i
           ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
   CMAKE_CACHE_ARGS

diff --git a/cpp/tensorrt_llm/executor/tensor.cpp b/cpp/tensorrt_llm/executor/tensor.cpp
@@ -63,9 +63,9 @@ DataType Tensor::getDataType() const
     case nvinfer1::DataType::kBF16: return DataType::kBF16;
     case nvinfer1::DataType::kINT64: return DataType::kINT64;
     case nvinfer1::DataType::kINT4: [[fallthrough]] /* do nothing */;
-    case nvinfer1::DataType::kFP4: /* do nothing */;
+    case nvinfer1::DataType::kFP4: [[fallthrough]] /* do nothing */;
+    default: TLLM_THROW("Unsupported data type");
     }
-    TLLM_THROW("Unsupported data type");
 }
 
 MemoryType Tensor::getMemoryType() const

diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu
@@ -134,15 +134,14 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
     sync_check_cuda_error(stream);
 }
 
-template <typename T>
-__global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
+__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
     FinishedState const* finished, int const* endIds, float const* diversityRates,
     runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
 {
     int const bid = blockIdx.x; // Index of request in batch
     runtime::SizeType32 const slot = batchSlots[bid];
     float const diversityRate{diversityRates[slot]};
-    T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
+    float* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
 
     for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
     {
@@ -160,13 +159,30 @@ __global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __res
     return;
 }
 
-template __global__ void addCumLogProbs<float>(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
+__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
     FinishedState const* finished, int const* endIds, float const* diversityRates,
-    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
+{
+    int const bid = blockIdx.x; // Index of request in batch
+    runtime::SizeType32 const slot = batchSlots[bid];
+    float const diversityRate{diversityRates[slot]};
+    half* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2;
 
-template __global__ void addCumLogProbs<half>(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
-    FinishedState const* finished, int const* endIds, float const* diversityRates,
-    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+    for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x)
+    {
+        int const iBMIn = i / (nBMOut * 2);
+        if (finished[slot * nBMIn + iBMIn].isFinished())
+        {
+            pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f;
+        }
+        else
+        {
+            // nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer
+            pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn;
+        }
+    }
+    return;
+}
 
 __global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
     size_t const nBMIn, size_t const nBMOut, size_t const nV)

diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.h b/cpp/tensorrt_llm/kernels/beamSearchKernels.h
@@ -130,8 +130,11 @@ void invokeTopkBeamSearch(T const* logProbs, T const* bias, void* workspace, Bea
 void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& bh,
     runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);
 
-template <typename T>
-__global__ void addCumLogProbs(T* __restrict pStage1Probs, float const* __restrict cumLogProbs,
+__global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
+    FinishedState const* finished, int const* endIds, float const* diversityRates,
+    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+
+__global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
     FinishedState const* finished, int const* endIds, float const* diversityRates,
     runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
 

diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
@@ -84,7 +84,7 @@ FusedMHARunnerV2::FusedMHARunnerV2(MHARunnerFixedParams fixedParams)
     : mFixedParams(fixedParams)
 {
     TLLM_CHECK_WITH_INFO((mSM == kSM_80 || mSM == kSM_86 || mSM == kSM_89 || mSM == kSM_90 || mSM == kSM_100
-                             || mSM == kSM_120 || mSM == kSM_121),
+                             || mSM == kSM_103 || mSM == kSM_120 || mSM == kSM_121),
         "Unsupported architecture");
     TLLM_CHECK_WITH_INFO((mFixedParams.dataType == DATA_TYPE_FP16 || mFixedParams.dataType == DATA_TYPE_BF16
                              || mFixedParams.dataType == DATA_TYPE_E4M3),
@@ -347,7 +347,7 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)
     bool const isSm8x = (mSM == kSM_86 || mSM == kSM_89);
     bool const isSm80 = (mSM == kSM_80);
     bool const isSm89 = (mSM == kSM_89);
-    bool const isSm100 = (mSM == kSM_100);
+    bool const isSm100f = (mSM == kSM_100 || mSM == kSM_103);
     bool const isSm120f = (mSM == kSM_120 || mSM == kSM_121);
 
     // Sliding_or_chunked_causal mask.
@@ -416,7 +416,7 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)
             // flash attention tiled kernel is faster on Ada and Ampere derivatives when head_size>=256
             mLaunchParams.granular_tiling = false;
         }
-        else if (isSm80 || isSm8x || isSm100 || isSm120f)
+        else if (isSm80 || isSm8x || isSm100f || isSm120f)
         {
             // otherwise, choose tiled kernel for Ampere/Ada/Gb20x
             mLaunchParams.granular_tiling = true;

diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
@@ -72,17 +72,28 @@ function(process_target target_name enable_hopper enable_blackwell)
 
   if(${enable_blackwell}
      AND ("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
+          OR "103" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
           OR "120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
-          OR "121" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG))
+          OR "121" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
+         ))
 
-    if("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
+    # Both 100 and 103 support these kernels
+    if("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
+       OR "103" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
       # No kernels should be parsed, unless blackwell is specified. This is a
       # build time improvement
       target_compile_definitions(${target_name}
                                  PUBLIC COMPILE_BLACKWELL_TMA_GEMMS)
       target_compile_definitions(${target_name}
                                  PUBLIC COMPILE_BLACKWELL_TMA_GROUPED_GEMMS)
     endif()
+    # SM103 only kernels
+    if("103" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
+      target_compile_definitions(${target_name}
+                                 PUBLIC COMPILE_BLACKWELL_SM103_TMA_GEMMS)
+      target_compile_definitions(
+        ${target_name} PUBLIC COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS)
+    endif()
     if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
        OR "121" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
       target_compile_definitions(${target_name}
@@ -113,6 +124,8 @@ function(add_instantiations library base_dir)
     list(LENGTH INSTANTIATIONS_GENERATED_${ARCH} n)
     if(${n} GREATER 0)
       set(TARGET_NAME "_${library}_instantiations_${ARCH}")
+      message(
+        STATUS "Adding target ${TARGET_NAME} with instantiations for ${ARCH}")
       add_library(${TARGET_NAME} OBJECT ${INSTANTIATIONS_GENERATED_${ARCH}})
       target_link_libraries(${library} PRIVATE ${TARGET_NAME})
       set_cuda_architectures(${TARGET_NAME} ${BUILD_ARCHS})
@@ -125,9 +138,10 @@ function(add_instantiations library base_dir)
     endif()
   endmacro()
 
-  glob_src_create_target(80 "80;86")
+  glob_src_create_target(80 "80;86;90;100f;120f")
   glob_src_create_target(90 90)
   glob_src_create_target(100 100f)
+  glob_src_create_target(103 103)
   glob_src_create_target(120 120f)
 endfunction()
 
@@ -240,7 +254,7 @@ if(USING_OSS_CUTLASS_MOE_GEMM)
   process_target(_moe_gemm_hopper_fp4 true false)
 
   add_library(_moe_gemm_fp4 OBJECT ${MOE_GEMM_SRC_CU_FP4})
-  set_cuda_architectures(_moe_gemm_fp4 100f 120f)
+  set_cuda_architectures(_moe_gemm_fp4 100f 103 120f)
   process_target(_moe_gemm_fp4 false true)
 
   add_library(_moe_gemm_fp8 OBJECT ${MOE_GEMM_SRC_CU_FP8})
+4 −2		README.md
+471 −0		csrc/apis/gemm.hpp
+85 −0		csrc/apis/layout.hpp
+28 −0		csrc/apis/runtime.hpp
+6 −4		csrc/jit/compiler.hpp
+4 −2		csrc/jit/device_runtime.hpp
+1 −1		csrc/jit/handle.hpp
+2 −2		csrc/jit/kernel_runtime.hpp
+6 −3		csrc/jit_kernels/heuristics/common.hpp
+2 −2		csrc/jit_kernels/heuristics/sm100.hpp
+7 −3		csrc/jit_kernels/heuristics/sm90.hpp
+143 −0		csrc/jit_kernels/impls/sm100_bf16_gemm.hpp
+3 −2		csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp
+3 −2		csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp
+229 −0		csrc/jit_kernels/impls/sm90_bf16_gemm.hpp
+3 −2		csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp
+55 −8		csrc/jit_kernels/impls/smxx_layout.hpp
+6 −399		csrc/python_api.cpp
+10 −3		csrc/utils/exception.hpp
+38 −10		deep_gemm/__init__.py
+6 −5		deep_gemm/include/deep_gemm/common/scheduler.cuh
+76 −0		deep_gemm/include/deep_gemm/common/sm90_utils.cuh
+18 −0		deep_gemm/include/deep_gemm/common/utils.cuh
+495 −1		deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh
+3 −4		deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh
+8 −5		deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d2d.cuh
+341 −1		deep_gemm/include/deep_gemm/impls/sm90_bf16_gemm.cuh
+1 −1		deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh
+39 −0		deep_gemm/include/deep_gemm/impls/smxx_layout.cuh
+0 −3		pyproject.toml
+4 −0		setup.py
+34 −22		tests/generators.py
+125 −0		tests/test_bf16.py
+3 −3		tests/test_fp8.py
+29 −17		tests/test_layout.py
+15 −0		tests/test_lazy_init.py