vllm-project · jasl · Apr 23, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
@@ -294,6 +294,7 @@ set(VLLM_EXT_SRC
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
   "csrc/fused_qknorm_rope_kernel.cu"
+  "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu"
   "csrc/layernorm_quant_kernels.cu"
   "csrc/sampler.cu"
   "csrc/topk.cu"
@@ -357,11 +358,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # are not supported by Machete yet.
 
   # marlin arches for fp16 output
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
   # marlin has limited support for turing
   cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
-  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
   # marlin arches for fp8 input
   # - sm80 doesn't support fp8 computation
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
@@ -1045,7 +1046,8 @@ endif()
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/topk_softmax_kernels.cu")
+  "csrc/moe/topk_softmax_kernels.cu"
+  "csrc/moe/topk_softplus_sqrt_kernels.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC
@@ -1078,7 +1080,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # moe marlin arches
   # note that we always set `use_atomic_add=False` for moe marlin now,
   # so we don't need 9.0 for bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
   # moe marlin has limited support for turing
   cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # moe marlin arches for fp8 input

diff --git a/cmake/external_projects/deepgemm.cmake b/cmake/external_projects/deepgemm.cmake
@@ -19,8 +19,8 @@ else()
   # This ref should be kept in sync with tools/install_deepgemm.sh
   FetchContent_Declare(
     deepgemm
-    GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM.git
-    GIT_TAG 477618cd51baffca09c4b0b87e97c03fe827ef03
+    GIT_REPOSITORY https://github.com/jasl/DeepGEMM.git
+    GIT_TAG 7a7a41a1bac7dacabe74057e7600e59f98f85bce
     GIT_SUBMODULES "third-party/cutlass" "third-party/fmt"
     GIT_PROGRESS TRUE
     CONFIGURE_COMMAND ""
@@ -46,6 +46,9 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
 elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
   list(APPEND DEEPGEMM_SUPPORT_ARCHS "10.0a")
 endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  list(APPEND DEEPGEMM_SUPPORT_ARCHS "12.0f")
+endif()
 
 cuda_archs_loose_intersection(DEEPGEMM_ARCHS
   "${DEEPGEMM_SUPPORT_ARCHS}" "${CUDA_ARCHS}")
@@ -120,6 +123,11 @@ if(DEEPGEMM_ARCHS)
     COMPONENT _deep_gemm_C
     FILES_MATCHING PATTERN "*.py")
 
+  install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/mega/"
+    DESTINATION vllm/third_party/deep_gemm/mega
+    COMPONENT _deep_gemm_C
+    FILES_MATCHING PATTERN "*.py")
+
   # Generate envs.py (normally generated by DeepGEMM's setup.py build step)
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm_envs.py"
     "# Pre-installed environment variables\npersistent_envs = dict()\n")

diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
   FetchContent_Declare(
         flashmla
         GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
+        GIT_TAG a6ec2ba7bd0a7dff98b3f4d3e6b52b159c48d78b
         GIT_PROGRESS TRUE
         CONFIGURE_COMMAND ""
         BUILD_COMMAND ""