vllm-project · bbbearxyz · Apr 23, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
@@ -310,7 +310,9 @@ set(VLLM_EXT_SRC
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_EXT_SRC "csrc/minimax_reduce_rms_kernel.cu")
+  list(APPEND VLLM_EXT_SRC
+    "csrc/minimax_reduce_rms_kernel.cu"
+    "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu")
 
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
@@ -1051,7 +1053,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC
     "csrc/moe/moe_wna16.cu"
     "csrc/moe/grouped_topk_kernels.cu"
-    "csrc/moe/router_gemm.cu")
+    "csrc/moe/router_gemm.cu"
+    "csrc/moe/topk_softplus_sqrt_kernels.cu")
 endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")

diff --git a/cmake/external_projects/deepgemm.cmake b/cmake/external_projects/deepgemm.cmake
@@ -20,7 +20,7 @@ else()
   FetchContent_Declare(
     deepgemm
     GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM.git
-    GIT_TAG 477618cd51baffca09c4b0b87e97c03fe827ef03
+    GIT_TAG 891d57b4db1071624b5c8fa0d1e51cb317fa709f
     GIT_SUBMODULES "third-party/cutlass" "third-party/fmt"
     GIT_PROGRESS TRUE
     CONFIGURE_COMMAND ""
@@ -120,6 +120,11 @@ if(DEEPGEMM_ARCHS)
     COMPONENT _deep_gemm_C
     FILES_MATCHING PATTERN "*.py")
 
+  install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/mega/"
+    DESTINATION vllm/third_party/deep_gemm/mega
+    COMPONENT _deep_gemm_C
+    FILES_MATCHING PATTERN "*.py")
+
   # Generate envs.py (normally generated by DeepGEMM's setup.py build step)
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm_envs.py"
     "# Pre-installed environment variables\npersistent_envs = dict()\n")

diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
   FetchContent_Declare(
         flashmla
         GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
+        GIT_TAG a6ec2ba7bd0a7dff98b3f4d3e6b52b159c48d78b
         GIT_PROGRESS TRUE
         CONFIGURE_COMMAND ""
         BUILD_COMMAND ""

@@ -178,7 +178,12 @@ void rotary_embedding_gptj_impl(
 
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       std::optional<torch::Tensor> key, int64_t head_size,
-                      torch::Tensor& cos_sin_cache, bool is_neox) {
+                      torch::Tensor& cos_sin_cache, bool is_neox,
+                      int64_t rope_dim_offset, bool inverse) {
+  TORCH_CHECK(rope_dim_offset == 0,
+              "rope_dim_offset != 0 is not supported on CPU");
+  TORCH_CHECK(!inverse, "inverse rotary embedding is not supported on CPU");
+
   int num_tokens = positions.numel();
   int rot_dim = cos_sin_cache.size(1);
   int num_heads = query.size(-1) / head_size;

@@ -263,7 +263,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "rotary_embedding(Tensor positions, Tensor! query,"
       "                 Tensor!? key, int head_size,"
-      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+      "                 Tensor cos_sin_cache, bool is_neox, int "
+      "rope_dim_offset=0, bool inverse=False) -> ()");
   ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
 
   // Quantization