Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
030c1ec
feat: support deepseek v4
zyongye Apr 23, 2026
30faa6a
chore: pass mypy
ivanium Apr 25, 2026
1df2a80
fix: update cuda requirements
ivanium Apr 25, 2026
8779f9d
fix: config
ivanium Apr 25, 2026
0ff736a
Integrate MegaMoE kernel (#232)
WoosukKwon Apr 25, 2026
8188e4a
Prototype SM120 DeepSeek V4 reference attention
jasl Apr 25, 2026
3856a3f
Allow DeepGEMM to build for SM120 with CUDA 13
jasl Apr 25, 2026
5241faf
Split SM120 sparse attention reference into LSE merge stages
jasl Apr 25, 2026
4034d5d
Add SM120 FP8 indexer logits fallback
jasl Apr 25, 2026
359e334
Register SM120 reference attention env vars
jasl Apr 25, 2026
34b6717
Pin DeepGEMM SM120 prototype dependency
jasl Apr 25, 2026
dd18dc2
Prototype DeepSeek V4 pipeline parallelism
jasl Apr 25, 2026
ca911d7
Generalize sparse MLA reference fallback controls
jasl Apr 25, 2026
0d45c12
Let sparse MLA dump control override legacy alias
jasl Apr 25, 2026
b15fe88
Avoid pinning DeepGEMM SM120 fork
jasl Apr 25, 2026
88ace9e
Keep DeepGEMM SM120 prototype pin
jasl Apr 25, 2026
bf42dc8
Add DeepSeek V4 sparse MLA reference tests
jasl Apr 25, 2026
916b19f
Extract sparse MLA reference helpers
jasl Apr 25, 2026
7674619
Move sparse MLA prefill reference into helper
jasl Apr 25, 2026
ba0771a
Share sparse MLA fallback env handling
jasl Apr 25, 2026
63ac6e7
Use workspace for DeepSeek V4 einsum output
jasl Apr 25, 2026
f2bde65
Add sparse MLA env helper tests
jasl Apr 25, 2026
7b38dd3
Use Triton for sparse MLA sink merge
jasl Apr 25, 2026
d78fc65
Use Triton for sparse MLA subset accumulation
jasl Apr 25, 2026
f21b837
Fuse fp8_ds_mla sparse MLA decode accumulation
jasl Apr 25, 2026
1ac48e4
Fuse fp8_ds_mla paged SWA decode accumulation
jasl Apr 25, 2026
f28fe76
Fuse fp8_ds_mla SWA-only decode fallback
jasl Apr 25, 2026
c918470
Use Triton indexed accumulation for sparse MLA prefill
jasl Apr 25, 2026
9afedca
Fix sparse MLA ruff import ordering
jasl Apr 25, 2026
68f3236
Fuse sparse MLA finish with sink merge
jasl Apr 25, 2026
59172ac
Use multi-head fp8 sparse MLA accumulation
jasl Apr 25, 2026
eeeeee8
Optimize SM12x sparse MLA decode kernels
jasl Apr 25, 2026
1894bda
Fix sparse MLA padded-head state launches
jasl Apr 25, 2026
4ddd9e8
Handle padded sparse MLA output heads
jasl Apr 25, 2026
58f4ee5
Accept padded sparse MLA attention sinks
jasl Apr 25, 2026
a75f327
Drop stale sparse MLA dummy workspace reservation
jasl Apr 25, 2026
6249f9e
Stabilize SM12x DeepSeek V4 sparse MLA fallback
jasl Apr 25, 2026
fad559c
Allow opt-in cudagraphs for SM12x sparse MLA
jasl Apr 25, 2026
617788a
Fuse SM12x sparse MLA decode fallback
jasl Apr 25, 2026
f6302ab
Fix sparse MLA env test import order after refresh
jasl Apr 25, 2026
026e6cb
Update sparse MLA env default test after refresh
jasl Apr 25, 2026
f681984
Update DeepGEMM SM120 pin for HC kernel
jasl Apr 25, 2026
3cca90b
Skip unsupported FlashInfer sparse MLA tests on SM12x
jasl Apr 25, 2026
4c0983e
Tune SM12x sparse MLA decode head grouping
jasl Apr 25, 2026
3fe9199
Fix DeepSeek V4 FP8 einsum config on SM12x
jasl Apr 25, 2026
227b15e
Default SM12x sparse MLA runtime knobs
jasl Apr 25, 2026
53be759
Reject CUTLASS block FP8 scaled MM on SM12x
jasl Apr 25, 2026
6ea0d61
Add SM12x Triton FP8 einsum for DeepSeek V4
jasl Apr 25, 2026
7b158e1
Bump FlashInfer CUDA packages to 0.6.9
jasl Apr 25, 2026
4a08ccb
Add sparse MLA head block tuning
jasl Apr 25, 2026
eab717d
Bump DeepGEMM SM120 reference
jasl Apr 25, 2026
b037165
Add DeepGEMM SM120 paged MQA toggle
jasl Apr 25, 2026
6d6d6f7
Update DeepGEMM SM120 pin
jasl Apr 26, 2026
cc06fe8
temporary disable persistent topk for 1024
zyongye Apr 24, 2026
0ad4dea
Support dummy loading
WoosukKwon Apr 25, 2026
0e8b532
free up unused weights
WoosukKwon Apr 25, 2026
310ac26
Fix DeepSeek V4 MegaMoE test fixture
jasl Apr 26, 2026
6130a6d
Address DeepSeek V4 review nits
jasl Apr 26, 2026
b56e53c
Tune SM12x sparse MLA decode head grouping
jasl Apr 26, 2026
5958f37
Use short-row topk on SM120 indexer
jasl Apr 26, 2026
43ce641
[Kernel] Marlin MoE: include SM 12.x in default arch list
Apr 26, 2026
3e3a30a
[Kernel] Tune default fp8 block-scaled Triton config for M<=8 decode
Apr 26, 2026
8573555
Guard low-M FP8 Triton stages on ROCm
jasl Apr 26, 2026
4d5335d
Speed up SM12x sparse MLA decode with matmul path
jasl Apr 26, 2026
681a817
Reduce SM12x sparse MLA decode KV staging
jasl Apr 26, 2026
3a2dd99
Fuse SM12x sparse MLA decode mask build
jasl Apr 26, 2026
9ffda0c
Extend SM12x low-M FP8 block config
jasl Apr 26, 2026
b7a70b9
Reduce SM12x long-prefill sparse MLA memory
jasl Apr 26, 2026
6652949
Clean up SM12x sparse MLA review issues
jasl Apr 26, 2026
8d0ebb7
Restore SM12x sparse MLA MTP decode fallback
jasl Apr 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ set(VLLM_EXT_SRC
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
"csrc/fused_qknorm_rope_kernel.cu"
"csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/topk.cu"
Expand Down Expand Up @@ -357,11 +358,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# are not supported by Machete yet.

# marlin arches for fp16 output
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
# marlin has limited support for turing
cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
# marlin arches for fp8 input
# - sm80 doesn't support fp8 computation
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
Expand Down Expand Up @@ -1045,7 +1046,8 @@ endif()
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
"csrc/moe/topk_softmax_kernels.cu"
"csrc/moe/topk_softplus_sqrt_kernels.cu")

if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC
Expand Down Expand Up @@ -1078,7 +1080,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# moe marlin arches
# note that we always set `use_atomic_add=False` for moe marlin now,
# so we don't need 9.0 for bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;12.0;12.1" "${CUDA_ARCHS}")
# moe marlin has limited support for turing
cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
# moe marlin arches for fp8 input
Expand Down
12 changes: 10 additions & 2 deletions cmake/external_projects/deepgemm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ else()
# This ref should be kept in sync with tools/install_deepgemm.sh
FetchContent_Declare(
deepgemm
GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM.git
GIT_TAG 477618cd51baffca09c4b0b87e97c03fe827ef03
GIT_REPOSITORY https://github.com/jasl/DeepGEMM.git
GIT_TAG 7a7a41a1bac7dacabe74057e7600e59f98f85bce
GIT_SUBMODULES "third-party/cutlass" "third-party/fmt"
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
Expand All @@ -46,6 +46,9 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
list(APPEND DEEPGEMM_SUPPORT_ARCHS "10.0a")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
list(APPEND DEEPGEMM_SUPPORT_ARCHS "12.0f")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need also 12.1a here for DGX Spark?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

12.0f means for all 12.x family

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see in log

[gpu_model_runner.py:4884] Model loading took 39.77 GiB memory and 522.056257 seconds
Running NVCC command: cd /root/.cache/vllm/deep_gemm/tmp && /usr/local/cuda/bin/nvcc /root/.cache/vllm/deep_gemm/tmp/203-af1d4b4f-d7902775-636c4fd8/kernel.cu -cubin -o /root/.cache/vllm/deep_gemm/tmp/203-af1d4b4f-d7902775-636c4fd8/kernel.cubin -std=c++20 --diag-suppress=39,161,174,177,186,940 --ptxas-options=--register-usage-level=10 -I/usr/local/lib/python3.12/dist-packages/vllm/third_party/deep_gemm/include -gencode=arch=compute_120f,code=sm_120f --compiler-options=-fPIC,-O3,-fconcepts,-Wno-deprecated-declarations,-Wno-abi -O3 --expt-relaxed-constexpr --expt-extended-lambda

Was thinking, is it problem on my side with env, not sure, but would like to see there 121f :) okay, will continue with testing...

endif()

cuda_archs_loose_intersection(DEEPGEMM_ARCHS
"${DEEPGEMM_SUPPORT_ARCHS}" "${CUDA_ARCHS}")
Expand Down Expand Up @@ -120,6 +123,11 @@ if(DEEPGEMM_ARCHS)
COMPONENT _deep_gemm_C
FILES_MATCHING PATTERN "*.py")

install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/mega/"
DESTINATION vllm/third_party/deep_gemm/mega
COMPONENT _deep_gemm_C
FILES_MATCHING PATTERN "*.py")

# Generate envs.py (normally generated by DeepGEMM's setup.py build step)
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm_envs.py"
"# Pre-installed environment variables\npersistent_envs = dict()\n")
Expand Down
2 changes: 1 addition & 1 deletion cmake/external_projects/flashmla.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
GIT_TAG a6ec2ba7bd0a7dff98b3f4d3e6b52b159c48d78b
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
Expand Down
Loading
Loading