Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
434d934
feat: support deepseek v4
zyongye Apr 23, 2026
908ab01
chore: pass mypy
ivanium Apr 25, 2026
cf3e417
fix: update cuda requirements
ivanium Apr 25, 2026
c75c382
fix: config
ivanium Apr 25, 2026
5e3525c
Integrate MegaMoE kernel
WoosukKwon Apr 25, 2026
9abe2bd
free up unused weights and support dummy weights
WoosukKwon Apr 25, 2026
f704cf3
[Bugfix] Flatten DeepSeek V32 indexer next_n on non-SM100 archs
zixi-qi Apr 25, 2026
b353527
chore: fix pre-commit
ivanium Apr 25, 2026
618e3b6
fix (ci): interface mismatches
ivanium Apr 25, 2026
6fac86c
Add model information
jeejeelee Apr 26, 2026
d95a973
fix (ci): misc api mismatches
ivanium Apr 26, 2026
36992a0
[Bugfix][CI] Run mooncake HMA worker tests on GPU lane (#241)
zhewenl Apr 26, 2026
6a2e1ed
Merge branch 'main' into feat/dsv4-support
ivanium Apr 26, 2026
e5108f7
CI Failure for deep_gemm and layernorm_fp8_quant
zyongye Apr 26, 2026
f35845c
FIX
jeejeelee Apr 26, 2026
6f95a90
Merge branch 'main' into feat/dsv4-support
jeejeelee Apr 26, 2026
9e5f0da
fix (ci): an e2e OOM issue and a MTP model registery issue
ivanium Apr 26, 2026
f21fcc1
chore: pin tilelang version
ivanium Apr 26, 2026
5cd8311
fix (ci): pre-commit happy
ivanium Apr 26, 2026
999637f
FIX
jeejeelee Apr 26, 2026
ab72e57
FIX
jeejeelee Apr 26, 2026
fe61cd4
Merge branch 'main' into feat/dsv4-support
jeejeelee Apr 26, 2026
b2a9e98
Support DeepSeek V4 on SM120 with Triton fallback
bbbearxyz Apr 26, 2026
d521d3e
add comment
bbbearxyz Apr 26, 2026
10934ad
Merge branch 'main' into support_sm120_deepseekv4
bbbearxyz Apr 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,9 @@ set(VLLM_EXT_SRC
"csrc/torch_bindings.cpp")

if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC "csrc/minimax_reduce_rms_kernel.cu")
list(APPEND VLLM_EXT_SRC
"csrc/minimax_reduce_rms_kernel.cu"
"csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu")

SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

Expand Down Expand Up @@ -1051,7 +1053,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC
"csrc/moe/moe_wna16.cu"
"csrc/moe/grouped_topk_kernels.cu"
"csrc/moe/router_gemm.cu")
"csrc/moe/router_gemm.cu"
"csrc/moe/topk_softplus_sqrt_kernels.cu")
endif()

if(VLLM_GPU_LANG STREQUAL "CUDA")
Expand Down
7 changes: 6 additions & 1 deletion cmake/external_projects/deepgemm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ else()
FetchContent_Declare(
deepgemm
GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM.git
GIT_TAG 477618cd51baffca09c4b0b87e97c03fe827ef03
GIT_TAG 891d57b4db1071624b5c8fa0d1e51cb317fa709f
GIT_SUBMODULES "third-party/cutlass" "third-party/fmt"
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
Expand Down Expand Up @@ -120,6 +120,11 @@ if(DEEPGEMM_ARCHS)
COMPONENT _deep_gemm_C
FILES_MATCHING PATTERN "*.py")

install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/mega/"
DESTINATION vllm/third_party/deep_gemm/mega
COMPONENT _deep_gemm_C
FILES_MATCHING PATTERN "*.py")

# Generate envs.py (normally generated by DeepGEMM's setup.py build step)
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm_envs.py"
"# Pre-installed environment variables\npersistent_envs = dict()\n")
Expand Down
2 changes: 1 addition & 1 deletion cmake/external_projects/flashmla.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
GIT_TAG a6ec2ba7bd0a7dff98b3f4d3e6b52b159c48d78b
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
Expand Down
7 changes: 6 additions & 1 deletion csrc/cpu/pos_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,12 @@ void rotary_embedding_gptj_impl(

void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
std::optional<torch::Tensor> key, int64_t head_size,
torch::Tensor& cos_sin_cache, bool is_neox) {
torch::Tensor& cos_sin_cache, bool is_neox,
int64_t rope_dim_offset, bool inverse) {
TORCH_CHECK(rope_dim_offset == 0,
"rope_dim_offset != 0 is not supported on CPU");
TORCH_CHECK(!inverse, "inverse rotary embedding is not supported on CPU");

int num_tokens = positions.numel();
int rot_dim = cos_sin_cache.size(1);
int num_heads = query.size(-1) / head_size;
Expand Down
3 changes: 2 additions & 1 deletion csrc/cpu/torch_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def(
"rotary_embedding(Tensor positions, Tensor! query,"
" Tensor!? key, int head_size,"
" Tensor cos_sin_cache, bool is_neox) -> ()");
" Tensor cos_sin_cache, bool is_neox, int "
"rope_dim_offset=0, bool inverse=False) -> ()");
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);

// Quantization
Expand Down
Loading
Loading