Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
1081 commits
Select commit Hold shift + click to select a range
2ea20a1
[Bugfix] fix when skip tokenizer init (#21922)
lengrongfu Aug 1, 2025
5e0a899
security policy: take 1 (#21119)
sidhpurwala-huzaifa Aug 1, 2025
024bae4
[Bugfix] [Performance] DeepEPHighThroughput + DeepSeek : Quant before…
varun-sundar-rabindranath Aug 1, 2025
23e9231
Enable headless models for pooling in the Transformers backend (#21767)
hmellor Aug 1, 2025
d939b3f
[Misc] Minor enhancement of benchmark_moe (#22068)
jeejeelee Aug 1, 2025
8d45d88
Fix pre-commit failure for SECURTIY.md (#22102)
mgoin Aug 1, 2025
0361fbc
[compile][startup] Disable C++ compilation of symbolic shapes (#20836)
anijain2305 Aug 1, 2025
46b5ada
Introduce RayPPCommunicator for ray-based PP (#21660)
ruisearch42 Aug 1, 2025
3c9cf54
Add lora test for tp>1 case for TPU. (#21970)
vanbasten23 Aug 1, 2025
cfa5d09
[BugFix] Harden distributed DP startup (#21538)
njhill Aug 1, 2025
61cfee8
[CI] Initial tests for SM100 Blackwell runner (#21877)
mgoin Aug 1, 2025
8854ac4
[Perf] Optimize `reshape_and_cache_flash` CUDA Kernel (#22036)
yewentao256 Aug 1, 2025
c30510f
feat: Add Support GPTQ Quantization MOE on ROCM vllm serve (#21733)
JartX Aug 2, 2025
fe51612
[V1][CUDA] Full cudagraph support for FlashInfer (#21367)
fhl2000 Aug 2, 2025
e01266e
[Model] Qwen2.5 VL SiLU-and-Mul (#22066)
vllmellm Aug 2, 2025
8e1e504
[Misc] `VLLM_TARGET_DEVICE.lower()` (#22101)
NickLucche Aug 2, 2025
cf16bc2
[Misc] DeepGemmExperts : Avoid JIT generation in the hot-path (#21955)
varun-sundar-rabindranath Aug 2, 2025
1b1dcc7
[Speculators][Speculative Decoding] Add Qwen Eagle3 Support (#21835)
dsikka Aug 2, 2025
a698cfc
[BugFix] Improve internal DP load balancing (#21617)
njhill Aug 2, 2025
5adc4f7
[Test] Add Unit Test for Batched DeepGEMM (#21559)
yewentao256 Aug 2, 2025
d857eba
[Attention][DBO] Add support for "splitting" the CommonAttentionMetad…
SageMoore Aug 2, 2025
5e1e33c
[FEAT][ROCm] Enable running Flash Attention as ViT attn backend for Q…
vllmellm Aug 2, 2025
9280075
[Misc] Getting and passing ray runtime_env to workers (#22040)
ruisearch42 Aug 2, 2025
aab9737
Fix test_kv_sharing_fast_prefill flakiness (#22038)
sarckk Aug 2, 2025
b81b304
[Bugfix] Mamba2 remove bugged initial state condition in chunk scan (…
cyang49 Aug 2, 2025
cc3a06f
docs: remove deprecated disable-log-requests flag (#22113)
Aug 2, 2025
2d6070c
[PERF] Use faster way of decode in tokenizer: avoid useless list-to-l…
vadiklyutiy Aug 2, 2025
3ebe718
for glm-4.1V update (#22000)
zRzRzRzRzRzRzR Aug 2, 2025
ee9e5c1
[Model] Mamba2 preallocate SSM output tensor to avoid d2d copy overhe…
cyang49 Aug 2, 2025
852dbd9
[Frontend] Improve error message for too many mm items (#22114)
DarkLight1337 Aug 2, 2025
f2f2c1b
[V1] [Hybrid] Validate compatibility of attention backend batch reord…
tdoublep Aug 2, 2025
a118857
[xpu]support moe models on XPU platform (#21643)
yma11 Aug 2, 2025
8dbd196
Revert "[compile][startup] Disable C++ compilation of symbolic shapes…
xiszishu Aug 2, 2025
e9d7c1d
[Misc] Bump ray to 2.48.0 (#22123)
ruisearch42 Aug 3, 2025
3294e45
[Fix] Fix llama4 modelopt weight loading error (#22107)
jiahanc Aug 3, 2025
c4f893f
[Misc] Add tensor schema test coverage for multimodal models (#21754)
Isotr0py Aug 3, 2025
93610de
[Benchmark] Support ready check timeout in `vllm bench serve` (#21696)
yeqcharlotte Aug 3, 2025
9bb57ec
Support CUTLASS NVFP4 (w4a4) for Blackwell Geforce GPUs (SM120) (#21309)
LopezCastroRoberto Aug 3, 2025
6437727
[Misc] update doc comment for send (#22026)
andyxning Aug 3, 2025
4dcef5b
[executor] feat: add supports_pp attr to executors (#21786)
eric-haibin-lin Aug 3, 2025
6e053b9
[V1] [P/D] Refactor KV Connector Path (#21980)
sdavidbd Aug 3, 2025
a383e60
[Responses API] Disable response store by default (#22137)
WoosukKwon Aug 3, 2025
11b59d1
[CI/Build][Bugfix] Fix Qwen2.5 tests in CPU CI via fallback silu_and_…
bigPYJ1151 Aug 3, 2025
2f2a07f
Add chat doc in quick start (#21213)
TankNee Aug 3, 2025
2dad4e6
fuse fp32 for GLM-4.5 e_score_correction_bias (#22143)
zRzRzRzRzRzRzR Aug 3, 2025
ce5621e
[Bugfix] Fix failing multimodal standard test (#22153)
Isotr0py Aug 3, 2025
e76ae81
Use `aiohttp` connection pool for benchmarking (#21981)
eicherseiji Aug 4, 2025
8438063
[fix] fix correct assertion syntax error in attention utils. (#22154)
skyloevil Aug 4, 2025
ce75d12
[RLHF] Fix torch.dtype not serializable in example (#22158)
22quinn Aug 4, 2025
a2b02eb
[PD] add test for chat completions endpoint (#21925)
Abirdcfly Aug 4, 2025
c362f3c
remove duplicate code within cleanup_dist_env_and_memory (#22147)
andyxning Aug 4, 2025
7ba6a65
Add tree attention backend for v1 (part 1) (#20401)
TheEpicDolphin Aug 4, 2025
925ae38
[refactor] improve ConstantList exception specificity (#22156)
skyloevil Aug 4, 2025
6dbbfa2
Remove index_put from MM embeddings merging (#22105)
chenxi-yang Aug 4, 2025
941ff09
[CI Bugfix] Fix wNa16 kernel not found for test_shared_storage_connec…
tlrmchlsmth Aug 4, 2025
0e1c84d
[Misc] Modify the organization of GLM series (#22171)
jeejeelee Aug 4, 2025
65e17c1
[feat] move WEIGHT_SCALE_SUPPORTED into raise block to accelerate RLH…
weixiao-huang Aug 4, 2025
34df04c
[Bugfix] Fix failing GGUF models test (#22174)
Isotr0py Aug 4, 2025
254d1e8
[Sampler] Support returning all logprobs or logits (#21792)
22quinn Aug 4, 2025
2090680
[Doc] Update pooling model docs (#22186)
DarkLight1337 Aug 4, 2025
5f73d90
Fix Arcee model weight loading: Add custom load_weights (#21725)
alyosha-swamy Aug 4, 2025
0732912
[Responses API] Ignore `store=True` and process the request by defaul…
WoosukKwon Aug 4, 2025
48e3973
[Bug] Update auto_tune.sh to separate benchmarking and profiling. (#2…
ericehanley Aug 4, 2025
4207e96
[Bugfix][V1][P/D]Fix the uneven polling issue in the toy proxy for P2…
Abatom Aug 4, 2025
efca991
[NVIDIA] Auto detect modelopt quant and fix DSR1-FP4 weight loading (…
nvpohanh Aug 5, 2025
4344398
[Bugfix] V1 Fix the cursor leakage issue during request scheduling. (…
CLFutureX Aug 5, 2025
87b30bc
Revert "[Bugfix] V1 Fix the cursor leakage issue during request sched…
WoosukKwon Aug 5, 2025
99eea67
[V1] reduce block size for tree attention correctness test to fix 'ou…
TheEpicDolphin Aug 5, 2025
8e3248b
[V0 deprecation][P/D] Deprecate v0 `KVConnectorBase` code (1/2) (#21785)
lk-chen Aug 5, 2025
09585eb
[FEAT] Refactor ROPE into module (#22192)
tjtanaa Aug 5, 2025
bbeddff
[ROCm][Bugfix] Compilation passes fix (#22202)
gshtras Aug 5, 2025
6bada90
self.gate dtype update for GLM-4.5 (#22203)
zRzRzRzRzRzRzR Aug 5, 2025
e88a388
[Log] DeepGEMM Update Log for Unaligned Problem Size (#22208)
yewentao256 Aug 5, 2025
1004673
fix: kimi_k2 return empty tool call list (#22149)
tlipoca9 Aug 5, 2025
d496544
[Misc] Remove pass_config from CompilationConfig dump_json excluded (…
elvischenv Aug 5, 2025
a239863
[Doc] add backend to doc string of initialize_model_parallel (#22142)
andyxning Aug 5, 2025
8ea25cd
[Misc] log more detailed message for ensure_model_parallel_initialize…
andyxning Aug 5, 2025
0f9fda8
Optimize configuration access with LRU cache in custom ops (#22204)
skyloevil Aug 5, 2025
3ef9666
[Bugfix] Misaligned params in TreeAttentionImpl (#22226)
DarkLight1337 Aug 5, 2025
1368161
[UX] Fail if an invalid attention backend is specified (#22217)
mgoin Aug 5, 2025
2535ed4
[Core] Factor out common logic for MM budget calculation (#22228)
DarkLight1337 Aug 5, 2025
5559f6c
[Model] Pooling model activation supports per request control by Pool…
noooop Aug 5, 2025
7270051
[Docs][TPU] Highlight TPU Software version selection (#22242)
NickLucche Aug 5, 2025
36f9361
Migrate KimiVLImagePixelInputs to TensorSchema (#21769)
bbeckca Aug 5, 2025
9c14e03
[Feature] Non-contiguous Support for FP8 Quantization (#21961)
yewentao256 Aug 5, 2025
fb2712f
[NVIDIA] Support Flashinfer TRT-LLM Prefill Attention Kernel (#22095)
elvischenv Aug 5, 2025
690f05c
[Misc] correct static type check for GroupCoordinator (#21946)
andyxning Aug 5, 2025
2ec868e
[V0 Deprecation][TPU] Remove V1 flag check from tests (#22248)
NickLucche Aug 5, 2025
8550273
Use UV_LINK_MODE=copy in Dockerfile to avoid hardlink fail (#22128)
mgoin Aug 5, 2025
43b3fce
[CI/Build] Update flashinfer to 0.2.9 (#22233)
mgoin Aug 5, 2025
2ac177c
[Refactor] Remove Unused Environment Variable `VLLM_NO_DEPRECATION_WA…
yewentao256 Aug 5, 2025
41deb30
[V1] port xformers backend to v1 (#21342)
TheEpicDolphin Aug 5, 2025
364baf6
[bugfix] fix blackwell deepep installation (#22255)
youkaichao Aug 5, 2025
3e95f70
[CI][TPU] Fix docker clean up (#22271)
lsy323 Aug 5, 2025
7429b46
[Bugfix] Remove faulty test for oot attention backend (#22286)
mgoin Aug 6, 2025
6a621c3
[Bugfix] Fix 3D input passed into cutlass_scaled_mm (#22278)
mgoin Aug 6, 2025
ec9612f
[Bugfix] Fix MoE BNB version (#22260)
jeejeelee Aug 6, 2025
226eeea
[Perf] Parallelize fill_bitmask to accelerate high-throughput guided …
benchislett Aug 6, 2025
56763b6
[Bugfix] Skip dead and non-GPU nodes for Ray DP engine allocation (#2…
ruisearch42 Aug 6, 2025
7b16b53
[Bugfix][CI/Build][ROCm] Make sure to use the headers from the build …
gshtras Aug 6, 2025
a463fb5
Upgrade FA3 for attention sink (#22313)
WoosukKwon Aug 6, 2025
f8ac961
Increase openai-python version (#22316)
WoosukKwon Aug 6, 2025
25ad72a
Add attention sink in attention backends (#22320)
WoosukKwon Aug 6, 2025
4dfff78
Update transformers to `v4.55` (#21931)
hmellor Aug 6, 2025
cf9ea63
Add GPT-OSS model code and config [1/N] (#22327)
WoosukKwon Aug 6, 2025
e326c3d
[ROCm] Add attention sink to use_rocm_custom_paged_attention (#22329)
WoosukKwon Aug 6, 2025
e23598a
[GptOss] Add GptOss reasoning parser to support structure output (#22…
heheda12345 Aug 6, 2025
de251f9
[gpt-oss] flashinfer attention sink init (#22330)
zyongye Aug 6, 2025
b3bd1f2
[gpt-oss] Add openai-harmony as default dependency (#22332)
WoosukKwon Aug 6, 2025
bbf3923
[Misc] Clean up duplicated hf overrides (#22311)
Isotr0py Aug 6, 2025
1fa11ea
[gpt-oss] Add Tool/ConversationContext classes and harmony_utils (#22…
WoosukKwon Aug 6, 2025
a482ecb
[gpt-oss] add model to supported models doc (#22336)
Aug 6, 2025
11be35f
[gpt-oss] Support chat completion api (#22342)
WoosukKwon Aug 6, 2025
1655e4c
[Minor] Fix type (#22347)
WoosukKwon Aug 6, 2025
48d892e
[BugFix] Fix FA2 RuntimeError when sinks is provided (#22365)
LucasWilkinson Aug 6, 2025
8b14e38
add the codes to check AMD Instinct GPU number (#22367)
zhangnju Aug 6, 2025
cc074b2
fix
jinzhen-lin Aug 7, 2025
cdb6d54
fix
jinzhen-lin Aug 7, 2025
ff94983
fix
jinzhen-lin Aug 7, 2025
4694099
fix
jinzhen-lin Aug 7, 2025
e2ee111
fix
jinzhen-lin Aug 7, 2025
a94893a
fix fp4 layer process
jinzhen-lin Aug 7, 2025
a29da80
[BugFix] Fix triton compile error in `kernel_unified_attention_2/3d` …
LucasWilkinson Aug 6, 2025
5912df5
[Bugfix] Make condition in triton kernel constexpr (#22370)
gshtras Aug 6, 2025
ceb4a80
[gpt-oss] Add loop for built-in tool call (#22374)
WoosukKwon Aug 6, 2025
1310438
[gpt-oss] Enhance error msg on attention sink init (#22335)
zyongye Aug 6, 2025
daaf5c7
[gpt-oss] flashinfer mxfp4 (#22339)
zyongye Aug 6, 2025
3d6ead9
[v1] - Mamba1 Attention Metadata (#21249)
Josephasafg Aug 7, 2025
2c9ea84
[Bug] Fix B200 DeepGEMM E8M0 Accuracy Issue (#22399)
yewentao256 Aug 7, 2025
4bf71e3
[gpt-oss] add demo tool server (#22393)
heheda12345 Aug 7, 2025
f89f198
[gpt-oss] fix model config with hf_config (#22401)
zyongye Aug 7, 2025
a9d7b0b
Fix trtllm-gen attention env and add attention sink (#22378)
IwakuraRein Aug 7, 2025
9051985
Update `flashinfer-python==0.2.10` (#22389)
mgoin Aug 7, 2025
f8101c6
[model] Support MiniCPM-V 4.0 (#22166)
tc-mb Aug 7, 2025
11bb5da
Support encoder_only attention for FlexAttention (#22273)
maxdebayser Aug 7, 2025
b2c7ff2
[Attention] Support multiple attention metadata builders per kv_cache…
LucasWilkinson Aug 7, 2025
677076f
[XPU]Fix `flash_attn_varlen_func` interface on xpu (#22350)
jikunshang Aug 7, 2025
79dd15f
[Qwen3] Enable dual-chunk-attention support for Qwen3 models. (#21924)
sighingnow Aug 7, 2025
a84af5c
[Bugfix] Fix wrong method name in Intern-S1 image processor (#22417)
DarkLight1337 Aug 7, 2025
600f0f2
Use float32 for test_completion.py (#22385)
mgoin Aug 7, 2025
220b984
[Bugfix]: Fix the streaming output for function calls in the minimax …
qscqesze Aug 7, 2025
f92c018
[Bugfix] Add proper comparison for package versions (#22314)
syedmba Aug 7, 2025
5329d9a
Update `hf_xet` pin to resolve hangs (#22356)
hmellor Aug 7, 2025
ceeafed
Optimize logger init performance by using module-level constants (#22…
skyloevil Aug 7, 2025
0405339
preload heavy modules when mp method is forkserver (#22214)
lionelvillard Aug 7, 2025
6002d81
[gpt-oss] Convert user input to harmony format (#22402)
heheda12345 Aug 7, 2025
e285926
[Bugfix] EPLB load statistics problem (#22167)
david6666666 Aug 7, 2025
8fb5c56
[CI] Skip the pooling models that do not support transformers v4.55 (…
noooop Aug 7, 2025
de47ec7
[Bench] Split serve.py:main into async/async versions (#22405)
lk-chen Aug 7, 2025
f0e4a8f
[Model] Switch to Fused RMS norm in Qwen2.5_VL model. (#22184)
vllmellm Aug 7, 2025
380a826
[Frontend] Update OpenAI error response to upstream format (#22099)
msanft Aug 7, 2025
1bee7ec
[Misc] Support routing logic simulation (#21990)
minosfuture Aug 7, 2025
2a8e85e
feat: Add --enable-log-outputs flag for logging model generations (#2…
mizadri Aug 7, 2025
5a35780
init frondend
jinzhen-lin Aug 7, 2025
e1b2854
fix
jinzhen-lin Aug 7, 2025
64874b1
fix scale
jinzhen-lin Aug 7, 2025
27f67f8
fix interleave
jinzhen-lin Aug 7, 2025
e949f61
activation func test
jinzhen-lin Aug 7, 2025
657f9ad
fix activation
jinzhen-lin Aug 7, 2025
1b3037b
fix
jinzhen-lin Aug 8, 2025
2af5b48
Update csrc/moe/marlin_moe_wna16/kernel.h
jinzhen-lin Aug 8, 2025
e07f4b6
fix format
jinzhen-lin Aug 8, 2025
ff5d7c9
fix
jinzhen-lin Aug 8, 2025
f329092
fix format
jinzhen-lin Aug 8, 2025
88a15f6
[Docs] Add missing dependency for docs build (#22435)
hmellor Aug 7, 2025
8987866
Add H20-3e fused MoE kernel tuning configs for GLM-4.5 (#22433)
JaceyShao Aug 7, 2025
8838a14
[Misc] Enhance code formatting in mxfp4.py (#22423)
WoosukKwon Aug 7, 2025
bc5711f
[Doc] Fix link to prefix caching design (#22384)
sarckk Aug 7, 2025
7c50c73
[Docs] Factor out troubleshooting to its own guide; add section for R…
crypdick Aug 7, 2025
69ea93f
[Doc] update docs for nightly benchmarks (#12022)
andrewkchan Aug 7, 2025
c50d484
[Docs] Update features/disagg_prefill, add v1 examples and developmen…
david6666666 Aug 7, 2025
5ba005b
[Core] Store only the keys for multi-modal data in P0 (#22198)
DarkLight1337 Aug 7, 2025
8841232
[Bugfix] Add missing `packed_modules_mapping` to `DeepseekV2ForCausal…
fxmarty-amd Aug 7, 2025
9f0edb0
[Tool] Fix auto tool call (#22434)
heheda12345 Aug 7, 2025
4b3ac39
[gpt-oss] Generate ResponseOutputItem from Harmony Message (#22410)
heheda12345 Aug 7, 2025
f2b502f
Fix pre-commit error in main (#22462)
WoosukKwon Aug 7, 2025
b84e781
[Core] Simplify mm processing cache (#22457)
DarkLight1337 Aug 7, 2025
7ec1dfa
[Frontend] Use engine argument to control MM cache size (#22441)
DarkLight1337 Aug 7, 2025
dc8ffbf
Remove `from_dict` from `SpeculativeConfig` (#22451)
hmellor Aug 7, 2025
df5e699
[Misc] normalize multiprocessing Queue usage (#22371)
andyxning Aug 8, 2025
c8df47b
[ROCm] [V1] [SpecDec] Enable Speculative Decoding on ROCm V1 Engine (…
tjtanaa Aug 8, 2025
c5d21f4
[PERF] Use pybase64 to more quickly decode prompt embeddings (#22469)
qthequartermasterman Aug 8, 2025
4f12acf
Add ModelOpt Qwen3 nvfp4 support (#20101)
Edwardf0t1 Aug 8, 2025
a0484d1
Support Tensorrt-LLM MoE fp4 for low-latency (#21331)
wenscarl Aug 8, 2025
21759ac
Fix Flashinfer CUTLASS MOE Allgather (#21963)
wenscarl Aug 8, 2025
cc5977f
[Kernel] Add support for block FP8 on SM120 (NVIDIA 5090 and RTX PRO …
0xjunhao Aug 8, 2025
3308d73
[Bugfix] Fix RuntimeError: Index put requires the source and destinat…
chaunceyjiang Aug 8, 2025
9e76de0
not tie_word_embeddings for glm-4.5 and glm-4.5v (#22460)
zRzRzRzRzRzRzR Aug 8, 2025
cc12828
Optimize MiniCPMO mask creation with vectorized implementation (#22464)
skyloevil Aug 8, 2025
63c8132
Fix pre-commit (#22487)
DarkLight1337 Aug 8, 2025
0e49a59
[bugfix] Fix Llama3/4 issues caused by FlashInfer 0.2.10 (#22426)
nvpohanh Aug 8, 2025
b354589
[Doc] Sleep mode documentation (#22310)
iAmir97 Aug 8, 2025
784c4e3
[bench] Fix benchmark/serve.py to ignore unavailable results (#22382)
lk-chen Aug 8, 2025
1bd12bc
fix topk
jinzhen-lin Aug 8, 2025
43c3dea
fix topk
jinzhen-lin Aug 8, 2025
e3a4420
[CI/Build] Fix multimodal tests (#22491)
DarkLight1337 Aug 8, 2025
ed4b805
[Misc] Begin deprecation of `get_tensor_model_*_group` (#22494)
DarkLight1337 Aug 8, 2025
60c4022
[Misc] fix openai version (#22485)
lengrongfu Aug 8, 2025
0c882d0
[BugFix] Don't cancel asyncio tasks directly from destructors (#22476)
njhill Aug 8, 2025
bb65f55
[Docs] Improve API docs (+small tweaks) (#22459)
hmellor Aug 8, 2025
108ee94
Remove exception for Python 3.8 typing from linter (#22506)
hmellor Aug 8, 2025
c219316
[gpt-oss] triton kernel mxfp4 (#22421)
zyongye Aug 8, 2025
6484dad
disable m_block_size_8 temporarily
jinzhen-lin Aug 8, 2025
4b63d40
fix moe_block_size_8
jinzhen-lin Aug 8, 2025
53871fa
fix idx
jinzhen-lin Aug 8, 2025
78710bd
fix
jinzhen-lin Aug 8, 2025
2634144
fix format
jinzhen-lin Aug 8, 2025
481bf21
disable on hopper
jinzhen-lin Aug 8, 2025
5a93660
[Benchmark] Add benchmark tool for multi turn conversations (#20267)
pliops-daniels Aug 8, 2025
20473c8
[gpt-oss] guard import when triton kernel is not installed (#22529)
zyongye Aug 8, 2025
4153b92
[Docs] Rename “Distributed inference and serving” to “Parallelism & S…
crypdick Aug 8, 2025
8f4ee95
[gpt-oss] Support tool call and implement MCP tool server (#22427)
heheda12345 Aug 8, 2025
17ab1c1
[BugFix] Fix IMA FlashMLA full cuda-graph and DP + Update FlashMLA (#…
LucasWilkinson Aug 8, 2025
327a7f5
[Misc] DeepGEMM : Avoid JIT generation in the hot-path (#22215)
varun-sundar-rabindranath Aug 8, 2025
9ce2b24
[Bugfix] Update FA commit hash (#22546)
tdoublep Aug 8, 2025
555741f
Skip Qwen 1 in CI because remote code is no longer compatible with Tr…
hmellor Aug 8, 2025
ccd6439
[Docs] fix broken links in metrics.md (#22315)
GuyStone Aug 8, 2025
d4d6bdf
[Frontend] Add unix domain socket support (#18097)
yyweiss Aug 8, 2025
229d7da
Extract `CompilationConfig` from `config.py` (#22524)
hmellor Aug 8, 2025
11240e0
Drop flaky test_healthcheck_response_time (#22539)
russellb Aug 8, 2025
4af55c2
[XPU] upgrade torch 2.8 on for XPU (#22300)
jikunshang Aug 9, 2025
4e9d93a
[BugFix] [P/D] Handle lookahead token count edge-case with Eagle Spec…
Pradyun92 Aug 9, 2025
2eebe6d
update use_marlin condition
jinzhen-lin Aug 9, 2025
234d02b
Merge branch 'main' of github.com:vllm-project/vllm into marlin-mxfp4…
jinzhen-lin Aug 9, 2025
1f48276
fix
jinzhen-lin Aug 9, 2025
59a1e4f
Merge branch 'main' of github.com:vllm-project/vllm into marlin-mxfp4…
jinzhen-lin Aug 10, 2025
1af6f1f
update activation and use_marlin condition
jinzhen-lin Aug 10, 2025
3e918f6
Merge branch 'main' into marlin-mxfp4-bias
mgoin Aug 10, 2025
7b61670
Fix precommit
mgoin Aug 10, 2025
b04cdbb
Fix _should_use_marlin
mgoin Aug 10, 2025
71a1246
fix
jinzhen-lin Aug 11, 2025
1dbcc42
Merge branch 'marlin-mxfp4-bias' of github.com:jinzhen-lin/vllm into …
jinzhen-lin Aug 11, 2025
37df2f0
fix _can_support_mxfp4
jinzhen-lin Aug 11, 2025
b34ca30
fix nvcc warning
jinzhen-lin Aug 11, 2025
53022e6
fix opcheck args
jinzhen-lin Aug 12, 2025
4b43856
fix shared memory size
jinzhen-lin Aug 12, 2025
3d14a28
fix format
jinzhen-lin Aug 12, 2025
da93a71
fix _gptq_marlin_gemm_fake
jinzhen-lin Aug 12, 2025
eb3214e
fix nvcc warning
jinzhen-lin Aug 12, 2025
f3e34ef
fix format
jinzhen-lin Aug 12, 2025
2293ff1
Merge branch 'main' of github.com:vllm-project/vllm into marlin-mxfp4…
jinzhen-lin Aug 12, 2025
b63a6c0
Merge branch 'main' of github.com:vllm-project/vllm into marlin-mxfp4…
jinzhen-lin Aug 12, 2025
2032ded
update CMakeLists.txt
jinzhen-lin Aug 12, 2025
7b5632d
Merge branch 'main' into marlin-mxfp4-bias
mgoin Aug 12, 2025
62357e8
fix gptq marlin bias permute
jinzhen-lin Aug 13, 2025
3143ff4
fix get_kernel_cache_size
jinzhen-lin Aug 13, 2025
590a3b1
Merge branch 'marlin-mxfp4-bias' of github.com:jinzhen-lin/vllm into …
jinzhen-lin Aug 13, 2025
b764bbb
add bias permute for fp8 marlin
jinzhen-lin Aug 13, 2025
69fe493
add missing bias permute
jinzhen-lin Aug 13, 2025
1dff08b
Merge branch 'main' into marlin-mxfp4-bias
mgoin Aug 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}")
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")

list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})

Expand All @@ -364,7 +366,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_SRCS}"
CUDA_ARCHS "${MARLIN_ARCHS}")
set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")

message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
Expand Down Expand Up @@ -854,6 +859,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MOE_WNAA16_MARLIN_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")

list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})

Expand Down
1 change: 1 addition & 0 deletions benchmarks/kernels/benchmark_machete.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
a=bt.a,
c=None,
b_q_weight=w_q,
b_bias=None,
b_scales=w_s,
global_scale=None,
b_zeros=w_zp,
Expand Down
2 changes: 2 additions & 0 deletions csrc/core/scalar_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ static inline constexpr auto kFE3M2f =
ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
static inline constexpr auto kFE4M3fn =
ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
static inline constexpr auto kFE8M0fnu =
ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
Expand Down
15 changes: 15 additions & 0 deletions csrc/moe/marlin_moe_wna16/generate_kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
TEMPLATE = ("template __global__ void Marlin<"
"{{scalar_t}}, "
"{{w_type_id}}, "
"{{s_type_id}}, "
"{{threads}}, "
"{{thread_m_blocks}}, "
"{{thread_n_blocks}}, "
Expand Down Expand Up @@ -77,6 +78,7 @@ def generate_new_kernels():
if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
continue
# nvfp4 only supports group_size == 16
# mxfp4 only supports group_size == 32
if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
continue
# other quantization methods don't support group_size = 16
Expand All @@ -89,9 +91,22 @@ def generate_new_kernels():

c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"

if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
s_type = "vllm::kFE4M3fn"
elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
s_type = "vllm::kFE8M0fnu"
if dtype == "fp16":
# we cannot safely dequantize e8m0 to fp16, so skip this
continue
elif dtype == "fp16":
s_type = "vllm::kFloat16"
elif dtype == "bf16":
s_type = "vllm::kBFloat16"

template_str = jinja2.Template(TEMPLATE).render(
scalar_t=c_dtype,
w_type_id=scalar_type + ".id()",
s_type_id=s_type + ".id()",
threads=threads,
thread_m_blocks=max(m_blocks, 1),
thread_n_blocks=n_blocks,
Expand Down
26 changes: 14 additions & 12 deletions csrc/moe/marlin_moe_wna16/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,25 @@
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
#include "core/scalar_type.hpp"

#define MARLIN_KERNEL_PARAMS \
const int4 *__restrict__ A, const int4 *__restrict__ B, \
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
const int4 *__restrict__ scales_ptr, \
const uint16_t *__restrict__ scale2_ptr, \
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
const int32_t *__restrict__ sorted_token_ids_ptr, \
const int32_t *__restrict__ expert_ids_ptr, \
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
const float *__restrict__ topk_weights_ptr, int top_k, \
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
int prob_n, int prob_k, int *locks, bool use_atomic_add, \
#define MARLIN_KERNEL_PARAMS \
const int4 *__restrict__ A, const int4 *__restrict__ B, \
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
const int4 *__restrict__ b_bias_ptr, \
const int4 *__restrict__ scales_ptr, \
const uint16_t *__restrict__ scale2_ptr, \
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
const int32_t *__restrict__ sorted_token_ids_ptr, \
const int32_t *__restrict__ expert_ids_ptr, \
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
const float *__restrict__ topk_weights_ptr, int top_k, \
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
bool use_fp32_reduce, int max_shared_mem

namespace MARLIN_NAMESPACE_NAME {
template <typename scalar_t, // compute dtype, half or nv_float16
const vllm::ScalarTypeId w_type_id, // weight ScalarType id
const vllm::ScalarTypeId s_type_id, // weight scale ScalarType id
const int threads, // number of threads in a threadblock
const int thread_m_blocks, // number of 16x16 blocks in the m
// dimension (batchsize) of the
Expand Down
Loading