Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
428 commits
Select commit Hold shift + click to select a range
3cc1c8b
add dsv4 flash coherence sanity tests
hnyls2002 Apr 30, 2026
623a314
add server sanity kit + dsv4 flash sanity tests
hnyls2002 Apr 30, 2026
df00677
nextn subclass owns post_load_weights is_nextn
hnyls2002 Apr 30, 2026
7d38986
remove deprecated environ
hnyls2002 Apr 30, 2026
ed98c6f
fix lint
hnyls2002 Apr 30, 2026
e055a5b
reduce one duplicate
hnyls2002 Apr 30, 2026
626c862
revert noisy log prefix in _try_load_model_cls
hnyls2002 Apr 30, 2026
c68c649
restore _build_hisparse_decode_batch docstring
hnyls2002 Apr 30, 2026
ce2ff84
Add manual AIME25 tests for DeepSeek-V4 cookbook launch configs (#24104)
Fridge003 Apr 30, 2026
5cb0a57
inline Compressor.compress_fused into Compressor.forward (single caller)
fzyzcjy Apr 30, 2026
f3e040b
remove redundant self.rotary_emb in MQALayer/Compressor/C4Indexer
fzyzcjy Apr 30, 2026
ba7ef7b
remove unused Compressor.overlap_transform / overlap_transform_decode
fzyzcjy Apr 30, 2026
567b6a1
remove duplicate 'from sglang.srt.environ import envs' import
fzyzcjy May 1, 2026
45bd371
remove unused rms_normalize function in deepseek_v4.py
fzyzcjy May 1, 2026
354c102
remove unused Compressor.compute_state_len{,_indices} static helpers
fzyzcjy May 1, 2026
5f9247d
remove unused freqs_cis param from MQALayer prepare/compute helpers
fzyzcjy May 1, 2026
3a63091
remove unused debug_return_kv param from MQALayer.forward
fzyzcjy May 1, 2026
e8158c5
drop unused pp_proxy_tensors from inner DeepseekV4Model.forward
fzyzcjy May 1, 2026
d1907d3
drop unused 'as err' in encoding_dsv4 tool_call exception
fzyzcjy May 1, 2026
2dba457
remove unused parse_message_from_completion_text in encoding_dsv4
fzyzcjy May 1, 2026
f10765d
remove unused expand_seq_lens helper in paged_prefill
fzyzcjy May 1, 2026
8df1c9f
remove unused make_swa_ring_buffer_indices helper in paged_prefill
fzyzcjy May 1, 2026
8aa52bc
delete unused paged_prefill module (prepare_swa_ring_buffer_cache)
fzyzcjy May 1, 2026
6d83cd2
remove unused RaggedCoreMetadata and RaggedIndexerMetadata dataclasses
fzyzcjy May 1, 2026
371f94b
remove unused init_c4_metadata / init_c128_metadata wrappers
fzyzcjy May 1, 2026
642752b
drop dead seq_lens_sum=bs in IDLE replay branch
fzyzcjy May 1, 2026
7d35c91
drop unused _is_cuda/_is_cpu/_is_cpu_amx_available/_use_aiter in deep…
fzyzcjy May 1, 2026
86ca10d
remove unused fused_norm_rope_inplace wrapper in jit_kernel/deepseek_v4
fzyzcjy May 1, 2026
e4eb8ac
remove unused HiSparseCoordinator.get_front_topk_tokens
fzyzcjy May 1, 2026
0353cb9
remove MOE/ATTN/COMPRESSOR_BIT_WISE_EQUAL_MODE flags
fzyzcjy May 1, 2026
612aa66
Revert "remove unused parse_message_from_completion_text in encoding_…
fzyzcjy May 1, 2026
00370f2
Revert "drop unused 'as err' in encoding_dsv4 tool_call exception"
fzyzcjy May 1, 2026
c8a561d
Revert "drop dead seq_lens_sum=bs in IDLE replay branch"
fzyzcjy May 1, 2026
83b48df
unify cuda graph metadata dicts via _GraphBucket enum in deepseek_v4_…
fzyzcjy May 1, 2026
c8f263a
remove unused yarn_get_mscale helper in deepseek_v4
fzyzcjy May 1, 2026
1bf0585
remove unused is_layer_sparse/_is_layer_sparse on DeepseekV4DecoderLayer
fzyzcjy May 1, 2026
0ea8709
drop dead BumpAllocator setup in DeepseekV4Model.forward
fzyzcjy May 1, 2026
37d499a
remove dead num_fused_shared_experts > 0 branches in DSv4
fzyzcjy May 1, 2026
ec693f8
remove unused self.padding_id in DeepseekV4ModelNextN
fzyzcjy May 1, 2026
da1e4bc
remove unused self.layers_to_capture in DeepseekV4ModelNextN
fzyzcjy May 1, 2026
6382a13
remove unused self.enable_a2a_moe in DeepseekV4ModelNextN
fzyzcjy May 1, 2026
2250c7b
Revert "remove dead num_fused_shared_experts > 0 branches in DSv4"
fzyzcjy May 1, 2026
145be01
remove unused PagedCoreMetadata class
fzyzcjy May 1, 2026
89b4fd1
remove unused DeepseekV4Metadata class
fzyzcjy May 1, 2026
1cb30cd
remove unused CoreMetadata.init_swa_slice method
fzyzcjy May 1, 2026
267ca2a
drop unread c4_positions field on DSV4AttnMetadataRadix
fzyzcjy May 1, 2026
6b8cea5
drop unread c128_positions field on DSV4AttnMetadataRadix
fzyzcjy May 1, 2026
4736050
drop unused real_metadata field on DSV4MetadataRawVerify/RawDecode
fzyzcjy May 1, 2026
a374819
remove unused apply_rotary_emb in deepseek_v4_rope
fzyzcjy May 1, 2026
410ec93
remove unused tilelang_make_swa_prefill_indices and helper kernel
fzyzcjy May 1, 2026
2c72be8
remove unused CompressorPrefillPlan.copy_ method
fzyzcjy May 1, 2026
8631d6b
remove unused CompressorDecodePlan.copy_ method
fzyzcjy May 1, 2026
0b8ac0c
Revert "remove unused CompressorDecodePlan.copy_ method"
fzyzcjy May 1, 2026
c4b9411
Revert "remove unused CompressorPrefillPlan.copy_ method"
fzyzcjy May 1, 2026
17fd7c8
drop unused start_event field on HiSparseAct namedtuple
fzyzcjy May 1, 2026
83f41d0
Revert "drop unused start_event field on HiSparseAct namedtuple"
fzyzcjy May 1, 2026
9f849e3
fix: re-add is_hip import in deepseek_v4_topk
fzyzcjy May 1, 2026
fc84e1d
fix lint after dead-code cleanup
fzyzcjy May 1, 2026
b503ba6
rename is_deepseek_compressed; move dsv4 fp4 autodetect out of model_…
hnyls2002 May 1, 2026
c344aa8
fix dsv4 dataclass import-order crash; move auto-detect helper to wei…
hnyls2002 May 1, 2026
0d291a3
Revert "fix dsv4 dataclass import-order crash; move auto-detect helpe…
hnyls2002 May 1, 2026
1045918
Revert "rename is_deepseek_compressed; move dsv4 fp4 autodetect out o…
hnyls2002 May 1, 2026
cda4bea
fix DSv4 config dataclass ordering with transformers 5.6 PretrainedCo…
hnyls2002 May 1, 2026
3175d0b
fix v4 nextn post_load_weights signature accept is_nextn kwarg
hnyls2002 May 1, 2026
6a90273
stop on newline in determinism probe to avoid drift on continuation
hnyls2002 May 1, 2026
178dcdf
simplify determinism probe comments
hnyls2002 May 1, 2026
c4d2056
rename is_deepseek_compressed; move dsv4 fp4 autodetect out of model_…
hnyls2002 May 1, 2026
d7f7a72
fix dsv4 dataclass import-order crash; move auto-detect helper to wei…
hnyls2002 May 1, 2026
955eb87
move dsv4 fp4 autodetect from weight_utils into configs/deepseek_v4
hnyls2002 May 1, 2026
02a6950
add swa unit tests
ispobock May 1, 2026
1a48196
Replace SGLANG_DSV4_FP4_EXPERTS env with ModelConfig.is_fp4_experts a…
hnyls2002 May 1, 2026
335a2b3
add marlin to MOE_RUNNER_BACKEND_CHOICES (CLI choices list was out of…
fzyzcjy May 1, 2026
2a30899
Revert "Replace SGLANG_DSV4_FP4_EXPERTS env with ModelConfig.is_fp4_e…
hnyls2002 May 1, 2026
98d35bc
remove SGLANG_OPT_DEEPGEMM_SCALE_CONVERT_AT_INIT (default True), inli…
fzyzcjy May 1, 2026
c0d402e
fix(support_triton): restore "ascend" backend (lost in rebase merge)
fzyzcjy May 1, 2026
8fe4670
remove duplicate get_libnuma/numa_bind_to_node from utils/common.py (…
fzyzcjy May 1, 2026
a5f4cc7
restore top-level 'import gc' in utils/common.py to match main
fzyzcjy May 1, 2026
9d5a84e
MoEGate: gate prefill_cp F.linear shortcut on not is_deepseek_v4 (dro…
fzyzcjy May 1, 2026
84cfee1
MoEGate: gate linear_bf16_fp32 fallback on is_deepseek_v4 (restore F.…
fzyzcjy May 1, 2026
9144676
DeepseekV2MoE: drop incorrect 'assert hasattr(self, shared_experts)' …
fzyzcjy May 1, 2026
7bd615b
remove SGLANG_OPT_ALLOW_SHARED_EXPERT_DUAL_STREAM (default True), inl…
fzyzcjy May 1, 2026
27cb98c
topk: drop duplicate '_is_xpu = is_xpu()' assignment
fzyzcjy May 1, 2026
1f7708d
Revert "topk: drop duplicate '_is_xpu = is_xpu()' assignment"
fzyzcjy May 1, 2026
c2bae49
DSV4 fp4 experts: env user override + try-detect → ModelConfig.is_fp4…
hnyls2002 May 1, 2026
c76ea69
cleanup swa fix env and test
ispobock May 1, 2026
73d128e
Merge remote-tracking branch 'origin' into dsv4-rebase
hnyls2002 May 1, 2026
e9b8d3e
group all DSV4 envs into one section with sub-group headers
hnyls2002 May 1, 2026
a49605d
is_deepseek_nsa/v4: accept dict-or-object via shared _hf_arch/_hf_att…
hnyls2002 May 1, 2026
229c0fe
[Dep] Add tilelang to pyproject.toml (#24178)
Fridge003 May 1, 2026
111dba6
drop SGLANG_DISABLE_REQUEST_LOGGING doc entry; env has no reader
hnyls2002 May 1, 2026
a18ceb5
drop dead branches in DeepSeekV4SingleKVPool
hnyls2002 May 1, 2026
708f215
add is_swa_like_pool helper
hnyls2002 May 1, 2026
b5e4991
add BaseSWAKVPool ABC
hnyls2002 May 1, 2026
4523bc7
drop is_swa_like_pool; use BaseSWAKVPool isinstance
hnyls2002 May 1, 2026
a18f1c1
all req pools reserve slot 0 as padding
hnyls2002 May 1, 2026
66572a6
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 1, 2026
008e7b8
rename expected_free back to req_total_size to match main
hnyls2002 May 1, 2026
850281a
restore SWAChunkCache assert; allow HiSparse allocator
hnyls2002 May 2, 2026
8fe0104
restore session_held_mamba_slots chain
hnyls2002 May 2, 2026
a8149dd
fix mamba slot leak in release_session
hnyls2002 May 2, 2026
d96d731
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 2, 2026
2c4693b
align with main's dependencies
hnyls2002 May 2, 2026
3203982
drop CoreMetadata + IndexerMetadata dead code
hnyls2002 May 2, 2026
cf3d985
rename maybe_torch_compile -> compile_in_capture_mode; move to cuda_g…
hnyls2002 May 2, 2026
778ec36
align custom_all_reduce_v2 to main
hnyls2002 May 2, 2026
6de214b
drop SGLANG_DISABLE_REQUEST_LOGGING doc; env has no reader
hnyls2002 May 2, 2026
31ef73a
resolve hisparse conflict
xiezhq-hermann May 2, 2026
5e68a6f
hisparse cleaning
xiezhq-hermann May 2, 2026
b3175f6
Merge origin/main into dsv4-rebase
hnyls2002 May 2, 2026
4d711cb
drop redundant f-prefix
hnyls2002 May 2, 2026
53e6703
rename topk
ispobock May 2, 2026
0f2db17
restore docs
ispobock May 2, 2026
38902df
fix rebase swa evict
ispobock May 2, 2026
9c3e572
update
ispobock May 2, 2026
022e293
add ut for leaf split
ispobock May 2, 2026
5a65728
Fix weight checker float dtype detection
yueming-yuan May 2, 2026
1dc3397
Align weight checker reset handling
yueming-yuan May 2, 2026
d6d1240
Handle DeepSeek KV cache scales in weight checker
yueming-yuan May 3, 2026
60cb388
force DSV4 topk_group=n_group so router takes ungrouped sqrtsoftplus …
hnyls2002 May 3, 2026
591a42f
register deepseek_v4 via DeepseekV3Config subclass alias
hnyls2002 May 3, 2026
87d385c
move allreduce v2 env flag to match main location
hnyls2002 May 3, 2026
bed994e
restore HIP buf_numel_per_page assertion
hnyls2002 May 3, 2026
0f4d53a
restore HIP page_size assertion
hnyls2002 May 3, 2026
57e1b98
restore blank line in combine_a
hnyls2002 May 3, 2026
3f9037a
extract mega-moe to layers/moe/mega_moe.py (#24301)
hnyls2002 May 3, 2026
d6418f4
restore registry log message
hnyls2002 May 3, 2026
0cd8b6e
extract MxFP4 fused RSF+shared_add helper
hnyls2002 May 3, 2026
698797e
revert HIP page_size to main
hnyls2002 May 3, 2026
6bdb5e1
Merge origin/main into dsv4-rebase
hnyls2002 May 3, 2026
62cbbb8
extract dsv4 server-args hooks to arg_groups (#24326)
hnyls2002 May 3, 2026
d564605
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 3, 2026
66fa6ac
sync utils/common.py with main; revert NPU bugfix revert
hnyls2002 May 3, 2026
6025a50
extract hisparse hook to arg_groups
hnyls2002 May 3, 2026
61b26dc
drop SGLANG_ENABLE_THINKING force on HF path
hnyls2002 May 3, 2026
e107dd7
rename SGLANG_ENABLE_THINKING to SGLANG_DEFAULT_THINKING
hnyls2002 May 4, 2026
bc4fab4
rename deepseekv4_memory_pool to deepseek_v4_memory_pool
hnyls2002 May 4, 2026
8e88bb8
rename compress_state to deepseek_v4_compress_state
hnyls2002 May 4, 2026
bfa804b
rebase main with 91fa2340ed
hnyls2002 May 4, 2026
b56d72e
drop SGLANG_OPT_V4_DRAFT_EXTEND_CUDA_GRAPH
hnyls2002 May 4, 2026
ce278b1
rename DeepseekV4BackendRadix to DeepseekV4AttnBackend; drop Radix su…
hnyls2002 May 4, 2026
581c44d
lock deps to cu129 baseline; main adaptation pending
hnyls2002 May 4, 2026
6ea6048
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 4, 2026
009bbc7
move v4 pd pp=1 check to server args
hnyls2002 May 4, 2026
b22221f
revert rocm artifacts from dsv4-rebase (cuda-only scope) (#24339)
hnyls2002 May 4, 2026
df16061
rename mxfp4_deepseek to mxfp4_flashinfer_trtllm_moe
hnyls2002 May 4, 2026
7ce156b
reconcile v4 nixl with main; add v4 dispatch branch
hnyls2002 May 4, 2026
16afced
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 4, 2026
d88c329
move dsv4 metadata init kernel to compressed/
hnyls2002 May 4, 2026
77b5774
rename on_after_cuda_graph_warmup; tidy comments
hnyls2002 May 4, 2026
eecf0fb
rename attention/compressed to compression
hnyls2002 May 4, 2026
70d5e29
drop SGLANG_FIX_MTP_HC_HIDDEN; default-on
hnyls2002 May 4, 2026
e46b1e3
minor: use main
DarkSharpness May 4, 2026
9267502
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 4, 2026
6617c26
rename compressed attn backend to dsv4; deprecate alias
hnyls2002 May 4, 2026
89411ab
rename is_swa_with_compressed_attention -> is_deepseek_v4_arch
hnyls2002 May 4, 2026
be95d76
rename init_compressed_metadata -> init_compression_metadata
hnyls2002 May 4, 2026
d1d3e57
rename SGLANG_REASONING_EFFORT -> SGLANG_DSV4_REASONING_EFFORT
hnyls2002 May 4, 2026
8996c5a
fix SGLANG_DSV4_ISOLATE type: EnvInt -> EnvBool
hnyls2002 May 4, 2026
d39be6d
describe head_dim/block_size asserts in fp8_paged_mqa_logits_torch
hnyls2002 May 4, 2026
8a2a47f
minor: clean up
DarkSharpness May 4, 2026
1127c78
drop DeepseekRefRMSNorm; use standard RMSNorm in Compressor
hnyls2002 May 4, 2026
99e3141
drop unused methods on v4 compress state pool and attn backend
hnyls2002 May 4, 2026
2008b54
drop unused _DSV4_RAW_TYPES constant
hnyls2002 May 4, 2026
dc0c318
move Compressor/C4Indexer nn.Modules to layers/attention/compression
hnyls2002 May 4, 2026
470a380
rename Compressor/C4IndexerBackend to *BackendMixin
hnyls2002 May 4, 2026
48f7b07
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 4, 2026
a96fd95
import ReplicatedLinear from layers.linear instead of models.dbrx
hnyls2002 May 4, 2026
f325ce6
describe magic-number asserts in dsv4 attn / compressor / pool
hnyls2002 May 4, 2026
618317a
drop unused create_flashmla_metadata helper
hnyls2002 May 4, 2026
23eab17
rename layers/attention/compression to layers/attention/dsv4
hnyls2002 May 4, 2026
8f36a94
unify Dsv4/DSv4 mixed-case to DSV4
hnyls2002 May 4, 2026
6ce24a8
rename is_v4_model to is_dsv4_model
hnyls2002 May 4, 2026
88c5754
consolidate copy_metadata: drop duplicate _copy_metadata in v4 backend
hnyls2002 May 4, 2026
fcda6fc
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 4, 2026
884eaeb
use _skip_weight_check marker
yueming-yuan May 4, 2026
3f55938
upd
Fridge003 May 4, 2026
0f27c21
upd
Fridge003 May 4, 2026
3bf02ce
align c4 hisparse translate_loc names with NSA
hnyls2002 May 4, 2026
38827bf
DeepGemm fixes for v4 rebasing (#24399)
Fridge003 May 5, 2026
17591f3
sync attention/utils.py with main
hnyls2002 May 5, 2026
62969b3
fix glm4-moe-lite missing is_hash attr
hnyls2002 May 5, 2026
ed35a74
minor: remove flag
DarkSharpness May 5, 2026
40f07b0
minor: remove seemingly dangerous flags
DarkSharpness May 5, 2026
85c63af
restore hisparse comments to align with main
hnyls2002 May 5, 2026
024911f
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 5, 2026
c398860
drop duplicate maybe_collect_indexer_topk left by merge
hnyls2002 May 5, 2026
66b9453
drop duplicate indexer_topk kwarg left by merge
hnyls2002 May 5, 2026
d688b21
isort dsv4/indexer.py imports
hnyls2002 May 5, 2026
e11e351
tiny recover comments
hnyls2002 May 6, 2026
25a9fab
rename state_type v4 -> dsv4
hnyls2002 May 6, 2026
a9ef99b
guard self.is_hash with getattr; drop glm4 patch
hnyls2002 May 6, 2026
7609096
move nsa _v4 artifacts to attention/dsv4
hnyls2002 May 6, 2026
fa3e030
extract dsv4 paged_mqa_logits to attention/dsv4/tilelang_kernel
hnyls2002 May 6, 2026
1b38b68
drop SGLANG_OPT_DG_PAGED_MQA_LOGITS_CHUNK_SIZE
hnyls2002 May 6, 2026
bcca716
Merge remote-tracking branch 'origin/main' into dsv4-rebase
hnyls2002 May 6, 2026
4e0b431
drop SGLANG_DSV4_ISOLATE
hnyls2002 May 6, 2026
8fa2b96
fixme: hisparse negative pool counter clamp
hnyls2002 May 6, 2026
584c86f
Remove dsv4 compress_state dead code (#24472)
hnyls2002 May 6, 2026
3593e22
restore non-dsv4 silu_and_mul fallback; gate dsv4 by swiglu_limit
hnyls2002 May 6, 2026
976a43d
remove requirement on fast hadamard for sm103
Fridge003 May 6, 2026
b0df768
tiny
hnyls2002 May 6, 2026
28dae6f
remove dead non-triton k cache quant
hnyls2002 May 6, 2026
c3d8770
drop dead swa_page_size and unused fields on CompressStatePool
hnyls2002 May 6, 2026
3039f4f
Merge branch 'main' into dsv4-rebase
hnyls2002 May 6, 2026
b69a724
drop duplicate TestDeepSeekV4Detector from rebase
hnyls2002 May 6, 2026
a3b4935
minor: remove selector
DarkSharpness May 6, 2026
72679d7
fix has_attention_sinks unset for non-hybrid-swa
hnyls2002 May 6, 2026
973f098
split hisparse mla swap-in by buffer layout
hnyls2002 May 6, 2026
087d83b
small upgrade sglang-kernel version
Fridge003 May 6, 2026
3ffc34d
fix non-dsv4 cuda graph replay typeerror
hnyls2002 May 6, 2026
a17de73
try revert
hnyls2002 May 6, 2026
a5fff9b
guard on_after_cuda_graph_warmup for non-dsv4 draft backends
hnyls2002 May 6, 2026
5760cc1
Merge branch 'main' into dsv4-rebase
hnyls2002 May 6, 2026
c918888
fix swa eviction test mock req
hnyls2002 May 6, 2026
dba0adc
restore admit_request_direct on HiSparseCoordinator
hnyls2002 May 6, 2026
7298f63
Merge branch 'main' into dsv4-rebase
hnyls2002 May 6, 2026
4acbcca
gate masked deep_gemm V4 path on swiglu_limit
hnyls2002 May 6, 2026
fce5ad6
fix _postprocess_tensors test calls
hnyls2002 May 6, 2026
c822e91
Merge branch 'main' into dsv4-rebase
hnyls2002 May 6, 2026
23e62ce
fix _make_req mock missing seqlen
hnyls2002 May 6, 2026
c39f6d9
restore naive_load_topk on HiSparseCoordinator
hnyls2002 May 7, 2026
1875af4
fix: NSA prefill context parallel crash (dsv4-rebase) (#24560)
yhyang201 May 7, 2026
900eac3
Merge branch 'main' into dsv4-rebase
hnyls2002 May 7, 2026
cfdce82
Merge remote-tracking branch 'origin/dsv4-rebase' into dsv4-rebase
hnyls2002 May 7, 2026
1623ce0
Merge remote-tracking branch 'origin' into dsv4-rebase
hnyls2002 May 7, 2026
6efeee8
Port MXFP4 Marlin MoE support to JIT kernel path (#24490)
yhyang201 May 7, 2026
0826876
gate jit mask_topk_ids; default off
hnyls2002 May 7, 2026
f490e24
Merge remote-tracking branch 'origin' into dsv4-rebase
hnyls2002 May 7, 2026
cbcd3a3
Inline SGLANG_OPT_MXFP4_FUSE_RSF_SHARED_ADD (default True)
fzyzcjy May 7, 2026
2267007
Inline SGLANG_OPT_MXFP4_STATIC_SCALE_ONES (default True)
fzyzcjy May 7, 2026
5d8779f
Inline SGLANG_OPT_MXFP4_SKIP_DISPATCHER_MAPPING (default True)
fzyzcjy May 7, 2026
f64d12f
Remove unused SGLANG_FIX_ATTN_BACKEND_IDLE env var
fzyzcjy May 7, 2026
1bc545e
Inline SGLANG_FIX_PD_IDLE (default True)
fzyzcjy May 7, 2026
fe60306
Drop unused envs import in mxfp4_flashinfer_trtllm_moe.py
fzyzcjy May 7, 2026
0d7379f
Revert "gate jit mask_topk_ids; default off"
hnyls2002 May 7, 2026
3a469e5
fix lint
hnyls2002 May 7, 2026
a32a55c
Merge remote-tracking branch 'upstream/dsv4-rebase' into dsv4-rebase
fzyzcjy May 7, 2026
4c3689b
fix error when model has no swiglu limit
fzyzcjy May 7, 2026
47d6c2b
fix AMD build: guard enable_cluster with USE_ROCM
fzyzcjy May 7, 2026
9e45b2d
add comments
fzyzcjy May 7, 2026
7fde762
fix _mask_topk_ids_padded_region
fzyzcjy May 7, 2026
37f4a77
fmt code
fzyzcjy May 7, 2026
f54b6ee
fix: cast kv_compressed to bf16 before rotate_activation in compressor
yhyang201 May 7, 2026
dfaf8a9
add B200 CI tests for DSV4 Flash FP4 (per-commit + nightly)
yhyang201 May 7, 2026
f81a88a
add H200 CI test for DSV4 Flash FP4 Marlin (per-commit sanity + GSM8K)
yhyang201 May 7, 2026
97d22df
fix glm5 grouped topk; route by model not n_group>topk_group
hnyls2002 May 7, 2026
ab450c6
topk init: explicit main default + dsv4 override
hnyls2002 May 7, 2026
b481bf0
minor adjust for hisparse
xiezhq-hermann May 7, 2026
644dee6
[CI] Add flash_mla installation script for dsv4 ci tests (#24634)
Fridge003 May 7, 2026
7978aa7
Merge branch 'main' into dsv4-rebase
Fridge003 May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
rebase main with 658a281
  • Loading branch information
hnyls2002 committed Apr 29, 2026
commit 5fc88fc9bc758a165f1e6fa689a8818d76e82222
266 changes: 0 additions & 266 deletions python/sglang/srt/layers/attention/nsa/tilelang_kernel.py
Comment thread
hnyls2002 marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -45,23 +45,6 @@ def fast_round_scale(amax, fp8_max_inv):
return fast_pow2(fast_log2_ceil(amax * fp8_max_inv))


@lru_cache(maxsize=8)
def _pick_inner_iter(seq: int, ni: int, cu: int, block_per_cu: int) -> int:
"""
Pick the largest valid inner_iter (power-of-two divisor of ni) that keeps
enough work per CU (seq * ni / inner_iter / cu >= block_per_cu), so we avoid
under-utilization while minimizing the number of partial groups.
"""

max_it = int(seq * ni / (cu * block_per_cu))
it = ni
while it >= 2:
if it <= max_it and ni % it == 0:
return it
it //= 2
return 1


@tilelang.jit(pass_configs=pass_configs)
def act_quant_kernel(
N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False
Expand Down Expand Up @@ -1024,255 +1007,6 @@ def main(
return main


@tilelang.jit(out_idx=[-2, -1], pass_configs=pass_configs)
def sparse_mla_fwd_decode_partial_fp8(
num_heads: int,
d_v: int,
d_tail: int,
topk: int,
*,
sm_scale=None,
block_I=64,
inner_iter=1,
threads=256,
):
assert d_v == 512, f"only support d_v=512"
assert (
topk % block_I == 0
), "otherwise will load some index=0 thus causing wrong kv to be loaded"

# Softmax scores are in [0, 1]. We scale by fp8_max_val before FP8 cast
# to better utilize FP8 dynamic range, then apply the inverse scale after GEMM.
# This is numerically safe because softmax output is bounded by 1.
fp8_dtype = "float8_e4m3fnuz" if _is_fp8_fnuz else "float8_e4m3fn"
fp8_max_val = 240.0 if _is_fp8_fnuz else 448.0
s_inv_scale_const = fp8_max_val
s_scale_const = 1.0 / fp8_max_val

BI = block_I
group_size = 128
dim_quant_fp8 = d_v + d_tail
rope_offset_fp8 = d_v
n_groups = topk // (BI * inner_iter)

if sm_scale is None:
sm_scale = (1.0 / (d_v + d_tail)) ** 0.5 * 1.44269504
else:
sm_scale = sm_scale * 1.44269504

h_per_block = 16
# Match bf16 partial behavior: keep fixed 16-head tiles and use
# sliced T.copy on H0:H1 for tail handling.
assert (
num_heads <= h_per_block or num_heads % h_per_block == 0
), "num_heads must be <=16 or divisible by 16"
head_blocks_per_seq = (num_heads + h_per_block - 1) // h_per_block

batch = 1
kv_group = 1
seq_len = T.symbolic("seq_len")
num_pages = T.symbolic("num_pages")

q_fp8_shape = [batch, seq_len, num_heads, d_v + d_tail]
kv_fp8_shape = [batch, num_pages, kv_group, dim_quant_fp8]
idx_shape = [batch, seq_len, kv_group, topk]
partial_o_shape = [batch, seq_len, n_groups, num_heads, d_v]
partial_lse_shape = [batch, seq_len, n_groups, num_heads]

accum_dtype = T.float32
dtype_bf16 = T.bfloat16

@T.prim_func
def main(
q_fp8: T.Tensor(q_fp8_shape, fp8_dtype),
kv_fp8: T.Tensor(kv_fp8_shape, fp8_dtype),
indices: T.Tensor(idx_shape, T.int32),
partial_o: T.Tensor(partial_o_shape, dtype_bf16),
partial_lse: T.Tensor(partial_lse_shape, accum_dtype),
):
with T.Kernel(seq_len * head_blocks_per_seq, n_groups, threads=threads) as (
bx,
by,
):
b_i, g_i = 0, 0
s_i = bx // head_blocks_per_seq
group_i = by
H0 = (bx % head_blocks_per_seq) * h_per_block
H1 = H0 + h_per_block

# We intentionally split the K=512 GEMM into 4x128 tiles.
# Although this adds extra intermediate memory traffic,
# it shortens the MFMA accumulation dependency chain and improves performance.
q_tile0 = T.alloc_shared([h_per_block, group_size], fp8_dtype)
q_tile1 = T.alloc_shared([h_per_block, group_size], fp8_dtype)
q_tile2 = T.alloc_shared([h_per_block, group_size], fp8_dtype)
q_tile3 = T.alloc_shared([h_per_block, group_size], fp8_dtype)
kv_tile0 = T.alloc_shared([BI, group_size], fp8_dtype)
kv_tile1 = T.alloc_shared([BI, group_size], fp8_dtype)
kv_tile2 = T.alloc_shared([BI, group_size], fp8_dtype)
kv_tile3 = T.alloc_shared([BI, group_size], fp8_dtype)
q_tail_buf = T.alloc_shared([h_per_block, d_tail], fp8_dtype)
k_tail_shared = T.alloc_shared([BI, d_tail], fp8_dtype)
s_fp8_shared = T.alloc_shared([h_per_block, BI], fp8_dtype)
page_idx_shared = T.alloc_shared([BI], T.int32)

mask = T.alloc_fragment([BI], T.bool)
acc_s = T.alloc_fragment([h_per_block, BI], accum_dtype)
acc_tile = T.alloc_fragment([h_per_block, BI], accum_dtype)
sv_tile = T.alloc_fragment([h_per_block, group_size], accum_dtype)
sumexp = T.alloc_fragment([h_per_block], accum_dtype)
sumexp_i = T.alloc_fragment([h_per_block], accum_dtype)
alpha = T.alloc_fragment([h_per_block], accum_dtype)
m_i = T.alloc_fragment([h_per_block], accum_dtype)
m_i_prev = T.alloc_fragment([h_per_block], accum_dtype)
inv_denom = T.alloc_fragment([h_per_block], accum_dtype)

acc_o_tile0 = T.alloc_fragment([h_per_block, group_size], accum_dtype)
acc_o_tile1 = T.alloc_fragment([h_per_block, group_size], accum_dtype)
acc_o_tile2 = T.alloc_fragment([h_per_block, group_size], accum_dtype)
acc_o_tile3 = T.alloc_fragment([h_per_block, group_size], accum_dtype)

T.fill(acc_o_tile0, 0)
T.fill(acc_o_tile1, 0)
T.fill(acc_o_tile2, 0)
T.fill(acc_o_tile3, 0)
T.fill(sumexp, 0)
T.fill(m_i, -(2**30))

T.copy(q_fp8[b_i, s_i, H0:H1, d_v:], q_tail_buf)
T.copy(q_fp8[b_i, s_i, H0:H1, 0 * group_size : 1 * group_size], q_tile0)
T.copy(q_fp8[b_i, s_i, H0:H1, 1 * group_size : 2 * group_size], q_tile1)
T.copy(q_fp8[b_i, s_i, H0:H1, 2 * group_size : 3 * group_size], q_tile2)
T.copy(q_fp8[b_i, s_i, H0:H1, 3 * group_size : 4 * group_size], q_tile3)

for k_i in T.serial(inner_iter):
topk_block_i = group_i * inner_iter + k_i

for bi_i in T.Parallel(BI):
idx = indices[b_i, s_i, g_i, topk_block_i * BI + bi_i]
valid = idx >= 0
page_idx_shared[bi_i] = T.if_then_else(valid, idx, 0)
mask[bi_i] = valid

for bi_i, j in T.Parallel(BI, group_size):
page = page_idx_shared[bi_i]
kv_tile0[bi_i, j] = kv_fp8[b_i, page, g_i, 0 * group_size + j]
kv_tile1[bi_i, j] = kv_fp8[b_i, page, g_i, 1 * group_size + j]
kv_tile2[bi_i, j] = kv_fp8[b_i, page, g_i, 2 * group_size + j]
kv_tile3[bi_i, j] = kv_fp8[b_i, page, g_i, 3 * group_size + j]

for bi_i, j in T.Parallel(BI, d_tail):
page = page_idx_shared[bi_i]
k_tail_shared[bi_i, j] = kv_fp8[b_i, page, g_i, rope_offset_fp8 + j]

for h_i, bi_i in T.Parallel(h_per_block, BI):
acc_s[h_i, bi_i] = T.if_then_else(
mask[bi_i], 0, -T.infinity(acc_s.dtype)
)

T.gemm(q_tile0, kv_tile0, acc_s, transpose_B=True, clear_accum=False)
T.gemm(q_tile1, kv_tile1, acc_tile, transpose_B=True, clear_accum=True)
for h_i, bi_i in T.Parallel(h_per_block, BI):
acc_s[h_i, bi_i] += acc_tile[h_i, bi_i]
T.gemm(q_tile2, kv_tile2, acc_tile, transpose_B=True, clear_accum=True)
for h_i, bi_i in T.Parallel(h_per_block, BI):
acc_s[h_i, bi_i] += acc_tile[h_i, bi_i]
T.gemm(q_tile3, kv_tile3, acc_tile, transpose_B=True, clear_accum=True)
for h_i, bi_i in T.Parallel(h_per_block, BI):
acc_s[h_i, bi_i] += acc_tile[h_i, bi_i]
T.gemm(
q_tail_buf,
k_tail_shared,
acc_s,
transpose_B=True,
policy=T.GemmWarpPolicy.FullCol,
)

T.copy(m_i, m_i_prev)
T.reduce_max(acc_s, m_i, dim=1, clear=False)
for h_i in T.Parallel(h_per_block):
alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
for h_i, bi_i in T.Parallel(h_per_block, BI):
acc_s[h_i, bi_i] = T.exp2(
acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale
)
T.reduce_sum(acc_s, sumexp_i, dim=1)
for h_i in T.Parallel(h_per_block):
sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i]
for h_i, j in T.Parallel(h_per_block, group_size):
acc_o_tile0[h_i, j] = acc_o_tile0[h_i, j] * alpha[h_i]
acc_o_tile1[h_i, j] = acc_o_tile1[h_i, j] * alpha[h_i]
acc_o_tile2[h_i, j] = acc_o_tile2[h_i, j] * alpha[h_i]
acc_o_tile3[h_i, j] = acc_o_tile3[h_i, j] * alpha[h_i]

for h_i, bi_i in T.Parallel(h_per_block, BI):
s_fp8_shared[h_i, bi_i] = T.clamp(
acc_s[h_i, bi_i] * s_inv_scale_const,
-fp8_max_val,
fp8_max_val,
)
T.gemm(s_fp8_shared, kv_tile0, sv_tile, clear_accum=True)
for h_i, j in T.Parallel(h_per_block, group_size):
acc_o_tile0[h_i, j] = (
acc_o_tile0[h_i, j] + sv_tile[h_i, j] * s_scale_const
)

T.gemm(s_fp8_shared, kv_tile1, sv_tile, clear_accum=True)
for h_i, j in T.Parallel(h_per_block, group_size):
acc_o_tile1[h_i, j] = (
acc_o_tile1[h_i, j] + sv_tile[h_i, j] * s_scale_const
)

T.gemm(s_fp8_shared, kv_tile2, sv_tile, clear_accum=True)
for h_i, j in T.Parallel(h_per_block, group_size):
acc_o_tile2[h_i, j] = (
acc_o_tile2[h_i, j] + sv_tile[h_i, j] * s_scale_const
)

T.gemm(s_fp8_shared, kv_tile3, sv_tile, clear_accum=True)
for h_i, j in T.Parallel(h_per_block, group_size):
acc_o_tile3[h_i, j] = (
acc_o_tile3[h_i, j] + sv_tile[h_i, j] * s_scale_const
)

for h_i in T.Parallel(h_per_block):
denom = T.if_then_else(sumexp[h_i] == 0.0, 1.0, sumexp[h_i])
inv_denom[h_i] = 1.0 / denom
for h_i, j in T.Parallel(h_per_block, group_size):
acc_o_tile0[h_i, j] = acc_o_tile0[h_i, j] * inv_denom[h_i]
acc_o_tile1[h_i, j] = acc_o_tile1[h_i, j] * inv_denom[h_i]
acc_o_tile2[h_i, j] = acc_o_tile2[h_i, j] * inv_denom[h_i]
acc_o_tile3[h_i, j] = acc_o_tile3[h_i, j] * inv_denom[h_i]

for h_i in T.Parallel(h_per_block):
sumexp[h_i] = T.if_then_else(
sumexp[h_i] == 0.0,
-(2**30),
T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale,
)

T.copy(
acc_o_tile0,
partial_o[b_i, s_i, group_i, H0:H1, 0 * group_size : 1 * group_size],
)
T.copy(
acc_o_tile1,
partial_o[b_i, s_i, group_i, H0:H1, 1 * group_size : 2 * group_size],
)
T.copy(
acc_o_tile2,
partial_o[b_i, s_i, group_i, H0:H1, 2 * group_size : 3 * group_size],
)
T.copy(
acc_o_tile3,
partial_o[b_i, s_i, group_i, H0:H1, 3 * group_size : 4 * group_size],
)

T.copy(sumexp, partial_lse[b_i, s_i, group_i, H0:H1])

return main


def tilelang_sparse_fwd(
q: torch.Tensor,
kv: torch.Tensor,
Expand Down
49 changes: 1 addition & 48 deletions python/sglang/srt/managers/hisparse_coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(
self.mem_pool_host.kv_cache_total_dim * self.mem_pool_host.dtype.itemsize
)

max_num_reqs = req_to_token_pool.req_to_token.shape[0]
max_num_reqs = req_to_token_pool.size
max_context_len = req_to_token_pool.max_context_len

self.padded_buffer_size = (
Expand Down Expand Up @@ -163,53 +163,6 @@ def admit_request_into_staging(self, req: Req) -> None:

self.ack_staging_queue.append(HiSparseAct(start_event, finish_event, req))

def admit_request_direct(self, req: Req) -> None:
"""Direct-to-host path: KV data already resides in host pool via RDMA.

Skips staging DMA entirely. Only allocates a small device buffer
(4KB) for decode-time swap-in, then marks the request as ready.
Host indices were already written to req_to_host_pool.

Metadata fixups after alloc_device_buffer():
- alloc_device_buffer() sets device_buffer_tokens = [0, 1, ..., buf_size-1],
which tells the swap-in kernel that those tokens are cached in the device
buffer. In the staging path this is correct (prefill filled the buffer),
but here the buffer is empty.
"""
self.alloc_device_buffer(req)

if req.kv_allocated_len <= self.device_buffer_size:
# Short sequences (seq_len <= device_buffer_size): the kernel fast path
# returns device_buffer_locs directly without any host loading, so we
# must preload all tokens from host pool into the device buffer
# TODO(hzh0425): Optimize this.
self._preload_to_device_buffer(req)
else:
# Long sequence: reset device_buffer_tokens to -1 so the kernel
# sees all slots as empty → every top-k lookup is a miss → host load.
self.req_device_buffer_tokens[
:, req.req_pool_idx, : self.device_buffer_size
] = -1

req.staging = False
self._skip_first_backup[req.req_pool_idx] = True
logger.debug("HiSparse: admitting request %s directly", req.rid)

def _preload_to_device_buffer(self, req: Req) -> None:
"""Preload all tokens from host pool into the device buffer."""
n = req.kv_allocated_len
host_indices = self.req_to_host_pool[req.req_pool_idx, :n]
device_locs = self.req_to_device_buffer[req.req_pool_idx, :n]

for layer_id in range(self.mem_pool_device.layer_num):
self.mem_pool_host.load_to_device_per_layer(
self.mem_pool_device,
host_indices,
device_locs,
layer_id,
io_backend="kernel",
)

def alloc_device_buffer(self, req: Req) -> None:
prefill_len = len(req.fill_ids)
compressed_logical_indices = (
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/managers/schedule_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,7 @@ def __init__(
# For diffusion LLM
self.init_diffusion_llm(dllm_config)

# For hisparse
self.hisparse_staging = False

@property
Expand Down
23 changes: 0 additions & 23 deletions python/sglang/srt/mem_cache/hisparse_memory_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,29 +208,6 @@ def alloc(self, need_size: int):
"Page size = 1 is not supported in HiSparse allocator"
)

def alloc_logical_only(
self,
prefix_lens: torch.Tensor,
prefix_lens_cpu: torch.Tensor,
seq_lens: torch.Tensor,
seq_lens_cpu: torch.Tensor,
last_loc: torch.Tensor,
extend_num_tokens: int,
):
"""Allocate only logical indices without hisparse device indices.

Used in the direct-to-host transfer path where KV data is written
directly to host memory by the prefill node, skipping GPU staging.
"""
return self.logical_attn_allocator.alloc_extend(
prefix_lens,
prefix_lens_cpu,
seq_lens,
seq_lens_cpu,
last_loc,
extend_num_tokens,
)

def alloc_device_buffer(self, allocated_indices, need_size: int):
assert need_size % self.page_size == 0
hisparse_indices = self.full_to_hisparse_device_index_mapping[allocated_indices]
Expand Down
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.