Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
136 commits
Select commit Hold shift + click to select a range
aa27d8f
Implementation of lighter mamba prefix caching on V1
peakcrosser7 Oct 28, 2025
81d3561
[BugFix] Resolve compatibility issues in lighter mamba prefix cache
peakcrosser7 Nov 5, 2025
8a652af
[BugFix] Resolve compatibility issues for mamba
peakcrosser7 Nov 5, 2025
8a39893
Add base implementation for lighter mamba cache with standard layout
peakcrosser7 Nov 23, 2025
ee08f54
update and fix bugs
peakcrosser7 Nov 24, 2025
ce840b3
fix bugs after rebasing
peakcrosser7 Nov 24, 2025
865ea28
add running script (just for testing)
peakcrosser7 Nov 26, 2025
7316d03
add debug logs
peakcrosser7 Nov 26, 2025
ad37d09
fix schedule
heheda12345 Nov 26, 2025
f1295e5
add test script (just for testing)
peakcrosser7 Nov 26, 2025
bf445fc
fix runner
heheda12345 Nov 26, 2025
d8acf4b
support sps (still issues)
peakcrosser7 Nov 26, 2025
b48f042
add some logs for debug
peakcrosser7 Nov 26, 2025
336a140
fix block reuse bug when SPS is enabled
peakcrosser7 Nov 29, 2025
08feffb
fix block_table bug when sps is enabled
peakcrosser7 Nov 30, 2025
12306ed
fix the bug only mamba new blocks are empty
peakcrosser7 Nov 30, 2025
60142b6
add postprocess for prefix caching
peakcrosser7 Nov 30, 2025
fa3d810
refactor preprocess_mamba
peakcrosser7 Nov 30, 2025
9082d16
add some logs for debug
peakcrosser7 Nov 30, 2025
4b5e074
fix the wrong kv_cache_gid bug
peakcrosser7 Dec 1, 2025
ea92ef4
adjust preprocess_mamba only copying state when new blocks exist
peakcrosser7 Dec 1, 2025
0b3a6b5
update mamba_gather_indices and apply to mamba models
peakcrosser7 Dec 2, 2025
381ab34
fix bugs with mamba and lfm2
peakcrosser7 Dec 2, 2025
1da8032
fix the bug when prefix-caching is disable
peakcrosser7 Dec 3, 2025
1667210
fix the bug between LPC and mamba mixer
peakcrosser7 Dec 4, 2025
cb45505
tmp from commit "b15e6fd"
heheda12345 Dec 1, 2025
f2ac29b
add e2e impl (still has bug)
heheda12345 Dec 3, 2025
2f103fc
update runner
heheda12345 Dec 7, 2025
08aa710
[WIP] write unit test
heheda12345 Dec 9, 2025
5a39c44
extract mamba state
heheda12345 Dec 10, 2025
fd7e340
update
heheda12345 Dec 10, 2025
30c6a5a
set block size to max_model_len when prefix caching is disable
peakcrosser7 Dec 10, 2025
9e6724e
update mamba manager
peakcrosser7 Dec 10, 2025
ac577b7
update max mem usage bytes for mamba spec
peakcrosser7 Dec 10, 2025
6f036d7
init result checker
heheda12345 Dec 10, 2025
9162095
format code
heheda12345 Dec 10, 2025
84e7a3f
preprocess copy need to consider accept token
heheda12345 Dec 11, 2025
6d810c2
test decode
heheda12345 Dec 11, 2025
6260f9e
unit test
heheda12345 Dec 11, 2025
ebc6149
revert
heheda12345 Dec 11, 2025
da789b2
fix
heheda12345 Dec 11, 2025
77480e4
add more tests
heheda12345 Dec 12, 2025
247986b
add more tests
heheda12345 Dec 12, 2025
816c57a
Merge branch 'main' of github.com:vllm-project/vllm into ups/mamba_pr…
peakcrosser7 Dec 13, 2025
01af070
Merge branch 'main' of github.com:vllm-project/vllm into ups/mamba_pr…
peakcrosser7 Dec 13, 2025
f438ee0
change env to config
heheda12345 Dec 12, 2025
1052cb6
nit update
heheda12345 Dec 12, 2025
dc37673
remove enable_caching from mamba_spec
heheda12345 Dec 12, 2025
2390ec3
revert
heheda12345 Dec 12, 2025
fdba531
fix pre-commit
heheda12345 Dec 12, 2025
5da5063
fix pre-commit
heheda12345 Dec 12, 2025
0114da1
fix pre-commit
heheda12345 Dec 12, 2025
e0f607c
fix pre-commit
heheda12345 Dec 12, 2025
0ec5e29
fix pre-commit
heheda12345 Dec 12, 2025
1be5e6d
fix pre-commit
heheda12345 Dec 12, 2025
ef2e9f2
clean up
heheda12345 Dec 12, 2025
7aab212
code cleanup
heheda12345 Dec 12, 2025
5aedba9
code cleanup
heheda12345 Dec 12, 2025
1e15448
code cleanup
heheda12345 Dec 12, 2025
0dd525e
revert
heheda12345 Dec 12, 2025
239b7d5
update
heheda12345 Dec 12, 2025
b09d8ce
update
heheda12345 Dec 12, 2025
167f35f
update
heheda12345 Dec 12, 2025
5e91598
introduce mamba_utils
heheda12345 Dec 13, 2025
9fcc6ee
fix mamba block size usage before update
peakcrosser7 Dec 13, 2025
ab39578
fix prefill chunk incorrectly including draft tokens
peakcrosser7 Dec 13, 2025
e41e973
fix the bug in mamba_get_block_table_tensor when cuda graph is enabled
peakcrosser7 Dec 17, 2025
7b9f90c
rm test script
peakcrosser7 Dec 18, 2025
0afca42
general copy spec demo
peakcrosser7 Dec 15, 2025
51f88d0
copy spec v2
peakcrosser7 Dec 16, 2025
7448c6a
mamba copy v3
peakcrosser7 Dec 16, 2025
6ec0e48
clean up code
peakcrosser7 Dec 17, 2025
73cca3c
mamba copy v4
peakcrosser7 Dec 18, 2025
cfb85d1
support general mamba copy
peakcrosser7 Dec 18, 2025
c52024f
fix a bug of conv copy spec
peakcrosser7 Dec 19, 2025
4f401ef
support other mamba models
peakcrosser7 Dec 19, 2025
55f98e1
update mamba_cache_mode config
peakcrosser7 Dec 19, 2025
07b6f0c
format code
peakcrosser7 Dec 20, 2025
fec5b52
format code
peakcrosser7 Dec 20, 2025
9a3a556
update mamba copy func in the test
peakcrosser7 Dec 20, 2025
25ca2a9
remove lpc env var
peakcrosser7 Dec 20, 2025
da3be73
format code
peakcrosser7 Dec 20, 2025
6ef2bc8
remove unused _mamba_copy_block
peakcrosser7 Dec 21, 2025
c2839b0
batch mamba copy block
peakcrosser7 Dec 21, 2025
fbcae5b
Merge branch 'main' into ups/mamba_prefix_cache_align
peakcrosser7 Dec 22, 2025
3319579
update interface for mamba copy
peakcrosser7 Dec 22, 2025
4518c4a
move get_mamba_copy_func to the model
peakcrosser7 Dec 24, 2025
bf57e49
update mamba cache mode
peakcrosser7 Dec 25, 2025
7e845f9
cleanup mamba manager
peakcrosser7 Dec 25, 2025
a6b7d08
fix test
heheda12345 Dec 26, 2025
19bb239
Merge branch 'main' into ups/mamba_prefix_cache_align
peakcrosser7 Jan 3, 2026
ad71cce
fix sinkfullattn
peakcrosser7 Jan 3, 2026
961845f
update mamba_attn for align mode
peakcrosser7 Jan 3, 2026
04f4c45
remove duplicate get_num_skipped_tokens
peakcrosser7 Jan 3, 2026
fc51f32
update builder.update_block_table
peakcrosser7 Jan 3, 2026
61af735
update builder.update_block_table: rm blk_table and slot_mapping
peakcrosser7 Jan 3, 2026
189b956
update note in cache_full_blocks
peakcrosser7 Jan 3, 2026
ba297c7
fix the max_num_blocks_per_req bug for mamba models with sps
peakcrosser7 Jan 3, 2026
81f6b80
fix test
heheda12345 Jan 4, 2026
0cb4c04
cleanup unit test
heheda12345 Jan 4, 2026
1882efd
remove debug scripts
heheda12345 Jan 4, 2026
b15f6ca
Revert "update builder.update_block_table: rm blk_table and slot_mapp…
peakcrosser7 Jan 4, 2026
8a92d97
update mamba_get_block_table_tensor api
peakcrosser7 Jan 4, 2026
477747e
fix update_block_table in chunk_local_attn
peakcrosser7 Jan 4, 2026
49b533a
update builder.update_block_table: add seq_lens arg
peakcrosser7 Jan 4, 2026
1c667d3
update builder.update_block_table: rm seq_lens arg
peakcrosser7 Jan 4, 2026
9cdfaa7
revert change
heheda12345 Jan 4, 2026
6e51e1c
use mamba_get_block_table_tensor in basic mamba
heheda12345 Jan 4, 2026
85fc6d6
remove unrelated changes
heheda12345 Jan 4, 2026
0eafca2
fix
heheda12345 Jan 5, 2026
7b2044e
add mamba cache mode to mamba_spec and rm cache_config in kv_manager
peakcrosser7 Jan 5, 2026
42de03c
prefill exclude sps tokens
peakcrosser7 Jan 5, 2026
b39990d
fix tests
peakcrosser7 Jan 5, 2026
f480c08
revert InputBatch api
peakcrosser7 Jan 5, 2026
67d4e03
revert code
peakcrosser7 Jan 6, 2026
8dcf54d
update according to suggestions
peakcrosser7 Jan 6, 2026
0ad13e2
update mamba cache mode config
peakcrosser7 Jan 7, 2026
f0d49ef
fix get_num_blocks_to_allocate tests
peakcrosser7 Jan 7, 2026
f02f265
add _get_num_evictable_blocks
peakcrosser7 Jan 7, 2026
7c2a6f2
add todo
peakcrosser7 Jan 7, 2026
21e8cee
fix allocate_new_blocks
peakcrosser7 Jan 8, 2026
aa13dc5
revert mamba_block_size code
peakcrosser7 Jan 14, 2026
3481f16
remove unused comments
peakcrosser7 Jan 14, 2026
7502358
add classmethod
peakcrosser7 Jan 14, 2026
c808dd0
block_aligned_split support resume
peakcrosser7 Jan 14, 2026
976081f
block_aligned_split support resume v2
peakcrosser7 Jan 14, 2026
e381dd9
revert block_align_split
peakcrosser7 Jan 14, 2026
f5580ed
add comments
peakcrosser7 Jan 15, 2026
8f347a6
revert the prefill logic back to include draft tokens
peakcrosser7 Jan 18, 2026
b29b026
temporarily disable spec decoding
peakcrosser7 Jan 18, 2026
e61518d
Merge branch 'main' into ups/mamba_prefix_cache_align
peakcrosser7 Jan 18, 2026
fd6e24f
fix pre-commit
peakcrosser7 Jan 18, 2026
c200506
skip test
heheda12345 Jan 19, 2026
738e7f4
skip test
heheda12345 Jan 19, 2026
74c60f5
update commemts
peakcrosser7 Jan 19, 2026
760a312
Merge branch 'main' into ups/mamba_prefix_cache_align
peakcrosser7 Jan 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions tests/v1/core/test_single_type_kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
def get_sliding_window_manager(sliding_window_spec, block_pool, enable_caching=True):
return SlidingWindowManager(
sliding_window_spec,
block_pool,
block_pool=block_pool,
enable_caching=enable_caching,
kv_cache_group_id=0,
)
Expand All @@ -35,7 +35,7 @@ def get_chunked_local_attention_manager(
):
return ChunkedLocalAttentionManager(
chunked_local_attention_spec,
block_pool,
block_pool=block_pool,
enable_caching=enable_caching,
kv_cache_group_id=0,
)
Expand Down Expand Up @@ -342,11 +342,15 @@ def test_get_num_blocks_to_allocate():
]

assert (
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1, 0)
manager.get_num_blocks_to_allocate(
"1", 20 * block_size, cached_blocks_1, 0, 20 * block_size
)
== 20
)
assert (
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2, 0)
manager.get_num_blocks_to_allocate(
"2", 20 * block_size, cached_blocks_2, 0, 20 * block_size
)
== 15
)

Expand Down Expand Up @@ -375,6 +379,7 @@ def test_evictable_cached_blocks_not_double_allocated():
num_tokens=2 * block_size,
new_computed_blocks=[evictable_block],
total_computed_tokens=block_size,
num_tokens_main_model=2 * block_size,
)
# Free capacity check should count evictable cached blocks, but allocation
# should only allocate the truly new block.
Expand All @@ -386,7 +391,9 @@ def test_evictable_cached_blocks_not_double_allocated():
num_local_computed_tokens=block_size,
num_external_computed_tokens=0,
)
new_blocks = manager.allocate_new_blocks(request_id, num_tokens=4)
new_blocks = manager.allocate_new_blocks(
request_id, num_tokens=4, num_tokens_main_model=4
)
assert len(new_blocks) == 1
assert len(manager.req_to_blocks[request_id]) == 2

Expand All @@ -411,10 +418,14 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
]

assert (
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1, 0)
manager.get_num_blocks_to_allocate(
"1", 20 * block_size, cached_blocks_1, 0, 20 * block_size
)
== 20
)
assert (
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2, 0)
manager.get_num_blocks_to_allocate(
"2", 20 * block_size, cached_blocks_2, 0, 20 * block_size
)
== 15
)
Loading