-
Notifications
You must be signed in to change notification settings - Fork 129
[FIX_FOR_VLLM_CUSTOM=3975eb6de6ea914b9d7b27fd517e0c971ddeb6fc] Fix upstream breakages: NIXL connector, TpKVTopology rename, MoE refactor, transformers v5 #1377
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
74bf578
074fc49
ac3a29d
7d38f7f
364c8c3
c95f491
d0250f9
eb21b1f
ba98fdc
f229e60
7e937d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -91,20 +91,19 @@ def test_offloading_connector(request_runner, async_scheduling: bool): | |
| runner.new_request(token_ids=[1] * offloaded_block_size) | ||
| runner.manager.prepare_store.side_effect = (lambda block_hashes, req_context: generate_store_output([])) | ||
| runner.run(decoded_tokens=[EOS_TOKEN_ID]) | ||
| runner.manager.lookup.assert_called() | ||
| assert len(list(runner.manager.lookup.call_args.args[0])) == 1 | ||
| runner.manager.lookup.assert_called_once() | ||
|
|
||
| # single block lookup with a hit | ||
| runner.scheduler.reset_prefix_cache() | ||
| runner.new_request(token_ids=[0] * offloaded_block_size) | ||
| runner.manager.prepare_store.side_effect = (lambda block_hashes, req_context: generate_store_output([])) | ||
| runner.manager.lookup.return_value = 1 | ||
| runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1 | ||
| runner.run(decoded_tokens=[EOS_TOKEN_ID], expected_loaded_gpu_block_indexes=(0, 1, 2)) | ||
|
Comment on lines
97
to
101
|
||
|
|
||
| # single block lookup with a hit in a middle block | ||
| runner.new_request(token_ids=[0] * offloaded_block_size * 2 + [1] * offloaded_block_size) | ||
| runner.manager.prepare_store.side_effect = (lambda block_hashes, req_context: generate_store_output([])) | ||
| runner.manager.lookup.return_value = 1 | ||
| runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1 | ||
| runner.run(decoded_tokens=[EOS_TOKEN_ID], expected_loaded_gpu_block_indexes=(3, 4, 5)) | ||
|
|
||
| # test take_events | ||
|
|
@@ -182,7 +181,7 @@ def test_request_preemption(request_runner, async_scheduling: bool): | |
|
|
||
| # request should now return from preemption | ||
| # re-load [0, ..., 8] from the CPU and store [9, 10, 11] | ||
| runner.manager.lookup.return_value = 3 | ||
| runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 3 | ||
| runner.manager.prepare_store.side_effect = (lambda block_hashes, req_context: generate_store_output(block_hashes)) | ||
| runner.run( | ||
| decoded_tokens=[0] * gpu_block_size, | ||
|
|
@@ -219,7 +218,7 @@ def test_concurrent_lookups_of_the_same_prefix(request_runner, async_scheduling: | |
| # start a request to load the first block, but don't complete | ||
| runner.scheduler.reset_prefix_cache() | ||
| runner.new_request(token_ids=[0] * offloaded_block_size) | ||
| runner.manager.lookup.return_value = 1 | ||
| runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1 | ||
| runner.run( | ||
| decoded_tokens=[], | ||
| complete_transfers=False, | ||
|
|
@@ -231,7 +230,7 @@ def test_concurrent_lookups_of_the_same_prefix(request_runner, async_scheduling: | |
|
|
||
| # start a new request to load the same first block | ||
| runner.new_request(token_ids=[0] * offloaded_block_size) | ||
| runner.manager.lookup.return_value = 1 | ||
| runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1 | ||
| runner.run( | ||
| decoded_tokens=[], | ||
| complete_transfers=False, | ||
|
|
@@ -275,7 +274,7 @@ def test_abort_loading_requests(request_runner, async_scheduling: bool): | |
| # start a request to load the first block, but don't complete | ||
| runner.scheduler.reset_prefix_cache() | ||
| runner.new_request(token_ids=[0] * offloaded_block_size) | ||
| runner.manager.lookup.return_value = 1 | ||
| runner.connector_scheduler._maximal_prefix_lookup = lambda key, req_context: 1 | ||
| runner.run( | ||
| decoded_tokens=[], | ||
| complete_transfers=False, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The assertion was weakened from validating the lookup input (previously checking the iterable length) to only checking
lookup()was called once. To keep coverage of the signature change, consider asserting thatlookupwas called with anOffloadKey(and, if applicable, thatreq_contextis passed via args/kwargs) so the test will fail if the old iterable-based call is accidentally reintroduced.