|
15 | 15 | ) |
16 | 16 | from vllm import PoolingParams |
17 | 17 | from vllm.logprobs import PromptLogprobs, SampleLogprobs |
| 18 | +from vllm.lora.request import LoRARequest |
18 | 19 | from vllm.outputs import CompletionOutput, RequestOutput |
19 | 20 | from vllm.sampling_params import RequestOutputKind, SamplingParams |
20 | 21 | from vllm.transformers_utils.tokenizer import AnyTokenizer |
21 | | -from vllm.v1.engine import EngineCoreRequest |
| 22 | +from vllm.v1.engine import ( |
| 23 | + EngineCoreEvent, |
| 24 | + EngineCoreEventType, |
| 25 | + EngineCoreOutputs, |
| 26 | + EngineCoreRequest, |
| 27 | + FinishReason, |
| 28 | +) |
22 | 29 | from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector |
23 | | -from vllm.v1.metrics.stats import IterationStats |
| 30 | +from vllm.v1.metrics.stats import IterationStats, SchedulerStats |
24 | 31 |
|
25 | 32 |
|
26 | 33 | def _ref_convert_id_to_token( |
@@ -895,6 +902,170 @@ def test_iteration_stats(dummy_test_vectors): |
895 | 902 | assert iteration_stats.num_generation_tokens == num_active |
896 | 903 |
|
897 | 904 |
|
| 905 | +@pytest.mark.parametrize("log_stats", [True, False]) |
| 906 | +def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): |
| 907 | + """Test LoRA request lifecycle tracking through waiting -> running -> finished.""" |
| 908 | + output_processor = OutputProcessor( |
| 909 | + dummy_test_vectors.tokenizer, log_stats=log_stats |
| 910 | + ) |
| 911 | + engine_core = MockEngineCore(dummy_test_vectors.generation_tokens) |
| 912 | + engine_core_timestamp = time.monotonic() |
| 913 | + |
| 914 | + # Create LoRA requests |
| 915 | + lora1 = LoRARequest(lora_name="lora-1", lora_int_id=1, lora_path="/path/to/lora1") |
| 916 | + lora2 = LoRARequest(lora_name="lora-2", lora_int_id=2, lora_path="/path/to/lora2") |
| 917 | + |
| 918 | + # Create requests with different LoRA adapters: |
| 919 | + # - request-0: lora-1 |
| 920 | + # - request-1: lora-2 |
| 921 | + # - request-2: None (no LoRA) |
| 922 | + lora_assignments = [lora1, lora2, None] |
| 923 | + requests = [ |
| 924 | + EngineCoreRequest( |
| 925 | + request_id=f"request-{idx}", |
| 926 | + prompt_token_ids=prompt_tokens, |
| 927 | + mm_features=None, |
| 928 | + eos_token_id=None, |
| 929 | + arrival_time=0, |
| 930 | + lora_request=lora_assignments[idx], |
| 931 | + cache_salt=None, |
| 932 | + data_parallel_rank=None, |
| 933 | + sampling_params=SamplingParams(), |
| 934 | + pooling_params=None, |
| 935 | + ) |
| 936 | + for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) |
| 937 | + ] |
| 938 | + |
| 939 | + # Add all requests to the OutputProcessor |
| 940 | + for request in requests: |
| 941 | + output_processor.add_request(request, None) |
| 942 | + |
| 943 | + # First iteration: process outputs with QUEUED events |
| 944 | + outputs = EngineCoreOutputs( |
| 945 | + outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() |
| 946 | + ) |
| 947 | + for output in outputs.outputs: |
| 948 | + output.events = [ |
| 949 | + EngineCoreEvent.new_event(EngineCoreEventType.QUEUED, engine_core_timestamp) |
| 950 | + ] |
| 951 | + |
| 952 | + iteration_stats = IterationStats() if log_stats else None |
| 953 | + output_processor.process_outputs( |
| 954 | + outputs.outputs, engine_core_timestamp, iteration_stats |
| 955 | + ) |
| 956 | + output_processor.update_scheduler_stats(outputs.scheduler_stats) |
| 957 | + |
| 958 | + if log_stats: |
| 959 | + # Verify waiting counts |
| 960 | + assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-1") == 1 |
| 961 | + assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-2") == 1 |
| 962 | + assert outputs.scheduler_stats.running_lora_adapters.get("lora-1") == 0 |
| 963 | + assert outputs.scheduler_stats.running_lora_adapters.get("lora-2") == 0 |
| 964 | + # Verify internal state |
| 965 | + assert len(output_processor.lora_states.requests) == 2 |
| 966 | + assert "lora-1" in output_processor.lora_states.requests |
| 967 | + assert "lora-2" in output_processor.lora_states.requests |
| 968 | + else: |
| 969 | + # When log_stats=False, no tracking should occur |
| 970 | + assert iteration_stats is None |
| 971 | + assert len(output_processor.lora_states.requests) == 0 |
| 972 | + |
| 973 | + # Second iteration: process outputs with SCHEDULED events |
| 974 | + outputs = EngineCoreOutputs( |
| 975 | + outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() |
| 976 | + ) |
| 977 | + for output in outputs.outputs: |
| 978 | + output.events = [ |
| 979 | + EngineCoreEvent.new_event( |
| 980 | + EngineCoreEventType.SCHEDULED, engine_core_timestamp |
| 981 | + ) |
| 982 | + ] |
| 983 | + |
| 984 | + iteration_stats = IterationStats() if log_stats else None |
| 985 | + output_processor.process_outputs( |
| 986 | + outputs.outputs, engine_core_timestamp, iteration_stats |
| 987 | + ) |
| 988 | + output_processor.update_scheduler_stats(outputs.scheduler_stats) |
| 989 | + |
| 990 | + if log_stats: |
| 991 | + # Verify running counts |
| 992 | + assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-1") == 0 |
| 993 | + assert outputs.scheduler_stats.waiting_lora_adapters.get("lora-2") == 0 |
| 994 | + assert outputs.scheduler_stats.running_lora_adapters.get("lora-1") == 1 |
| 995 | + assert outputs.scheduler_stats.running_lora_adapters.get("lora-2") == 1 |
| 996 | + else: |
| 997 | + assert iteration_stats is None |
| 998 | + assert len(output_processor.lora_states.requests) == 0 |
| 999 | + |
| 1000 | + # Third iteration: finish request-0 (lora-1) |
| 1001 | + outputs = EngineCoreOutputs( |
| 1002 | + outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() |
| 1003 | + ) |
| 1004 | + # Find and mark request-0 as finished (it uses lora-1) |
| 1005 | + for output in outputs.outputs: |
| 1006 | + if output.request_id == "request-0": |
| 1007 | + output.finish_reason = FinishReason.LENGTH |
| 1008 | + break |
| 1009 | + |
| 1010 | + iteration_stats = IterationStats() if log_stats else None |
| 1011 | + output_processor.process_outputs( |
| 1012 | + outputs.outputs, engine_core_timestamp, iteration_stats |
| 1013 | + ) |
| 1014 | + output_processor.update_scheduler_stats(outputs.scheduler_stats) |
| 1015 | + |
| 1016 | + if log_stats: |
| 1017 | + # lora-1 should be removed since no requests remain |
| 1018 | + assert "lora-1" not in output_processor.lora_states.requests |
| 1019 | + # lora-2 should still be running |
| 1020 | + assert outputs.scheduler_stats.running_lora_adapters.get("lora-2") == 1 |
| 1021 | + assert len(output_processor.lora_states.requests) == 1 |
| 1022 | + else: |
| 1023 | + assert len(output_processor.lora_states.requests) == 0 |
| 1024 | + |
| 1025 | + # Fourth iteration: finish request-1 (lora-2) |
| 1026 | + outputs = EngineCoreOutputs( |
| 1027 | + outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() |
| 1028 | + ) |
| 1029 | + # Find and mark request-1 as finished (it uses lora-2) |
| 1030 | + for output in outputs.outputs: |
| 1031 | + if output.request_id == "request-1": |
| 1032 | + output.finish_reason = FinishReason.LENGTH |
| 1033 | + break |
| 1034 | + |
| 1035 | + iteration_stats = IterationStats() if log_stats else None |
| 1036 | + output_processor.process_outputs( |
| 1037 | + outputs.outputs, engine_core_timestamp, iteration_stats |
| 1038 | + ) |
| 1039 | + output_processor.update_scheduler_stats(outputs.scheduler_stats) |
| 1040 | + |
| 1041 | + if log_stats: |
| 1042 | + # lora-2 should be removed since no requests remain |
| 1043 | + assert "lora-2" not in output_processor.lora_states.requests |
| 1044 | + assert len(outputs.scheduler_stats.running_lora_adapters) == 0 |
| 1045 | + assert len(output_processor.lora_states.requests) == 0 |
| 1046 | + else: |
| 1047 | + assert len(output_processor.lora_states.requests) == 0 |
| 1048 | + |
| 1049 | + # Finish the last request (no LoRA) |
| 1050 | + outputs = EngineCoreOutputs( |
| 1051 | + outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() |
| 1052 | + ) |
| 1053 | + # Find and mark request-2 as finished (it has no LoRA) |
| 1054 | + for output in outputs.outputs: |
| 1055 | + if output.request_id == "request-2": |
| 1056 | + output.finish_reason = FinishReason.LENGTH |
| 1057 | + break |
| 1058 | + |
| 1059 | + iteration_stats = IterationStats() if log_stats else None |
| 1060 | + output_processor.process_outputs( |
| 1061 | + outputs.outputs, engine_core_timestamp, iteration_stats |
| 1062 | + ) |
| 1063 | + output_processor.update_scheduler_stats(outputs.scheduler_stats) |
| 1064 | + |
| 1065 | + # Verify all requests are finished |
| 1066 | + assert output_processor.get_num_unfinished_requests() == 0 |
| 1067 | + |
| 1068 | + |
898 | 1069 | @pytest.mark.asyncio |
899 | 1070 | async def test_request_output_collector(): |
900 | 1071 | NUM_REQS = 3 |
|
0 commit comments