From 44d1740c2b1e12e85a8d81ad8c674b52ec60e7ee Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 27 Apr 2026 17:23:10 +0800 Subject: [PATCH 1/9] init Signed-off-by: wang.yuqi --- .buildkite/test-amd.yaml | 34 +-- .buildkite/test_areas/distributed.yaml | 14 +- .buildkite/test_areas/misc.yaml | 6 +- .buildkite/test_areas/model_runner_v2.yaml | 6 +- .github/mergify.yml | 4 +- docs/contributing/profiling.md | 2 +- docs/features/automatic_prefix_caching.md | 2 +- docs/features/context_extension.md | 2 +- docs/features/lora.md | 2 +- docs/features/prompt_embeds.md | 4 +- docs/features/speculative_decoding/README.md | 2 +- docs/features/speculative_decoding/eagle.md | 2 +- .../models/extensions/runai_model_streamer.md | 2 +- docs/serving/data_parallel_deployment.md | 2 +- docs/usage/reproducibility.md | 2 +- .../automatic_prefix_caching_offline.py} | 2 +- .../prefix_caching_offline.py} | 0 .../reproducibility_offline.py} | 0 .../context_extension_offline.py} | 2 +- .../data_parallel/data_parallel_offline.py} | 6 +- .../multi_instance_data_parallel.py | 0 .../kv_events}/kv_events_subscriber.py | 0 .../logits_processor/README.md | 0 .../logits_processor/custom.py | 0 .../logits_processor/custom_req.py | 0 .../logits_processor/custom_req_init.py | 0 .../lora/lora_with_quantization_offline.py} | 0 .../lora/multilora_offline.py} | 0 .../openai_batch/README.md | 0 .../openai_batch/openai_example_batch.jsonl | 0 .../data_parallel_pause_resume.py | 270 +++++++++--------- .../pause_resume/pause_resume_offline.py} | 0 .../profiling/run_one_batch_offline.py} | 0 .../profiling/simple_profiling_offline.py} | 0 ...ompt_embed_inference_with_openai_client.py | 0 .../prompt_embed/prompt_embed_offline.py} | 2 +- .../reset_kv/reset_kv_offline.py} | 0 .../routed_experts/routed_experts_offline.py} | 2 +- .../load_sharded_state_offline.py} | 6 +- .../save_sharded_state_offline.py} | 2 +- .../extract_hidden_states_offline.py} | 0 .../mlpspeculator_offline.py} | 0 .../spec_decode_offline.py} | 0 .../structured_outputs/README.md | 8 +- .../structured_outputs/pyproject.toml | 0 .../structured_outputs_client.py} | 0 .../structured_outputs_offline.py} | 0 .../torchrun/torchrun_dp_example_offline.py} | 4 +- .../torchrun/torchrun_example_offline.py} | 2 +- .../skip_loading_weights_in_engine_init.py | 0 tests/distributed/test_torchrun_example.py | 2 +- .../distributed/test_torchrun_example_moe.py | 2 +- .../v1/spec_decode/test_acceptance_length.py | 2 +- vllm/entrypoints/llm.py | 2 +- .../model_loader/sharded_state_loader.py | 2 +- vllm/v1/engine/utils.py | 2 +- vllm/v1/executor/uniproc_executor.py | 2 +- 57 files changed, 203 insertions(+), 203 deletions(-) rename examples/{offline_inference/automatic_prefix_caching.py => features/automatic_prefix_caching/automatic_prefix_caching_offline.py} (98%) rename examples/{offline_inference/prefix_caching.py => features/automatic_prefix_caching/prefix_caching_offline.py} (100%) rename examples/{offline_inference/reproducibility.py => features/batch_invariance/reproducibility_offline.py} (100%) rename examples/{offline_inference/context_extension.py => features/context_extension/context_extension_offline.py} (96%) rename examples/{offline_inference/data_parallel.py => features/data_parallel/data_parallel_offline.py} (96%) rename examples/{online_serving => features/data_parallel}/multi_instance_data_parallel.py (100%) rename examples/{online_serving => features/kv_events}/kv_events_subscriber.py (100%) rename examples/{offline_inference => features}/logits_processor/README.md (100%) rename examples/{offline_inference => features}/logits_processor/custom.py (100%) rename examples/{offline_inference => features}/logits_processor/custom_req.py (100%) rename examples/{offline_inference => features}/logits_processor/custom_req_init.py (100%) rename examples/{offline_inference/lora_with_quantization_inference.py => features/lora/lora_with_quantization_offline.py} (100%) rename examples/{offline_inference/multilora_inference.py => features/lora/multilora_offline.py} (100%) rename examples/{offline_inference => features}/openai_batch/README.md (100%) rename examples/{offline_inference => features}/openai_batch/openai_example_batch.jsonl (100%) rename examples/{online_serving => features/pause_resume}/data_parallel_pause_resume.py (96%) rename examples/{offline_inference/pause_resume.py => features/pause_resume/pause_resume_offline.py} (100%) rename examples/{offline_inference/run_one_batch.py => features/profiling/run_one_batch_offline.py} (100%) rename examples/{offline_inference/simple_profiling.py => features/profiling/simple_profiling_offline.py} (100%) rename examples/{online_serving => features/prompt_embed}/prompt_embed_inference_with_openai_client.py (100%) rename examples/{offline_inference/prompt_embed_inference.py => features/prompt_embed/prompt_embed_offline.py} (97%) rename examples/{offline_inference/llm_engine_reset_kv.py => features/reset_kv/reset_kv_offline.py} (100%) rename examples/{offline_inference/routed_experts_e2e.py => features/routed_experts/routed_experts_offline.py} (99%) rename examples/{offline_inference/load_sharded_state.py => features/sharded_state/load_sharded_state_offline.py} (94%) rename examples/{offline_inference/save_sharded_state.py => features/sharded_state/save_sharded_state_offline.py} (98%) rename examples/{offline_inference/extract_hidden_states.py => features/speculative_decoding/extract_hidden_states_offline.py} (100%) rename examples/{offline_inference/mlpspeculator.py => features/speculative_decoding/mlpspeculator_offline.py} (100%) rename examples/{offline_inference/spec_decode.py => features/speculative_decoding/spec_decode_offline.py} (100%) rename examples/{online_serving => features}/structured_outputs/README.md (87%) rename examples/{online_serving => features}/structured_outputs/pyproject.toml (100%) rename examples/{online_serving/structured_outputs/structured_outputs.py => features/structured_outputs/structured_outputs_client.py} (100%) rename examples/{offline_inference/structured_outputs.py => features/structured_outputs/structured_outputs_offline.py} (100%) rename examples/{offline_inference/torchrun_dp_example.py => features/torchrun/torchrun_dp_example_offline.py} (98%) rename examples/{offline_inference/torchrun_example.py => features/torchrun/torchrun_example_offline.py} (99%) rename examples/{offline_inference => rl}/skip_loading_weights_in_engine_init.py (100%) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 68179dcb68cd..62b68b74ffe6 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -395,11 +395,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching.py + - python3 offline_inference/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi250 · kernels ----------------------------------------------------------# @@ -1168,13 +1168,13 @@ steps: - vllm/v1/attention/backends/ - vllm/v1/attention/selector.py - tests/distributed/test_context_parallel.py - - examples/offline_inference/data_parallel.py + - examples/offline_inference/data_parallel_offline.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - label: Distributed Tests (4xA100-4xMI300) # TBD timeout_in_minutes: 180 @@ -1203,7 +1203,7 @@ steps: - tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example_moe.py - examples/rl/ - - tests/examples/offline_inference/data_parallel.py + - tests/examples/offline_inference/data_parallel_offline.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 @@ -1213,7 +1213,7 @@ steps: - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - python3 ../examples/offline_inference/data_parallel_offline.py --enforce-eager # rlhf examples - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py @@ -1266,7 +1266,7 @@ steps: optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py + - examples/offline_inference/torchrun_dp_example_offline.py - vllm/config/parallel.py - vllm/distributed/ - vllm/v1/engine/llm_engine.py @@ -1275,7 +1275,7 @@ steps: - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep #-------------------------------------------------------- mi300 · entrypoints --------------------------------------------------------# @@ -1654,11 +1654,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching.py + - python3 offline_inference/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi300 · kernels ----------------------------------------------------------# @@ -2302,7 +2302,7 @@ steps: commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - pytest -v -s tests/distributed/test_packed_tensor.py @@ -2713,7 +2713,7 @@ steps: - vllm/v1/attention/selector.py - tests/distributed/test_context_parallel.py - tests/v1/distributed/test_dbo.py - - examples/offline_inference/data_parallel.py + - examples/offline_inference/data_parallel_offline.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: @@ -2937,11 +2937,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching.py + - python3 offline_inference/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi355 · kernels ----------------------------------------------------------# diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 093f3ab4fe1f..81a45f54465f 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -90,7 +90,7 @@ steps: - tests/distributed/test_torchrun_example_moe.py - examples/offline_inference/rlhf_colocate.py - examples/rl/ - - tests/examples/offline_inference/data_parallel.py + - tests/examples/offline_inference/data_parallel_offline.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 @@ -107,7 +107,7 @@ steps: # test with torchrun tp=2 and dp=2 with ep - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py # test with internal dp - - python3 examples/offline_inference/data_parallel.py --enforce-eager + - python3 examples/offline_inference/data_parallel_offline.py --enforce-eager # rlhf examples - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py @@ -159,7 +159,7 @@ steps: num_devices: 8 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py + - examples/offline_inference/torchrun_dp_example_offline.py - vllm/config/parallel.py - vllm/distributed/ - vllm/v1/engine/llm_engine.py @@ -169,7 +169,7 @@ steps: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and dp=4 with ep - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: Distributed Tests (4 GPUs)(A100) device: a100 @@ -194,7 +194,7 @@ steps: commands: - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - pytest -v -s tests/distributed/test_packed_tensor.py @@ -222,9 +222,9 @@ steps: - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ - - tests/examples/offline_inference/data_parallel.py + - tests/examples/offline_inference/data_parallel_offline.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - label: Pipeline + Context Parallelism (4 GPUs) timeout_in_minutes: 60 diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index d0930be156d2..558d24851ab0 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -120,12 +120,12 @@ steps: # for pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - - python3 offline_inference/prefix_caching.py + - python3 offline_inference/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Metrics, Tracing (2 GPUs) timeout_in_minutes: 20 diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml index 2b88c00d6b77..10d093878ecd 100644 --- a/.buildkite/test_areas/model_runner_v2.yaml +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -51,12 +51,12 @@ steps: # for pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - - python3 offline_inference/prefix_caching.py + - python3 offline_inference/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Model Runner V2 Distributed (2 GPUs) timeout_in_minutes: 45 diff --git a/.github/mergify.yml b/.github/mergify.yml index 8ca00d6e7d2d..c73a876d3f14 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -308,8 +308,8 @@ pull_request_rules: - files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/run_structured_output_benchmark.sh - files=docs/features/structured_outputs.md - - files=examples/offline_inference/structured_outputs.py - - files=examples/online_serving/structured_outputs/structured_outputs.py + - files=examples/offline_inference/structured_outputs_offline.py + - files=examples/online_serving/structured_outputs/structured_outputs_offline.py - files~=^tests/v1/structured_output/ - files=tests/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index addda300d020..650aad9e3470 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -42,7 +42,7 @@ Traces can be visualized using . #### Offline Inference -Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example. +Refer to [examples/offline_inference/simple_profiling.py](../../examples/features/profiling/simple_profiling_offline.py) for an example. #### OpenAI Server diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md index 3718a4b74eb2..32163d3b10f1 100644 --- a/docs/features/automatic_prefix_caching.md +++ b/docs/features/automatic_prefix_caching.md @@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example: -[examples/offline_inference/automatic_prefix_caching.py](../../examples/offline_inference/automatic_prefix_caching.py) +[examples/offline_inference/automatic_prefix_caching.py](../../examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py) ## Example workloads diff --git a/docs/features/context_extension.md b/docs/features/context_extension.md index f622191aebc6..9b75249923ef 100644 --- a/docs/features/context_extension.md +++ b/docs/features/context_extension.md @@ -11,7 +11,7 @@ The [`context_extension.py`](../../examples/offline_inference/context_extension) ### Usage ```bash -python examples/offline_inference/context_extension.py +python examples/offline_inference/context_extension_offline.py ``` ## OpenAI Online Method diff --git a/docs/features/lora.md b/docs/features/lora.md index 2e7b36545d46..4cd523572b84 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -47,7 +47,7 @@ the third parameter is the path to the LoRA adapter. ) ``` -Check out [examples/offline_inference/multilora_inference.py](../../examples/offline_inference/multilora_inference.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. +Check out [examples/offline_inference/multilora_inference.py](../../examples/features/lora/multilora_offline.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md index b81d2f28e3b9..9db46182d0f6 100644 --- a/docs/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -16,7 +16,7 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]: You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples: -[examples/offline_inference/prompt_embed_inference.py](../../examples/offline_inference/prompt_embed_inference.py) +[examples/offline_inference/prompt_embed_inference.py](../../examples/features/prompt_embed/prompt_embed_offline.py) ## Online Serving @@ -41,4 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \ Then, you can use the OpenAI client as follows: -[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/online_serving/prompt_embed_inference_with_openai_client.py) +[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py) diff --git a/docs/features/speculative_decoding/README.md b/docs/features/speculative_decoding/README.md index 25cda8059b24..4450cfb82874 100644 --- a/docs/features/speculative_decoding/README.md +++ b/docs/features/speculative_decoding/README.md @@ -32,7 +32,7 @@ depend on your model family, traffic pattern, hardware, and sampling settings. | Suffix decoding | Low to medium gain | Medium gain | No extra draft model; dynamic speculation depth. | For reproducible measurements in your environment, use -[`examples/offline_inference/spec_decode.py`](../../../examples/offline_inference/spec_decode.py) +[`examples/offline_inference/spec_decode.py`](../../../examples/features/speculative_decoding/spec_decode_offline.py) or the [benchmark CLI guide](../../benchmarking/cli.md). ## `--speculative-config` schema diff --git a/docs/features/speculative_decoding/eagle.md b/docs/features/speculative_decoding/eagle.md index 3e0f3add416e..c52468eac4b5 100644 --- a/docs/features/speculative_decoding/eagle.md +++ b/docs/features/speculative_decoding/eagle.md @@ -1,6 +1,6 @@ # EAGLE Draft Models -The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found in [examples/offline_inference/spec_decode.py](../../../examples/offline_inference/spec_decode.py) +The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found in [examples/offline_inference/spec_decode.py](../../../examples/features/speculative_decoding/spec_decode_offline.py) ## Eagle Drafter Example diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md index 38c603b46e10..837aea23a404 100644 --- a/docs/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -101,7 +101,7 @@ vllm serve /path/to/sharded/model \ --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' ``` -To create sharded model files, you can use the script provided in [examples/offline_inference/save_sharded_state.py](../../../examples/offline_inference/save_sharded_state.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. +To create sharded model files, you can use the script provided in [examples/offline_inference/save_sharded_state.py](../../../examples/features/sharded_state/save_sharded_state_offline.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way: diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md index f0946eaf407a..71957385bfd5 100644 --- a/docs/serving/data_parallel_deployment.md +++ b/docs/serving/data_parallel_deployment.md @@ -16,7 +16,7 @@ For MoE models, when any requests are in progress in any rank, we must ensure th In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently. -This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/offline_inference/data_parallel.py](../../examples/offline_inference/data_parallel.py). +This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/offline_inference/data_parallel.py](../../examples/features/data_parallel/data_parallel_offline.py). There are two distinct modes supported for online deployments - self-contained with internal load balancing, or externally per-rank process deployment and load balancing. diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md index a8e49d0a3398..dab180ff8266 100644 --- a/docs/usage/reproducibility.md +++ b/docs/usage/reproducibility.md @@ -7,7 +7,7 @@ reproducible results: or enable [batch invariance](../features/batch_invariance.md) to make the outputs insensitive to scheduling. - In online mode, you can only enable [batch invariance](../features/batch_invariance.md). -Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py) +Example: [examples/offline_inference/reproducibility.py](../../examples/features/batch_invariance/reproducibility_offline.py) !!! warning diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py similarity index 98% rename from examples/offline_inference/automatic_prefix_caching.py rename to examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py index 2d3c28d9dd4f..d11e8547410f 100644 --- a/examples/offline_inference/automatic_prefix_caching.py +++ b/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py @@ -15,7 +15,7 @@ but ask different questions. Run: -python examples/offline_inference/automatic_prefix_caching.py +python examples/offline_inference/automatic_prefix_caching_offline.py """ import time diff --git a/examples/offline_inference/prefix_caching.py b/examples/features/automatic_prefix_caching/prefix_caching_offline.py similarity index 100% rename from examples/offline_inference/prefix_caching.py rename to examples/features/automatic_prefix_caching/prefix_caching_offline.py diff --git a/examples/offline_inference/reproducibility.py b/examples/features/batch_invariance/reproducibility_offline.py similarity index 100% rename from examples/offline_inference/reproducibility.py rename to examples/features/batch_invariance/reproducibility_offline.py diff --git a/examples/offline_inference/context_extension.py b/examples/features/context_extension/context_extension_offline.py similarity index 96% rename from examples/offline_inference/context_extension.py rename to examples/features/context_extension/context_extension_offline.py index fae8590f914e..4ea0c356f1d1 100644 --- a/examples/offline_inference/context_extension.py +++ b/examples/features/context_extension/context_extension_offline.py @@ -6,7 +6,7 @@ and run a simple chat example. Usage: - python examples/offline_inference/context_extension.py + python examples/offline_inference/context_extension_offline.py """ from vllm import LLM, RequestOutput, SamplingParams diff --git a/examples/offline_inference/data_parallel.py b/examples/features/data_parallel/data_parallel_offline.py similarity index 96% rename from examples/offline_inference/data_parallel.py rename to examples/features/data_parallel/data_parallel_offline.py index 287409fa2b5c..fccfa782236c 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/features/data_parallel/data_parallel_offline.py @@ -3,14 +3,14 @@ """ Usage: Single node: - python examples/offline_inference/data_parallel.py \ + python examples/offline_inference/data_parallel_offline.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 Multi-node: Node 0 (assume the node has ip of 10.99.48.128): - python examples/offline_inference/data_parallel.py \ + python examples/offline_inference/data_parallel_offline.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 \ @@ -19,7 +19,7 @@ --dp-master-addr=10.99.48.128 \ --dp-master-port=13345 Node 1: - python examples/offline_inference/data_parallel.py \ + python examples/offline_inference/data_parallel_offline.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 \ diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/features/data_parallel/multi_instance_data_parallel.py similarity index 100% rename from examples/online_serving/multi_instance_data_parallel.py rename to examples/features/data_parallel/multi_instance_data_parallel.py diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/features/kv_events/kv_events_subscriber.py similarity index 100% rename from examples/online_serving/kv_events_subscriber.py rename to examples/features/kv_events/kv_events_subscriber.py diff --git a/examples/offline_inference/logits_processor/README.md b/examples/features/logits_processor/README.md similarity index 100% rename from examples/offline_inference/logits_processor/README.md rename to examples/features/logits_processor/README.md diff --git a/examples/offline_inference/logits_processor/custom.py b/examples/features/logits_processor/custom.py similarity index 100% rename from examples/offline_inference/logits_processor/custom.py rename to examples/features/logits_processor/custom.py diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/features/logits_processor/custom_req.py similarity index 100% rename from examples/offline_inference/logits_processor/custom_req.py rename to examples/features/logits_processor/custom_req.py diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/features/logits_processor/custom_req_init.py similarity index 100% rename from examples/offline_inference/logits_processor/custom_req_init.py rename to examples/features/logits_processor/custom_req_init.py diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/features/lora/lora_with_quantization_offline.py similarity index 100% rename from examples/offline_inference/lora_with_quantization_inference.py rename to examples/features/lora/lora_with_quantization_offline.py diff --git a/examples/offline_inference/multilora_inference.py b/examples/features/lora/multilora_offline.py similarity index 100% rename from examples/offline_inference/multilora_inference.py rename to examples/features/lora/multilora_offline.py diff --git a/examples/offline_inference/openai_batch/README.md b/examples/features/openai_batch/README.md similarity index 100% rename from examples/offline_inference/openai_batch/README.md rename to examples/features/openai_batch/README.md diff --git a/examples/offline_inference/openai_batch/openai_example_batch.jsonl b/examples/features/openai_batch/openai_example_batch.jsonl similarity index 100% rename from examples/offline_inference/openai_batch/openai_example_batch.jsonl rename to examples/features/openai_batch/openai_example_batch.jsonl diff --git a/examples/online_serving/data_parallel_pause_resume.py b/examples/features/pause_resume/data_parallel_pause_resume.py similarity index 96% rename from examples/online_serving/data_parallel_pause_resume.py rename to examples/features/pause_resume/data_parallel_pause_resume.py index e94de22a1271..1f11536e5366 100644 --- a/examples/online_serving/data_parallel_pause_resume.py +++ b/examples/features/pause_resume/data_parallel_pause_resume.py @@ -1,135 +1,135 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Test pause/resume with Data Parallel (DP) via HTTP API. - -This example demonstrates coordinated pause/resume across multiple DP ranks. -The pause synchronizes across all DP engines via all-reduce. - -Prerequisites: - Start a vLLM server with data parallelism: - - $ VLLM_SERVER_DEV_MODE=1 vllm serve facebook/opt-125m \ - --enforce-eager \ - --data-parallel-size 4 \ - --tensor-parallel-size 1 - - Then run this script: - - $ python data_parallel_pause_resume.py - -The test verifies pause works by: -1. Starting a streaming generation request -2. Pausing the server mid-generation -3. Sleeping for PAUSE_DURATION seconds -4. Resuming the server -5. Verifying there was a gap in token generation matching the pause duration -""" - -import argparse -import threading -import time - -import requests -from openai import OpenAI - -BASE_URL = "http://localhost:8000" -MODEL_NAME = "facebook/opt-125m" -PAUSE_DURATION = 3.0 - - -def pause_generation(base_url: str, mode: str = "keep") -> None: - """Pause generation via HTTP endpoint.""" - url = f"{base_url}/pause" - response = requests.post(url, params={"mode": mode}, timeout=60) - response.raise_for_status() - print("Server paused") - - -def resume_generation(base_url: str) -> None: - """Resume generation via HTTP endpoint.""" - url = f"{base_url}/resume" - response = requests.post(url, timeout=60) - response.raise_for_status() - print("Server resumed") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", default=BASE_URL) - parser.add_argument("--model", default=MODEL_NAME) - args = parser.parse_args() - - client = OpenAI( - base_url=f"{args.base_url}/v1", - api_key="EMPTY", - ) - - prompt = "Write a long story about a dragon. Once upon a time" - token_times: list[float] = [] - pause_token_idx = 0 - pause_triggered = threading.Event() - - def generator_thread(): - """Stream tokens and record timestamps.""" - stream = client.completions.create( - model=args.model, - prompt=prompt, - max_tokens=50, - stream=True, - ) - for chunk in stream: - if chunk.choices[0].text: - token_times.append(time.monotonic()) - token_count = len(token_times) - print(f"Token {token_count}: {chunk.choices[0].text!r}") - - # Signal controller after some tokens - if token_count >= 5 and not pause_triggered.is_set(): - pause_triggered.set() - - def controller_thread(): - """Pause and resume the server.""" - nonlocal pause_token_idx - - # Wait for some tokens - pause_triggered.wait() - - print(f"\nPausing server (keep mode) at token {len(token_times)}...") - pause_generation(args.base_url, mode="keep") - pause_token_idx = len(token_times) - print(f"Sleeping for {PAUSE_DURATION}s...") - - time.sleep(PAUSE_DURATION) - - print("Resuming server...") - resume_generation(args.base_url) - print("Resumed!\n") - - # Run both threads - gen_thread = threading.Thread(target=generator_thread) - ctrl_thread = threading.Thread(target=controller_thread) - - gen_thread.start() - ctrl_thread.start() - - gen_thread.join() - ctrl_thread.join() - - # Check gap at the pause point - if pause_token_idx < len(token_times): - pause_gap = token_times[pause_token_idx] - token_times[pause_token_idx - 1] - print( - f"\nGap after pause (token {pause_token_idx} -> " - f"{pause_token_idx + 1}): {pause_gap:.3f}s" - ) - if pause_gap >= PAUSE_DURATION * 0.9: - print("Test passed! Pause synchronized across DP ranks.") - else: - print(f"Test failed! Expected ~{PAUSE_DURATION}s gap, got {pause_gap:.3f}s") - else: - print("Test failed! No tokens were generated after resuming.") - - -if __name__ == "__main__": - main() +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test pause/resume with Data Parallel (DP) via HTTP API. + +This example demonstrates coordinated pause/resume across multiple DP ranks. +The pause synchronizes across all DP engines via all-reduce. + +Prerequisites: + Start a vLLM server with data parallelism: + + $ VLLM_SERVER_DEV_MODE=1 vllm serve facebook/opt-125m \ + --enforce-eager \ + --data-parallel-size 4 \ + --tensor-parallel-size 1 + + Then run this script: + + $ python data_parallel_pause_resume.py + +The test verifies pause works by: +1. Starting a streaming generation request +2. Pausing the server mid-generation +3. Sleeping for PAUSE_DURATION seconds +4. Resuming the server +5. Verifying there was a gap in token generation matching the pause duration +""" + +import argparse +import threading +import time + +import requests +from openai import OpenAI + +BASE_URL = "http://localhost:8000" +MODEL_NAME = "facebook/opt-125m" +PAUSE_DURATION = 3.0 + + +def pause_generation(base_url: str, mode: str = "keep") -> None: + """Pause generation via HTTP endpoint.""" + url = f"{base_url}/pause" + response = requests.post(url, params={"mode": mode}, timeout=60) + response.raise_for_status() + print("Server paused") + + +def resume_generation(base_url: str) -> None: + """Resume generation via HTTP endpoint.""" + url = f"{base_url}/resume" + response = requests.post(url, timeout=60) + response.raise_for_status() + print("Server resumed") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--base-url", default=BASE_URL) + parser.add_argument("--model", default=MODEL_NAME) + args = parser.parse_args() + + client = OpenAI( + base_url=f"{args.base_url}/v1", + api_key="EMPTY", + ) + + prompt = "Write a long story about a dragon. Once upon a time" + token_times: list[float] = [] + pause_token_idx = 0 + pause_triggered = threading.Event() + + def generator_thread(): + """Stream tokens and record timestamps.""" + stream = client.completions.create( + model=args.model, + prompt=prompt, + max_tokens=50, + stream=True, + ) + for chunk in stream: + if chunk.choices[0].text: + token_times.append(time.monotonic()) + token_count = len(token_times) + print(f"Token {token_count}: {chunk.choices[0].text!r}") + + # Signal controller after some tokens + if token_count >= 5 and not pause_triggered.is_set(): + pause_triggered.set() + + def controller_thread(): + """Pause and resume the server.""" + nonlocal pause_token_idx + + # Wait for some tokens + pause_triggered.wait() + + print(f"\nPausing server (keep mode) at token {len(token_times)}...") + pause_generation(args.base_url, mode="keep") + pause_token_idx = len(token_times) + print(f"Sleeping for {PAUSE_DURATION}s...") + + time.sleep(PAUSE_DURATION) + + print("Resuming server...") + resume_generation(args.base_url) + print("Resumed!\n") + + # Run both threads + gen_thread = threading.Thread(target=generator_thread) + ctrl_thread = threading.Thread(target=controller_thread) + + gen_thread.start() + ctrl_thread.start() + + gen_thread.join() + ctrl_thread.join() + + # Check gap at the pause point + if pause_token_idx < len(token_times): + pause_gap = token_times[pause_token_idx] - token_times[pause_token_idx - 1] + print( + f"\nGap after pause (token {pause_token_idx} -> " + f"{pause_token_idx + 1}): {pause_gap:.3f}s" + ) + if pause_gap >= PAUSE_DURATION * 0.9: + print("Test passed! Pause synchronized across DP ranks.") + else: + print(f"Test failed! Expected ~{PAUSE_DURATION}s gap, got {pause_gap:.3f}s") + else: + print("Test failed! No tokens were generated after resuming.") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/pause_resume.py b/examples/features/pause_resume/pause_resume_offline.py similarity index 100% rename from examples/offline_inference/pause_resume.py rename to examples/features/pause_resume/pause_resume_offline.py diff --git a/examples/offline_inference/run_one_batch.py b/examples/features/profiling/run_one_batch_offline.py similarity index 100% rename from examples/offline_inference/run_one_batch.py rename to examples/features/profiling/run_one_batch_offline.py diff --git a/examples/offline_inference/simple_profiling.py b/examples/features/profiling/simple_profiling_offline.py similarity index 100% rename from examples/offline_inference/simple_profiling.py rename to examples/features/profiling/simple_profiling_offline.py diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py similarity index 100% rename from examples/online_serving/prompt_embed_inference_with_openai_client.py rename to examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/features/prompt_embed/prompt_embed_offline.py similarity index 97% rename from examples/offline_inference/prompt_embed_inference.py rename to examples/features/prompt_embed/prompt_embed_offline.py index a0eaeb6810a2..60292e0006a4 100644 --- a/examples/offline_inference/prompt_embed_inference.py +++ b/examples/features/prompt_embed/prompt_embed_offline.py @@ -15,7 +15,7 @@ - transformers Run: - python examples/offline_inference/prompt_embed_inference.py + python examples/offline_inference/prompt_embed_offline.py """ import torch diff --git a/examples/offline_inference/llm_engine_reset_kv.py b/examples/features/reset_kv/reset_kv_offline.py similarity index 100% rename from examples/offline_inference/llm_engine_reset_kv.py rename to examples/features/reset_kv/reset_kv_offline.py diff --git a/examples/offline_inference/routed_experts_e2e.py b/examples/features/routed_experts/routed_experts_offline.py similarity index 99% rename from examples/offline_inference/routed_experts_e2e.py rename to examples/features/routed_experts/routed_experts_offline.py index bb1d7b411f99..278f92b11bbe 100644 --- a/examples/offline_inference/routed_experts_e2e.py +++ b/examples/features/routed_experts/routed_experts_offline.py @@ -9,7 +9,7 @@ 3. Results are deterministic across runs (baseline vs reference). Usage: - python examples/offline_inference/routed_experts_e2e.py \ + python examples/offline_inference/routed_experts_offline.py \ --model Qwen/Qwen3-30B-A3B \ --tp 4 \ --max-model-len 4096 \ diff --git a/examples/offline_inference/load_sharded_state.py b/examples/features/sharded_state/load_sharded_state_offline.py similarity index 94% rename from examples/offline_inference/load_sharded_state.py rename to examples/features/sharded_state/load_sharded_state_offline.py index 0085e8e8e32b..e867db5d12fe 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/features/sharded_state/load_sharded_state_offline.py @@ -3,16 +3,16 @@ """ Validates the loading of a model saved with the sharded_state format. This script demonstrates how to load a model that was previously saved -using save_sharded_state.py and validates it by running inference. +using save_sharded_state_offline.py and validates it by running inference. Example usage: (First need to save a sharded_state mode) -python save_sharded_state.py \ +python save_sharded_state_offline.py \ --model /path/to/load \ --tensor-parallel-size 8 \ --output /path/to/save/sharded/model -python load_sharded_state.py \ +python load_sharded_state_offline.py \ --model /path/to/saved/sharded/model \ --load-format sharded_state \ --tensor-parallel-size 8 \ diff --git a/examples/offline_inference/save_sharded_state.py b/examples/features/sharded_state/save_sharded_state_offline.py similarity index 98% rename from examples/offline_inference/save_sharded_state.py rename to examples/features/sharded_state/save_sharded_state_offline.py index 14d472ee3f23..675f2e35a53f 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/features/sharded_state/save_sharded_state_offline.py @@ -7,7 +7,7 @@ Example usage: -python save_sharded_state.py \ +python save_sharded_state_offline.py \ --model /path/to/load \ --tensor-parallel-size 8 \ --output /path/to/save diff --git a/examples/offline_inference/extract_hidden_states.py b/examples/features/speculative_decoding/extract_hidden_states_offline.py similarity index 100% rename from examples/offline_inference/extract_hidden_states.py rename to examples/features/speculative_decoding/extract_hidden_states_offline.py diff --git a/examples/offline_inference/mlpspeculator.py b/examples/features/speculative_decoding/mlpspeculator_offline.py similarity index 100% rename from examples/offline_inference/mlpspeculator.py rename to examples/features/speculative_decoding/mlpspeculator_offline.py diff --git a/examples/offline_inference/spec_decode.py b/examples/features/speculative_decoding/spec_decode_offline.py similarity index 100% rename from examples/offline_inference/spec_decode.py rename to examples/features/speculative_decoding/spec_decode_offline.py diff --git a/examples/online_serving/structured_outputs/README.md b/examples/features/structured_outputs/README.md similarity index 87% rename from examples/online_serving/structured_outputs/README.md rename to examples/features/structured_outputs/README.md index 7f539716ecf8..cfa9282207b0 100644 --- a/examples/online_serving/structured_outputs/README.md +++ b/examples/features/structured_outputs/README.md @@ -34,19 +34,19 @@ See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.ht Run all constraints, non-streaming: ```bash -uv run structured_outputs.py +uv run structured_outputs_offline.py ``` Run all constraints, streaming: ```bash -uv run structured_outputs.py --stream +uv run structured_outputs_offline.py --stream ``` Run certain constraints, for example `structural_tag` and `regex`, streaming: ```bash -uv run structured_outputs.py \ +uv run structured_outputs_offline.py \ --constraint structural_tag regex \ --stream ``` @@ -54,5 +54,5 @@ uv run structured_outputs.py \ Run all constraints, with reasoning models and streaming: ```bash -uv run structured_outputs.py --reasoning --stream +uv run structured_outputs_offline.py --reasoning --stream ``` diff --git a/examples/online_serving/structured_outputs/pyproject.toml b/examples/features/structured_outputs/pyproject.toml similarity index 100% rename from examples/online_serving/structured_outputs/pyproject.toml rename to examples/features/structured_outputs/pyproject.toml diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/features/structured_outputs/structured_outputs_client.py similarity index 100% rename from examples/online_serving/structured_outputs/structured_outputs.py rename to examples/features/structured_outputs/structured_outputs_client.py diff --git a/examples/offline_inference/structured_outputs.py b/examples/features/structured_outputs/structured_outputs_offline.py similarity index 100% rename from examples/offline_inference/structured_outputs.py rename to examples/features/structured_outputs/structured_outputs_offline.py diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/features/torchrun/torchrun_dp_example_offline.py similarity index 98% rename from examples/offline_inference/torchrun_dp_example.py rename to examples/features/torchrun/torchrun_dp_example_offline.py index eb7ed969ea4b..64809f4b1560 100644 --- a/examples/offline_inference/torchrun_dp_example.py +++ b/examples/features/torchrun/torchrun_dp_example_offline.py @@ -7,12 +7,12 @@ To run this example: ```bash -$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py +$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example_offline.py ``` With custom parallelism settings: ```bash -$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \ +$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example_offline.py \ --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep ``` """ diff --git a/examples/offline_inference/torchrun_example.py b/examples/features/torchrun/torchrun_example_offline.py similarity index 99% rename from examples/offline_inference/torchrun_example.py rename to examples/features/torchrun/torchrun_example_offline.py index 2960d329968a..e41bcd420c20 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/features/torchrun/torchrun_example_offline.py @@ -4,7 +4,7 @@ experimental support for tensor-parallel inference with torchrun, see https://github.com/vllm-project/vllm/issues/11400 for the motivation and use case for this example. -run the script with `torchrun --nproc-per-node=4 torchrun_example.py`, +run the script with `torchrun --nproc-per-node=4 torchrun_example_offline.py`, the argument `4` should match the product of `tensor_parallel_size` and `pipeline_parallel_size` below. see `tests/distributed/test_torchrun_example.py` for the unit test. diff --git a/examples/offline_inference/skip_loading_weights_in_engine_init.py b/examples/rl/skip_loading_weights_in_engine_init.py similarity index 100% rename from examples/offline_inference/skip_loading_weights_in_engine_init.py rename to examples/rl/skip_loading_weights_in_engine_init.py diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index f56d037fa547..c99c3b3d32af 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# unit test for `examples/offline_inference/torchrun_example.py` +# unit test for `examples/offline_inference/torchrun_example_offline.py` import os import random diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py index 8c1d00561b16..25e5c173d3a8 100644 --- a/tests/distributed/test_torchrun_example_moe.py +++ b/tests/distributed/test_torchrun_example_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# unit test for `examples/offline_inference/torchrun_example.py` +# unit test for `examples/offline_inference/torchrun_example_offline.py` import os import random diff --git a/tests/v1/spec_decode/test_acceptance_length.py b/tests/v1/spec_decode/test_acceptance_length.py index ec65e20cbde1..75b601ad492f 100644 --- a/tests/v1/spec_decode/test_acceptance_length.py +++ b/tests/v1/spec_decode/test_acceptance_length.py @@ -43,7 +43,7 @@ class Eagle3ModelConfig: # Model configurations for EAGLE3 acceptance length tests. # Expected acceptance lengths are determined by running baseline benchmarks -# using examples/offline_inference/spec_decode.py with the MT-Bench dataset. +# using examples/offline_inference/spec_decode_offline.py with the MT-Bench dataset. EAGLE3_MODEL_CONFIGS = [ Eagle3ModelConfig( verifier="meta-llama/Llama-3.1-8B-Instruct", diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 29cc2b47e7be..5677c1bbb0d2 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -334,7 +334,7 @@ def _make_config(value: Any, cls: type[_R]) -> _R: f"LLM(data_parallel_size={_dp_size}) is not supported for single-" "process usage and may hang. Please use " "the explicit multi-process data-parallel example at " - "'examples/offline_inference/data_parallel.py'." + "'examples/offline_inference/data_parallel_offline.py'." ) engine_args = EngineArgs( diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index 87b4b72db2a1..2c4e3b3ce65c 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -31,7 +31,7 @@ class ShardedStateLoader(BaseModelLoader): Model loader that directly loads each worker's model state dict, which enables a fast load path for large tensor-parallel models where each worker only needs to read its own shard rather than the entire checkpoint. See - `examples/offline_inference/save_sharded_state.py` for creating a sharded + `examples/offline_inference/save_sharded_state_offline.py` for creating a sharded checkpoint. """ diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 53cad2bc153f..ab7b91a66ac3 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -952,7 +952,7 @@ def get_engine_zmq_addresses( # In offline mode there is an LLM instance per DP rank and # one core engine per LLM, see - # examples/offline_inference/data_parallel.py. + # examples/offline_inference/data_parallel_offline.py. offline_mode = local_start_index is not None # client_local_only = True for cases where this front-end diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index b616c3b7b8ad..537ef14a48ef 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -147,7 +147,7 @@ class ExecutorWithExternalLauncher(UniProcExecutor): offline inference with tensor parallelism. see https://github.com/vllm-project/vllm/issues/11400 for - the motivation, and examples/offline_inference/torchrun_example.py + the motivation, and examples/offline_inference/torchrun_example_offline.py for the usage example. The key idea: although it is tensor-parallel inference, we only From 88254f445cce519bfc84ea28dcc25e249f5bc776 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 27 Apr 2026 18:00:49 +0800 Subject: [PATCH 2/9] refine Signed-off-by: wang.yuqi --- .buildkite/test-amd.yaml | 34 +++++++++---------- .buildkite/test_areas/distributed.yaml | 15 ++++---- .buildkite/test_areas/misc.yaml | 6 ++-- .buildkite/test_areas/model_runner_v2.yaml | 9 ++--- .github/mergify.yml | 5 ++- docs/cli/README.md | 2 +- docs/configuration/conserving_memory.md | 2 +- docs/contributing/profiling.md | 2 +- docs/features/automatic_prefix_caching.md | 2 +- docs/features/context_extension.md | 4 +-- docs/features/lora.md | 2 +- docs/features/prompt_embeds.md | 4 +-- docs/features/speculative_decoding/README.md | 2 +- docs/features/speculative_decoding/eagle.md | 2 +- docs/features/structured_outputs.md | 8 ++--- .../models/extensions/runai_model_streamer.md | 2 +- docs/serving/data_parallel_deployment.md | 2 +- docs/usage/reproducibility.md | 2 +- .../automatic_prefix_caching_offline.py | 2 +- .../context_extension_offline.py | 2 +- .../data_parallel/data_parallel_offline.py | 6 ++-- .../multi_instance_data_parallel.py | 2 +- examples/features/logits_processor/README.md | 6 ++-- examples/features/openai_batch/README.md | 12 +++---- .../prompt_embed/prompt_embed_offline.py | 2 +- .../routed_experts/routed_experts_offline.py | 2 +- .../features/structured_outputs/README.md | 2 +- .../torchrun/torchrun_dp_example_offline.py | 4 +-- tests/distributed/test_torchrun_example.py | 2 +- .../distributed/test_torchrun_example_moe.py | 2 +- .../v1/spec_decode/test_acceptance_length.py | 2 +- vllm/benchmarks/datasets/datasets.py | 4 --- vllm/entrypoints/llm.py | 2 +- .../model_loader/sharded_state_loader.py | 2 +- vllm/model_executor/models/config.py | 2 +- vllm/v1/engine/utils.py | 2 +- vllm/v1/executor/uniproc_executor.py | 2 +- 37 files changed, 80 insertions(+), 85 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 62b68b74ffe6..3ff5413f707e 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -395,11 +395,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching_offline.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi250 · kernels ----------------------------------------------------------# @@ -1168,13 +1168,13 @@ steps: - vllm/v1/attention/backends/ - vllm/v1/attention/selector.py - tests/distributed/test_context_parallel.py - - examples/offline_inference/data_parallel_offline.py + - examples/features/data_parallel/data_parallel_offline.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - label: Distributed Tests (4xA100-4xMI300) # TBD timeout_in_minutes: 180 @@ -1203,7 +1203,7 @@ steps: - tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example_moe.py - examples/rl/ - - tests/examples/offline_inference/data_parallel_offline.py + - tests/examples/features/data_parallel/data_parallel_offline.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 @@ -1213,7 +1213,7 @@ steps: - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - python3 ../examples/offline_inference/data_parallel_offline.py --enforce-eager + - python3 ../examples/features/data_parallel/data_parallel_offline.py --enforce-eager # rlhf examples - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py @@ -1266,7 +1266,7 @@ steps: optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - - examples/offline_inference/torchrun_dp_example_offline.py + - examples/features/torchrun/torchrun_dp_example_offline.py - vllm/config/parallel.py - vllm/distributed/ - vllm/v1/engine/llm_engine.py @@ -1275,7 +1275,7 @@ steps: - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep #-------------------------------------------------------- mi300 · entrypoints --------------------------------------------------------# @@ -1654,11 +1654,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching_offline.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi300 · kernels ----------------------------------------------------------# @@ -2302,7 +2302,7 @@ steps: commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - pytest -v -s tests/distributed/test_packed_tensor.py @@ -2713,7 +2713,7 @@ steps: - vllm/v1/attention/selector.py - tests/distributed/test_context_parallel.py - tests/v1/distributed/test_dbo.py - - examples/offline_inference/data_parallel_offline.py + - examples/features/data_parallel/data_parallel_offline.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: @@ -2937,11 +2937,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching_offline.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi355 · kernels ----------------------------------------------------------# diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 81a45f54465f..e1d6e2039c59 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -88,9 +88,8 @@ steps: - vllm/distributed/ - tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example_moe.py - - examples/offline_inference/rlhf_colocate.py - examples/rl/ - - tests/examples/offline_inference/data_parallel_offline.py + - tests/examples/features/data_parallel/data_parallel_offline.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 @@ -107,7 +106,7 @@ steps: # test with torchrun tp=2 and dp=2 with ep - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py # test with internal dp - - python3 examples/offline_inference/data_parallel_offline.py --enforce-eager + - python3 examples/features/data_parallel/data_parallel_offline.py --enforce-eager # rlhf examples - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py @@ -159,7 +158,7 @@ steps: num_devices: 8 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - examples/offline_inference/torchrun_dp_example_offline.py + - examples/features/torchrun/torchrun_dp_example_offline.py - vllm/config/parallel.py - vllm/distributed/ - vllm/v1/engine/llm_engine.py @@ -169,7 +168,7 @@ steps: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and dp=4 with ep - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: Distributed Tests (4 GPUs)(A100) device: a100 @@ -194,7 +193,7 @@ steps: commands: - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - pytest -v -s tests/distributed/test_packed_tensor.py @@ -222,9 +221,9 @@ steps: - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ - - tests/examples/offline_inference/data_parallel_offline.py + - tests/examples/features/data_parallel/data_parallel_offline.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/features/data_parallel/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/features/data_parallel/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - label: Pipeline + Context Parallelism (4 GPUs) timeout_in_minutes: 60 diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index 558d24851ab0..1552aceab4ab 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -120,12 +120,12 @@ steps: # for pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - - python3 offline_inference/prefix_caching_offline.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Metrics, Tracing (2 GPUs) timeout_in_minutes: 20 diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml index 10d093878ecd..74025d34f8b7 100644 --- a/.buildkite/test_areas/model_runner_v2.yaml +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -31,8 +31,9 @@ steps: - vllm/v1/worker/gpu/ - vllm/v1/core/sched/ - vllm/v1/worker/gpu_worker.py - - examples/offline_inference/ - examples/basic/offline_inference/ + - examples/generate/multimodal/ + - examples/features/ - examples/pooling/embed/vision_embedding_offline.py - examples/others/tensorize_vllm_model.py commands: @@ -51,12 +52,12 @@ steps: # for pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - - python3 offline_inference/prefix_caching_offline.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Model Runner V2 Distributed (2 GPUs) timeout_in_minutes: 45 diff --git a/.github/mergify.yml b/.github/mergify.yml index c73a876d3f14..de3c76fd458b 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -308,8 +308,7 @@ pull_request_rules: - files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/run_structured_output_benchmark.sh - files=docs/features/structured_outputs.md - - files=examples/offline_inference/structured_outputs_offline.py - - files=examples/online_serving/structured_outputs/structured_outputs_offline.py + - files=^examples/features/structured_outputs/ - files~=^tests/v1/structured_output/ - files=tests/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ @@ -325,7 +324,7 @@ pull_request_rules: - or: - files~=^vllm/v1/spec_decode/ - files~=^tests/v1/spec_decode/ - - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py + - files=^examples/features/speculative_decoding/ - files~=^vllm/model_executor/models/.*eagle.*\.py - files=vllm/model_executor/models/mlp_speculator.py - files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py diff --git a/docs/cli/README.md b/docs/cli/README.md index c708eb795898..d912fd7b06e3 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -172,7 +172,7 @@ Using remote file: ```bash vllm run-batch \ - -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 8ea241c582e5..2c098118dbb1 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -23,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) !!! note With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). - You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. + You can convert the model checkpoint to a sharded checkpoint using [examples/features/sharded_state/load_sharded_state_offline.py](../../examples/features/sharded_state/load_sharded_state_offline.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Quantization diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 650aad9e3470..91757c40e4f8 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -42,7 +42,7 @@ Traces can be visualized using . #### Offline Inference -Refer to [examples/offline_inference/simple_profiling.py](../../examples/features/profiling/simple_profiling_offline.py) for an example. +Refer to [examples/features/profiling/simple_profiling_offline.py](../../examples/features/profiling/simple_profiling_offline.py) for an example. #### OpenAI Server diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md index 32163d3b10f1..fe7977ee23d0 100644 --- a/docs/features/automatic_prefix_caching.md +++ b/docs/features/automatic_prefix_caching.md @@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example: -[examples/offline_inference/automatic_prefix_caching.py](../../examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py) +[examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py](../../examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py) ## Example workloads diff --git a/docs/features/context_extension.md b/docs/features/context_extension.md index 9b75249923ef..f96340c3183f 100644 --- a/docs/features/context_extension.md +++ b/docs/features/context_extension.md @@ -6,12 +6,12 @@ This directory contains examples for extending the context length of models usin ## Offline Inference Example -The [`context_extension.py`](../../examples/offline_inference/context_extension) script demonstrates how to extend the context length of a Qwen model using the YARN method (rope_parameters) and run a simple chat example. +The [`context_extension.py`](../../examples/features/context_extension/context_extension_offline.py) script demonstrates how to extend the context length of a Qwen model using the YARN method (rope_parameters) and run a simple chat example. ### Usage ```bash -python examples/offline_inference/context_extension_offline.py +python examples/features/context_extension/context_extension_offline.py ``` ## OpenAI Online Method diff --git a/docs/features/lora.md b/docs/features/lora.md index 4cd523572b84..d78fdc05792e 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -47,7 +47,7 @@ the third parameter is the path to the LoRA adapter. ) ``` -Check out [examples/offline_inference/multilora_inference.py](../../examples/features/lora/multilora_offline.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. +Check out [examples/features/lora/multilora_offline.py](../../examples/features/lora/multilora_offline.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md index 9db46182d0f6..3d68b07a3ace 100644 --- a/docs/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -16,7 +16,7 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]: You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples: -[examples/offline_inference/prompt_embed_inference.py](../../examples/features/prompt_embed/prompt_embed_offline.py) +[examples/features/prompt_embed/prompt_embed_offline.py](../../examples/features/prompt_embed/prompt_embed_offline.py) ## Online Serving @@ -41,4 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \ Then, you can use the OpenAI client as follows: -[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py) +[examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py](../../examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py) diff --git a/docs/features/speculative_decoding/README.md b/docs/features/speculative_decoding/README.md index 4450cfb82874..bef71a4f5a37 100644 --- a/docs/features/speculative_decoding/README.md +++ b/docs/features/speculative_decoding/README.md @@ -32,7 +32,7 @@ depend on your model family, traffic pattern, hardware, and sampling settings. | Suffix decoding | Low to medium gain | Medium gain | No extra draft model; dynamic speculation depth. | For reproducible measurements in your environment, use -[`examples/offline_inference/spec_decode.py`](../../../examples/features/speculative_decoding/spec_decode_offline.py) +[`examples/features/speculative_decoding/spec_decode_offline.py`](../../../examples/features/speculative_decoding/spec_decode_offline.py) or the [benchmark CLI guide](../../benchmarking/cli.md). ## `--speculative-config` schema diff --git a/docs/features/speculative_decoding/eagle.md b/docs/features/speculative_decoding/eagle.md index c52468eac4b5..cc9e4fd4c0c1 100644 --- a/docs/features/speculative_decoding/eagle.md +++ b/docs/features/speculative_decoding/eagle.md @@ -1,6 +1,6 @@ # EAGLE Draft Models -The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found in [examples/offline_inference/spec_decode.py](../../../examples/features/speculative_decoding/spec_decode_offline.py) +The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found in [examples/features/speculative_decoding/spec_decode_offline.py](../../../examples/features/speculative_decoding/spec_decode_offline.py) ## Eagle Drafter Example diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 41cf7be89291..4849953b61c3 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -165,7 +165,7 @@ As an example, we can use to define a specific format of simplified SQL queries: print(completion.choices[0].message.content) ``` -See also: [full example](../examples/online_serving/structured_outputs.md) +See also: [full example](../examples/features/structured_outputs/README.md) ## Reasoning Outputs @@ -208,7 +208,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th print("content: ", completion.choices[0].message.content) ``` -See also: [full example](../examples/online_serving/structured_outputs.md) +See also: [full example](../examples/features/structured_outputs/README.md) !!! note When using Qwen3 Coder models with reasoning enabled, structured outputs might become disabled if the reasoning content does not get parsed into the `reasoning` field separately (v0.11.2+). @@ -304,7 +304,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa Answer: x = -29/8 ``` -An example of using `structural_tag` can be found here: [examples/online_serving/structured_outputs](../../examples/online_serving/structured_outputs) +An example of using `structural_tag` can be found here: [examples/features/structured_outputs](../../examples/features/structured_outputs/README.md) ## Offline Inference @@ -339,4 +339,4 @@ shown below: print(outputs[0].outputs[0].text) ``` -See also: [full example](../examples/online_serving/structured_outputs.md) +See also: [full example](../examples/features/structured_outputsstructured_outputs.md) diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md index 837aea23a404..965b2932ffaa 100644 --- a/docs/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -101,7 +101,7 @@ vllm serve /path/to/sharded/model \ --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' ``` -To create sharded model files, you can use the script provided in [examples/offline_inference/save_sharded_state.py](../../../examples/features/sharded_state/save_sharded_state_offline.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. +To create sharded model files, you can use the script provided in [examples/features/sharded_state/save_sharded_state_offline.py](../../../examples/features/sharded_state/save_sharded_state_offline.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way: diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md index 71957385bfd5..7b963b99d565 100644 --- a/docs/serving/data_parallel_deployment.md +++ b/docs/serving/data_parallel_deployment.md @@ -16,7 +16,7 @@ For MoE models, when any requests are in progress in any rank, we must ensure th In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently. -This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/offline_inference/data_parallel.py](../../examples/features/data_parallel/data_parallel_offline.py). +This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/features/data_parallel/data_parallel_offline.py](../../examples/features/data_parallel/data_parallel_offline.py). There are two distinct modes supported for online deployments - self-contained with internal load balancing, or externally per-rank process deployment and load balancing. diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md index dab180ff8266..680791bbe24a 100644 --- a/docs/usage/reproducibility.md +++ b/docs/usage/reproducibility.md @@ -7,7 +7,7 @@ reproducible results: or enable [batch invariance](../features/batch_invariance.md) to make the outputs insensitive to scheduling. - In online mode, you can only enable [batch invariance](../features/batch_invariance.md). -Example: [examples/offline_inference/reproducibility.py](../../examples/features/batch_invariance/reproducibility_offline.py) +Example: [examples/features/batch_invariance/reproducibility_offline.py](../../examples/features/batch_invariance/reproducibility_offline.py) !!! warning diff --git a/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py b/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py index d11e8547410f..801b4b769792 100644 --- a/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py +++ b/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py @@ -15,7 +15,7 @@ but ask different questions. Run: -python examples/offline_inference/automatic_prefix_caching_offline.py +python examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py """ import time diff --git a/examples/features/context_extension/context_extension_offline.py b/examples/features/context_extension/context_extension_offline.py index 4ea0c356f1d1..3874288b5e11 100644 --- a/examples/features/context_extension/context_extension_offline.py +++ b/examples/features/context_extension/context_extension_offline.py @@ -6,7 +6,7 @@ and run a simple chat example. Usage: - python examples/offline_inference/context_extension_offline.py + python examples/features/context_extension/context_extension_offline.py """ from vllm import LLM, RequestOutput, SamplingParams diff --git a/examples/features/data_parallel/data_parallel_offline.py b/examples/features/data_parallel/data_parallel_offline.py index fccfa782236c..c38ff7297afc 100644 --- a/examples/features/data_parallel/data_parallel_offline.py +++ b/examples/features/data_parallel/data_parallel_offline.py @@ -3,14 +3,14 @@ """ Usage: Single node: - python examples/offline_inference/data_parallel_offline.py \ + python examples/features/data_parallel/data_parallel_offline.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 Multi-node: Node 0 (assume the node has ip of 10.99.48.128): - python examples/offline_inference/data_parallel_offline.py \ + python examples/features/data_parallel/data_parallel_offline.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 \ @@ -19,7 +19,7 @@ --dp-master-addr=10.99.48.128 \ --dp-master-port=13345 Node 1: - python examples/offline_inference/data_parallel_offline.py \ + python examples/features/data_parallel/data_parallel_offline.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 \ diff --git a/examples/features/data_parallel/multi_instance_data_parallel.py b/examples/features/data_parallel/multi_instance_data_parallel.py index 04d21e048940..66fcd3d24644 100644 --- a/examples/features/data_parallel/multi_instance_data_parallel.py +++ b/examples/features/data_parallel/multi_instance_data_parallel.py @@ -12,7 +12,7 @@ """ To run this example, run the following commands simultaneously with different CUDA_VISIBLE_DEVICES: - python examples/online_serving/multi_instance_data_parallel.py + python examples/features/data_parallel/multi_instance_data_parallel.py vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \ --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \ diff --git a/examples/features/logits_processor/README.md b/examples/features/logits_processor/README.md index 6b6e16942f85..07ca07dc71ed 100644 --- a/examples/features/logits_processor/README.md +++ b/examples/features/logits_processor/README.md @@ -9,7 +9,7 @@ This directory contains examples demonstrating how to use custom logits processo Demonstrates how to instantiate vLLM with a custom logits processor class that operates at the batch level. The example uses a `DummyLogitsProcessor` that masks out all tokens except a specified `target_token` when passed via `SamplingParams.extra_args`. ```bash -python examples/offline_inference/logits_processor/custom.py +python examples/features/logits_processor/custom.py ``` ### `custom_req.py` — Request-level logits processor wrapper @@ -17,7 +17,7 @@ python examples/offline_inference/logits_processor/custom.py Shows how to wrap a request-level logits processor (which operates on individual requests) to be compatible with vLLM's batch-level logits processing interface. ```bash -python examples/offline_inference/logits_processor/custom_req.py +python examples/features/logits_processor/custom_req.py ``` ### `custom_req_init.py` — Request-level processor with engine config @@ -25,7 +25,7 @@ python examples/offline_inference/logits_processor/custom_req.py A special case of wrapping a request-level logits processor where the processor needs access to engine configuration or model metadata during initialization (e.g., vocabulary size, tokenizer info). ```bash -python examples/offline_inference/logits_processor/custom_req_init.py +python examples/features/logits_processor/custom_req_init.py ``` ## Key Concepts diff --git a/examples/features/openai_batch/README.md b/examples/features/openai_batch/README.md index ef4e438d6b72..5f020856c97d 100644 --- a/examples/features/openai_batch/README.md +++ b/examples/features/openai_batch/README.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/features/openai_batch/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -30,7 +30,7 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ```bash -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this @@ -77,11 +77,11 @@ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl`, you can run ```bash python -m vllm.entrypoints.openai.run_batch \ - -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` @@ -90,7 +90,7 @@ or use command-line: ```bash vllm run-batch \ - -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` @@ -113,7 +113,7 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ```bash -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this diff --git a/examples/features/prompt_embed/prompt_embed_offline.py b/examples/features/prompt_embed/prompt_embed_offline.py index 60292e0006a4..29853bce9673 100644 --- a/examples/features/prompt_embed/prompt_embed_offline.py +++ b/examples/features/prompt_embed/prompt_embed_offline.py @@ -15,7 +15,7 @@ - transformers Run: - python examples/offline_inference/prompt_embed_offline.py + python examples/features/prompt_embed/prompt_embed_offline.py """ import torch diff --git a/examples/features/routed_experts/routed_experts_offline.py b/examples/features/routed_experts/routed_experts_offline.py index 278f92b11bbe..6e82cd73ec00 100644 --- a/examples/features/routed_experts/routed_experts_offline.py +++ b/examples/features/routed_experts/routed_experts_offline.py @@ -9,7 +9,7 @@ 3. Results are deterministic across runs (baseline vs reference). Usage: - python examples/offline_inference/routed_experts_offline.py \ + python examples/features/routed_experts/routed_experts_offline.py \ --model Qwen/Qwen3-30B-A3B \ --tp 4 \ --max-model-len 4096 \ diff --git a/examples/features/structured_outputs/README.md b/examples/features/structured_outputs/README.md index cfa9282207b0..f2863eb0cbcf 100644 --- a/examples/features/structured_outputs/README.md +++ b/examples/features/structured_outputs/README.md @@ -20,7 +20,7 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ If you want to run this script standalone with `uv`, you can use the following: ```bash -uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \ +uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/features/structured_outputs \ structured-outputs ``` diff --git a/examples/features/torchrun/torchrun_dp_example_offline.py b/examples/features/torchrun/torchrun_dp_example_offline.py index 64809f4b1560..b6391373750b 100644 --- a/examples/features/torchrun/torchrun_dp_example_offline.py +++ b/examples/features/torchrun/torchrun_dp_example_offline.py @@ -7,12 +7,12 @@ To run this example: ```bash -$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example_offline.py +$ torchrun --nproc-per-node=2 examples/features/torchrun/torchrun_dp_example_offline.py ``` With custom parallelism settings: ```bash -$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example_offline.py \ +$ torchrun --nproc-per-node=8 examples/features/torchrun/torchrun_dp_example_offline.py \ --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep ``` """ diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index c99c3b3d32af..e72f00bc91e0 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# unit test for `examples/offline_inference/torchrun_example_offline.py` +# unit test for `examples/features/torchrun/torchrun_example_offline.py` import os import random diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py index 25e5c173d3a8..969b5e92e3fc 100644 --- a/tests/distributed/test_torchrun_example_moe.py +++ b/tests/distributed/test_torchrun_example_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# unit test for `examples/offline_inference/torchrun_example_offline.py` +# unit test for `examples/features/torchrun/torchrun_example_offline.py` import os import random diff --git a/tests/v1/spec_decode/test_acceptance_length.py b/tests/v1/spec_decode/test_acceptance_length.py index 75b601ad492f..bd13fb96ee26 100644 --- a/tests/v1/spec_decode/test_acceptance_length.py +++ b/tests/v1/spec_decode/test_acceptance_length.py @@ -43,7 +43,7 @@ class Eagle3ModelConfig: # Model configurations for EAGLE3 acceptance length tests. # Expected acceptance lengths are determined by running baseline benchmarks -# using examples/offline_inference/spec_decode_offline.py with the MT-Bench dataset. +# using examples/features/speculative_decoding/spec_decode_offline.py with the MT-Bench dataset. EAGLE3_MODEL_CONFIGS = [ Eagle3ModelConfig( verifier="meta-llama/Llama-3.1-8B-Instruct", diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py index 745b5ab2ff8f..7a9bdbd40cea 100644 --- a/vllm/benchmarks/datasets/datasets.py +++ b/vllm/benchmarks/datasets/datasets.py @@ -2918,10 +2918,6 @@ class MTBenchDataset(HuggingFaceDataset): """ MT-Bench Dataset. https://huggingface.co/datasets/philschmid/mt-bench - - We create a single turn dataset for MT-Bench. - This is similar to Spec decoding benchmark setup in vLLM - https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 """ # noqa: E501 DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5677c1bbb0d2..e67046b14117 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -334,7 +334,7 @@ def _make_config(value: Any, cls: type[_R]) -> _R: f"LLM(data_parallel_size={_dp_size}) is not supported for single-" "process usage and may hang. Please use " "the explicit multi-process data-parallel example at " - "'examples/offline_inference/data_parallel_offline.py'." + "'examples/features/data_parallel/data_parallel_offline.py'." ) engine_args = EngineArgs( diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index 2c4e3b3ce65c..b96fa4f452ce 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -31,7 +31,7 @@ class ShardedStateLoader(BaseModelLoader): Model loader that directly loads each worker's model state dict, which enables a fast load path for large tensor-parallel models where each worker only needs to read its own shard rather than the entire checkpoint. See - `examples/offline_inference/save_sharded_state_offline.py` for creating a sharded + `examples/features/sharded_state/save_sharded_state_offline.py` for creating a sharded checkpoint. """ diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index e8f5101b577d..459c16f8ec97 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -517,7 +517,7 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: "Nomic context extension is disabled. " "Changing max_model_len from %s to %s. " "To enable context extension, see: " - "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.py", + "https://github.com/vllm-project/vllm/tree/main/examples/features/context_extension/context_extension_offline.py", max_model_len_before, model_config.max_model_len, ) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index ab7b91a66ac3..7b0f00d14c8a 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -952,7 +952,7 @@ def get_engine_zmq_addresses( # In offline mode there is an LLM instance per DP rank and # one core engine per LLM, see - # examples/offline_inference/data_parallel_offline.py. + # examples/features/data_parallel/data_parallel_offline.py. offline_mode = local_start_index is not None # client_local_only = True for cases where this front-end diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index 537ef14a48ef..d006946079e7 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -147,7 +147,7 @@ class ExecutorWithExternalLauncher(UniProcExecutor): offline inference with tensor parallelism. see https://github.com/vllm-project/vllm/issues/11400 for - the motivation, and examples/offline_inference/torchrun_example_offline.py + the motivation, and examples/features/torchrun/torchrun_example_offline.py for the usage example. The key idea: although it is tensor-parallel inference, we only From ea0636e5feda6a0d289cf6b88a82e3153e84eb00 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Apr 2026 10:12:44 +0800 Subject: [PATCH 3/9] refine Signed-off-by: wang.yuqi --- docs/cli/README.md | 2 +- examples/features/openai_batch/README.md | 10 +++++----- .../prompt_embed_inference_with_openai_client.py | 2 +- vllm/benchmarks/datasets/datasets.py | 4 ++++ 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/cli/README.md b/docs/cli/README.md index d912fd7b06e3..b27bd3b647b5 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -163,7 +163,7 @@ Running with a local file: ```bash vllm run-batch \ - -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -i features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` diff --git a/examples/features/openai_batch/README.md b/examples/features/openai_batch/README.md index 5f020856c97d..a9bd31691210 100644 --- a/examples/features/openai_batch/README.md +++ b/examples/features/openai_batch/README.md @@ -36,7 +36,7 @@ wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/ Once you've created your batch file it should look like this ```bash -cat offline_inference/openai_batch/openai_example_batch.jsonl +cat features/openai_batch/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -49,7 +49,7 @@ You can run the batch with the following command, which will write its results t ```bash python -m vllm.entrypoints.openai.run_batch \ - -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -i features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` @@ -58,7 +58,7 @@ or use command-line: ```bash vllm run-batch \ - -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -i features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` @@ -119,7 +119,7 @@ wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/ Once you've created your batch file it should look like this ```bash -cat offline_inference/openai_batch/openai_example_batch.jsonl +cat features/openai_batch/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -127,7 +127,7 @@ cat offline_inference/openai_batch/openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ```bash -aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp features/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py b/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py index fa4b64c00703..40eae0c062dd 100644 --- a/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py +++ b/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py @@ -15,7 +15,7 @@ --enable-prompt-embeds Run the client: -python examples/online_serving/prompt_embed_inference_with_openai_client.py +python examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py Model: meta-llama/Llama-3.2-1B-Instruct Note: This model is gated on Hugging Face Hub. diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py index 7a9bdbd40cea..40f5c5a46735 100644 --- a/vllm/benchmarks/datasets/datasets.py +++ b/vllm/benchmarks/datasets/datasets.py @@ -2918,6 +2918,10 @@ class MTBenchDataset(HuggingFaceDataset): """ MT-Bench Dataset. https://huggingface.co/datasets/philschmid/mt-bench + + We create a single turn dataset for MT-Bench. + This is similar to Spec decoding benchmark setup in vLLM + https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 """ # noqa: E501 DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM From 30c23afff03ba31ae58676d401c691d6fffe987c Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Apr 2026 10:25:20 +0800 Subject: [PATCH 4/9] refine Signed-off-by: wang.yuqi --- vllm/benchmarks/datasets/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py index 40f5c5a46735..745b5ab2ff8f 100644 --- a/vllm/benchmarks/datasets/datasets.py +++ b/vllm/benchmarks/datasets/datasets.py @@ -2918,7 +2918,7 @@ class MTBenchDataset(HuggingFaceDataset): """ MT-Bench Dataset. https://huggingface.co/datasets/philschmid/mt-bench - + We create a single turn dataset for MT-Bench. This is similar to Spec decoding benchmark setup in vLLM https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 From b3f889a632116965cf56e6deb563832841d2d0c5 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Apr 2026 10:28:41 +0800 Subject: [PATCH 5/9] ruff Signed-off-by: wang.yuqi --- examples/features/torchrun/torchrun_dp_example_offline.py | 2 +- tests/v1/spec_decode/test_acceptance_length.py | 3 ++- vllm/model_executor/model_loader/sharded_state_loader.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/features/torchrun/torchrun_dp_example_offline.py b/examples/features/torchrun/torchrun_dp_example_offline.py index b6391373750b..2e356edd44e9 100644 --- a/examples/features/torchrun/torchrun_dp_example_offline.py +++ b/examples/features/torchrun/torchrun_dp_example_offline.py @@ -15,7 +15,7 @@ $ torchrun --nproc-per-node=8 examples/features/torchrun/torchrun_dp_example_offline.py \ --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep ``` -""" +""" # noqa: E501 import argparse diff --git a/tests/v1/spec_decode/test_acceptance_length.py b/tests/v1/spec_decode/test_acceptance_length.py index bd13fb96ee26..62ff100fdbf8 100644 --- a/tests/v1/spec_decode/test_acceptance_length.py +++ b/tests/v1/spec_decode/test_acceptance_length.py @@ -43,7 +43,8 @@ class Eagle3ModelConfig: # Model configurations for EAGLE3 acceptance length tests. # Expected acceptance lengths are determined by running baseline benchmarks -# using examples/features/speculative_decoding/spec_decode_offline.py with the MT-Bench dataset. +# using examples/features/speculative_decoding/spec_decode_offline.py +# with the MT-Bench dataset. EAGLE3_MODEL_CONFIGS = [ Eagle3ModelConfig( verifier="meta-llama/Llama-3.1-8B-Instruct", diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index b96fa4f452ce..3f57fe7e0265 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -31,8 +31,8 @@ class ShardedStateLoader(BaseModelLoader): Model loader that directly loads each worker's model state dict, which enables a fast load path for large tensor-parallel models where each worker only needs to read its own shard rather than the entire checkpoint. See - `examples/features/sharded_state/save_sharded_state_offline.py` for creating a sharded - checkpoint. + `examples/features/sharded_state/save_sharded_state_offline.py` for creating + a sharded checkpoint. """ DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors" From e65a2fc642e7f2842ccacae85c636d64dde7882c Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Apr 2026 10:34:52 +0800 Subject: [PATCH 6/9] ruff Signed-off-by: wang.yuqi --- examples/features/torchrun/torchrun_dp_example_offline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/features/torchrun/torchrun_dp_example_offline.py b/examples/features/torchrun/torchrun_dp_example_offline.py index 2e356edd44e9..f18f6042e9c6 100644 --- a/examples/features/torchrun/torchrun_dp_example_offline.py +++ b/examples/features/torchrun/torchrun_dp_example_offline.py @@ -15,7 +15,7 @@ $ torchrun --nproc-per-node=8 examples/features/torchrun/torchrun_dp_example_offline.py \ --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep ``` -""" # noqa: E501 +""" # noqa: E501 import argparse From ec1a17d64afb93b6579b747e263d102946930d00 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Apr 2026 10:59:53 +0800 Subject: [PATCH 7/9] mv routed_experts to rl Signed-off-by: wang.yuqi --- .../{features => rl}/routed_experts/routed_experts_offline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename examples/{features => rl}/routed_experts/routed_experts_offline.py (99%) diff --git a/examples/features/routed_experts/routed_experts_offline.py b/examples/rl/routed_experts/routed_experts_offline.py similarity index 99% rename from examples/features/routed_experts/routed_experts_offline.py rename to examples/rl/routed_experts/routed_experts_offline.py index 6e82cd73ec00..5a9a44ffb694 100644 --- a/examples/features/routed_experts/routed_experts_offline.py +++ b/examples/rl/routed_experts/routed_experts_offline.py @@ -9,7 +9,7 @@ 3. Results are deterministic across runs (baseline vs reference). Usage: - python examples/features/routed_experts/routed_experts_offline.py \ + python examples/rl/routed_experts/routed_experts_offline.py \ --model Qwen/Qwen3-30B-A3B \ --tp 4 \ --max-model-len 4096 \ From 3a3986daec404364ba43f0635da5cba72cf2262e Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Apr 2026 11:19:21 +0800 Subject: [PATCH 8/9] mv routed_experts_e2e.py to rl Signed-off-by: wang.yuqi --- .../routed_experts_offline.py => routed_experts_e2e.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename examples/rl/{routed_experts/routed_experts_offline.py => routed_experts_e2e.py} (99%) diff --git a/examples/rl/routed_experts/routed_experts_offline.py b/examples/rl/routed_experts_e2e.py similarity index 99% rename from examples/rl/routed_experts/routed_experts_offline.py rename to examples/rl/routed_experts_e2e.py index 5a9a44ffb694..1666bc3ffe16 100644 --- a/examples/rl/routed_experts/routed_experts_offline.py +++ b/examples/rl/routed_experts_e2e.py @@ -9,7 +9,7 @@ 3. Results are deterministic across runs (baseline vs reference). Usage: - python examples/rl/routed_experts/routed_experts_offline.py \ + python examples/rl/routed_experts_e2e.py \ --model Qwen/Qwen3-30B-A3B \ --tp 4 \ --max-model-len 4096 \ From 3a93356a0815d2090ca5aba7e508398045ed58d3 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 28 Apr 2026 13:57:06 +0800 Subject: [PATCH 9/9] fix docs Signed-off-by: wang.yuqi --- docs/features/structured_outputs.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 4849953b61c3..fa39f7ae6e48 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -165,7 +165,7 @@ As an example, we can use to define a specific format of simplified SQL queries: print(completion.choices[0].message.content) ``` -See also: [full example](../examples/features/structured_outputs/README.md) +See also: [full example](../../examples/features/structured_outputs/README.md) ## Reasoning Outputs @@ -208,7 +208,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th print("content: ", completion.choices[0].message.content) ``` -See also: [full example](../examples/features/structured_outputs/README.md) +See also: [full example](../../examples/features/structured_outputs/README.md) !!! note When using Qwen3 Coder models with reasoning enabled, structured outputs might become disabled if the reasoning content does not get parsed into the `reasoning` field separately (v0.11.2+). @@ -339,4 +339,4 @@ shown below: print(outputs[0].outputs[0].text) ``` -See also: [full example](../examples/features/structured_outputsstructured_outputs.md) +See also: [full example](../../examples/features/structured_outputs/structured_outputs_offline.py)