diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 68179dcb68cd..3ff5413f707e 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -395,11 +395,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi250 · kernels ----------------------------------------------------------# @@ -1168,13 +1168,13 @@ steps: - vllm/v1/attention/backends/ - vllm/v1/attention/selector.py - tests/distributed/test_context_parallel.py - - examples/offline_inference/data_parallel.py + - examples/features/data_parallel/data_parallel_offline.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - label: Distributed Tests (4xA100-4xMI300) # TBD timeout_in_minutes: 180 @@ -1203,7 +1203,7 @@ steps: - tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example_moe.py - examples/rl/ - - tests/examples/offline_inference/data_parallel.py + - tests/examples/features/data_parallel/data_parallel_offline.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 @@ -1213,7 +1213,7 @@ steps: - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - python3 ../examples/features/data_parallel/data_parallel_offline.py --enforce-eager # rlhf examples - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py @@ -1266,7 +1266,7 @@ steps: optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py + - examples/features/torchrun/torchrun_dp_example_offline.py - vllm/config/parallel.py - vllm/distributed/ - vllm/v1/engine/llm_engine.py @@ -1275,7 +1275,7 @@ steps: - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep #-------------------------------------------------------- mi300 · entrypoints --------------------------------------------------------# @@ -1654,11 +1654,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi300 · kernels ----------------------------------------------------------# @@ -2302,7 +2302,7 @@ steps: commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - pytest -v -s tests/distributed/test_packed_tensor.py @@ -2713,7 +2713,7 @@ steps: - vllm/v1/attention/selector.py - tests/distributed/test_context_parallel.py - tests/v1/distributed/test_dbo.py - - examples/offline_inference/data_parallel.py + - examples/features/data_parallel/data_parallel_offline.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: @@ -2937,11 +2937,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi355 · kernels ----------------------------------------------------------# diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 093f3ab4fe1f..e1d6e2039c59 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -88,9 +88,8 @@ steps: - vllm/distributed/ - tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example_moe.py - - examples/offline_inference/rlhf_colocate.py - examples/rl/ - - tests/examples/offline_inference/data_parallel.py + - tests/examples/features/data_parallel/data_parallel_offline.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 @@ -107,7 +106,7 @@ steps: # test with torchrun tp=2 and dp=2 with ep - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py # test with internal dp - - python3 examples/offline_inference/data_parallel.py --enforce-eager + - python3 examples/features/data_parallel/data_parallel_offline.py --enforce-eager # rlhf examples - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py @@ -159,7 +158,7 @@ steps: num_devices: 8 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py + - examples/features/torchrun/torchrun_dp_example_offline.py - vllm/config/parallel.py - vllm/distributed/ - vllm/v1/engine/llm_engine.py @@ -169,7 +168,7 @@ steps: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and dp=4 with ep - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: Distributed Tests (4 GPUs)(A100) device: a100 @@ -194,7 +193,7 @@ steps: commands: - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - pytest -v -s tests/distributed/test_packed_tensor.py @@ -222,9 +221,9 @@ steps: - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ - - tests/examples/offline_inference/data_parallel.py + - tests/examples/features/data_parallel/data_parallel_offline.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/features/data_parallel/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/features/data_parallel/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - label: Pipeline + Context Parallelism (4 GPUs) timeout_in_minutes: 60 diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index d0930be156d2..1552aceab4ab 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -120,12 +120,12 @@ steps: # for pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Metrics, Tracing (2 GPUs) timeout_in_minutes: 20 diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml index 2b88c00d6b77..74025d34f8b7 100644 --- a/.buildkite/test_areas/model_runner_v2.yaml +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -31,8 +31,9 @@ steps: - vllm/v1/worker/gpu/ - vllm/v1/core/sched/ - vllm/v1/worker/gpu_worker.py - - examples/offline_inference/ - examples/basic/offline_inference/ + - examples/generate/multimodal/ + - examples/features/ - examples/pooling/embed/vision_embedding_offline.py - examples/others/tensorize_vllm_model.py commands: @@ -51,12 +52,12 @@ steps: # for pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Model Runner V2 Distributed (2 GPUs) timeout_in_minutes: 45 diff --git a/.github/mergify.yml b/.github/mergify.yml index 8ca00d6e7d2d..de3c76fd458b 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -308,8 +308,7 @@ pull_request_rules: - files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/run_structured_output_benchmark.sh - files=docs/features/structured_outputs.md - - files=examples/offline_inference/structured_outputs.py - - files=examples/online_serving/structured_outputs/structured_outputs.py + - files=^examples/features/structured_outputs/ - files~=^tests/v1/structured_output/ - files=tests/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ @@ -325,7 +324,7 @@ pull_request_rules: - or: - files~=^vllm/v1/spec_decode/ - files~=^tests/v1/spec_decode/ - - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py + - files=^examples/features/speculative_decoding/ - files~=^vllm/model_executor/models/.*eagle.*\.py - files=vllm/model_executor/models/mlp_speculator.py - files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py diff --git a/docs/cli/README.md b/docs/cli/README.md index c708eb795898..b27bd3b647b5 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -163,7 +163,7 @@ Running with a local file: ```bash vllm run-batch \ - -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -i features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` @@ -172,7 +172,7 @@ Using remote file: ```bash vllm run-batch \ - -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 8ea241c582e5..2c098118dbb1 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -23,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) !!! note With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). - You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. + You can convert the model checkpoint to a sharded checkpoint using [examples/features/sharded_state/load_sharded_state_offline.py](../../examples/features/sharded_state/load_sharded_state_offline.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Quantization diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index addda300d020..91757c40e4f8 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -42,7 +42,7 @@ Traces can be visualized using . #### Offline Inference -Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example. +Refer to [examples/features/profiling/simple_profiling_offline.py](../../examples/features/profiling/simple_profiling_offline.py) for an example. #### OpenAI Server diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md index 3718a4b74eb2..fe7977ee23d0 100644 --- a/docs/features/automatic_prefix_caching.md +++ b/docs/features/automatic_prefix_caching.md @@ -11,7 +11,7 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example: -[examples/offline_inference/automatic_prefix_caching.py](../../examples/offline_inference/automatic_prefix_caching.py) +[examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py](../../examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py) ## Example workloads diff --git a/docs/features/context_extension.md b/docs/features/context_extension.md index f622191aebc6..f96340c3183f 100644 --- a/docs/features/context_extension.md +++ b/docs/features/context_extension.md @@ -6,12 +6,12 @@ This directory contains examples for extending the context length of models usin ## Offline Inference Example -The [`context_extension.py`](../../examples/offline_inference/context_extension) script demonstrates how to extend the context length of a Qwen model using the YARN method (rope_parameters) and run a simple chat example. +The [`context_extension.py`](../../examples/features/context_extension/context_extension_offline.py) script demonstrates how to extend the context length of a Qwen model using the YARN method (rope_parameters) and run a simple chat example. ### Usage ```bash -python examples/offline_inference/context_extension.py +python examples/features/context_extension/context_extension_offline.py ``` ## OpenAI Online Method diff --git a/docs/features/lora.md b/docs/features/lora.md index 2e7b36545d46..d78fdc05792e 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -47,7 +47,7 @@ the third parameter is the path to the LoRA adapter. ) ``` -Check out [examples/offline_inference/multilora_inference.py](../../examples/offline_inference/multilora_inference.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. +Check out [examples/features/lora/multilora_offline.py](../../examples/features/lora/multilora_offline.py) for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md index b81d2f28e3b9..3d68b07a3ace 100644 --- a/docs/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -16,7 +16,7 @@ To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]: You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples: -[examples/offline_inference/prompt_embed_inference.py](../../examples/offline_inference/prompt_embed_inference.py) +[examples/features/prompt_embed/prompt_embed_offline.py](../../examples/features/prompt_embed/prompt_embed_offline.py) ## Online Serving @@ -41,4 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --runner generate \ Then, you can use the OpenAI client as follows: -[examples/online_serving/prompt_embed_inference_with_openai_client.py](../../examples/online_serving/prompt_embed_inference_with_openai_client.py) +[examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py](../../examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py) diff --git a/docs/features/speculative_decoding/README.md b/docs/features/speculative_decoding/README.md index 25cda8059b24..bef71a4f5a37 100644 --- a/docs/features/speculative_decoding/README.md +++ b/docs/features/speculative_decoding/README.md @@ -32,7 +32,7 @@ depend on your model family, traffic pattern, hardware, and sampling settings. | Suffix decoding | Low to medium gain | Medium gain | No extra draft model; dynamic speculation depth. | For reproducible measurements in your environment, use -[`examples/offline_inference/spec_decode.py`](../../../examples/offline_inference/spec_decode.py) +[`examples/features/speculative_decoding/spec_decode_offline.py`](../../../examples/features/speculative_decoding/spec_decode_offline.py) or the [benchmark CLI guide](../../benchmarking/cli.md). ## `--speculative-config` schema diff --git a/docs/features/speculative_decoding/eagle.md b/docs/features/speculative_decoding/eagle.md index 3e0f3add416e..cc9e4fd4c0c1 100644 --- a/docs/features/speculative_decoding/eagle.md +++ b/docs/features/speculative_decoding/eagle.md @@ -1,6 +1,6 @@ # EAGLE Draft Models -The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found in [examples/offline_inference/spec_decode.py](../../../examples/offline_inference/spec_decode.py) +The following code configures vLLM to use speculative decoding where proposals are generated by an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found in [examples/features/speculative_decoding/spec_decode_offline.py](../../../examples/features/speculative_decoding/spec_decode_offline.py) ## Eagle Drafter Example diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 41cf7be89291..fa39f7ae6e48 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -165,7 +165,7 @@ As an example, we can use to define a specific format of simplified SQL queries: print(completion.choices[0].message.content) ``` -See also: [full example](../examples/online_serving/structured_outputs.md) +See also: [full example](../../examples/features/structured_outputs/README.md) ## Reasoning Outputs @@ -208,7 +208,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th print("content: ", completion.choices[0].message.content) ``` -See also: [full example](../examples/online_serving/structured_outputs.md) +See also: [full example](../../examples/features/structured_outputs/README.md) !!! note When using Qwen3 Coder models with reasoning enabled, structured outputs might become disabled if the reasoning content does not get parsed into the `reasoning` field separately (v0.11.2+). @@ -304,7 +304,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa Answer: x = -29/8 ``` -An example of using `structural_tag` can be found here: [examples/online_serving/structured_outputs](../../examples/online_serving/structured_outputs) +An example of using `structural_tag` can be found here: [examples/features/structured_outputs](../../examples/features/structured_outputs/README.md) ## Offline Inference @@ -339,4 +339,4 @@ shown below: print(outputs[0].outputs[0].text) ``` -See also: [full example](../examples/online_serving/structured_outputs.md) +See also: [full example](../../examples/features/structured_outputs/structured_outputs_offline.py) diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md index 38c603b46e10..965b2932ffaa 100644 --- a/docs/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -101,7 +101,7 @@ vllm serve /path/to/sharded/model \ --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' ``` -To create sharded model files, you can use the script provided in [examples/offline_inference/save_sharded_state.py](../../../examples/offline_inference/save_sharded_state.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. +To create sharded model files, you can use the script provided in [examples/features/sharded_state/save_sharded_state_offline.py](../../../examples/features/sharded_state/save_sharded_state_offline.py). This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way: diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md index f0946eaf407a..7b963b99d565 100644 --- a/docs/serving/data_parallel_deployment.md +++ b/docs/serving/data_parallel_deployment.md @@ -16,7 +16,7 @@ For MoE models, when any requests are in progress in any rank, we must ensure th In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently. -This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/offline_inference/data_parallel.py](../../examples/offline_inference/data_parallel.py). +This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see [examples/features/data_parallel/data_parallel_offline.py](../../examples/features/data_parallel/data_parallel_offline.py). There are two distinct modes supported for online deployments - self-contained with internal load balancing, or externally per-rank process deployment and load balancing. diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md index a8e49d0a3398..680791bbe24a 100644 --- a/docs/usage/reproducibility.md +++ b/docs/usage/reproducibility.md @@ -7,7 +7,7 @@ reproducible results: or enable [batch invariance](../features/batch_invariance.md) to make the outputs insensitive to scheduling. - In online mode, you can only enable [batch invariance](../features/batch_invariance.md). -Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py) +Example: [examples/features/batch_invariance/reproducibility_offline.py](../../examples/features/batch_invariance/reproducibility_offline.py) !!! warning diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py similarity index 98% rename from examples/offline_inference/automatic_prefix_caching.py rename to examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py index 2d3c28d9dd4f..801b4b769792 100644 --- a/examples/offline_inference/automatic_prefix_caching.py +++ b/examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py @@ -15,7 +15,7 @@ but ask different questions. Run: -python examples/offline_inference/automatic_prefix_caching.py +python examples/features/automatic_prefix_caching/automatic_prefix_caching_offline.py """ import time diff --git a/examples/offline_inference/prefix_caching.py b/examples/features/automatic_prefix_caching/prefix_caching_offline.py similarity index 100% rename from examples/offline_inference/prefix_caching.py rename to examples/features/automatic_prefix_caching/prefix_caching_offline.py diff --git a/examples/offline_inference/reproducibility.py b/examples/features/batch_invariance/reproducibility_offline.py similarity index 100% rename from examples/offline_inference/reproducibility.py rename to examples/features/batch_invariance/reproducibility_offline.py diff --git a/examples/offline_inference/context_extension.py b/examples/features/context_extension/context_extension_offline.py similarity index 96% rename from examples/offline_inference/context_extension.py rename to examples/features/context_extension/context_extension_offline.py index fae8590f914e..3874288b5e11 100644 --- a/examples/offline_inference/context_extension.py +++ b/examples/features/context_extension/context_extension_offline.py @@ -6,7 +6,7 @@ and run a simple chat example. Usage: - python examples/offline_inference/context_extension.py + python examples/features/context_extension/context_extension_offline.py """ from vllm import LLM, RequestOutput, SamplingParams diff --git a/examples/offline_inference/data_parallel.py b/examples/features/data_parallel/data_parallel_offline.py similarity index 96% rename from examples/offline_inference/data_parallel.py rename to examples/features/data_parallel/data_parallel_offline.py index 287409fa2b5c..c38ff7297afc 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/features/data_parallel/data_parallel_offline.py @@ -3,14 +3,14 @@ """ Usage: Single node: - python examples/offline_inference/data_parallel.py \ + python examples/features/data_parallel/data_parallel_offline.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 Multi-node: Node 0 (assume the node has ip of 10.99.48.128): - python examples/offline_inference/data_parallel.py \ + python examples/features/data_parallel/data_parallel_offline.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 \ @@ -19,7 +19,7 @@ --dp-master-addr=10.99.48.128 \ --dp-master-port=13345 Node 1: - python examples/offline_inference/data_parallel.py \ + python examples/features/data_parallel/data_parallel_offline.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 \ diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/features/data_parallel/multi_instance_data_parallel.py similarity index 97% rename from examples/online_serving/multi_instance_data_parallel.py rename to examples/features/data_parallel/multi_instance_data_parallel.py index 04d21e048940..66fcd3d24644 100644 --- a/examples/online_serving/multi_instance_data_parallel.py +++ b/examples/features/data_parallel/multi_instance_data_parallel.py @@ -12,7 +12,7 @@ """ To run this example, run the following commands simultaneously with different CUDA_VISIBLE_DEVICES: - python examples/online_serving/multi_instance_data_parallel.py + python examples/features/data_parallel/multi_instance_data_parallel.py vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \ --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \ diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/features/kv_events/kv_events_subscriber.py similarity index 100% rename from examples/online_serving/kv_events_subscriber.py rename to examples/features/kv_events/kv_events_subscriber.py diff --git a/examples/offline_inference/logits_processor/README.md b/examples/features/logits_processor/README.md similarity index 90% rename from examples/offline_inference/logits_processor/README.md rename to examples/features/logits_processor/README.md index 6b6e16942f85..07ca07dc71ed 100644 --- a/examples/offline_inference/logits_processor/README.md +++ b/examples/features/logits_processor/README.md @@ -9,7 +9,7 @@ This directory contains examples demonstrating how to use custom logits processo Demonstrates how to instantiate vLLM with a custom logits processor class that operates at the batch level. The example uses a `DummyLogitsProcessor` that masks out all tokens except a specified `target_token` when passed via `SamplingParams.extra_args`. ```bash -python examples/offline_inference/logits_processor/custom.py +python examples/features/logits_processor/custom.py ``` ### `custom_req.py` — Request-level logits processor wrapper @@ -17,7 +17,7 @@ python examples/offline_inference/logits_processor/custom.py Shows how to wrap a request-level logits processor (which operates on individual requests) to be compatible with vLLM's batch-level logits processing interface. ```bash -python examples/offline_inference/logits_processor/custom_req.py +python examples/features/logits_processor/custom_req.py ``` ### `custom_req_init.py` — Request-level processor with engine config @@ -25,7 +25,7 @@ python examples/offline_inference/logits_processor/custom_req.py A special case of wrapping a request-level logits processor where the processor needs access to engine configuration or model metadata during initialization (e.g., vocabulary size, tokenizer info). ```bash -python examples/offline_inference/logits_processor/custom_req_init.py +python examples/features/logits_processor/custom_req_init.py ``` ## Key Concepts diff --git a/examples/offline_inference/logits_processor/custom.py b/examples/features/logits_processor/custom.py similarity index 100% rename from examples/offline_inference/logits_processor/custom.py rename to examples/features/logits_processor/custom.py diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/features/logits_processor/custom_req.py similarity index 100% rename from examples/offline_inference/logits_processor/custom_req.py rename to examples/features/logits_processor/custom_req.py diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/features/logits_processor/custom_req_init.py similarity index 100% rename from examples/offline_inference/logits_processor/custom_req_init.py rename to examples/features/logits_processor/custom_req_init.py diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/features/lora/lora_with_quantization_offline.py similarity index 100% rename from examples/offline_inference/lora_with_quantization_inference.py rename to examples/features/lora/lora_with_quantization_offline.py diff --git a/examples/offline_inference/multilora_inference.py b/examples/features/lora/multilora_offline.py similarity index 100% rename from examples/offline_inference/multilora_inference.py rename to examples/features/lora/multilora_offline.py diff --git a/examples/offline_inference/openai_batch/README.md b/examples/features/openai_batch/README.md similarity index 94% rename from examples/offline_inference/openai_batch/README.md rename to examples/features/openai_batch/README.md index ef4e438d6b72..a9bd31691210 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/features/openai_batch/README.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/features/openai_batch/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -30,13 +30,13 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ```bash -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ```bash -cat offline_inference/openai_batch/openai_example_batch.jsonl +cat features/openai_batch/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -49,7 +49,7 @@ You can run the batch with the following command, which will write its results t ```bash python -m vllm.entrypoints.openai.run_batch \ - -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -i features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` @@ -58,7 +58,7 @@ or use command-line: ```bash vllm run-batch \ - -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -i features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` @@ -77,11 +77,11 @@ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl`, you can run ```bash python -m vllm.entrypoints.openai.run_batch \ - -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` @@ -90,7 +90,7 @@ or use command-line: ```bash vllm run-batch \ - -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl \ -o results.jsonl \ --model meta-llama/Meta-Llama-3-8B-Instruct ``` @@ -113,13 +113,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ```bash -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/features/openai_batch/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ```bash -cat offline_inference/openai_batch/openai_example_batch.jsonl +cat features/openai_batch/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -127,7 +127,7 @@ cat offline_inference/openai_batch/openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ```bash -aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp features/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/offline_inference/openai_batch/openai_example_batch.jsonl b/examples/features/openai_batch/openai_example_batch.jsonl similarity index 100% rename from examples/offline_inference/openai_batch/openai_example_batch.jsonl rename to examples/features/openai_batch/openai_example_batch.jsonl diff --git a/examples/online_serving/data_parallel_pause_resume.py b/examples/features/pause_resume/data_parallel_pause_resume.py similarity index 96% rename from examples/online_serving/data_parallel_pause_resume.py rename to examples/features/pause_resume/data_parallel_pause_resume.py index e94de22a1271..1f11536e5366 100644 --- a/examples/online_serving/data_parallel_pause_resume.py +++ b/examples/features/pause_resume/data_parallel_pause_resume.py @@ -1,135 +1,135 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Test pause/resume with Data Parallel (DP) via HTTP API. - -This example demonstrates coordinated pause/resume across multiple DP ranks. -The pause synchronizes across all DP engines via all-reduce. - -Prerequisites: - Start a vLLM server with data parallelism: - - $ VLLM_SERVER_DEV_MODE=1 vllm serve facebook/opt-125m \ - --enforce-eager \ - --data-parallel-size 4 \ - --tensor-parallel-size 1 - - Then run this script: - - $ python data_parallel_pause_resume.py - -The test verifies pause works by: -1. Starting a streaming generation request -2. Pausing the server mid-generation -3. Sleeping for PAUSE_DURATION seconds -4. Resuming the server -5. Verifying there was a gap in token generation matching the pause duration -""" - -import argparse -import threading -import time - -import requests -from openai import OpenAI - -BASE_URL = "http://localhost:8000" -MODEL_NAME = "facebook/opt-125m" -PAUSE_DURATION = 3.0 - - -def pause_generation(base_url: str, mode: str = "keep") -> None: - """Pause generation via HTTP endpoint.""" - url = f"{base_url}/pause" - response = requests.post(url, params={"mode": mode}, timeout=60) - response.raise_for_status() - print("Server paused") - - -def resume_generation(base_url: str) -> None: - """Resume generation via HTTP endpoint.""" - url = f"{base_url}/resume" - response = requests.post(url, timeout=60) - response.raise_for_status() - print("Server resumed") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", default=BASE_URL) - parser.add_argument("--model", default=MODEL_NAME) - args = parser.parse_args() - - client = OpenAI( - base_url=f"{args.base_url}/v1", - api_key="EMPTY", - ) - - prompt = "Write a long story about a dragon. Once upon a time" - token_times: list[float] = [] - pause_token_idx = 0 - pause_triggered = threading.Event() - - def generator_thread(): - """Stream tokens and record timestamps.""" - stream = client.completions.create( - model=args.model, - prompt=prompt, - max_tokens=50, - stream=True, - ) - for chunk in stream: - if chunk.choices[0].text: - token_times.append(time.monotonic()) - token_count = len(token_times) - print(f"Token {token_count}: {chunk.choices[0].text!r}") - - # Signal controller after some tokens - if token_count >= 5 and not pause_triggered.is_set(): - pause_triggered.set() - - def controller_thread(): - """Pause and resume the server.""" - nonlocal pause_token_idx - - # Wait for some tokens - pause_triggered.wait() - - print(f"\nPausing server (keep mode) at token {len(token_times)}...") - pause_generation(args.base_url, mode="keep") - pause_token_idx = len(token_times) - print(f"Sleeping for {PAUSE_DURATION}s...") - - time.sleep(PAUSE_DURATION) - - print("Resuming server...") - resume_generation(args.base_url) - print("Resumed!\n") - - # Run both threads - gen_thread = threading.Thread(target=generator_thread) - ctrl_thread = threading.Thread(target=controller_thread) - - gen_thread.start() - ctrl_thread.start() - - gen_thread.join() - ctrl_thread.join() - - # Check gap at the pause point - if pause_token_idx < len(token_times): - pause_gap = token_times[pause_token_idx] - token_times[pause_token_idx - 1] - print( - f"\nGap after pause (token {pause_token_idx} -> " - f"{pause_token_idx + 1}): {pause_gap:.3f}s" - ) - if pause_gap >= PAUSE_DURATION * 0.9: - print("Test passed! Pause synchronized across DP ranks.") - else: - print(f"Test failed! Expected ~{PAUSE_DURATION}s gap, got {pause_gap:.3f}s") - else: - print("Test failed! No tokens were generated after resuming.") - - -if __name__ == "__main__": - main() +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test pause/resume with Data Parallel (DP) via HTTP API. + +This example demonstrates coordinated pause/resume across multiple DP ranks. +The pause synchronizes across all DP engines via all-reduce. + +Prerequisites: + Start a vLLM server with data parallelism: + + $ VLLM_SERVER_DEV_MODE=1 vllm serve facebook/opt-125m \ + --enforce-eager \ + --data-parallel-size 4 \ + --tensor-parallel-size 1 + + Then run this script: + + $ python data_parallel_pause_resume.py + +The test verifies pause works by: +1. Starting a streaming generation request +2. Pausing the server mid-generation +3. Sleeping for PAUSE_DURATION seconds +4. Resuming the server +5. Verifying there was a gap in token generation matching the pause duration +""" + +import argparse +import threading +import time + +import requests +from openai import OpenAI + +BASE_URL = "http://localhost:8000" +MODEL_NAME = "facebook/opt-125m" +PAUSE_DURATION = 3.0 + + +def pause_generation(base_url: str, mode: str = "keep") -> None: + """Pause generation via HTTP endpoint.""" + url = f"{base_url}/pause" + response = requests.post(url, params={"mode": mode}, timeout=60) + response.raise_for_status() + print("Server paused") + + +def resume_generation(base_url: str) -> None: + """Resume generation via HTTP endpoint.""" + url = f"{base_url}/resume" + response = requests.post(url, timeout=60) + response.raise_for_status() + print("Server resumed") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--base-url", default=BASE_URL) + parser.add_argument("--model", default=MODEL_NAME) + args = parser.parse_args() + + client = OpenAI( + base_url=f"{args.base_url}/v1", + api_key="EMPTY", + ) + + prompt = "Write a long story about a dragon. Once upon a time" + token_times: list[float] = [] + pause_token_idx = 0 + pause_triggered = threading.Event() + + def generator_thread(): + """Stream tokens and record timestamps.""" + stream = client.completions.create( + model=args.model, + prompt=prompt, + max_tokens=50, + stream=True, + ) + for chunk in stream: + if chunk.choices[0].text: + token_times.append(time.monotonic()) + token_count = len(token_times) + print(f"Token {token_count}: {chunk.choices[0].text!r}") + + # Signal controller after some tokens + if token_count >= 5 and not pause_triggered.is_set(): + pause_triggered.set() + + def controller_thread(): + """Pause and resume the server.""" + nonlocal pause_token_idx + + # Wait for some tokens + pause_triggered.wait() + + print(f"\nPausing server (keep mode) at token {len(token_times)}...") + pause_generation(args.base_url, mode="keep") + pause_token_idx = len(token_times) + print(f"Sleeping for {PAUSE_DURATION}s...") + + time.sleep(PAUSE_DURATION) + + print("Resuming server...") + resume_generation(args.base_url) + print("Resumed!\n") + + # Run both threads + gen_thread = threading.Thread(target=generator_thread) + ctrl_thread = threading.Thread(target=controller_thread) + + gen_thread.start() + ctrl_thread.start() + + gen_thread.join() + ctrl_thread.join() + + # Check gap at the pause point + if pause_token_idx < len(token_times): + pause_gap = token_times[pause_token_idx] - token_times[pause_token_idx - 1] + print( + f"\nGap after pause (token {pause_token_idx} -> " + f"{pause_token_idx + 1}): {pause_gap:.3f}s" + ) + if pause_gap >= PAUSE_DURATION * 0.9: + print("Test passed! Pause synchronized across DP ranks.") + else: + print(f"Test failed! Expected ~{PAUSE_DURATION}s gap, got {pause_gap:.3f}s") + else: + print("Test failed! No tokens were generated after resuming.") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/pause_resume.py b/examples/features/pause_resume/pause_resume_offline.py similarity index 100% rename from examples/offline_inference/pause_resume.py rename to examples/features/pause_resume/pause_resume_offline.py diff --git a/examples/offline_inference/run_one_batch.py b/examples/features/profiling/run_one_batch_offline.py similarity index 100% rename from examples/offline_inference/run_one_batch.py rename to examples/features/profiling/run_one_batch_offline.py diff --git a/examples/offline_inference/simple_profiling.py b/examples/features/profiling/simple_profiling_offline.py similarity index 100% rename from examples/offline_inference/simple_profiling.py rename to examples/features/profiling/simple_profiling_offline.py diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py similarity index 96% rename from examples/online_serving/prompt_embed_inference_with_openai_client.py rename to examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py index fa4b64c00703..40eae0c062dd 100644 --- a/examples/online_serving/prompt_embed_inference_with_openai_client.py +++ b/examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py @@ -15,7 +15,7 @@ --enable-prompt-embeds Run the client: -python examples/online_serving/prompt_embed_inference_with_openai_client.py +python examples/features/prompt_embed/prompt_embed_inference_with_openai_client.py Model: meta-llama/Llama-3.2-1B-Instruct Note: This model is gated on Hugging Face Hub. diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/features/prompt_embed/prompt_embed_offline.py similarity index 97% rename from examples/offline_inference/prompt_embed_inference.py rename to examples/features/prompt_embed/prompt_embed_offline.py index a0eaeb6810a2..29853bce9673 100644 --- a/examples/offline_inference/prompt_embed_inference.py +++ b/examples/features/prompt_embed/prompt_embed_offline.py @@ -15,7 +15,7 @@ - transformers Run: - python examples/offline_inference/prompt_embed_inference.py + python examples/features/prompt_embed/prompt_embed_offline.py """ import torch diff --git a/examples/offline_inference/llm_engine_reset_kv.py b/examples/features/reset_kv/reset_kv_offline.py similarity index 100% rename from examples/offline_inference/llm_engine_reset_kv.py rename to examples/features/reset_kv/reset_kv_offline.py diff --git a/examples/offline_inference/load_sharded_state.py b/examples/features/sharded_state/load_sharded_state_offline.py similarity index 94% rename from examples/offline_inference/load_sharded_state.py rename to examples/features/sharded_state/load_sharded_state_offline.py index 0085e8e8e32b..e867db5d12fe 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/features/sharded_state/load_sharded_state_offline.py @@ -3,16 +3,16 @@ """ Validates the loading of a model saved with the sharded_state format. This script demonstrates how to load a model that was previously saved -using save_sharded_state.py and validates it by running inference. +using save_sharded_state_offline.py and validates it by running inference. Example usage: (First need to save a sharded_state mode) -python save_sharded_state.py \ +python save_sharded_state_offline.py \ --model /path/to/load \ --tensor-parallel-size 8 \ --output /path/to/save/sharded/model -python load_sharded_state.py \ +python load_sharded_state_offline.py \ --model /path/to/saved/sharded/model \ --load-format sharded_state \ --tensor-parallel-size 8 \ diff --git a/examples/offline_inference/save_sharded_state.py b/examples/features/sharded_state/save_sharded_state_offline.py similarity index 98% rename from examples/offline_inference/save_sharded_state.py rename to examples/features/sharded_state/save_sharded_state_offline.py index 14d472ee3f23..675f2e35a53f 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/features/sharded_state/save_sharded_state_offline.py @@ -7,7 +7,7 @@ Example usage: -python save_sharded_state.py \ +python save_sharded_state_offline.py \ --model /path/to/load \ --tensor-parallel-size 8 \ --output /path/to/save diff --git a/examples/offline_inference/extract_hidden_states.py b/examples/features/speculative_decoding/extract_hidden_states_offline.py similarity index 100% rename from examples/offline_inference/extract_hidden_states.py rename to examples/features/speculative_decoding/extract_hidden_states_offline.py diff --git a/examples/offline_inference/mlpspeculator.py b/examples/features/speculative_decoding/mlpspeculator_offline.py similarity index 100% rename from examples/offline_inference/mlpspeculator.py rename to examples/features/speculative_decoding/mlpspeculator_offline.py diff --git a/examples/offline_inference/spec_decode.py b/examples/features/speculative_decoding/spec_decode_offline.py similarity index 100% rename from examples/offline_inference/spec_decode.py rename to examples/features/speculative_decoding/spec_decode_offline.py diff --git a/examples/online_serving/structured_outputs/README.md b/examples/features/structured_outputs/README.md similarity index 85% rename from examples/online_serving/structured_outputs/README.md rename to examples/features/structured_outputs/README.md index 7f539716ecf8..f2863eb0cbcf 100644 --- a/examples/online_serving/structured_outputs/README.md +++ b/examples/features/structured_outputs/README.md @@ -20,7 +20,7 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ If you want to run this script standalone with `uv`, you can use the following: ```bash -uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs \ +uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/features/structured_outputs \ structured-outputs ``` @@ -34,19 +34,19 @@ See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.ht Run all constraints, non-streaming: ```bash -uv run structured_outputs.py +uv run structured_outputs_offline.py ``` Run all constraints, streaming: ```bash -uv run structured_outputs.py --stream +uv run structured_outputs_offline.py --stream ``` Run certain constraints, for example `structural_tag` and `regex`, streaming: ```bash -uv run structured_outputs.py \ +uv run structured_outputs_offline.py \ --constraint structural_tag regex \ --stream ``` @@ -54,5 +54,5 @@ uv run structured_outputs.py \ Run all constraints, with reasoning models and streaming: ```bash -uv run structured_outputs.py --reasoning --stream +uv run structured_outputs_offline.py --reasoning --stream ``` diff --git a/examples/online_serving/structured_outputs/pyproject.toml b/examples/features/structured_outputs/pyproject.toml similarity index 100% rename from examples/online_serving/structured_outputs/pyproject.toml rename to examples/features/structured_outputs/pyproject.toml diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/features/structured_outputs/structured_outputs_client.py similarity index 100% rename from examples/online_serving/structured_outputs/structured_outputs.py rename to examples/features/structured_outputs/structured_outputs_client.py diff --git a/examples/offline_inference/structured_outputs.py b/examples/features/structured_outputs/structured_outputs_offline.py similarity index 100% rename from examples/offline_inference/structured_outputs.py rename to examples/features/structured_outputs/structured_outputs_offline.py diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/features/torchrun/torchrun_dp_example_offline.py similarity index 95% rename from examples/offline_inference/torchrun_dp_example.py rename to examples/features/torchrun/torchrun_dp_example_offline.py index eb7ed969ea4b..f18f6042e9c6 100644 --- a/examples/offline_inference/torchrun_dp_example.py +++ b/examples/features/torchrun/torchrun_dp_example_offline.py @@ -7,15 +7,15 @@ To run this example: ```bash -$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py +$ torchrun --nproc-per-node=2 examples/features/torchrun/torchrun_dp_example_offline.py ``` With custom parallelism settings: ```bash -$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \ +$ torchrun --nproc-per-node=8 examples/features/torchrun/torchrun_dp_example_offline.py \ --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep ``` -""" +""" # noqa: E501 import argparse diff --git a/examples/offline_inference/torchrun_example.py b/examples/features/torchrun/torchrun_example_offline.py similarity index 99% rename from examples/offline_inference/torchrun_example.py rename to examples/features/torchrun/torchrun_example_offline.py index 2960d329968a..e41bcd420c20 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/features/torchrun/torchrun_example_offline.py @@ -4,7 +4,7 @@ experimental support for tensor-parallel inference with torchrun, see https://github.com/vllm-project/vllm/issues/11400 for the motivation and use case for this example. -run the script with `torchrun --nproc-per-node=4 torchrun_example.py`, +run the script with `torchrun --nproc-per-node=4 torchrun_example_offline.py`, the argument `4` should match the product of `tensor_parallel_size` and `pipeline_parallel_size` below. see `tests/distributed/test_torchrun_example.py` for the unit test. diff --git a/examples/offline_inference/routed_experts_e2e.py b/examples/rl/routed_experts_e2e.py similarity index 99% rename from examples/offline_inference/routed_experts_e2e.py rename to examples/rl/routed_experts_e2e.py index bb1d7b411f99..1666bc3ffe16 100644 --- a/examples/offline_inference/routed_experts_e2e.py +++ b/examples/rl/routed_experts_e2e.py @@ -9,7 +9,7 @@ 3. Results are deterministic across runs (baseline vs reference). Usage: - python examples/offline_inference/routed_experts_e2e.py \ + python examples/rl/routed_experts_e2e.py \ --model Qwen/Qwen3-30B-A3B \ --tp 4 \ --max-model-len 4096 \ diff --git a/examples/offline_inference/skip_loading_weights_in_engine_init.py b/examples/rl/skip_loading_weights_in_engine_init.py similarity index 100% rename from examples/offline_inference/skip_loading_weights_in_engine_init.py rename to examples/rl/skip_loading_weights_in_engine_init.py diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index f56d037fa547..e72f00bc91e0 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# unit test for `examples/offline_inference/torchrun_example.py` +# unit test for `examples/features/torchrun/torchrun_example_offline.py` import os import random diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py index 8c1d00561b16..969b5e92e3fc 100644 --- a/tests/distributed/test_torchrun_example_moe.py +++ b/tests/distributed/test_torchrun_example_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# unit test for `examples/offline_inference/torchrun_example.py` +# unit test for `examples/features/torchrun/torchrun_example_offline.py` import os import random diff --git a/tests/v1/spec_decode/test_acceptance_length.py b/tests/v1/spec_decode/test_acceptance_length.py index ec65e20cbde1..62ff100fdbf8 100644 --- a/tests/v1/spec_decode/test_acceptance_length.py +++ b/tests/v1/spec_decode/test_acceptance_length.py @@ -43,7 +43,8 @@ class Eagle3ModelConfig: # Model configurations for EAGLE3 acceptance length tests. # Expected acceptance lengths are determined by running baseline benchmarks -# using examples/offline_inference/spec_decode.py with the MT-Bench dataset. +# using examples/features/speculative_decoding/spec_decode_offline.py +# with the MT-Bench dataset. EAGLE3_MODEL_CONFIGS = [ Eagle3ModelConfig( verifier="meta-llama/Llama-3.1-8B-Instruct", diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 29cc2b47e7be..e67046b14117 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -334,7 +334,7 @@ def _make_config(value: Any, cls: type[_R]) -> _R: f"LLM(data_parallel_size={_dp_size}) is not supported for single-" "process usage and may hang. Please use " "the explicit multi-process data-parallel example at " - "'examples/offline_inference/data_parallel.py'." + "'examples/features/data_parallel/data_parallel_offline.py'." ) engine_args = EngineArgs( diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index 87b4b72db2a1..3f57fe7e0265 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -31,8 +31,8 @@ class ShardedStateLoader(BaseModelLoader): Model loader that directly loads each worker's model state dict, which enables a fast load path for large tensor-parallel models where each worker only needs to read its own shard rather than the entire checkpoint. See - `examples/offline_inference/save_sharded_state.py` for creating a sharded - checkpoint. + `examples/features/sharded_state/save_sharded_state_offline.py` for creating + a sharded checkpoint. """ DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors" diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index e8f5101b577d..459c16f8ec97 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -517,7 +517,7 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: "Nomic context extension is disabled. " "Changing max_model_len from %s to %s. " "To enable context extension, see: " - "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.py", + "https://github.com/vllm-project/vllm/tree/main/examples/features/context_extension/context_extension_offline.py", max_model_len_before, model_config.max_model_len, ) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 53cad2bc153f..7b0f00d14c8a 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -952,7 +952,7 @@ def get_engine_zmq_addresses( # In offline mode there is an LLM instance per DP rank and # one core engine per LLM, see - # examples/offline_inference/data_parallel.py. + # examples/features/data_parallel/data_parallel_offline.py. offline_mode = local_start_index is not None # client_local_only = True for cases where this front-end diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index b616c3b7b8ad..d006946079e7 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -147,7 +147,7 @@ class ExecutorWithExternalLauncher(UniProcExecutor): offline inference with tensor parallelism. see https://github.com/vllm-project/vllm/issues/11400 for - the motivation, and examples/offline_inference/torchrun_example.py + the motivation, and examples/features/torchrun/torchrun_example_offline.py for the usage example. The key idea: although it is tensor-parallel inference, we only