vllm-project · vllm-bot · May 8, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -136,8 +136,6 @@ run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
     "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 run_and_track_test 4 "test_quantization_accuracy.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
-run_and_track_test 5 "examples/offline_inference/tpu.py" \
-    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
 run_and_track_test 6 "test_tpu_model_runner.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
 run_and_track_test 7 "test_sampler.py" \

@@ -394,8 +394,8 @@ steps:
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # Features demo
     - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 deployment/llm_engine_example.py
+    - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
@@ -1649,8 +1649,8 @@ steps:
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # Features demo
     - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 deployment/llm_engine_example.py
+    - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
@@ -2930,8 +2930,8 @@ steps:
   - python3 pooling/embed/vision_embedding_offline.py --seed 0
   # Features demo
   - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-  - python3 offline_inference/llm_engine_example.py
-  - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+  - python3 deployment/llm_engine_example.py
+  - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
   - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
   - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 

@@ -117,8 +117,8 @@ steps:
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # for features demo
     - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 deployment/llm_engine_example.py
+    - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536

@@ -37,7 +37,7 @@ steps:
     - examples/generate/multimodal/
     - examples/features/
     - examples/pooling/embed/vision_embedding_offline.py
-    - examples/others/tensorize_vllm_model.py
+    - examples/features/tensorize_vllm_model.py
   commands:
     - set -x
     - export VLLM_USE_V2_MODEL_RUNNER=1
@@ -55,8 +55,8 @@ steps:
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # for features demo
     - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 deployment/llm_engine_example.py
+    - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536

@@ -860,7 +860,7 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm" \
 # define sagemaker first, so it is not default from `docker build`
 FROM vllm-openai-base AS vllm-sagemaker
 
-COPY examples/online_serving/sagemaker-entrypoint.sh .
+COPY examples/deployment/sagemaker-entrypoint.sh .
 RUN chmod +x sagemaker-entrypoint.sh
 ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 

diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
@@ -278,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
       http://localhost:8000/v1/audio/translations
     ```
 
-Or check out more examples in [examples/online_serving](../../../examples/online_serving).
+Or check out more examples in [examples/speech_to_text](../../../examples/speech_to_text).
 
 !!! note
     - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.

diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md
@@ -17,7 +17,7 @@ Before you begin, ensure that you have the following:
 
 ## Installing the chart
 
-This guide uses the Helm chart at [examples/online_serving/chart-helm](../../../examples/online_serving/chart-helm).
+This guide uses the Helm chart at [examples/deployment/chart-helm](../../../examples/deployment/chart-helm).
 
 To install the chart with the release name `test-vllm`:
 

diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md
@@ -40,7 +40,7 @@ Deploy the following yaml file `lws.yaml`
                 command:
                   - sh
                   - -c
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                  - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
                     vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
                 resources:
                   limits:
@@ -73,7 +73,7 @@ Deploy the following yaml file `lws.yaml`
                 command:
                   - sh
                   - -c
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+                  - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
                 resources:
                   limits:
                     nvidia.com/gpu: "8"

diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -36,7 +36,7 @@ pip install -U vllm \
     vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
     ```
 
-1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
+1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_langchain.py](../../../examples/applications/rag/retrieval_augmented_generation_with_langchain.py)
 
 1. Run the script
 
@@ -74,7 +74,7 @@ pip install vllm \
     vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
     ```
 
-1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
+1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py](../../../examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py)
 
 1. Run the script:
 

diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
@@ -59,7 +59,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
 
       echo 'Starting gradio server...'
       git clone https://github.com/vllm-project/vllm.git || true
-      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+      python vllm/examples/applications/chatbot/gradio_openai_chatbot_webserver.py \
         -m $MODEL_NAME \
         --port 8811 \
         --model-url http://localhost:8081/v1 \
@@ -305,7 +305,7 @@ It is also possible to access the Llama-3 service with a separate GUI frontend,
 
       echo 'Starting gradio server...'
       git clone https://github.com/vllm-project/vllm.git || true
-      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+      python vllm/examples/applications/api_client/gradio_openai_chatbot_webserver.py \
         -m $MODEL_NAME \
         --port 8811 \
         --model-url http://$ENDPOINT/v1 \

diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md
@@ -20,7 +20,7 @@ pip install vllm streamlit openai
     vllm serve Qwen/Qwen1.5-0.5B-Chat
     ```
 
-1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
+1. Use the script: [examples/applications/chatbot/streamlit_openai_chatbot_webserver.py](../../../examples/applications/chatbot/streamlit_openai_chatbot_webserver.py)
 
 1. Start the streamlit web UI and start to chat:
 

diff --git a/docs/deployment/integrations/kthena.md b/docs/deployment/integrations/kthena.md
@@ -78,7 +78,7 @@ Key points from the example YAML:
     - sh
     - -c
     - >
-      bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
+      bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2;
       python3 -m vllm.entrypoints.openai.api_server
         --port 8080
         --model meta-llama/Llama-3.1-405B-Instruct
@@ -93,7 +93,7 @@ Key points from the example YAML:
     - sh
     - -c
     - >
-      bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
+      bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
   ```
 
 ---
@@ -144,7 +144,7 @@ spec:
                 command:
                   - sh
                   - -c
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2; 
+                  - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2; 
                     python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2"
                 resources:
                   limits:
@@ -178,7 +178,7 @@ spec:
                 command:
                   - sh
                   - -c
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
+                  - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
                 resources:
                   limits:
                     nvidia.com/gpu: "8"

diff --git a/docs/examples/README.md b/docs/examples/README.md
@@ -2,6 +2,6 @@
 
 vLLM's examples are split into three categories:
 
-- If you are using vLLM from within Python code, see the [Offline Inference](../../examples/offline_inference) section.
-- If you are using vLLM from an HTTP application or client, see the [Online Serving](../../examples/online_serving) section.
-- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](../../examples/others) section.
+- If you are using vLLM from within Python code, see the [Offline Inference](.) section.
+- If you are using vLLM from an HTTP application or client, see the [Online Serving](.) section.
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](.) section.
diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
@@ -47,7 +47,7 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```bash
-python examples/offline_inference/llm_engine_example.py \
+python examples/deployment/llm_engine_example.py \
     --model TheBloke/Llama-2-7b-Chat-AWQ \
     --quantization awq
 ```

diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
@@ -58,7 +58,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
 
 ```bash
-python examples/offline_inference/llm_engine_example.py \
+python examples/deployment/llm_engine_example.py \
     --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```
 

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
@@ -157,7 +157,7 @@ OpenAI Python client library does not officially support `reasoning` attribute f
             print(content, end="", flush=True)
     ```
 
-Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/reasoning/openai_chat_completion_with_reasoning_streaming.py).
 
 ## Tool Calling
 

diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md
@@ -88,7 +88,7 @@ vllm serve facebook/opt-13b \
      -tp=8
 ```
 
-By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/online_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/run_cluster.sh) helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/ray_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/ray_serving/run_cluster.sh) helper script.
 
 --8<-- [end:supported-features]
 --8<-- [start:distributed-backend]

diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
@@ -14,7 +14,7 @@ To install `tensorizer`, run `pip install vllm[tensorizer]`.
 ## The basics
 
 To load a model using Tensorizer, the model first needs to be serialized by
-Tensorizer. [The example script](../../examples/others/tensorize_vllm_model.md) takes care of this process.
+Tensorizer. [The example script](../../../examples/features/tensorize_vllm_model.py) takes care of this process.
 
 Let's walk through a basic example by serializing `facebook/opt-125m` using the script, and then loading it for inference.
 
@@ -25,7 +25,7 @@ CLI arguments. The docstring for the script itself explains the CLI args
 and how to use it properly in great detail, and we'll use one of the examples from the docstring directly, assuming we want to serialize and save our model at our S3 bucket example `s3://my-bucket`:
 
 ```bash
-python examples/others/tensorize_vllm_model.py \
+python examples/features/tensorize_vllm_model.py \
    --model facebook/opt-125m \
    serialize \
    --serialized-directory s3://my-bucket \
@@ -35,7 +35,7 @@ python examples/others/tensorize_vllm_model.py \
 This saves the model tensors at `s3://my-bucket/vllm/facebook/opt-125m/v1`. If you intend on applying a LoRA adapter to your tensorized model, you can pass the HF id of the LoRA adapter in the above command, and the artifacts will be saved there too:
 
 ```bash
-python examples/others/tensorize_vllm_model.py \
+python examples/features/tensorize_vllm_model.py \
    --model facebook/opt-125m \
    --lora-path <lora_id> \
    serialize \
@@ -71,7 +71,7 @@ llm = LLM(
 As an example, CPU concurrency can be limited when serializing with `tensorizer` via the `limit_cpu_concurrency` parameter in the initializer for `TensorSerializer`. To set `limit_cpu_concurrency` to some arbitrary value, you would do so like this when serializing:
 
 ```bash
-python examples/others/tensorize_vllm_model.py \
+python examples/features/tensorize_vllm_model.py \
    --model facebook/opt-125m \
    --lora-path <lora_id> \
    serialize \