diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index feaf2b356267..61ebddf82e40 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -136,8 +136,6 @@ run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" run_and_track_test 4 "test_quantization_accuracy.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" -run_and_track_test 5 "examples/offline_inference/tpu.py" \ - "python3 /workspace/vllm/examples/offline_inference/tpu.py" run_and_track_test 6 "test_tpu_model_runner.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" run_and_track_test 7 "test_sampler.py" \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index b3c77dcac7c8..02820250f9b0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -394,8 +394,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -1649,8 +1649,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -2930,8 +2930,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index c34d4c10b49a..a9ae038b7b60 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -117,8 +117,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml index 6a4338a5e40a..93b5197a6fda 100644 --- a/.buildkite/test_areas/model_runner_v2.yaml +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -37,7 +37,7 @@ steps: - examples/generate/multimodal/ - examples/features/ - examples/pooling/embed/vision_embedding_offline.py - - examples/others/tensorize_vllm_model.py + - examples/features/tensorize_vllm_model.py commands: - set -x - export VLLM_USE_V2_MODEL_RUNNER=1 @@ -55,8 +55,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 diff --git a/docker/Dockerfile b/docker/Dockerfile index fd0622e2416a..1051ca97d3d3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -860,7 +860,7 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm" \ # define sagemaker first, so it is not default from `docker build` FROM vllm-openai-base AS vllm-sagemaker -COPY examples/online_serving/sagemaker-entrypoint.sh . +COPY examples/deployment/sagemaker-entrypoint.sh . RUN chmod +x sagemaker-entrypoint.sh ENTRYPOINT ["./sagemaker-entrypoint.sh"] diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index 3e2ee38d2bdd..b076ef84a46c 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -278,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints ( http://localhost:8000/v1/audio/translations ``` -Or check out more examples in [examples/online_serving](../../../examples/online_serving). +Or check out more examples in [examples/speech_to_text](../../../examples/speech_to_text). !!! note - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking. diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md index a0aee70b1b32..45924dd6f7c3 100644 --- a/docs/deployment/frameworks/helm.md +++ b/docs/deployment/frameworks/helm.md @@ -17,7 +17,7 @@ Before you begin, ensure that you have the following: ## Installing the chart -This guide uses the Helm chart at [examples/online_serving/chart-helm](../../../examples/online_serving/chart-helm). +This guide uses the Helm chart at [examples/deployment/chart-helm](../../../examples/deployment/chart-helm). To install the chart with the release name `test-vllm`: diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md index 14710a8dc333..47586bcd7003 100644 --- a/docs/deployment/frameworks/lws.md +++ b/docs/deployment/frameworks/lws.md @@ -40,7 +40,7 @@ Deploy the following yaml file `lws.yaml` command: - sh - -c - - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); + - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2" resources: limits: @@ -73,7 +73,7 @@ Deploy the following yaml file `lws.yaml` command: - sh - -c - - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" + - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" resources: limits: nvidia.com/gpu: "8" diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md index 8a5d18807d06..c23862cde72f 100644 --- a/docs/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -36,7 +36,7 @@ pip install -U vllm \ vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 ``` -1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py) +1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_langchain.py](../../../examples/applications/rag/retrieval_augmented_generation_with_langchain.py) 1. Run the script @@ -74,7 +74,7 @@ pip install vllm \ vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 ``` -1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py) +1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py](../../../examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py) 1. Run the script: diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index e9b0d5f0671c..e032769f15e8 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -59,7 +59,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ + python vllm/examples/applications/chatbot/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://localhost:8081/v1 \ @@ -305,7 +305,7 @@ It is also possible to access the Llama-3 service with a separate GUI frontend, echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ + python vllm/examples/applications/api_client/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://$ENDPOINT/v1 \ diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md index 1b214e1a32aa..6c0c8c6c1430 100644 --- a/docs/deployment/frameworks/streamlit.md +++ b/docs/deployment/frameworks/streamlit.md @@ -20,7 +20,7 @@ pip install vllm streamlit openai vllm serve Qwen/Qwen1.5-0.5B-Chat ``` -1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py) +1. Use the script: [examples/applications/chatbot/streamlit_openai_chatbot_webserver.py](../../../examples/applications/chatbot/streamlit_openai_chatbot_webserver.py) 1. Start the streamlit web UI and start to chat: diff --git a/docs/deployment/integrations/kthena.md b/docs/deployment/integrations/kthena.md index 483dd7474440..0989e5d67f02 100644 --- a/docs/deployment/integrations/kthena.md +++ b/docs/deployment/integrations/kthena.md @@ -78,7 +78,7 @@ Key points from the example YAML: - sh - -c - > - bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2; + bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2; python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct @@ -93,7 +93,7 @@ Key points from the example YAML: - sh - -c - > - bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS) + bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS) ``` --- @@ -144,7 +144,7 @@ spec: command: - sh - -c - - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2; + - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2; python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2" resources: limits: @@ -178,7 +178,7 @@ spec: command: - sh - -c - - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)" + - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)" resources: limits: nvidia.com/gpu: "8" diff --git a/docs/examples/README.md b/docs/examples/README.md index f5707ab6eeed..e4cd1c87f4ea 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -2,6 +2,6 @@ vLLM's examples are split into three categories: -- If you are using vLLM from within Python code, see the [Offline Inference](../../examples/offline_inference) section. -- If you are using vLLM from an HTTP application or client, see the [Online Serving](../../examples/online_serving) section. -- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](../../examples/others) section. +- If you are using vLLM from within Python code, see the [Offline Inference](.) section. +- If you are using vLLM from an HTTP application or client, see the [Online Serving](.) section. +- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](.) section. diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index e77e8b5a1f41..e93005f26321 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -47,7 +47,7 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```bash -python examples/offline_inference/llm_engine_example.py \ +python examples/deployment/llm_engine_example.py \ --model TheBloke/Llama-2-7b-Chat-AWQ \ --quantization awq ``` diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md index f14a931725da..636a952b6551 100644 --- a/docs/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -58,7 +58,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command: ```bash -python examples/offline_inference/llm_engine_example.py \ +python examples/deployment/llm_engine_example.py \ --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 ``` diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 374149786e14..f1cc18a25cbb 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -157,7 +157,7 @@ OpenAI Python client library does not officially support `reasoning` attribute f print(content, end="", flush=True) ``` -Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). +Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/reasoning/openai_chat_completion_with_reasoning_streaming.py). ## Tool Calling diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md index 8c282582281e..e8b74a06f079 100644 --- a/docs/getting_started/installation/gpu.xpu.inc.md +++ b/docs/getting_started/installation/gpu.xpu.inc.md @@ -88,7 +88,7 @@ vllm serve facebook/opt-13b \ -tp=8 ``` -By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/online_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/run_cluster.sh) helper script. +By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/ray_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/ray_serving/run_cluster.sh) helper script. --8<-- [end:supported-features] --8<-- [start:distributed-backend] diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index 3df80d5af6c4..0f3ee1ede436 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -14,7 +14,7 @@ To install `tensorizer`, run `pip install vllm[tensorizer]`. ## The basics To load a model using Tensorizer, the model first needs to be serialized by -Tensorizer. [The example script](../../examples/others/tensorize_vllm_model.md) takes care of this process. +Tensorizer. [The example script](../../../examples/features/tensorize_vllm_model.py) takes care of this process. Let's walk through a basic example by serializing `facebook/opt-125m` using the script, and then loading it for inference. @@ -25,7 +25,7 @@ CLI arguments. The docstring for the script itself explains the CLI args and how to use it properly in great detail, and we'll use one of the examples from the docstring directly, assuming we want to serialize and save our model at our S3 bucket example `s3://my-bucket`: ```bash -python examples/others/tensorize_vllm_model.py \ +python examples/features/tensorize_vllm_model.py \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -35,7 +35,7 @@ python examples/others/tensorize_vllm_model.py \ This saves the model tensors at `s3://my-bucket/vllm/facebook/opt-125m/v1`. If you intend on applying a LoRA adapter to your tensorized model, you can pass the HF id of the LoRA adapter in the above command, and the artifacts will be saved there too: ```bash -python examples/others/tensorize_vllm_model.py \ +python examples/features/tensorize_vllm_model.py \ --model facebook/opt-125m \ --lora-path \ serialize \ @@ -71,7 +71,7 @@ llm = LLM( As an example, CPU concurrency can be limited when serializing with `tensorizer` via the `limit_cpu_concurrency` parameter in the initializer for `TensorSerializer`. To set `limit_cpu_concurrency` to some arbitrary value, you would do so like this when serializing: ```bash -python examples/others/tensorize_vllm_model.py \ +python examples/features/tensorize_vllm_model.py \ --model facebook/opt-125m \ --lora-path \ serialize \ diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index bf1c153ec0d5..973c2ecdcb52 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -251,7 +251,7 @@ The following extra parameters are supported: Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/tool_calling/openai_responses_client_with_tools.py) +Code example: [examples/tool_calling/openai_responses_client_with_tools.py](../../examples/tool_calling/openai_responses_client_with_tools.py) #### Extra parameters @@ -456,8 +456,8 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan #### Example Clients -- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file -- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription +- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/speech_to_text/realtime/openai_realtime_client.py) - Upload and transcribe an audio file +- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/speech_to_text/realtime/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription ### Tokenizer API diff --git a/examples/online_serving/api_client.py b/examples/applications/chatbot/api_client.py similarity index 100% rename from examples/online_serving/api_client.py rename to examples/applications/chatbot/api_client.py diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/applications/chatbot/gradio_openai_chatbot_webserver.py similarity index 97% rename from examples/online_serving/gradio_openai_chatbot_webserver.py rename to examples/applications/chatbot/gradio_openai_chatbot_webserver.py index c76c60cc4472..2a67aefc0278 100644 --- a/examples/online_serving/gradio_openai_chatbot_webserver.py +++ b/examples/applications/chatbot/gradio_openai_chatbot_webserver.py @@ -5,7 +5,7 @@ vllm serve meta-llama/Llama-2-7b-chat-hf Start Gradio OpenAI Chatbot Webserver: - python examples/online_serving/gradio_openai_chatbot_webserver.py \ + python examples/applications/chatbot/gradio_openai_chatbot_webserver.py \ -m meta-llama/Llama-2-7b-chat-hf Note that `pip install --upgrade gradio` is needed to run this example. diff --git a/examples/online_serving/gradio_webserver.py b/examples/applications/chatbot/gradio_webserver.py similarity index 97% rename from examples/online_serving/gradio_webserver.py rename to examples/applications/chatbot/gradio_webserver.py index 86d9ceb48bb0..f75636409c2f 100644 --- a/examples/online_serving/gradio_webserver.py +++ b/examples/applications/chatbot/gradio_webserver.py @@ -6,7 +6,7 @@ --model meta-llama/Llama-2-7b-chat-hf Start Webserver: - python examples/online_serving/gradio_webserver.py + python examples/applications/chatbot/gradio_webserver.py Note that `pip install --upgrade gradio` is needed to run this example. More details: https://github.com/gradio-app/gradio diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/applications/chatbot/streamlit_openai_chatbot_webserver.py similarity index 100% rename from examples/online_serving/streamlit_openai_chatbot_webserver.py rename to examples/applications/chatbot/streamlit_openai_chatbot_webserver.py diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/applications/rag/retrieval_augmented_generation_with_langchain.py similarity index 100% rename from examples/online_serving/retrieval_augmented_generation_with_langchain.py rename to examples/applications/rag/retrieval_augmented_generation_with_langchain.py diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py similarity index 100% rename from examples/online_serving/retrieval_augmented_generation_with_llamaindex.py rename to examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py diff --git a/examples/offline_inference/async_llm_streaming.py b/examples/deployment/async_llm_streaming.py similarity index 98% rename from examples/offline_inference/async_llm_streaming.py rename to examples/deployment/async_llm_streaming.py index b876d536e3a1..ef69089a0460 100644 --- a/examples/offline_inference/async_llm_streaming.py +++ b/examples/deployment/async_llm_streaming.py @@ -8,7 +8,7 @@ streaming where you receive new tokens as they are generated. Usage: - python examples/offline_inference/async_llm_streaming.py + python examples/deployment/async_llm_streaming.py """ import asyncio diff --git a/examples/online_serving/chart-helm/.helmignore b/examples/deployment/chart-helm/.helmignore similarity index 100% rename from examples/online_serving/chart-helm/.helmignore rename to examples/deployment/chart-helm/.helmignore diff --git a/examples/online_serving/chart-helm/Chart.yaml b/examples/deployment/chart-helm/Chart.yaml similarity index 100% rename from examples/online_serving/chart-helm/Chart.yaml rename to examples/deployment/chart-helm/Chart.yaml diff --git a/examples/online_serving/chart-helm/README.md b/examples/deployment/chart-helm/README.md similarity index 100% rename from examples/online_serving/chart-helm/README.md rename to examples/deployment/chart-helm/README.md diff --git a/examples/online_serving/chart-helm/ct.yaml b/examples/deployment/chart-helm/ct.yaml similarity index 100% rename from examples/online_serving/chart-helm/ct.yaml rename to examples/deployment/chart-helm/ct.yaml diff --git a/examples/online_serving/chart-helm/lintconf.yaml b/examples/deployment/chart-helm/lintconf.yaml similarity index 100% rename from examples/online_serving/chart-helm/lintconf.yaml rename to examples/deployment/chart-helm/lintconf.yaml diff --git a/examples/online_serving/chart-helm/templates/_helpers.tpl b/examples/deployment/chart-helm/templates/_helpers.tpl similarity index 100% rename from examples/online_serving/chart-helm/templates/_helpers.tpl rename to examples/deployment/chart-helm/templates/_helpers.tpl diff --git a/examples/online_serving/chart-helm/templates/configmap.yaml b/examples/deployment/chart-helm/templates/configmap.yaml similarity index 100% rename from examples/online_serving/chart-helm/templates/configmap.yaml rename to examples/deployment/chart-helm/templates/configmap.yaml diff --git a/examples/online_serving/chart-helm/templates/custom-objects.yaml b/examples/deployment/chart-helm/templates/custom-objects.yaml similarity index 100% rename from examples/online_serving/chart-helm/templates/custom-objects.yaml rename to examples/deployment/chart-helm/templates/custom-objects.yaml diff --git a/examples/online_serving/chart-helm/templates/deployment.yaml b/examples/deployment/chart-helm/templates/deployment.yaml similarity index 100% rename from examples/online_serving/chart-helm/templates/deployment.yaml rename to examples/deployment/chart-helm/templates/deployment.yaml diff --git a/examples/online_serving/chart-helm/templates/hpa.yaml b/examples/deployment/chart-helm/templates/hpa.yaml similarity index 100% rename from examples/online_serving/chart-helm/templates/hpa.yaml rename to examples/deployment/chart-helm/templates/hpa.yaml diff --git a/examples/online_serving/chart-helm/templates/job.yaml b/examples/deployment/chart-helm/templates/job.yaml similarity index 100% rename from examples/online_serving/chart-helm/templates/job.yaml rename to examples/deployment/chart-helm/templates/job.yaml diff --git a/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml b/examples/deployment/chart-helm/templates/poddisruptionbudget.yaml similarity index 100% rename from examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml rename to examples/deployment/chart-helm/templates/poddisruptionbudget.yaml diff --git a/examples/online_serving/chart-helm/templates/pvc.yaml b/examples/deployment/chart-helm/templates/pvc.yaml similarity index 100% rename from examples/online_serving/chart-helm/templates/pvc.yaml rename to examples/deployment/chart-helm/templates/pvc.yaml diff --git a/examples/online_serving/chart-helm/templates/secrets.yaml b/examples/deployment/chart-helm/templates/secrets.yaml similarity index 100% rename from examples/online_serving/chart-helm/templates/secrets.yaml rename to examples/deployment/chart-helm/templates/secrets.yaml diff --git a/examples/online_serving/chart-helm/templates/service.yaml b/examples/deployment/chart-helm/templates/service.yaml similarity index 100% rename from examples/online_serving/chart-helm/templates/service.yaml rename to examples/deployment/chart-helm/templates/service.yaml diff --git a/examples/online_serving/chart-helm/tests/deployment_test.yaml b/examples/deployment/chart-helm/tests/deployment_test.yaml similarity index 100% rename from examples/online_serving/chart-helm/tests/deployment_test.yaml rename to examples/deployment/chart-helm/tests/deployment_test.yaml diff --git a/examples/online_serving/chart-helm/tests/job_test.yaml b/examples/deployment/chart-helm/tests/job_test.yaml similarity index 100% rename from examples/online_serving/chart-helm/tests/job_test.yaml rename to examples/deployment/chart-helm/tests/job_test.yaml diff --git a/examples/online_serving/chart-helm/tests/pvc_test.yaml b/examples/deployment/chart-helm/tests/pvc_test.yaml similarity index 100% rename from examples/online_serving/chart-helm/tests/pvc_test.yaml rename to examples/deployment/chart-helm/tests/pvc_test.yaml diff --git a/examples/online_serving/chart-helm/values.schema.json b/examples/deployment/chart-helm/values.schema.json similarity index 100% rename from examples/online_serving/chart-helm/values.schema.json rename to examples/deployment/chart-helm/values.schema.json diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/deployment/chart-helm/values.yaml similarity index 100% rename from examples/online_serving/chart-helm/values.yaml rename to examples/deployment/chart-helm/values.yaml diff --git a/examples/offline_inference/llm_engine_example.py b/examples/deployment/llm_engine_example.py similarity index 100% rename from examples/offline_inference/llm_engine_example.py rename to examples/deployment/llm_engine_example.py diff --git a/examples/online_serving/sagemaker-entrypoint.sh b/examples/deployment/sagemaker-entrypoint.sh similarity index 100% rename from examples/online_serving/sagemaker-entrypoint.sh rename to examples/deployment/sagemaker-entrypoint.sh diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_multiturn.py b/examples/disaggregated/disaggregated_serving/disagg_proxy_multiturn.py similarity index 100% rename from examples/online_serving/disaggregated_serving/disagg_proxy_multiturn.py rename to examples/disaggregated/disaggregated_serving/disagg_proxy_multiturn.py diff --git a/examples/others/logging_configuration.md b/examples/features/logging_configuration.md similarity index 100% rename from examples/others/logging_configuration.md rename to examples/features/logging_configuration.md diff --git a/examples/others/tensorize_vllm_model.py b/examples/features/tensorize_vllm_model.py similarity index 97% rename from examples/others/tensorize_vllm_model.py rename to examples/features/tensorize_vllm_model.py index 3644a03b32ed..a89b1781264d 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/features/tensorize_vllm_model.py @@ -33,7 +33,7 @@ To serialize a model, install vLLM from source, then run something like this from the root level of this repository: -python examples/others/tensorize_vllm_model.py \ +python examples/features/tensorize_vllm_model.py \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -53,7 +53,7 @@ To deserialize a model, you can run something like this from the root level of this repository: -python examples/others/tensorize_vllm_model.py \ +python examples/features/tensorize_vllm_model.py \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ @@ -71,11 +71,11 @@ model-rank-%03d.tensors For more information on the available arguments for serializing, run -`python -m examples.others.tensorize_vllm_model serialize --help`. +`python -m examples.features.tensorize_vllm_model serialize --help`. Or for deserializing: -`python examples/others/tensorize_vllm_model.py deserialize --help`. +`python examples/features/tensorize_vllm_model.py deserialize --help`. Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: @@ -100,7 +100,7 @@ In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: -`python examples/others/tensorize_vllm_model.py deserialize --help` +`python examples/features/tensorize_vllm_model.py deserialize --help` under the `tensorizer options` section. These can also be used for deserialization in this example script, although `--tensorizer-uri` and diff --git a/examples/observability/dashboards/README.md b/examples/observability/dashboards/README.md index e5f5010a42a4..29ec932cef24 100644 --- a/examples/observability/dashboards/README.md +++ b/examples/observability/dashboards/README.md @@ -43,7 +43,7 @@ Both platforms provide equivalent monitoring capabilities: First, navigate to this example's directory: ```bash -cd examples/online_serving/dashboards +cd examples/observability/dashboards ``` ### Grafana diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py deleted file mode 100644 index a512d8a31b53..000000000000 --- a/examples/online_serving/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from openai import APIConnectionError, OpenAI -from openai.pagination import SyncPage -from openai.types.model import Model - - -def get_first_model(client: OpenAI) -> str: - """ - Get the first model from the vLLM server. - """ - try: - models: SyncPage[Model] = client.models.list() - except APIConnectionError as e: - raise RuntimeError( - "Failed to get the list of models from the vLLM server at " - f"{client.base_url} with API key {client.api_key}. Check\n" - "1. the server is running\n" - "2. the server URL is correct\n" - "3. the API key is correct" - ) from e - - if len(models.data) == 0: - raise RuntimeError(f"No models found on the vLLM server at {client.base_url}") - - return models.data[0].id diff --git a/examples/pooling/embed/openai_embedding_long_text/service.sh b/examples/pooling/embed/openai_embedding_long_text/service.sh index 37a8b625b7f9..68950d04ee14 100644 --- a/examples/pooling/embed/openai_embedding_long_text/service.sh +++ b/examples/pooling/embed/openai_embedding_long_text/service.sh @@ -119,7 +119,7 @@ echo " - API Key: $API_KEY" echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN" echo "" echo "๐Ÿงช Test the server with:" -echo " python examples/online_serving/openai_embedding_long_text/client.py" +echo " python examples/pooling/embed/openai_embedding_long_text/client.py" echo "" echo "๐Ÿ“š Enhanced features enabled:" echo " โœ… Intelligent native pooling type detection" diff --git a/examples/speech_to_text/lid/openai_lid_client.py b/examples/speech_to_text/lid/openai_lid_client.py index 0ce0fbc92250..d91df3298c22 100644 --- a/examples/speech_to_text/lid/openai_lid_client.py +++ b/examples/speech_to_text/lid/openai_lid_client.py @@ -15,14 +15,14 @@ Then run this script: # Use the built-in sample audio - python examples/online_serving/openai_lid_client.py + python examples/speech_to_text/lid/openai_lid_client.py # Use your own audio file(s) - python examples/online_serving/openai_lid_client.py \ + python examples/speech_to_text/lid/openai_lid_client.py \ --audio_paths audio_en.wav audio_zh.wav audio_fr.wav # Batch-identify multiple files in one run - python examples/online_serving/openai_lid_client.py \ + python examples/speech_to_text/lid/openai_lid_client.py \ --audio_paths /path/to/dir/*.wav Requirements: diff --git a/requirements/common.txt b/requirements/common.txt index 68f5e165b923..843170325240 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -41,7 +41,7 @@ compressed-tensors == 0.15.0.1 # required for compressed-tensors depyf==0.20.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files -python-json-logger # Used by logging as per examples/others/logging_configuration.md +python-json-logger # Used by logging as per examples/features/logging_configuration.md ninja # Required for xgrammar, rocm, tpu, xpu pybase64 # fast base64 implementation cbor2 # Required for cross-language serialization of hashable objects diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 483235ff5129..bcf66a29628f 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -184,7 +184,7 @@ def test_tp2_serialize_and_deserialize_lora( result = subprocess.run( [ sys.executable, - f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", + f"{VLLM_PATH}/examples/features/tensorize_vllm_model.py", "--model", MODEL_PATH, "--lora-path", diff --git a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py index 3b950c843c56..a15a624c905d 100644 --- a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py +++ b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py @@ -460,7 +460,7 @@ async def test_serialize_and_serve_entrypoints(tmp_path): result = subprocess.run( [ sys.executable, - f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", + f"{VLLM_PATH}/examples/features/tensorize_vllm_model.py", "--model", model_ref, "serialize", diff --git a/vllm/assets/video.py b/vllm/assets/video.py index f5e443db978f..9ec2e4d16770 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -139,6 +139,6 @@ def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray: """ Read audio data from the video asset, used in Qwen2.5-Omni examples. - See also: examples/offline_inference/qwen2_5_omni/only_thinker.py + See also: examples/generate/multimodal/qwen2_5_omni/only_thinker.py """ return load_audio_pyav(self.video_path, sr=sampling_rate)[0] diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 3e6ed248ff3a..37d37d55f543 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -211,7 +211,7 @@ class TensorizerConfig(MutableMapping): encryption_keyfile: File path to a binary file containing a binary key to use for decryption. `None` (the default) means no decryption. See the example script in - examples/others/tensorize_vllm_model.py. + examples/features/tensorize_vllm_model.py. s3_access_key_id: The access key for the S3 bucket. Can also be set via the S3_ACCESS_KEY_ID environment variable. s3_secret_access_key: The secret access key for the S3 bucket. Can also @@ -579,7 +579,7 @@ def tensorizer_weights_iterator( "loading on vLLM, as tensorizer is forced to load to CPU. " "Consider deserializing a vLLM model instead for faster " "load times. See the " - "examples/others/tensorize_vllm_model.py example script " + "examples/features/tensorize_vllm_model.py example script " "for serializing vLLM models." ) diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index c5bff1312932..338f9eac072a 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -73,7 +73,7 @@ def _load_model_serialized_cpu( """Load a serialized model with tensorizer to the CPU. This is only necessary when the model isn't vLLM-tensorized (see - examples/others/tensorize_vllm_model.py) This should still + examples/features/tensorize_vllm_model.py) This should still be faster than default HuggingFace loading, but will be slower than loading a vLLM-tensorized model. """ @@ -104,7 +104,7 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: """Load serialized model weights with tensorizer. Expects a vLLM-tensorized model. See the - examples/others/tensorize_vllm_model.py example script + examples/features/tensorize_vllm_model.py example script for serializing vLLM models.""" if is_vllm_tensorized(self.tensorizer_config): tensorizer_config = self._patch_tensorizer_config(model_config)