diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml new file mode 100644 index 000000000000..56ec933c9cc0 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml @@ -0,0 +1,12 @@ +# For vllm script, with -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 +model_name: "HandH1998/QQQ-Llama-3-8b-g128" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.419 + - name: "exact_match,flexible-extract" + value: 0.416 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml new file mode 100644 index 000000000000..f10b93724997 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml @@ -0,0 +1,11 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + value: 0.90 +limit: 100 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml new file mode 100644 index 000000000000..96eeed04a9dc --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +backend: "vllm-vlm" +tasks: +- name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.80 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml index a2f235f48581..aa4fb9fa03d6 100644 --- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml @@ -1,4 +1,5 @@ -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1 +# For vllm script, with -t option (tensor parallel size) +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" tasks: - name: "gsm8k" diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml new file mode 100644 index 000000000000..5f3c31743e75 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -0,0 +1,12 @@ +# For vllm script, with -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1 + +model_name: "Qwen/Qwen2.5-VL-7B-Instruct" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + value: 0.855 +limit: 2500 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt new file mode 100644 index 000000000000..4fb0b84bc4d8 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt new file mode 100644 index 000000000000..91e22b6459c1 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-small.txt b/.buildkite/lm-eval-harness/configs/models-mm-small.txt new file mode 100644 index 000000000000..1097d220245f --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt @@ -0,0 +1 @@ +Qwen2.5-VL-7B-Instruct.yaml \ No newline at end of file diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh new file mode 100755 index 000000000000..c8db951381b0 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on chartqa for vllm. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.9 + +usage() { + echo`` + echo "Runs lm eval harness on ChartQA using multimodal vllm." + echo "This pathway is intended to be used to create baselines for " + echo "our correctness tests in vllm's CI." + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:l:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm-vlm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \ + --tasks chartqa \ + --batch_size auto \ + --apply_chat_template \ + --limit $LIMIT diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh old mode 100644 new mode 100755 diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh new file mode 100644 index 000000000000..d85a1721db9a --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on MMLUPRO for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] + +usage() { + echo`` + echo "Runs lm eval harness on MMLU Pro using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ + --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size auto diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index ceea01166b7f..f10de82b1d8e 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -19,21 +19,27 @@ def launch_lm_eval(eval_config, tp_size): trust_remote_code = eval_config.get("trust_remote_code", False) max_model_len = eval_config.get("max_model_len", 4096) + batch_size = eval_config.get("batch_size", "auto") + backend = eval_config.get("backend", "vllm") model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," f"enforce_eager=true," f"add_bos_token=true," f"trust_remote_code={trust_remote_code}," - f"max_model_len={max_model_len}" + f"max_model_len={max_model_len}," ) results = lm_eval.simple_evaluate( - model="vllm", + model=backend, model_args=model_args, tasks=[task["name"] for task in eval_config["tasks"]], num_fewshot=eval_config["num_fewshot"], limit=eval_config["limit"], - batch_size="auto", + # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help + # text models. however, this is regressing measured strict-match for + # existing text models in CI, so only apply it for mm. + apply_chat_template=backend == "vllm-vlm", + batch_size=batch_size, ) return results diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 94c0944c838c..a8a5bf3ad234 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -734,6 +734,16 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - label: Multi-Modal Models Test (Extended) 1 mirror_hardwares: [amdexperimental] optional: true diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake index 9aace7693077..5a59a409999a 100644 --- a/cmake/external_projects/qutlass.cmake +++ b/cmake/external_projects/qutlass.cmake @@ -22,10 +22,10 @@ else() CONFIGURE_COMMAND "" BUILD_COMMAND "" ) - FetchContent_Populate(qutlass) - set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}") endif() +FetchContent_Populate(qutlass) + if(NOT qutlass_SOURCE_DIR) message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") endif() diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 2b0654fa6d46..85906d23dee3 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc ```python from vllm import LLM - from vllm.config import CompilationConfig, CompilationLevel + from vllm.config import CompilationConfig, CompilationMode llm = LLM( model="meta-llama/Llama-3.1-8B-Instruct", compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # By default, it goes up to max_num_seqs cudagraph_capture_sizes=[1, 2, 4, 8, 16], ), diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index 315746b0ef67..c6d71589be98 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum): """NO CUDA Graphs support""" ``` -Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. +Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. The following table lists backends that support full CUDA Graphs at the time of writing. @@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG") import vllm from vllm.config import CUDAGraphMode -compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} +compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} model = vllm.LLM( model="meta-llama/Llama-3.1-8B-Instruct", dtype="auto", diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index 182127bc91cc..e77e8b5a1f41 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the from awq import AutoAWQForCausalLM from transformers import AutoTokenizer - model_path = 'mistralai/Mistral-7B-Instruct-v0.2' - quant_path = 'mistral-instruct-v0.2-awq' - quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + model_path = "mistralai/Mistral-7B-Instruct-v0.2" + quant_path = "mistral-instruct-v0.2-awq" + quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, + low_cpu_mem_usage=True, + use_cache=False, ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index 53b689ad53ff..c3a127657622 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -34,7 +34,7 @@ llm = LLM( model=model_id, dtype=torch.bfloat16, trust_remote_code=True, - quantization="bitblas" + quantization="bitblas", ) ``` @@ -53,6 +53,6 @@ llm = LLM( dtype=torch.float16, trust_remote_code=True, quantization="bitblas", - max_model_len=1024 + max_model_len=1024, ) ``` diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index 3b15a6072d47..2348c7739c06 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit" llm = LLM( model=model_id, dtype=torch.bfloat16, - trust_remote_code=True + trust_remote_code=True, ) ``` @@ -43,7 +43,7 @@ llm = LLM( model=model_id, dtype=torch.bfloat16, trust_remote_code=True, - quantization="bitsandbytes" + quantization="bitsandbytes", ) ``` diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 834c03cbe05b..a54acdbb9622 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio # Configure the simple PTQ quantization recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + targets="Linear", + scheme="FP8_DYNAMIC", + ignore=["lm_head"], + ) # Apply the quantization algorithm. oneshot(model=model, recipe=recipe) diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index 2a1c3bdd775f..2a731e9b7e03 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint: conversation = [ { "role": "system", - "content": "You are a helpful assistant" + "content": "You are a helpful assistant", }, { "role": "user", - "content": "Hello" + "content": "Hello", }, { "role": "assistant", - "content": "Hello! How can I assist you today?" + "content": "Hello! How can I assist you today?", }, { "role": "user", @@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint: sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") + llm = LLM( + model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.chat(conversation, sampling_params) diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md index 47cb2d65bae4..f14a931725da 100644 --- a/docs/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: calibration_dataset = load_dataset( "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", - split="train" + split="train", ).select(range(1024))["text"] quant_config = QuantizeConfig(bits=4, group_size=128) diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index d6fdac7b07f7..5d8e06ffb5d7 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y }, ignore=["lm_head"], update_size=NUM_CALIBRATION_SAMPLES, - dampening_frac=0.01 + dampening_frac=0.01, ) ``` diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index af3650e701ad..ee1de2146057 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index 39ae03b1bdac..c48ccb719a79 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll from vllm import LLM, SamplingParams def main(): - model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" - # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint + + # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) sampling_params = SamplingParams(temperature=0.8, top_p=0.9) diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index b2b417309e92..e0585a88451d 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization: from vllm import LLM, SamplingParams sampling_params = SamplingParams(temperature=0.7, top_p=0.8) - llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - calculate_kv_scales=True) + llm = LLM( + model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + calculate_kv_scales=True, + ) prompt = "London is the capital of" out = llm.generate(prompt, sampling_params)[0].outputs[0].text print(out) diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index 85b7d8ec84ed..f0cd20b7335c 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -48,7 +48,9 @@ to fetch model and tokenizer. MAX_SEQ_LEN = 512 model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + torch_dtype="auto", ) model.eval() @@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") text_data = dataset["text"][:NUM_CALIBRATION_DATA] - tokenized_outputs = tokenizer(text_data, return_tensors="pt", - padding=True, truncation=True, max_length=MAX_SEQ_LEN) - calib_dataloader = DataLoader(tokenized_outputs['input_ids'], - batch_size=BATCH_SIZE, drop_last=True) + tokenized_outputs = tokenizer( + text_data, + return_tensors="pt", + padding=True, + truncation=True, + max_length=MAX_SEQ_LEN, + ) + calib_dataloader = DataLoader( + tokenized_outputs['input_ids'], + batch_size=BATCH_SIZE, + drop_last=True, + ) ``` ### 3. Set the Quantization Configuration @@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. load_quant_algo_config_from_file) # Define fp8/per-tensor/static spec. - FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", - is_dynamic=False).to_quantization_spec() + FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec( + observer_method="min_max", + is_dynamic=False, + ).to_quantization_spec() # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. - global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, - weight=FP8_PER_TENSOR_SPEC) + global_quant_config = QuantizationConfig( + input_tensors=FP8_PER_TENSOR_SPEC, + weight=FP8_PER_TENSOR_SPEC, + ) # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] - kv_cache_quant_config = {name : - QuantizationConfig(input_tensors=global_quant_config.input_tensors, - weight=global_quant_config.weight, - output_tensors=KV_CACHE_SPEC) - for name in kv_cache_layer_names_for_llama} + kv_cache_quant_config = { + name: QuantizationConfig( + input_tensors=global_quant_config.input_tensors, + weight=global_quant_config.weight, + output_tensors=KV_CACHE_SPEC, + ) + for name in kv_cache_layer_names_for_llama + } layer_quant_config = kv_cache_quant_config.copy() # Define algorithm config by config file. - LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = - 'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' + LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json" algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) EXCLUDE_LAYERS = ["lm_head"] @@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. layer_quant_config=layer_quant_config, kv_cache_quant_config=kv_cache_quant_config, exclude=EXCLUDE_LAYERS, - algo_config=algo_config) + algo_config=algo_config, + ) ``` ### 4. Quantize the Model and Export @@ -165,8 +182,11 @@ for more exporting format details. EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) with torch.no_grad(): - exporter.export_safetensors_model(freezed_model, - quant_config=quant_config, tokenizer=tokenizer) + exporter.export_safetensors_model( + freezed_model, + quant_config=quant_config, + tokenizer=tokenizer, + ) ``` ### 5. Evaluation in vLLM @@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", - kv_cache_dtype='fp8',quantization='quark') + llm = LLM( + model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", + kv_cache_dtype="fp8", + quantization="quark", + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index ab04a1efcc08..0b00b8805bb2 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -11,6 +11,7 @@ vLLM currently supports the following reasoning models: | Model Series | Parser Name | Structured Output Support | Tool Calling | |--------------|-------------|------------------|-------------| | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | +| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ | | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ | | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ | @@ -20,8 +21,9 @@ vLLM currently supports the following reasoning models: | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ | !!! note - IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. + IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. + DeepSeek-V3.1 tool calling is supported in non-thinking mode. ## Quickstart diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md index 15fce69b4487..9cae9ed1a212 100644 --- a/docs/getting_started/installation/cpu/arm.inc.md +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] ---8<-- "docs/getting_started/installation/cpu/build.inc.md:extra-information" +First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +```bash +sudo apt-get update -y +sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +Second, clone the vLLM project: + +```bash +git clone https://github.com/vllm-project/vllm.git vllm_source +cd vllm_source +``` + +Third, install required dependencies: + +```bash +uv pip install -r requirements/cpu-build.txt --torch-backend cpu +uv pip install -r requirements/cpu.txt --torch-backend cpu +``` + +??? console "pip" + ```bash + pip install --upgrade pip + pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu + ``` + +Finally, build and install vLLM: + +```bash +VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation +``` + +If you want to develop vLLM, install it in editable mode instead. + +```bash +VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation +``` Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md deleted file mode 100644 index f99497128fd3..000000000000 --- a/docs/getting_started/installation/cpu/build.inc.md +++ /dev/null @@ -1,44 +0,0 @@ -# --8<-- [start:extra-information] - -First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: - -```bash -sudo apt-get update -y -sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof -sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 -``` - -Second, clone the vLLM project: - -```bash -git clone https://github.com/vllm-project/vllm.git vllm_source -cd vllm_source -``` - -Third, install required dependencies: - -```bash -uv pip install -r requirements/cpu-build.txt --torch-backend cpu -uv pip install -r requirements/cpu.txt --torch-backend cpu -``` - -??? console "pip" - ```bash - pip install --upgrade pip - pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - ``` - -Finally, build and install vLLM: - -```bash -VLLM_TARGET_DEVICE=cpu python setup.py install -``` - -If you want to develop vLLM, install it in editable mode instead. - -```bash -VLLM_TARGET_DEVICE=cpu python setup.py develop -``` - -# --8<-- [end:extra-information] diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 49e1f6fac715..1cba21cf5f6d 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep api_key=openai_api_key, base_url=openai_api_base, ) - completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", - prompt="San Francisco is a") + completion = client.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a", + ) print("Completion result:", completion) ``` @@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package: messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a joke."}, - ] + ], ) print("Chat response:", chat_response) ``` diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index f70ab0c6f4e5..3df80d5af6c4 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -60,7 +60,7 @@ from vllm import LLM llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", load_format="tensorizer", - enable_lora=True + enable_lora=True, ) ``` @@ -97,6 +97,6 @@ llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", load_format="tensorizer", enable_lora=True, - model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}} + model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}, ) ``` diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index 05f8d16cc4ca..9ea32ed61645 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc conversation = [ { "role": "system", - "content": "You are a helpful assistant" + "content": "You are a helpful assistant", }, { "role": "user", - "content": "Hello" + "content": "Hello", }, { "role": "assistant", - "content": "Hello! How can I assist you today?" + "content": "Hello! How can I assist you today?", }, { "role": "user", diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 50982d3d0d0f..45bfba2cbf59 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u from vllm import LLM llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") -(output,) = llm.score("What is the capital of France?", - "The capital of Brazil is Brasilia.") +(output,) = llm.score( + "What is the capital of France?", + "The capital of Brazil is Brasilia.", +) score = output.outputs.score print(f"Score: {score}") @@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please Here is an example to serve a model with Matryoshka Embeddings enabled. -```text +```bash vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' ``` @@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka ```python from vllm import LLM, PoolingParams -llm = LLM(model="jinaai/jina-embeddings-v3", - runner="pooling", - trust_remote_code=True) -outputs = llm.embed(["Follow the white rabbit."], - pooling_params=PoolingParams(dimensions=32)) +llm = LLM( + model="jinaai/jina-embeddings-v3", + runner="pooling", + trust_remote_code=True, +) +outputs = llm.embed( + ["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32), +) print(outputs[0].outputs) ``` @@ -234,13 +240,13 @@ A code example can be found here: ```python import os -os.environ['http_proxy'] = 'http://your.proxy.server:port' -os.environ['https_proxy'] = 'http://your.proxy.server:port' +os.environ["http_proxy"] = "http://your.proxy.server:port" +os.environ["https_proxy"] = "http://your.proxy.server:port" ``` ### ModelScope diff --git a/docs/serving/context_parallel_deployment.md b/docs/serving/context_parallel_deployment.md new file mode 100644 index 000000000000..dacdf312ee55 --- /dev/null +++ b/docs/serving/context_parallel_deployment.md @@ -0,0 +1,47 @@ +# Context Parallel Deployment + +Context parallel mainly solves the problem of serving long context requests. As prefill and decode present quite different characteristics and have quite different SLO (service level objectives), we need to implement context parallel separately for them. The major considerations are: + +- For long context prefill, we need to control the TTFT (time to first token) by amortizing the computation time of the prefill across query tokens. +- For long context decode, we need more space for KV cache to increase the batchsize (and hence the throughput). + +## Prefill Context Parallel + +During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors. + +Depending on the use case, there're two possible strategies: + +1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk. +2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk. + +Both approaches are under active development. + +## Decode Context Parallel + +Due to the auto-regressive nature of decoding, every decoding step needs to compute a small amount of query tokens w.r.t. a large number of key/value tokens stored in the paged KV cache. The core of decode context parallel is how to shard the KV cache across GPUs. + +For a model with `H` kv-heads, a request with `T` tokens in the context needs to store `H * T` key/value tensors in the KV cache. + +1. If one GPU can hold them all, and the performance is good enough, then no parallelization is needed. +2. If one GPU cannot hold them all, or we want to hold more requests in the KV cache, we can first shard the KV cache along the `H` dimension, that's the plain tensor parallel sharding. It's as simple as adding `-tp ` to the command line. +3. Since `H` is limited (determined by the model architecture), when we continue to increase the tensor parallel size, the KV cache for each GPU will be duplicated for `tp_size / H` times. Of course, duplication is not good for efficiency. Then we need to add decode context parallel to further shard the KV cache along the `T` dimension. This is as simple as adding `-dcp ` to the command line. Note that `size` does not increase the number of GPUs we need to launch, but just reduces the KV cache duplication. The dcp size should lie in the range of `[1, tp_size/H]`. With larger dcp size, the KV cache duplication is reduced, but the communication overhead increases. + +Theoretically, it is possible to extend the dcp size beyond `tp_size / H` to further shard the KV cache and accelerate the decoding phase. However, since the number of query tokens is limited in decoding, it's unclear what should we do for the remaining `dcp_size - tp_size / H` GPUs for non-attention layers. For the sake of simplicity, dcp size is upper bounded by `tp_size / H`. If you want to further accelerate the decoding phase, you can consider increasing the `tp_size` first, and then increasing the dcp size. + +Note that kv cache can grow during decoding, and the sharding strategy needs to be carefully implemented. We use an interleaving strategy to shard the KV cache along the `T` dimension, so that kv cache for future tokens can be naturally sharded along the `T` dimension. This is proposed by [Chao Hong from Moonshot](https://github.com/youzhedian), and also explained in details in [this paper](http://arxiv.org/abs/2507.07120). + +Case study: + +For DeepSeek-R1, we have 1 kv-head when MLA is enabled. The typical single-node deployment with `-tp 8` causes 8x KV cache duplication. We can consider adding `-dcp 8` to reduce the KV cache duplication. + +For Kimi-K2, the architecture is similar to DeepSeek-R1, but with more parameters. When we deploy it with `-tp 16`, the KV cache duplication is 16x. We can add `-dcp 16` to completely remove the KV cache duplication, at the cost of more communication overhead. We can also add `-dcp 8` to reduce the KV cache duplication to 2x. Although it still duplicates the KV cache twice, the communication overhead is smaller since the DCP communication only happens inside one node. + +For Qwen3-235B-A22B, we have 4 kv-heads. When we deploy it with `-tp 8`, the KV cache duplication is 2x. Then we can add `-dcp 2` to remove the KV cache duplication. + +In short, for decode context parallel, try to increase `-tp` size until you get satisfactory performance, and then add `-dcp` to reduce the KV cache duplication. + +Decode context parallel is supported in vLLM, for both MLA and GQA models. Some attention backends also support the combination of decode context parallel and MTP (multi-token prediction) to further accelerate the decoding phase. + +## Technical Discussions + +The main discussions happen in the `#sig-context-parallel` channel of [vLLM Slack](https://slack.vllm.ai/). diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index cd6515dde75e..f1dfb05ea5d4 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -243,10 +243,10 @@ try: "remote_engine_id": None, # Will be populated by vLLM "remote_block_ids": None, # Will be populated by vLLM "remote_host": None, # Will be populated by vLLM - "remote_port": None # Will be populated by vLLM + "remote_port": None, # Will be populated by vLLM } }, - extra_headers={"X-Request-Id": request_id} + extra_headers={"X-Request-Id": request_id}, ) print("-" * 50) @@ -262,7 +262,7 @@ try: extra_body={ "kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info }, - extra_headers={"X-Request-Id": request_id} # Same request ID + extra_headers={"X-Request-Id": request_id}, # Same request ID ) print("-" * 50) diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md index 47074f411ac9..192a61ea5b90 100644 --- a/docs/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain` ```python from langchain_community.llms import VLLM - llm = VLLM(model="mosaicml/mpt-7b", - trust_remote_code=True, # mandatory for hf models - max_new_tokens=128, - top_k=10, - top_p=0.95, - temperature=0.8, - # tensor_parallel_size=... # for distributed inference + llm = VLLM( + model="mosaicml/mpt-7b", + trust_remote_code=True, # mandatory for hf models + max_new_tokens=128, + top_k=10, + top_p=0.95, + temperature=0.8, + # for distributed inference + # tensor_parallel_size=..., ) print(llm("What is the capital of France ?")) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index fe0e1e3df378..215c7bf0ced3 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Hello!"} - ] + {"role": "user", "content": "Hello!"}, + ], ) print(completion.choices[0].message) @@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below: completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} - ] + { + "role": "user", + "content": [ + {"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}, + ], + }, + ], ) ``` @@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}, ], extra_body={ - "structured_outputs": {"choice": ["positive", "negative"]} - } + "structured_outputs": {"choice": ["positive", "negative"]}, + }, ) ``` @@ -149,11 +154,11 @@ with `--enable-request-id-headers`. completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}, ], extra_headers={ "x-request-id": "sentiment-classification-00001", - } + }, ) print(completion._request_id) @@ -162,7 +167,7 @@ with `--enable-request-id-headers`. prompt="A robot may not injure a human being", extra_headers={ "x-request-id": "completion-test", - } + }, ) print(completion._request_id) ``` @@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi model="openai/whisper-large-v3-turbo", file=audio_file, language="en", - response_format="verbose_json" + response_format="verbose_json", ) print(transcription.text) @@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including "model": "jinaai/jina-reranker-m0", "text_1": "slm markdown", "text_2": { - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" - }, - }, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" - }, - }, - ] - } + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + }, + }, + ], }, + }, ) response.raise_for_status() response_json = response.json() diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 0076d4d30ee8..a3e671a0f4cc 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -95,7 +95,7 @@ def parse_args(): parser.add_argument( "--compilation-config", type=int, - help=("Compilation optimization (O) level 0-3."), + help=("Compilation optimization (O) mode 0-3."), ) parser.add_argument( "--quantization", diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md index 3c6f6c7a6c58..7d5a1af8f5a4 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/offline_inference/openai_batch/README.md @@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ """ try: url = s3_client.generate_presigned_url( - ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in + ClientMethod=client_method, + Params=method_parameters, + ExpiresIn=expires_in, ) except ClientError: raise @@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ s3_client = boto3.client("s3") input_url = generate_presigned_url( - s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600 + s3_client, + "get_object", + {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, + expires_in=3600, ) output_url = generate_presigned_url( - s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600 + s3_client, + "put_object", + {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, + expires_in=3600, ) print(f"{input_url=}") print(f"{output_url=}") diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index 79afbd9cfac4..7c535e91afac 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -26,6 +26,12 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py python examples/offline_inference/pooling/embed_matryoshka_fy.py ``` +## Multi vector retrieval usage + +```bash +python examples/offline_inference/pooling/multi_vector_retrieval.py +``` + ## Named Entity Recognition (NER) usage ```bash diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/offline_inference/pooling/multi_vector_retrieval.py new file mode 100644 index 000000000000..8b8892117d37 --- /dev/null +++ b/examples/offline_inference/pooling/multi_vector_retrieval.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults( + model="BAAI/bge-m3", + runner="pooling", + enforce_eager=True, + ) + return parser.parse_args() + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + # Create an LLM. + # You should pass runner="pooling" for embedding models + llm = LLM(**vars(args)) + + # Generate embedding. The output is a list of EmbeddingRequestOutputs. + outputs = llm.embed(prompts) + + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for prompt, output in zip(prompts, outputs): + embeds = output.outputs.embedding + print(len(embeds)) + + # Generate embedding for each token. The output is a list of PoolingRequestOutput. + outputs = llm.encode(prompts, pooling_task="token_embed") + + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for prompt, output in zip(prompts, outputs): + multi_vector = output.outputs.data + print(multi_vector.shape) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py index 418c40645f9f..6c47b5715438 100644 --- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py +++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py @@ -40,7 +40,7 @@ def main(): model_impl="terratorch", ) - pooling_params = PoolingParams(task="encode", softmax=False) + pooling_params = PoolingParams(task="token_classify", activation=False) pooler_output = llm.encode( img_prompt, pooling_params=pooling_params, diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index ac4e40221edf..91345e0ae778 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -18,6 +18,12 @@ python examples/online_serving/pooling/embedding_embed_dtype_client.py python examples/online_serving/pooling/jinaai_rerank_client.py ``` +## Multi vector retrieval usage + +```bash +python examples/online_serving/pooling/multi_vector_retrieval_client.py +``` + ## Named Entity Recognition (NER) usage ```bash diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/online_serving/pooling/multi_vector_retrieval_client.py new file mode 100644 index 000000000000..ef8c4745aa53 --- /dev/null +++ b/examples/online_serving/pooling/multi_vector_retrieval_client.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Example online usage of Pooling API for multi vector retrieval. + +Run `vllm serve --runner pooling` +to start up the server in vLLM. e.g. + +vllm serve BAAI/bge-m3 +""" + +import argparse + +import requests +import torch + + +def post_http_request(prompt: dict, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", type=str, default="BAAI/bge-m3") + + return parser.parse_args() + + +def main(args): + api_url = f"http://{args.host}:{args.port}/pooling" + model_name = args.model + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + prompt = {"model": model_name, "input": prompts} + + pooling_response = post_http_request(prompt=prompt, api_url=api_url) + for output in pooling_response.json()["data"]: + multi_vector = torch.tensor(output["data"]) + print(multi_vector.shape) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index acbfd8cda489..2601c9eff971 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -84,7 +84,7 @@ from vllm import LLM llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", - load_format="tensorizer" + load_format="tensorizer", ) ``` diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index 0d265bc59638..d1f741479acf 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -14,7 +14,7 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -199,10 +199,10 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): outputs = [] - # piecewise compile + # vllmcompile compile vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], @@ -251,7 +251,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): # no compile or cudagraph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.NO_COMPILATION, + mode=CompilationMode.NONE, ) ) cudagraph_runtime_mode = CUDAGraphMode.NONE @@ -280,7 +280,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): # piecewise compile without CUDA graph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=False, splitting_ops=["silly::attention"], use_inductor_graph_partition=use_inductor_graph_partition, diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index bc65e3da0ae7..f61a0a4eb740 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -13,7 +13,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -61,7 +61,7 @@ def _run_simple_model( ): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, use_inductor=use_inductor, splitting_ops=splitting_ops, diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 7ab610fa7811..75a89d692fa8 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -21,7 +21,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -356,13 +356,13 @@ def test_toy_llama( ) compile_config_no_compile = CompilationConfig( - level=CompilationLevel.NO_COMPILATION, + level=CompilationMode.NONE, cudagraph_mode=CUDAGraphMode.NONE, backend="eager", ) compile_config_no_split = CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=use_inductor_graph_partition, cudagraph_mode=CUDAGraphMode.PIECEWISE, backend=backend, @@ -458,14 +458,14 @@ def benchmark(): for piecewise in [False, True]: if piecewise: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=cudagraph_sizes, ) else: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_capture_sizes=cudagraph_sizes, ) diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py index 08f79d90cd36..1701d85fe84e 100644 --- a/tests/compile/test_aot_compile.py +++ b/tests/compile/test_aot_compile.py @@ -10,7 +10,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, VllmConfig, set_current_vllm_config, ) @@ -38,7 +38,7 @@ def forward(self, x: torch.Tensor): def make_vllm_config() -> VllmConfig: return VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, ) ) diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 102a929bf240..60856f5a5806 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -10,6 +10,7 @@ from vllm.compilation.collective_fusion import AsyncTPPass from vllm.config import ( CompilationConfig, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -400,7 +401,7 @@ def test_async_tp_pass_correctness( common_args.append("--enforce-eager") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "compile_sizes": [2, 4, 8], "splitting_ops": [], "pass_config": {"enable_async_tp": async_tp_enabled}, diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index ab6a17e149fc..954774a8e398 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -4,7 +4,7 @@ import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from vllm.utils import cuda_device_count_stateless from ..utils import compare_all_settings @@ -21,7 +21,7 @@ class TestSetting: # we cannot afford testing the full Cartesian product -# of all models and all levels +# of all models and all modes @pytest.mark.parametrize( "test_setting", [ @@ -121,15 +121,13 @@ def test_compile_correctness( all_args: list[list[str]] = [] all_envs: list[dict[str, str] | None] = [] - for comp_level in [ - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - CompilationLevel.PIECEWISE, + for comp_mode in [ + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, ]: - for level in [CompilationLevel.NO_COMPILATION, comp_level]: - all_args.append( - final_args + [f"-O.level={level}", "-O.backend=inductor"] - ) + for mode in [CompilationMode.NONE, comp_mode]: + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"]) # inductor will change the output, so we only compare if the output # is close, not exactly the same. @@ -142,13 +140,13 @@ def test_compile_correctness( all_envs.clear() all_args.clear() - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - CompilationLevel.PIECEWISE, + for mode in [ + CompilationMode.NONE, + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, ]: - all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"]) + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"]) all_envs.append({}) all_envs.append({}) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index ae8b0b226c31..7f51c763da73 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -4,7 +4,7 @@ from vllm.compilation.counter import compilation_counter from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig -from vllm.config.compilation import CompilationLevel +from vllm.config.compilation import CompilationMode from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer @@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked -def test_dynamo_as_is(vllm_runner, monkeypatch): +def test_stock_torch_compile(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(dynamo_as_is_count=1), + compilation_counter.expect(stock_torch_compile_count=1), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 1}, + compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE}, gpu_memory_utilization=0.4, ) as _, ): @@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 0}, + compilation_config={"mode": CompilationMode.NONE}, gpu_memory_utilization=0.4, ) as _, ): @@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch): monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4 @@ -151,7 +151,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, splitting_ops=["vllm::unified_attention"], ) @@ -163,7 +163,7 @@ def test_splitting_ops_dynamic(): # When attn_fusion pass enabled, splitting_ops now default to attention ops. config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, @@ -178,7 +178,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py index 63cb266094a1..e459bc539f2b 100644 --- a/tests/compile/test_decorator.py +++ b/tests/compile/test_decorator.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest import torch from torch import nn @@ -8,12 +9,13 @@ from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, ) from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from . import silly_attention # noqa: F401 @@ -65,19 +67,40 @@ def run_model( return output.cpu() -def test_ignore_torch_compile_decorator(): +@pytest.mark.parametrize("use_inductor_graph_partition", [True, False]) +def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatch): + # disable compile cache so that we can count the number of compilations + # appropriately + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available in PyTorch 2.9+") + # piecewise vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], - use_inductor_graph_partition=False, # TODO test both? + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE + expected_num_graphs_seen = 1 + expected_num_cudagraph_captured = ( + 4 # num_cudagraph_sizes * num cudagraphs to capture + ) + if use_inductor_graph_partition: + expected_num_piecewise_graphs_seen = 1 + expected_num_piecewise_capturable_graphs_seen = 1 + expected_num_backend_compilations = 1 + else: + expected_num_piecewise_graphs_seen = 3 + expected_num_piecewise_capturable_graphs_seen = 2 + expected_num_backend_compilations = 2 + @support_torch_compile class A(nn.Module): def __init__( @@ -104,12 +127,11 @@ class C(B): ... # A has support_torch_compile with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_graphs_seen=expected_num_graphs_seen, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, + num_cudagraph_captured=expected_num_cudagraph_captured, ): run_model(vllm_config, mod_A, cudagraph_runtime_mode) @@ -131,12 +153,11 @@ class C(B): ... # C's support_torch_compile should override B's ignore_torch_compile with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_graphs_seen=expected_num_graphs_seen, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, + num_cudagraph_captured=expected_num_cudagraph_captured, ): run_model(vllm_config, mod_C, cudagraph_runtime_mode) @@ -179,17 +200,25 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -def test_conditional_compile_enable_if(): +@pytest.mark.parametrize("use_inductor_graph_partition", [True, False]) +def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch): + # disable compile cache so that we can count the number of compilations + # appropriately + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available in PyTorch 2.9+") + vllm_config = VllmConfig( cache_config=CacheConfig( kv_sharing_fast_prefill=True, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], - use_inductor_graph_partition=False, # TODO test both + use_inductor_graph_partition=use_inductor_graph_partition, ), ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -197,17 +226,26 @@ def test_conditional_compile_enable_if(): with set_current_vllm_config(vllm_config): mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda() + if use_inductor_graph_partition: + expected_num_piecewise_graphs_seen = 2 + expected_num_piecewise_capturable_graphs_seen = 2 + expected_num_backend_compilations = 2 + else: + expected_num_piecewise_graphs_seen = 6 + expected_num_piecewise_capturable_graphs_seen = 4 + expected_num_backend_compilations = 4 + # A has support_torch_compile but enable_if fn returns False # enalbe_if will be True for B, so we expect mod1 and mod2 # to be compiled with compilation_counter.expect( num_graphs_seen=2, - num_piecewise_graphs_seen=6, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, # 3 piecewise graphs per instance of B() - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + # num_cudagraph_sizes * num cudagraphable graphs to capture ): run_model(vllm_config, mod_A, cudagraph_runtime_mode) @@ -218,24 +256,34 @@ def test_conditional_compile_enable_if(): kv_sharing_fast_prefill=False, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], - use_inductor_graph_partition=False, # TODO test both? + use_inductor_graph_partition=use_inductor_graph_partition, ), ) with set_current_vllm_config(vllm_config): mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda() + if use_inductor_graph_partition: + expected_num_piecewise_graphs_seen = 1 + expected_num_piecewise_capturable_graphs_seen = 1 + expected_num_backend_compilations = 1 + else: + # 3 attn ops and 4 non-attn ops + expected_num_piecewise_graphs_seen = 7 + expected_num_piecewise_capturable_graphs_seen = 4 + expected_num_backend_compilations = 4 + with compilation_counter.expect( num_graphs_seen=1, - num_piecewise_graphs_seen=7, + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, # 3 attn ops and 4 non-attn ops - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + # num_cudagraph_sizes * num cudagraphable graphs to capture ): run_model(vllm_config, mod_A, cudagraph_runtime_mode) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 2f3794c90b20..2d290771f9ad 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -12,7 +12,7 @@ from vllm import LLM, SamplingParams from vllm.attention.backends.registry import _Backend from vllm.attention.selector import global_force_attn_backend_context_manager -from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig +from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils import is_torch_equal_or_newer @@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None): @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE], ) @pytest.mark.parametrize("model_info", models_list(all=True)) @create_new_process_for_each_test() def test_full_graph( monkeypatch: pytest.MonkeyPatch, model_info: tuple[str, dict[str, Any]], - optimization_level: int, + compilation_mode: int, ): model, model_kwargs = model_info with monkeypatch.context(): print(f"MODEL={model}") - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) # TODO(luka) add other supported compilation config scenarios here @@ -104,7 +104,7 @@ def test_full_graph( [ # additional compile sizes, only some of the models ( - CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]), + CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]), model, ) for model in models_list(all=False) @@ -113,7 +113,7 @@ def test_full_graph( # RMSNorm + quant fusion, only 8-bit quant models ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ), @@ -125,7 +125,8 @@ def test_full_graph( # Test depyf integration works ( CompilationConfig( - level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir() + mode=CompilationMode.VLLM_COMPILE, + debug_dump_path=tempfile.gettempdir(), ), ("facebook/opt-125m", {}), ), @@ -134,7 +135,7 @@ def test_full_graph( # graph inductor partition ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # inductor graph partition uses # torch._C.Tag.cudagraph_unsafe to specify splitting ops use_inductor_graph_partition=True, @@ -164,10 +165,10 @@ def test_custom_compile_config( @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.NONE, CompilationMode.VLLM_COMPILE], ) -def test_fp8_kv_scale_compile(optimization_level: int): +def test_fp8_kv_scale_compile(compilation_mode: int): model = "Qwen/Qwen2-0.5B" model_kwargs = { "quantization": "fp8", @@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int): "calculate_kv_scales": True, "max_model_len": 512, } - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) def test_inductor_graph_partition_attn_fusion(caplog_vllm): @@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm): model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, cudagraph_mode=CUDAGraphMode.PIECEWISE, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 7c2233643229..1a5eaf2639b3 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -13,7 +13,7 @@ ) from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, @@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 455d1bb03905..fbcd6c71fb72 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -12,7 +12,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"] + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"] ) ) vllm_config.compilation_config.pass_config = PassConfig( diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index d1ab85cfb875..a8d78daa32a1 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -19,7 +19,7 @@ from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, ModelConfig, PassConfig, SchedulerConfig, @@ -321,7 +321,7 @@ def test_attention_quant_pattern( ), scheduler_config=SchedulerConfig(max_num_seqs=1024), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+quant_fp8"], use_inductor_graph_partition=use_inductor_graph_partition, ), diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py index 188f4514dda5..0ccc1a016162 100644 --- a/tests/compile/test_noop_elimination.py +++ b/tests/compile/test_noop_elimination.py @@ -6,7 +6,7 @@ import vllm from vllm.compilation.noop_elimination import NoOpEliminationPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from .backend import TestBackend @@ -50,7 +50,7 @@ def forward(self, x): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) @@ -98,7 +98,7 @@ def forward(self, x): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index b2fff822bbbb..da0afd9eaa49 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -5,7 +5,7 @@ import torch from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel +from vllm.config import CompilationMode class MyMod(torch.nn.Module): @@ -20,7 +20,7 @@ def __init__(self, model): self.model = model compiled_callable = torch.compile(self.forward, backend="eager") super().__init__( - compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE + compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE ) def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): diff --git a/tests/conftest.py b/tests/conftest.py index 2fde7f97836d..9126b3d668b9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1011,8 +1011,12 @@ def embed( req_outputs = self.llm.embed(inputs, *args, **kwargs) return [req_output.outputs.embedding for req_output in req_outputs] - def encode(self, prompts: list[str]) -> list[list[float]]: - req_outputs = self.llm.encode(prompts) + def token_embed(self, prompts: list[str]) -> list[list[float]]: + req_outputs = self.llm.encode(prompts, pooling_task="token_embed") + return [req_output.outputs.data for req_output in req_outputs] + + def token_classify(self, prompts: list[str]) -> list[list[float]]: + req_outputs = self.llm.encode(prompts, pooling_task="token_classify") return [req_output.outputs.data for req_output in req_outputs] def reward(self, prompts: list[str]) -> list[list[float]]: diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index a431bf30fc89..362e9daf5ae0 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -15,6 +15,7 @@ import pytest +from vllm.config.compilation import CompilationMode from vllm.config.model import RunnerOption from vllm.logger import init_logger @@ -234,7 +235,7 @@ def _compare_sp( common_args.append("--skip-tokenizer-init") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "custom_ops": ["+rms_norm"], "compile_sizes": [4, 8], "pass_config": { diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 78928a53942f..c73083b0b5ef 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -226,30 +226,30 @@ def test_compilation_config(): # set to O3 args = parser.parse_args(["-O0"]) - assert args.compilation_config.level == 0 + assert args.compilation_config.mode == 0 # set to O 3 (space) args = parser.parse_args(["-O", "1"]) - assert args.compilation_config.level == 1 + assert args.compilation_config.mode == 1 # set to O 3 (equals) args = parser.parse_args(["-O=2"]) - assert args.compilation_config.level == 2 + assert args.compilation_config.mode == 2 - # set to O.level 3 - args = parser.parse_args(["-O.level", "3"]) - assert args.compilation_config.level == 3 + # set to O.mode 3 + args = parser.parse_args(["-O.mode", "3"]) + assert args.compilation_config.mode == 3 # set to string form of a dict args = parser.parse_args( [ "-O", - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": false}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and not args.compilation_config.use_inductor ) @@ -258,12 +258,12 @@ def test_compilation_config(): args = parser.parse_args( [ "--compilation-config=" - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": true}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.use_inductor ) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index a96f0134c2ff..a2d8993441fc 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -53,22 +53,35 @@ def base64_encoded_audio() -> dict[str, str]: } -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) -async def test_single_chat_session_audio( - client: openai.AsyncOpenAI, model_name: str, audio_url: str +def dummy_messages_from_audio_url( + audio_urls: str | list[str], + content_text: str = "What's happening in this audio?", ): - messages = [ + if isinstance(audio_urls, str): + audio_urls = [audio_urls] + + return [ { "role": "user", "content": [ - {"type": "audio_url", "audio_url": {"url": audio_url}}, - {"type": "text", "text": "What's happening in this audio?"}, + *( + {"type": "audio_url", "audio_url": {"url": audio_url}} + for audio_url in audio_urls + ), + {"type": "text", "text": content_text}, ], } ] + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) +async def test_single_chat_session_audio( + client: openai.AsyncOpenAI, model_name: str, audio_url: str +): + messages = dummy_messages_from_audio_url(audio_url) + # test single completion chat_completion = await client.chat.completions.create( model=model_name, @@ -138,20 +151,9 @@ async def test_single_chat_session_audio_base64encoded( audio_url: str, base64_encoded_audio: dict[str, str], ): - messages = [ - { - "role": "user", - "content": [ - { - "type": "audio_url", - "audio_url": { - "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url( + f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" + ) # test single completion chat_completion = await client.chat.completions.create( @@ -252,15 +254,7 @@ async def test_single_chat_session_input_audio( async def test_chat_streaming_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "audio_url", "audio_url": {"url": audio_url}}, - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url(audio_url) # test single completion chat_completion = await client.chat.completions.create( @@ -365,18 +359,7 @@ async def test_chat_streaming_input_audio( async def test_multi_audio_input( client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str] ): - messages = [ - { - "role": "user", - "content": [ - *( - {"type": "audio_url", "audio_url": {"url": audio_url}} - for audio_url in audio_urls - ), - {"type": "text", "text": "What's happening in this audio?"}, - ], - } - ] + messages = dummy_messages_from_audio_url(audio_urls) if len(audio_urls) > MAXIMUM_AUDIOS: with pytest.raises(openai.BadRequestError): # test multi-audio input diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 4c7d1c14ca17..7ecdac518f97 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -55,22 +55,35 @@ def base64_encoded_video() -> dict[str, str]: } -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) -async def test_single_chat_session_video( - client: openai.AsyncOpenAI, model_name: str, video_url: str +def dummy_messages_from_video_url( + video_urls: str | list[str], + content_text: str = "What's in this video?", ): - messages = [ + if isinstance(video_urls, str): + video_urls = [video_urls] + + return [ { "role": "user", "content": [ - {"type": "video_url", "video_url": {"url": video_url}}, - {"type": "text", "text": "What's in this video?"}, + *( + {"type": "video_url", "video_url": {"url": video_url}} + for video_url in video_urls + ), + {"type": "text", "text": content_text}, ], } ] + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_single_chat_session_video( + client: openai.AsyncOpenAI, model_name: str, video_url: str +): + messages = dummy_messages_from_video_url(video_url) + # test single completion chat_completion = await client.chat.completions.create( model=model_name, @@ -137,15 +150,7 @@ async def test_error_on_invalid_video_url_type( async def test_single_chat_session_video_beamsearch( client: openai.AsyncOpenAI, model_name: str, video_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "video_url", "video_url": {"url": video_url}}, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_url) chat_completion = await client.chat.completions.create( model=model_name, @@ -172,20 +177,9 @@ async def test_single_chat_session_video_base64encoded( video_url: str, base64_encoded_video: dict[str, str], ): - messages = [ - { - "role": "user", - "content": [ - { - "type": "video_url", - "video_url": { - "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url( + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + ) # test single completion chat_completion = await client.chat.completions.create( @@ -231,20 +225,10 @@ async def test_single_chat_session_video_base64encoded_beamsearch( video_url: str, base64_encoded_video: dict[str, str], ): - messages = [ - { - "role": "user", - "content": [ - { - "type": "video_url", - "video_url": { - "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url( + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + ) + chat_completion = await client.chat.completions.create( model=model_name, messages=messages, @@ -265,15 +249,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch( async def test_chat_streaming_video( client: openai.AsyncOpenAI, model_name: str, video_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "video_url", "video_url": {"url": video_url}}, - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_url) # test single completion chat_completion = await client.chat.completions.create( @@ -318,18 +294,7 @@ async def test_chat_streaming_video( async def test_multi_video_input( client: openai.AsyncOpenAI, model_name: str, video_urls: list[str] ): - messages = [ - { - "role": "user", - "content": [ - *( - {"type": "video_url", "video_url": {"url": video_url}} - for video_url in video_urls - ), - {"type": "text", "text": "What's in this video?"}, - ], - } - ] + messages = dummy_messages_from_video_url(video_urls) if len(video_urls) > MAXIMUM_VIDEOS: with pytest.raises(openai.BadRequestError): # test multi-video input diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 5a15a352f45c..09bd0dabb799 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -78,6 +78,27 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]: } +def dummy_messages_from_image_url( + image_urls: str | list[str], + content_text: str = "What's in this image?", +): + if isinstance(image_urls, str): + image_urls = [image_urls] + + return [ + { + "role": "user", + "content": [ + *( + {"type": "image_url", "image_url": {"url": image_url}} + for image_url in image_urls + ), + {"type": "text", "text": content_text}, + ], + } + ] + + def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True, num_crops=4 @@ -107,15 +128,7 @@ async def test_single_chat_session_image( client: openai.AsyncOpenAI, model_name: str, image_url: str ): content_text = "What's in this image?" - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": content_text}, - ], - } - ] + messages = dummy_messages_from_image_url(image_url, content_text) max_completion_tokens = 10 # test single completion @@ -188,15 +201,8 @@ async def test_error_on_invalid_image_url_type( async def test_single_chat_session_image_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + content_text = "What's in this image?" + messages = dummy_messages_from_image_url(image_url, content_text) chat_completion = await client.chat.completions.create( model=model_name, @@ -226,20 +232,10 @@ async def test_single_chat_session_image_base64encoded( base64_encoded_image: dict[str, str], ): content_text = "What's in this image?" - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": content_text}, - ], - } - ] + messages = dummy_messages_from_image_url( + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", + content_text, + ) max_completion_tokens = 10 # test single completion @@ -293,20 +289,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch( raw_image_url = TEST_IMAGE_ASSETS[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" # noqa: E501 - }, - }, - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + messages = dummy_messages_from_image_url( + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" + ) + chat_completion = await client.chat.completions.create( model=model_name, messages=messages, @@ -326,15 +312,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( async def test_chat_streaming_image( client: openai.AsyncOpenAI, model_name: str, image_url: str ): - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + messages = dummy_messages_from_image_url(image_url) # test single completion chat_completion = await client.chat.completions.create( @@ -381,18 +359,7 @@ async def test_chat_streaming_image( async def test_multi_image_input( client: openai.AsyncOpenAI, model_name: str, image_urls: list[str] ): - messages = [ - { - "role": "user", - "content": [ - *( - {"type": "image_url", "image_url": {"url": image_url}} - for image_url in image_urls - ), - {"type": "text", "text": "What's in this image?"}, - ], - } - ] + messages = dummy_messages_from_image_url(image_urls) if len(image_urls) > MAXIMUM_IMAGES: with pytest.raises(openai.BadRequestError): # test multi-image input diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py index 488c82c9fe7f..96f634ee0a8c 100644 --- a/tests/entrypoints/pooling/llm/test_classify.py +++ b/tests/entrypoints/pooling/llm/test_classify.py @@ -63,7 +63,7 @@ def test_encode_api(llm: LLM): # chunked prefill does not support all pooling err_msg = "pooling_task must be one of.+" with pytest.raises(ValueError, match=err_msg): - llm.encode(prompts, use_tqdm=False) + llm.encode(prompts, pooling_task="token_classify", use_tqdm=False) def test_score_api(llm: LLM): diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py index c53941390bd1..5455b5f91fc0 100644 --- a/tests/entrypoints/pooling/llm/test_embedding.py +++ b/tests/entrypoints/pooling/llm/test_embedding.py @@ -35,6 +35,13 @@ def llm(): cleanup_dist_env_and_memory() +@pytest.mark.skip_global_cleanup +def test_encode_api(llm: LLM): + outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False) + multi_vector = outputs[0].outputs.data + assert multi_vector.shape == (11, 384) + + def test_pooling_params(llm: LLM): def get_outputs(normalize): outputs = llm.embed( diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py index 9ba380334e5a..ca85d2758fce 100644 --- a/tests/entrypoints/pooling/llm/test_encode.py +++ b/tests/entrypoints/pooling/llm/test_encode.py @@ -57,20 +57,24 @@ def test_multiple_pooling_params(llm: LLM): ] # Multiple PoolingParams should be matched with each prompt - outputs = llm.encode(PROMPTS, pooling_params=pooling_params) + outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed") assert len(PROMPTS) == len(outputs) # Exception raised, if the size of params does not match the size of prompts with pytest.raises(ValueError): - outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3]) + outputs = llm.encode( + PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed" + ) # Single PoolingParams should be applied to every prompt single_pooling_params = PoolingParams() - outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params) + outputs = llm.encode( + PROMPTS, pooling_params=single_pooling_params, pooling_task="embed" + ) assert len(PROMPTS) == len(outputs) # pooling_params is None, default params should be applied - outputs = llm.encode(PROMPTS, pooling_params=None) + outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed") assert len(PROMPTS) == len(outputs) diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py index 8312ff180b36..81058dbad891 100644 --- a/tests/entrypoints/pooling/llm/test_reward.py +++ b/tests/entrypoints/pooling/llm/test_reward.py @@ -36,22 +36,23 @@ def llm(): cleanup_dist_env_and_memory() -@pytest.mark.skip_global_cleanup def test_pooling_params(llm: LLM): - def get_outputs(softmax): + def get_outputs(activation): outputs = llm.reward( - prompts, pooling_params=PoolingParams(softmax=softmax), use_tqdm=False + prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False ) return torch.cat([x.outputs.data for x in outputs]) - default = get_outputs(softmax=None) - w_softmax = get_outputs(softmax=True) - wo_softmax = get_outputs(softmax=False) + default = get_outputs(activation=None) + w_activation = get_outputs(activation=True) + wo_activation = get_outputs(activation=False) - assert torch.allclose(default, w_softmax, atol=1e-2), "Default should use softmax." - assert not torch.allclose(w_softmax, wo_softmax, atol=1e-2), ( - "wo_softmax should not use softmax." + assert torch.allclose(default, w_activation, atol=1e-2), ( + "Default should use activation." ) - assert torch.allclose(softmax(wo_softmax), w_softmax, atol=1e-2), ( - "w_softmax should be close to softmax(wo_softmax)." + assert not torch.allclose(w_activation, wo_activation, atol=1e-2), ( + "wo_activation should not use activation." + ) + assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), ( + "w_activation should be close to activation(wo_activation)." ) diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index 8a3d298a48e2..ab8ca9d68e0e 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -17,6 +17,7 @@ from vllm.entrypoints.openai.protocol import ( EMBED_DTYPE_TO_TORCH_DTYPE, EmbeddingResponse, + PoolingResponse, ) from vllm.transformers_utils.tokenizer import get_tokenizer @@ -509,3 +510,20 @@ async def get_outputs(normalize): assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), ( "w_normal should be close to normal(wo_normal)." ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling(server: RemoteOpenAIServer, model_name: str): + input_text = ["The chef prepared a delicious meal."] + + response = requests.post( + server.url_for("pooling"), + json={"model": model_name, "input": input_text, "encoding_format": "float"}, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 11 + assert len(poolings.data[0].data[0]) == 384 diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index 9980fcff16c1..e43148d25fee 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -7,7 +7,7 @@ import torch.nn.functional as F from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.openai.protocol import RerankResponse +from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" @@ -159,3 +159,20 @@ async def get_outputs(activation): assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), ( "w_activation should be close to activation(wo_activation)." ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling(server: RemoteOpenAIServer, model_name: str): + input_text = ["The chef prepared a delicious meal."] + + response = requests.post( + server.url_for("pooling"), + json={"model": model_name, "input": input_text, "encoding_format": "float"}, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 11 + assert len(poolings.data[0].data[0]) == 1 diff --git a/tests/models/language/pooling/test_multi_vector_retrieval.py b/tests/models/language/pooling/test_multi_vector_retrieval.py new file mode 100644 index 000000000000..302f2df13557 --- /dev/null +++ b/tests/models/language/pooling/test_multi_vector_retrieval.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +from transformers import AutoModel + +from tests.models.utils import check_embeddings_close + + +@pytest.mark.parametrize( + "model", + ["BAAI/bge-m3"], +) +@pytest.mark.parametrize("dtype", ["half"]) +@torch.inference_mode +def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str): + with vllm_runner( + model, + runner="pooling", + max_model_len=None, + ) as vllm_model: + vllm_outputs = vllm_model.token_embed(example_prompts) + + with hf_runner( + model, + auto_cls=AutoModel, + ) as hf_model: + tokenizer = hf_model.tokenizer + hf_outputs = [] + for prompt in example_prompts: + inputs = tokenizer([prompt], return_tensors="pt") + inputs = hf_model.wrap_device(inputs) + output = hf_model.model(**inputs) + embedding = output.last_hidden_state[0].float() + # normal + hf_outputs.append(embedding.cpu()) + + for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): + check_embeddings_close( + embeddings_0_lst=hf_output, + embeddings_1_lst=vllm_output, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py index 674bf02b7b98..55663ee3f1b4 100644 --- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -93,7 +93,7 @@ def test_embed_models_using_normalize( ], ) @pytest.mark.parametrize("dtype", ["half"]) -def test_reward_models_using_softmax( +def test_reward_models_using_activation( hf_runner, vllm_runner, example_prompts, @@ -104,22 +104,64 @@ def test_reward_models_using_softmax( model, max_model_len=1024, dtype=dtype, - pooler_config=PoolerConfig(softmax=False), + pooler_config=PoolerConfig(activation=False), ) as vllm_model: - wo_softmax = vllm_model.encode(example_prompts) + wo_activation = vllm_model.reward(example_prompts) with vllm_runner( - model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True) + model, + max_model_len=1024, + dtype=dtype, + pooler_config=PoolerConfig(activation=True), ) as vllm_model: - w_softmax = vllm_model.encode(example_prompts) + w_activation = vllm_model.reward(example_prompts) - for wo, w in zip(wo_softmax, w_softmax): + for wo, w in zip(wo_activation, w_activation): wo = torch.tensor(wo) w = torch.tensor(w) assert not torch.allclose(wo, w, atol=1e-2), ( - "pooler_config softmax is not working" + "pooler_config activation is not working" ) assert torch.allclose(softmax(wo), w, atol=1e-2), ( - "w_softmax should be close to softmax(wo_softmax)." + "w_activation should be close to activation(wo_activation)." + ) + + +@pytest.mark.parametrize( + "model", + [ + "intfloat/multilingual-e5-small", + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_multi_vector_retrieval_models_using_normalize( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(normalize=False), + ) as vllm_model: + wo_normalize = vllm_model.token_embed(example_prompts) + + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + pooler_config=PoolerConfig(normalize=True), + ) as vllm_model: + w_normalize = vllm_model.token_embed(example_prompts) + + for wo, w in zip(wo_normalize, w_normalize): + assert not torch.allclose(wo, w, atol=1e-2), ( + "pooler_config normalize is not working" + ) + assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), ( + "w_normal should be close to normal(wo_normal)." ) diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py index 784d9fc31267..2dfc0072126b 100644 --- a/tests/models/language/pooling/test_token_classification.py +++ b/tests/models/language/pooling/test_token_classification.py @@ -19,7 +19,7 @@ def test_bert_models( dtype: str, ) -> None: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) + vllm_outputs = vllm_model.token_classify(example_prompts) with hf_runner( model, dtype=dtype, auto_cls=AutoModelForTokenClassification @@ -50,7 +50,7 @@ def test_modernbert_models( dtype: str, ) -> None: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) + vllm_outputs = vllm_model.token_classify(example_prompts) with hf_runner( model, dtype=dtype, auto_cls=AutoModelForTokenClassification diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py index abf4150a9132..62154b083487 100644 --- a/tests/models/multimodal/pooling/test_prithvi_mae.py +++ b/tests/models/multimodal/pooling/test_prithvi_mae.py @@ -39,7 +39,7 @@ def _run_test( max_num_seqs=32, default_torch_num_threads=1, ) as vllm_model: - vllm_model.encode(prompt) + vllm_model.llm.encode(prompt, pooling_task="token_classify") MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index d1dae587d38e..98245cdf0c98 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -30,7 +30,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_embed": Pooler.for_token_embed(pooler_config), "embed": Pooler.for_embed(pooler_config), } ) diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py index 912b32755e80..936f27fb69bc 100644 --- a/tests/plugins_tests/test_io_processor_plugins.py +++ b/tests/plugins_tests/test_io_processor_plugins.py @@ -93,7 +93,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): out_data_format="b64_json", ) - pooling_params = PoolingParams(task="encode", softmax=False) + pooling_params = PoolingParams(activation=False) with vllm_runner( model_name, @@ -108,8 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): io_processor_plugin="prithvi_to_tiff", ) as llm_runner: pooler_output = llm_runner.get_llm().encode( - img_prompt, - pooling_params=pooling_params, + img_prompt, pooling_params=pooling_params, pooling_task="token_classify" ) output = pooler_output[0].outputs diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py new file mode 100644 index 000000000000..3d12f3e5b30e --- /dev/null +++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.reasoning import ( + DeepSeekR1ReasoningParser, + DeepSeekV3ReasoningParser, + IdentityReasoningParser, +) + +REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-V3.1" + + +@pytest.fixture(scope="module") +def tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +@pytest.mark.parametrize( + "thinking,expected_parser_type", + [ + (True, DeepSeekR1ReasoningParser), + (False, IdentityReasoningParser), + ], +) +def test_parser_selection(tokenizer, thinking, expected_parser_type): + parser = DeepSeekV3ReasoningParser( + tokenizer, chat_template_kwargs={"thinking": thinking} + ) + + assert isinstance(parser._parser, expected_parser_type) + + +def test_identity_reasoning_parser_basic(tokenizer): + parser = IdentityReasoningParser(tokenizer) + + # Test is_reasoning_end always returns True + input_text = "This is some output" + input_tokens = tokenizer.tokenize(input_text) + input_ids = tokenizer.convert_tokens_to_ids(input_tokens) + assert parser.is_reasoning_end(input_ids) is True + + # Test extract_content_ids returns all input_ids + assert parser.extract_content_ids(input_ids) == input_ids + + # Test extract_reasoning_content returns (None, model_output) + request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0) + reasoning, content = parser.extract_reasoning_content(input_text, request) + assert reasoning is None + assert content == input_text + + # Test extract_reasoning_content_streaming returns DeltaMessage or None + result = parser.extract_reasoning_content_streaming( + previous_text="", + current_text="Hello world", + delta_text="Hello world", + previous_token_ids=[], + current_token_ids=input_ids, + delta_token_ids=input_ids, + ) + assert isinstance(result, DeltaMessage) + assert result.content == "Hello world" + + # If delta_text is empty, should return None + result_none = parser.extract_reasoning_content_streaming( + previous_text="Hello world", + current_text="Hello world", + delta_text="", + previous_token_ids=input_ids, + current_token_ids=input_ids, + delta_token_ids=[], + ) + assert result_none is None diff --git a/tests/test_config.py b/tests/test_config.py index bba2fbec3db2..d3f6d4097615 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,9 +8,16 @@ import pytest from vllm.compilation.backends import VllmBackend -from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config +from vllm.config import ( + CompilationConfig, + ModelConfig, + PoolerConfig, + VllmConfig, + update_config, +) from vllm.config.load import LoadConfig from vllm.config.utils import get_field +from vllm.config.vllm import OptimizationLevel from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform @@ -235,6 +242,32 @@ def test_default_pooling_type(model_id, default_pooling_type, pooling_type): assert model_config.pooler_config.pooling_type == pooling_type +@pytest.mark.parametrize( + ("model_id", "expected_is_moe_model"), + [ + ("Qwen/Qwen1.5-7B", False), + ("deepseek-ai/DeepSeek-V2-Lite", True), + ], +) +def test_moe_model_detection(model_id, expected_is_moe_model): + model_config = ModelConfig(model_id) + # Just check that is_moe_model field exists and is a boolean + assert model_config.is_model_moe() == expected_is_moe_model + + +@pytest.mark.parametrize( + ("model_id", "quantized"), + [ + ("jerryzh168/Qwen3-8B-INT4", True), + ("deepseek-ai/DeepSeek-V2-Lite", False), + ], +) +def test_is_quantized(model_id, quantized): + model_config = ModelConfig(model_id) + # Just check that is_moe_model field exists and is a boolean + assert model_config.is_quantized() == quantized + + @pytest.mark.skipif( current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." ) @@ -549,3 +582,74 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files): assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer) assert os.path.exists(config2.model) and os.path.isdir(config2.model) assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer) + + +def test_vllm_config_defaults_are_none(): + """Verify that all fields that are set by default based on optimizaiton + level are set to None if user does not set them explicitly.""" + # Construct VllmConfig without __post_init__. + config = object.__new__(VllmConfig) + # Construct CompilationConfig with __post_init__. + config.compilation_config = CompilationConfig() + default_config = config._build_defaults() + # Apply optimization level default if not set by user. + for k, v in default_config["general"].items(): + if k == "pass_config": + for pass_k, pass_v in default_config["general"]["pass_config"].items(): + assert getattr(config.compilation_config.pass_config, pass_k) is None + else: + assert getattr(config.compilation_config, k) is None + + for k, v in default_config["is_quantized"]["pass_config"].items(): + assert getattr(config.compilation_config.pass_config, k) is None + + for k, v in default_config["is_sequential"]["pass_config"].items(): + assert getattr(config.compilation_config.pass_config, k) is None + + +@pytest.mark.parametrize( + ("model_id", "optimization_level"), + [ + (None, OptimizationLevel.O0), + (None, OptimizationLevel.O1), + (None, OptimizationLevel.O2), + (None, OptimizationLevel.O3), + ("Qwen/Qwen1.5-7B", OptimizationLevel.O0), + ("Qwen/Qwen1.5-7B", OptimizationLevel.O1), + ("Qwen/Qwen1.5-7B", OptimizationLevel.O2), + ("Qwen/Qwen1.5-7B", OptimizationLevel.O3), + ("deepseek-ai/DeepSeek-V2-Lite", OptimizationLevel.O0), + ("deepseek-ai/DeepSeek-V2-Lite", OptimizationLevel.O1), + ("deepseek-ai/DeepSeek-V2-Lite", OptimizationLevel.O2), + ("deepseek-ai/DeepSeek-V2-Lite", OptimizationLevel.O3), + ("jerryzh168/Qwen3-8B-INT4", OptimizationLevel.O0), + ("jerryzh168/Qwen3-8B-INT4", OptimizationLevel.O1), + ("jerryzh168/Qwen3-8B-INT4", OptimizationLevel.O2), + ("jerryzh168/Qwen3-8B-INT4", OptimizationLevel.O3), + ], +) +def test_vllm_conifg_defaults(model_id, optimization_level): + if model_id is not None: + model_config = ModelConfig(model_id) + vllm_config = VllmConfig( + model_config=model_config, optimization_level=optimization_level + ) + else: + vllm_config = VllmConfig(optimization_level=optimization_level) + + default_config = vllm_config._build_defaults() + for k, v in default_config["general"].items(): + if k == "pass_config": + for pass_k, pass_v in default_config["general"]["pass_config"].items(): + assert ( + getattr(vllm_config.compilation_config.pass_config, pass_k) + == pass_v + ) + else: + assert getattr(vllm_config.compilation_config, k) == v + + for k, v in default_config["is_quantized"]["pass_config"].items(): + assert getattr(vllm_config.compilation_config.pass_config, k) == v + + for k, v in default_config["is_sequential"]["pass_config"].items(): + assert getattr(vllm_config.compilation_config.pass_config, k) == v diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py index e3561ac3a577..e73d7efc1483 100644 --- a/tests/test_pooling_params.py +++ b/tests/test_pooling_params.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + import pytest from tests.models.utils import EmbedModelInfo from vllm import PoolingParams -from vllm.config import ModelConfig +from vllm.config import ModelConfig, PoolerConfig EMBEDDING_MODELS = [ EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), @@ -15,6 +17,15 @@ ), ] +classify_parameters = ["activation"] +embed_parameters = ["dimensions", "normalize"] +step_pooling_parameters = ["step_tag_id", "returned_token_ids"] + + +@dataclass() +class MockModelConfig: + pooler_config: PoolerConfig + def test_task(): pooling_params = PoolingParams() @@ -24,25 +35,27 @@ def test_task(): pooling_params.verify(task="score") with pytest.raises(ValueError): - pooling_params.verify(task="encode") + pooling_params.verify(task="classify") def test_embed(): task = "embed" + model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS")) + pooling_params = PoolingParams(normalize=None) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(normalize=True) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(normalize=False) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) - invalid_parameters = ["activation", "softmax"] + invalid_parameters = classify_parameters + step_pooling_parameters for p in invalid_parameters: with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) @pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @@ -73,35 +86,71 @@ def test_embed_dimensions(model_info: EmbedModelInfo): @pytest.mark.parametrize("task", ["score", "classify"]) def test_classify(task): + model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS")) + pooling_params = PoolingParams(activation=None) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(activation=True) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) pooling_params = PoolingParams(activation=False) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) + + invalid_parameters = embed_parameters + step_pooling_parameters + for p in invalid_parameters: + with pytest.raises(ValueError): + pooling_params = PoolingParams(**{p: True}) + pooling_params.verify(task=task, model_config=model_config) + + +@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"]) +def test_token_embed(pooling_type: str): + task = "token_embed" + model_config = MockModelConfig( + pooler_config=PoolerConfig(pooling_type=pooling_type) + ) + + pooling_params = PoolingParams(normalize=None) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(normalize=True) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(normalize=False) + pooling_params.verify(task=task, model_config=model_config) + + invalid_parameters = classify_parameters + if pooling_type != "STEP": + invalid_parameters = classify_parameters + step_pooling_parameters - invalid_parameters = ["dimensions", "normalize", "softmax"] for p in invalid_parameters: with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) -def test_encode(): - task = "encode" - pooling_params = PoolingParams(softmax=None) - pooling_params.verify(task=task) +@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"]) +def test_token_classify(pooling_type: str): + task = "token_classify" + model_config = MockModelConfig( + pooler_config=PoolerConfig(pooling_type=pooling_type) + ) - pooling_params = PoolingParams(softmax=True) - pooling_params.verify(task=task) + pooling_params = PoolingParams(activation=None) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(activation=True) + pooling_params.verify(task=task, model_config=model_config) + + pooling_params = PoolingParams(activation=False) + pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(softmax=False) - pooling_params.verify(task=task) + invalid_parameters = embed_parameters + if pooling_type != "STEP": + invalid_parameters = embed_parameters + step_pooling_parameters - invalid_parameters = ["dimensions", "normalize", "activation"] for p in invalid_parameters: with pytest.raises(ValueError): pooling_params = PoolingParams(**{p: True}) - pooling_params.verify(task=task) + pooling_params.verify(task=task, model_config=model_config) diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index b4f0989b1b19..93ef1049fc07 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -40,7 +40,7 @@ def qwen3_xml_tool_parser(qwen3_tokenizer): return Qwen3XMLToolParser(qwen3_tokenizer) -@pytest.fixture(params=["original", "xml"]) +@pytest.fixture(params=["xml"]) def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, request): """Parameterized fixture that provides both parser types for testing""" if request.param == "original": @@ -664,6 +664,9 @@ def test_extract_tool_calls_streaming( # Verify we got all expected tool calls assert len(tool_states) == len(expected_tool_calls) + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == len( + expected_tool_calls + ) # Verify each tool call for idx, expected_tool in enumerate(expected_tool_calls): @@ -780,9 +783,10 @@ def test_extract_tool_calls_streaming_missing_closing_tag( # Verify content was streamed assert "Let me check the weather for you:" in other_content - # Verify we got the tool call assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + state = tool_states[0] assert state["id"] is not None assert state["type"] == "function" @@ -892,3 +896,83 @@ def test_extract_tool_calls_complex_type_with_single_quote( args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) assert args["obj_param"] == {"key": "value"} + + +def test_extract_tool_calls_streaming_missing_opening_tag( + qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools +): + """Test streaming with missing opening tag + + This tests that the streaming parser correctly handles + tool calls that start directly with + """ + model_output = """I'll check the weather for you. + + + +Dallas + + +TX + + +fahrenheit + + +""" + + request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) + + other_content = "" + tool_states = {} + + for delta_message in stream_delta_message_generator( + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request + ): + if delta_message.content: + other_content += delta_message.content + + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None, + } + + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + + if tool_call.function: + if tool_call.function.name: + tool_states[idx]["name"] = tool_call.function.name + + if tool_call.function.arguments is not None: + tool_states[idx]["arguments"] += tool_call.function.arguments + + # Verify content was streamed + assert "I'll check the weather for you." in other_content + + # Verify we got the tool call + assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + + state = tool_states[0] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == "get_current_weather" + + # Verify arguments were parsed correctly despite missing opening tag + assert state["arguments"] is not None + args = json.loads(state["arguments"]) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index 102e5ddf16d6..cf455ff3edbd 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from ..utils import compare_two_settings @@ -21,13 +21,13 @@ def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_ONCE}", + f"-O{CompilationMode.DYNAMO_TRACE_ONCE}", ], arg2=[ "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_AS_IS}", + f"-O{CompilationMode.STOCK_TORCH_COMPILE}", ], env1={}, env2={}, diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 308629ab0583..af5fc758f2c2 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -299,7 +299,7 @@ def test_dict_args(parser): "val2", "--hf-overrides.key2.key4", "val3", - # Test compile config and compilation level + # Test compile config and compilation mode "-O.use_inductor=true", "-O.backend", "custom", @@ -352,7 +352,7 @@ def test_dict_args(parser): }, } assert parsed_args.compilation_config == { - "level": 1, + "mode": 1, "use_inductor": True, "backend": "custom", "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], @@ -367,7 +367,7 @@ def test_duplicate_dict_args(caplog_vllm, parser): "--hf-overrides.key1", "val2", "-O1", - "-O.level", + "-O.mode", "2", "-O3", ] @@ -375,12 +375,12 @@ def test_duplicate_dict_args(caplog_vllm, parser): parsed_args = parser.parse_args(args) # Should be the last value assert parsed_args.hf_overrides == {"key1": "val2"} - assert parsed_args.compilation_config == {"level": 3} + assert parsed_args.compilation_config == {"mode": 3} assert len(caplog_vllm.records) == 1 assert "duplicate" in caplog_vllm.text assert "--hf-overrides.key1" in caplog_vllm.text - assert "-O.level" in caplog_vllm.text + assert "-O.mode" in caplog_vllm.text @pytest.mark.parametrize( diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index 59841a446db3..02fa27e3f05f 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -11,7 +11,7 @@ from vllm.compilation.monitor import set_cudagraph_capturing_enabled from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, ParallelConfig, SchedulerConfig, @@ -42,7 +42,7 @@ def _create_vllm_config( mock_config.parallel_config = ParallelConfig() # Mimic the behavior of VllmConfig.__post_init__() - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: compilation_config.set_splitting_ops_for_v1() return mock_config @@ -50,23 +50,23 @@ def _create_vllm_config( class TestCudagraphDispatcher: @pytest.mark.parametrize( - "case_id,cudagraph_mode_str,compilation_level", + "case_id,cudagraph_mode_str,compilation_mode", [ # Test case 0: Full CG for mixed batches, no separate routine - (0, "FULL", CompilationLevel.NO_COMPILATION), + (0, "FULL", CompilationMode.NONE), # Test case 1: Full CG for uniform batches, piecewise for mixed - (1, "FULL_AND_PIECEWISE", CompilationLevel.NO_COMPILATION), + (1, "FULL_AND_PIECEWISE", CompilationMode.NONE), # Test case 2: Full CG for uniform batches, no CG for mixed - (2, "FULL_DECODE_ONLY", CompilationLevel.NO_COMPILATION), - # Test case 3: Piecewise for all - (3, "PIECEWISE", CompilationLevel.PIECEWISE), + (2, "FULL_DECODE_ONLY", CompilationMode.NONE), + # Test case 3: PIECEWISE for all + (3, "PIECEWISE", CompilationMode.VLLM_COMPILE), ], ) - def test_dispatcher(self, cudagraph_mode_str, compilation_level): + def test_dispatcher(self, cudagraph_mode_str, compilation_mode): # Setup dispatcher comp_config = CompilationConfig( cudagraph_mode=cudagraph_mode_str, - level=compilation_level, + mode=compilation_mode, cudagraph_capture_sizes=[1, 8], ) @@ -242,7 +242,7 @@ class TestCudagraphIntegration: def setup_method(self): # only FULL mode for non-uniform batches self.comp_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode="FULL", cudagraph_capture_sizes=[10, 20], ) diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 8c8148ae2094..818ae1d7ba67 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -10,7 +10,7 @@ from tests.utils import wait_for_gpu_memory_to_clear from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM -from vllm.config import CompilationConfig +from vllm.config import CompilationConfig, CompilationMode from vllm.platforms import current_platform @@ -73,7 +73,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=3, cudagraph_mode=cudagraph_mode + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) @@ -90,32 +90,27 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ) -# test cudagraph_mode with different compilation level. -# (backend_name, cudagraph_mode, compilation_level, supported) +# test cudagraph_mode with different compilation mode. +# (backend_name, cudagraph_mode, compilation_mode, supported) combo_cases_2 = [ - ("FA2", "FULL", 0, True), # no compilation + full cudagraph - ("FA2", "FULL", 3, True), # piecewise compilation + full cudagraph - ("FA2", "PIECEWISE", 0, False), # no compilation + piecewise cudagraph - ("FA2", "PIECEWISE", 3, True), # piecewise compilation + piecewise cudagraph - ( - "FA2", - "FULL_AND_PIECEWISE", - 0, - False, - ), # piecewise cudagraph not supported without piecewise compilation - ("FA2", "FULL_AND_PIECEWISE", 3, True), - ("FA2", "FULL_DECODE_ONLY", 0, True), - ("FA2", "FULL_DECODE_ONLY", 3, True), - ("FA2", "NONE", 0, True), # no compilation + no cudagraph - ("FA2", "NONE", 3, True), # piecewise compilation + no cudagraph + ("FA2", "FULL", CompilationMode.NONE, True), + ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True), + ("FA2", "PIECEWISE", CompilationMode.NONE, False), + ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), + ("FA2", "NONE", CompilationMode.NONE, True), + ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True), ] @pytest.mark.parametrize( - "backend_name,cudagraph_mode,compilation_level,supported", combo_cases_2 + "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2 ) def test_cudagraph_compilation_combo(combo_case): - backend_name, cudagraph_mode, compilation_level, supported = combo_case + backend_name, cudagraph_mode, compilation_mode, supported = combo_case env_vars = backend_configs[backend_name].env_vars @@ -130,7 +125,7 @@ def test_cudagraph_compilation_combo(combo_case): gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=compilation_level, cudagraph_mode=cudagraph_mode + mode=compilation_mode, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index 89e5f26ac627..f2c6d1c1fd1a 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -7,7 +7,7 @@ import torch from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationConfig, CompilationMode from vllm.distributed import cleanup_dist_env_and_memory from ...utils import fork_new_process_for_each_test @@ -75,9 +75,9 @@ def test_kv_sharing_fast_prefill( # This allows vLLM compilation backend to handle allocating and # managing buffers for cudagraph cudagraph_copy_inputs=True, - level=CompilationLevel.PIECEWISE + mode=CompilationMode.VLLM_COMPILE if not enforce_eager - else CompilationLevel.NO_COMPILATION, + else CompilationMode.NONE, ) with monkeypatch.context() as m: diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 7fdfdb37a0c0..a3aa54634725 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -28,6 +28,7 @@ "vllm/assets", "vllm/distributed", "vllm/entrypoints", + "vllm/executor", "vllm/inputs", "vllm/logging_utils", "vllm/multimodal", @@ -44,7 +45,6 @@ "vllm/attention", "vllm/compilation", "vllm/engine", - "vllm/executor", "vllm/inputs", "vllm/lora", "vllm/model_executor", diff --git a/vllm/assets/video.py b/vllm/assets/video.py index a4e67ca0b63e..277c8ea1bf0d 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -5,7 +5,6 @@ from functools import lru_cache from typing import Any, ClassVar, Literal -import cv2 import numpy as np import numpy.typing as npt from huggingface_hub import hf_hub_download @@ -43,6 +42,8 @@ def download_video_asset(filename: str) -> str: def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: + import cv2 + cap = cv2.VideoCapture(path) if not cap.isOpened(): raise ValueError(f"Could not open video file {path}") @@ -78,6 +79,8 @@ def video_to_pil_images_list(path: str, num_frames: int = -1) -> list[Image.Imag def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]: + import cv2 + cap = cv2.VideoCapture(path) if not cap.isOpened(): raise ValueError(f"Could not open video file {path}") diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 8b5b87cba404..16c5799f7d0b 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -587,6 +587,7 @@ def __init__( prefix: str = "", use_sparse: bool = False, indexer: object | None = None, + **extra_impl_args, ): super().__init__() self.num_heads = num_heads @@ -639,6 +640,7 @@ def __init__( v_head_dim=self.v_head_dim, kv_b_proj=kv_b_proj, indexer=indexer, + **extra_impl_args, ) self.use_direct_call = not current_platform.opaque_attention_op() diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index d610389ddb6b..20a15bbc31e3 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -2979,13 +2979,14 @@ def _generate_exact_length_tokens(target_length: int) -> list[int]: requests = [] token_mismatch_total = 0 for _ in range(num_prefixes): - prefix_tokens = _generate_exact_length_tokens(prefix_len) + prefix_tokens, prefix_mismatch = _generate_exact_length_tokens(prefix_len) + token_mismatch_total += prefix_mismatch for _ in range(prompts_per_prefix): - suffix_tokens, token_mistmatch = _generate_exact_length_tokens( + suffix_tokens, suffix_mismatch = _generate_exact_length_tokens( suffix_len ) - token_mismatch_total += token_mistmatch + token_mismatch_total += suffix_mismatch combined_tokens = prefix_tokens + suffix_tokens prompt = tokenizer.decode(combined_tokens) prompt_len = len(combined_tokens) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index c52e384a4002..3c85a1e8fdd9 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1230,6 +1230,15 @@ def add_cli_args(parser: argparse.ArgumentParser): "the ready check will be skipped.", ) + parser.add_argument( + "--extra-body", + help="A JSON string representing extra body parameters to include " + "in each request." + 'Example: \'{"chat_template_kwargs":{"enable_thinking":false}}\'', + type=json.loads, + default=None, + ) + def main(args: argparse.Namespace) -> dict[str, Any]: return asyncio.run(main_async(args)) @@ -1330,6 +1339,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: else: sampling_params = {} + extra_body = args.extra_body or {} + extra_body = {**sampling_params, **extra_body} + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() @@ -1355,7 +1367,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, extra_headers=headers, - extra_body=sampling_params, + extra_body=extra_body, ramp_up_strategy=args.ramp_up_strategy, ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_end_rps=args.ramp_up_end_rps, diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 46c433fe6aef..91be7e85af51 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -56,7 +56,7 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: return InductorAdaptor() else: assert compilation_config.backend == "eager", ( - "Custom backends not supported with CompilationLevel.PIECEWISE" + "Custom backends not supported with CompilationMode.VLLM_COMPILE" ) logger.debug("Using EagerAdaptor") @@ -481,7 +481,7 @@ def set_model_tag(tag: str): class VllmBackend: """The compilation backend for `torch.compile` with vLLM. - It is used for compilation level of `CompilationLevel.PIECEWISE`, + It is used for compilation mode of `CompilationMode.VLLM_COMPILE`, where we customize the compilation. The major work of this backend is to split the graph into diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 4553007027e3..e2369a635ad1 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -575,7 +575,7 @@ def metrics_context(self) -> contextlib.AbstractContextManager: Because it is re-entrant, we always set it (even if entering via Dynamo and the context was already entered). We might want to revisit if it - should be set at a different level of compilation. + should be set at a different mode of compilation. This is likely a bug in PyTorch: public APIs should not rely on manually setting up internal contexts. But we also rely on non-public diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 9e8de831bcb2..20918099f169 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -27,8 +27,8 @@ class CompilationCounter: num_cache_entries_updated: int = 0 # The number of standalone_compile compiled artifacts saved num_compiled_artifacts_saved: int = 0 - # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS - dynamo_as_is_count: int = 0 + # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE + stock_torch_compile_count: int = 0 def clone(self) -> "CompilationCounter": return copy.deepcopy(self) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index fe19d4e85129..20d4681e2c78 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -18,7 +18,7 @@ import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel, VllmConfig, set_current_vllm_config +from vllm.config import CompilationMode, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import resolve_obj_by_qualname, supports_dynamo @@ -233,11 +233,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs): old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) self.vllm_config = vllm_config enable_compile = enable_if is None or enable_if(vllm_config) - # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner + # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = ( - vllm_config.compilation_config.level - in [CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS] + vllm_config.compilation_config.mode + in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE] or not supports_dynamo() or _should_ignore_torch_compile(self.__class__) or not enable_compile @@ -247,7 +247,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs): compilation_counter.num_models_seen += 1 TorchCompileWrapperWithCustomDispatcher.__init__( - self, compilation_level=vllm_config.compilation_config.level + self, compilation_mode=vllm_config.compilation_config.mode ) cls.__init__ = __init__ diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index d3c437795fab..1e6d0e79228b 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -3,7 +3,7 @@ import time -from vllm.config import CompilationConfig, CompilationLevel, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, VllmConfig from vllm.logger import init_logger logger = init_logger(__name__) @@ -18,7 +18,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config path = vllm_config.compile_debug_dump_path() - if compilation_config.level == CompilationLevel.PIECEWISE and path: + if compilation_config.mode == CompilationMode.VLLM_COMPILE and path: import depyf path.mkdir(parents=True, exist_ok=True) @@ -29,7 +29,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): def end_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: logger.info( "torch.compile takes %.2f s in total", compilation_config.compilation_time ) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index b4a0d89af0d6..4b10c85209f6 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -11,7 +11,7 @@ import torch import vllm.envs as envs -from vllm.config import CompilationLevel, CUDAGraphMode, get_current_vllm_config +from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config from vllm.logger import init_logger logger = init_logger(__name__) @@ -31,7 +31,7 @@ class TorchCompileWrapperWithCustomDispatcher: """ def __init__( - self, compiled_callable: Callable | None = None, compilation_level: int = 0 + self, compiled_callable: Callable | None = None, compilation_mode: int = 0 ): vllm_config = get_current_vllm_config() self.vllm_config = vllm_config @@ -72,7 +72,7 @@ def __init__( # subclasses can use this to switch between the custom dispatcher # and the default Dynamo guard mechanism. self.use_custom_dispatcher: bool = ( - compilation_level >= CompilationLevel.DYNAMO_ONCE + compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE ) def aot_compile(self, *args, **kwargs): @@ -85,7 +85,7 @@ def aot_compile(self, *args, **kwargs): return self.compiled_callable.aot_compile((args, kwargs)) def __call__(self, *args, **kwargs): - """Implement the dispatch logic here, beyond the torch.compile level. + """Implement the dispatch logic here, beyond the torch.compile mode. NOTE: this function can have additional arguments beyond the forward method, for directly dispatching to the compiled code. """ diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6a0197d044dc..7f1cc5202420 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -4,7 +4,7 @@ from vllm.config.cache import CacheConfig from vllm.config.compilation import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, PassConfig, ) @@ -49,7 +49,7 @@ "CacheConfig", # From vllm.config.compilation "CompilationConfig", - "CompilationLevel", + "CompilationMode", "CUDAGraphMode", "PassConfig", # From vllm.config.device diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index fb80835ba48a..708ce30939c3 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -26,12 +26,20 @@ logger = init_logger(__name__) -class CompilationLevel: - # constants for the levels of the compilation process - NO_COMPILATION = 0 - DYNAMO_AS_IS = 1 - DYNAMO_ONCE = 2 - PIECEWISE = 3 +class CompilationMode: + """The compilation approach used for torch.compile-based compilation of the + model.""" + + NONE = 0 + """No torch.compile compilation is applied, model runs in fully eager pytorch mode. + The model runs as-is.""" + STOCK_TORCH_COMPILE = 1 + """The standard `torch.compile` compilation pipeline.""" + DYNAMO_TRACE_ONCE = 2 + """Single Dynamo trace through the model, avoiding recompilation.""" + VLLM_COMPILE = 3 + """Custom vLLM Inductor-based backend with caching, piecewise compilation, + shape specialization, and custom passes.""" class CUDAGraphMode(enum.Enum): @@ -89,17 +97,17 @@ class PassConfig: don't all have access to full configuration - that would create a cycle as the `PassManager` is set as a property of config.""" - enable_fusion: bool = False + enable_fusion: bool | None = None """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass.""" - enable_attn_fusion: bool = False + enable_attn_fusion: bool | None = None """Whether to enable the custom attention+quant fusion pass.""" - enable_noop: bool = False + enable_noop: bool | None = None """Whether to enable the custom no-op elimination pass.""" - enable_sequence_parallelism: bool = False + enable_sequence_parallelism: bool | None = None """Whether to enable sequence parallelism.""" - enable_async_tp: bool = False + enable_async_tp: bool | None = None """Whether to enable async TP.""" - enable_fi_allreduce_fusion: bool = False + enable_fi_allreduce_fusion: bool | None = None """Whether to enable flashinfer allreduce fusion.""" fi_allreduce_fusion_max_token_num: int = 16384 """Max number of tokens to used in flashinfer allreduce fusion.""" @@ -134,7 +142,7 @@ class CompilationConfig: """Configuration for compilation. It has three parts: - Top-level Compilation control: - - [`level`][vllm.config.CompilationConfig.level] + - [`mode`][vllm.config.CompilationConfig.mode] - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] - [`backend`][vllm.config.CompilationConfig.backend] @@ -171,14 +179,26 @@ class CompilationConfig: # Top-level Compilation control level: int | None = None - """The level of compilation: - - - None: If None, we will select the default compilation level. - For V1 engine this is 3, for V0 engine this is 0. - - 0: no compilation. - - 1: dynamo as is. - - 2: dynamo once. - - 3: piecewise compilation.""" + """ + Level is deprecated and will be removed in the next release, + either 0.12.0 or 0.11.2 whichever is soonest. + Please use mode. Currently all levels are mapped to mode. + """ + # Top-level Compilation control + mode: int | None = None + """The compilation approach used for torch.compile-based compilation of the + model. + + - None: If None, we will select the default compilation mode. + For V1 engine this is 3. + - 0: NONE: No torch.compile compilation is applied, model runs in fully + eager pytorch mode. The model runs as-is. + - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline. + - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding + recompilation by removing guards. + Requires no dynamic-shape-dependent control-flow. + - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching, + piecewise compilation, shape specialization, and custom passes.""" debug_dump_path: Path | None = None """The path to dump the debug information.""" cache_dir: str = "" @@ -195,11 +215,11 @@ class CompilationConfig: backend function. We use string to avoid serialization issues when using compilation in a - distributed setting. When the compilation level is 1 or 2, the backend is + distributed setting. When the compilation mode is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). When the - compilation level is 3, the backend is used for the piecewise compilation + compilation mode is 3, the backend is used for the piecewise compilation (it sees a part of the graph). The backend can not be custom for compilation - level 3, i.e. the backend must be either eager or inductor. Furthermore, + mode 3, i.e. the backend must be either eager or inductor. Furthermore, compilation is only piecewise if splitting ops is set accordingly and use_inductor_graph_partition is off. Note that the default options for splitting ops are sufficient for piecewise compilation. @@ -214,7 +234,7 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: level>=PIECEWISE and use_inductor=True. + disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] | None = None """A list of ops to exclude from cudagraphs, used in piecewise compilation. @@ -249,7 +269,7 @@ class CompilationConfig: One graph for symbolic shape and one graph per size in compile_sizes are compiled using configurations in inductor_compile_config. - This setting is ignored if level str: the final hidden states. """ factors: list[Any] = [] - factors.append(self.level) + factors.append(self.mode) factors.append(self.backend) factors.append(self.custom_ops) factors.append(self.splitting_ops) @@ -477,6 +497,17 @@ def validate_cudagraph_mode_before(cls, value: Any) -> Any: return value def __post_init__(self) -> None: + if self.level is not None: + logger.warning( + "Level is deprecated and will be removed in the next release," + "either 0.12.0 or 0.11.2 whichever is soonest." + "Use mode instead." + "If both level and mode are given," + "only mode will be used." + ) + if self.mode is None: + self.mode = self.level + count_none = self.custom_ops.count("none") count_all = self.custom_ops.count("all") assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" @@ -574,7 +605,7 @@ def __post_init__(self) -> None: # Currently only eager and inductor backend are supported. # for piecewise compilation. Custom backends are not suppported for # piecewise compilation. Update when more backends are supported. - if self.level == CompilationLevel.PIECEWISE and self.backend not in [ + if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [ "", "eager", "inductor", @@ -602,24 +633,27 @@ def init_backend(self, vllm_config: "VllmConfig") -> str | Callable: Returns: The backend for the compilation config. """ - if self.level is None: + if self.mode is None: raise ValueError( - "No compilation level is set. This method should only be \ + "No compilation mode is set. This method should only be \ called via vllm config where the level is set if none is \ provided." ) - if self.level == CompilationLevel.NO_COMPILATION: - raise ValueError("No compilation level is set.") + if self.mode == CompilationMode.NONE: + raise ValueError("No compilation mode is set.") from torch._dynamo.backends.registry import list_backends torch_backends = list_backends(exclude_tags=tuple()) - if self.level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]: + if self.mode in [ + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + ]: if self.backend in torch_backends: return self.backend return resolve_obj_by_qualname(self.backend) - assert self.level == CompilationLevel.PIECEWISE + assert self.mode == CompilationMode.VLLM_COMPILE if self.backend not in ["eager", "inductor"]: raise ValueError( f"Invalid backend for piecewise compilation: {self.backend}" @@ -684,11 +718,11 @@ def init_with_cudagraph_sizes(self, cudagraph_capture_sizes: list[int]) -> None: self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size def set_splitting_ops_for_v1(self): - # NOTE: this function needs to be called only when level is - # CompilationLevel.PIECEWISE - assert self.level == CompilationLevel.PIECEWISE, ( + # NOTE: this function needs to be called only when mode is + # CompilationMode.VLLM_COMPILE + assert self.mode == CompilationMode.VLLM_COMPILE, ( "set_splitting_ops_for_v1 should only be called when " - "level is CompilationLevel.PIECEWISE" + "mode is CompilationMode.VLLM_COMPILE" ) if self.use_inductor_graph_partition: @@ -769,12 +803,10 @@ def is_attention_compiled_piecewise(self) -> bool: if not self.use_inductor_graph_partition: # Dynamo-level FX split case - return self.level == CompilationLevel.PIECEWISE + return self.mode == CompilationMode.VLLM_COMPILE # Inductor partition case - return ( - self.backend == "inductor" and self.level > CompilationLevel.NO_COMPILATION - ) + return self.backend == "inductor" and self.mode > CompilationMode.NONE def custom_op_log_check(self): """ diff --git a/vllm/config/model.py b/vllm/config/model.py index 2be939eb654d..3ac865827f0e 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -408,7 +408,7 @@ def _apply_dict_overrides( def __post_init__( self, # Multimodal config init vars - limit_mm_per_prompt: dict[str, int] | None, + limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, media_io_kwargs: dict[str, dict[str, Any]] | None, mm_processor_kwargs: dict[str, Any] | None, mm_processor_cache_gb: float | None, @@ -428,7 +428,7 @@ def __post_init__( # doesn't affect the user process. However, without a consistent seed, # different tensor parallel workers would sample different tokens, # leading to inconsistent results. - if envs.VLLM_USE_V1 and self.seed is None: + if self.seed is None: self.seed = 0 if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: logger.warning( @@ -1734,6 +1734,51 @@ def get_and_verify_max_len(self, max_model_len: int): logger.info("Using max model len %s", max_model_len) return max_model_len + def is_model_moe( + self, + ) -> bool: + """ + Parse model configuration to determine if sequence parallel is needed. + + Returns: + True if model is MOE. + False otherwise. + """ + # Get text config (handles multimodal models) + assert hasattr(self, "hf_config") + + text_config = self.hf_config.get_text_config() + + # Check for MoE (Mixture of Experts) indicators + num_expert_names = [ + "num_experts", # Jamba + "moe_num_experts", # Dbrx + "n_routed_experts", # DeepSeek + "num_local_experts", # Mixtral + ] + + num_experts = getattr_iter(text_config, num_expert_names, 0) + + # Handle list case (e.g., Ernie VL) + is_moe_model = False + if isinstance(num_experts, list): + is_moe_model = max(num_experts) > 1 + else: + is_moe_model = num_experts > 1 + + return is_moe_model + + def is_quantized(self) -> bool: + """ + Check if a PretrainedConfig is quantized. + + Returns: + True if model is quantized. + False otherwise. + """ + assert hasattr(self, "hf_config") + return getattr(self.hf_config, "quantization_config", None) is not None + def get_served_model_name(model: str, served_model_name: str | list[str] | None): """ diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index b7ef0fef6833..944a1e8666f4 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -334,7 +334,7 @@ def stateless_init_dp_group(self) -> ProcessGroup: self.get_next_dp_init_port(), self.data_parallel_rank, self.data_parallel_size, - backend="gloo", + backend=current_platform.dist_backend, ) except DistNetworkError as e: # We only want to retry when the root cause is EADDRINUSE. diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index b0ed12894065..17743a7584a8 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -8,6 +8,7 @@ import time from contextlib import contextmanager from dataclasses import replace +from enum import Enum from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar @@ -22,7 +23,7 @@ from vllm.utils import random_uuid from .cache import CacheConfig -from .compilation import CompilationConfig, CompilationLevel, CUDAGraphMode +from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode from .device import DeviceConfig from .kv_events import KVEventsConfig from .kv_transfer import KVTransferConfig @@ -48,6 +49,22 @@ logger = init_logger(__name__) +class OptimizationLevel(Enum): + """Optimization level enum.""" + + O0 = 0 + """00 : No optimization. no compilation, no cudagraphs, no other + optimization, just starting up immediately""" + O1 = 1 + """O1: Quick optimizations. Dynamo+Inductor compilation but no + cudagraphs""" + O2 = 2 + """O2: Full optimizations. -O1 as well as cudagraphs.""" + O3 = 3 + """O3: Full (auto)tuning. -O2 as well as max-autotune, compiling for + additional static sizes, etc. - any other time-consuming optimizations.""" + + @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class VllmConfig: @@ -84,17 +101,11 @@ class VllmConfig: compilation_config: CompilationConfig = Field(default_factory=CompilationConfig) """`torch.compile` and cudagraph capture configuration for the model. - As a shorthand, `-O` can be used to directly specify the compilation - level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`). - Currently, -O and -O= are supported as well but this will likely be - removed in favor of clearer -O syntax in the future. - - NOTE: level 0 is the default level without any optimization. level 1 and 2 - are for internal testing only. level 3 is the recommended level for - production, also default in V1. + As a shorthand, one can append compilation arguments via + -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`). You can specify the full compilation config like so: - `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}` + `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}` """ kv_transfer_config: KVTransferConfig | None = None """The configurations for distributed KV cache transfer.""" @@ -109,6 +120,11 @@ class VllmConfig: you are using. Contents must be hashable.""" instance_id: str = "" """The ID of the vLLM instance.""" + optimization_level: OptimizationLevel | None = OptimizationLevel.O2 + """The optimization level. These levels trade startup time cost for + performance, with -O0 having the best startup time and -O3 having the best + performance. -02 is used by defult. See OptimizationLevel for full + description.""" def compute_hash(self) -> str: """ @@ -268,6 +284,128 @@ def with_hf_config( return replace(self, model_config=model_config) + def _build_defaults(self): + is_quantized = False + is_sequential = False + if self.model_config is not None: + is_quantized = self.model_config.is_quantized() + is_sequential = not self.model_config.is_model_moe() + optimization_level_00 = { + "general": { + "pass_config": { + "enable_noop": False, + "enable_fusion": False, + "enable_fi_allreduce_fusion": False, + }, + "mode": CompilationMode.NONE, + "cudagraph_mode": CUDAGraphMode.NONE, + "use_inductor_graph_partition": False, + }, + "is_quantized": {"pass_config": {"enable_attn_fusion": False}}, + "is_sequential": { + "pass_config": { + "enable_sequence_parallelism": False, + "enable_async_tp": False, + } + }, + } + optimization_level_01 = { + "general": { + "pass_config": { + "enable_noop": True, + "enable_fusion": True, + "enable_fi_allreduce_fusion": False, + }, + "mode": CompilationMode.VLLM_COMPILE, + "cudagraph_mode": CUDAGraphMode.PIECEWISE, + "use_inductor_graph_partition": False, + }, + "is_quantized": {"pass_config": {"enable_attn_fusion": False}}, + "is_sequential": { + "pass_config": { + "enable_sequence_parallelism": False, + "enable_async_tp": False, + } + }, + } + optimization_level_02 = { + "general": { + "pass_config": { + "enable_noop": True, + "enable_fusion": True, + "enable_fi_allreduce_fusion": True, + }, + "mode": CompilationMode.VLLM_COMPILE, + "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE, + "use_inductor_graph_partition": True, + }, + "is_quantized": {"pass_config": {"enable_attn_fusion": is_quantized}}, + "is_sequential": { + "pass_config": { + "enable_sequence_parallelism": is_sequential, + "enable_async_tp": is_sequential, + } + }, + } + optimization_level_03 = { + "general": { + "pass_config": { + "enable_noop": True, + "enable_fusion": True, + "enable_fi_allreduce_fusion": True, + }, + "mode": CompilationMode.VLLM_COMPILE, + "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE, + "use_inductor_graph_partition": True, + }, + "is_quantized": {"pass_config": {"enable_attn_fusion": is_quantized}}, + "is_sequential": { + "pass_config": { + "enable_sequence_parallelism": is_sequential, + "enable_async_tp": is_sequential, + } + }, + } + optimization_level_to_config = { + OptimizationLevel.O0: optimization_level_00, + OptimizationLevel.O1: optimization_level_01, + OptimizationLevel.O2: optimization_level_02, + OptimizationLevel.O3: optimization_level_03, + } + return optimization_level_to_config[self.optimization_level] + + def _apply_optimization_level_defaults(self, default_config: dict) -> None: + """Apply optimization level-specific default configurations. + + Configures defaults for -O0 through -O3 based on compilation level. + Only sets values not explicitly configured by the user. + + Optimization Levels: + - (None): No optimization, fast startup, eager execution + - (STOCK_TORCH_COMPILE): Fast compilation + - (DYNAMO_TRACE_ONCE): Full optimization + - (VLLM_COMPILE): Maximum optimization with autotuning + """ + # Apply optimization level default if not set by user. + for k, v in default_config["general"].items(): + if k == "pass_config": + for pass_k, pass_v in default_config["general"]["pass_config"].items(): + if getattr(self.compilation_config.pass_config, pass_k) is None: + setattr(self.compilation_config.pass_config, pass_k, pass_v) + else: + if getattr(self.compilation_config, k) is None: + setattr(self.compilation_config, k, v) + + assert self.optimization_level is not None + + for k, v in default_config["is_quantized"]["pass_config"].items(): + if getattr(self.compilation_config.pass_config, k) is None: + setattr(self.compilation_config.pass_config, k, v) + + for k, v in default_config["is_sequential"]["pass_config"].items(): + if getattr(self.compilation_config.pass_config, k) is None: + setattr(self.compilation_config.pass_config, k, v) + def __post_init__(self): """Verify configs are valid & consistent with each other.""" @@ -305,33 +443,18 @@ def __post_init__(self): "precision for chunked prefill triton kernels." ) - # If the user does not explicitly set a compilation level, then - # we use the default level. The default level depends on other - # settings (see the below code). - if self.compilation_config.level is None: - if envs.VLLM_USE_V1: - if ( - self.model_config is not None - and not self.model_config.enforce_eager - ): - self.compilation_config.level = CompilationLevel.PIECEWISE - else: - self.compilation_config.level = CompilationLevel.NO_COMPILATION - - else: - # NB: Passing both --enforce-eager and a compilation level - # in V0 means the compilation level wins out. - self.compilation_config.level = CompilationLevel.NO_COMPILATION - else: - assert self.compilation_config.level >= CompilationLevel.NO_COMPILATION - assert self.compilation_config.level <= CompilationLevel.PIECEWISE + # Apply optimization level-specific defaults + default_config = self._build_defaults() + self._apply_optimization_level_defaults(default_config) + assert self.compilation_config.mode >= CompilationMode.NONE + assert self.compilation_config.mode <= CompilationMode.VLLM_COMPILE # If user does not set custom ops via none or all set it here based on - # compilation level and backend. + # compilation mode and backend. if all(s not in self.compilation_config.custom_ops for s in ("all", "none")): if ( self.compilation_config.backend == "inductor" - and self.compilation_config.level > CompilationLevel.NO_COMPILATION + and self.compilation_config.mode > CompilationMode.NONE ): self.compilation_config.custom_ops.append("none") else: @@ -350,7 +473,7 @@ def __post_init__(self): if self.compilation_config.cudagraph_mode is None: if ( envs.VLLM_USE_V1 - and self.compilation_config.level == CompilationLevel.PIECEWISE + and self.compilation_config.mode == CompilationMode.VLLM_COMPILE ): # default to full and piecewise for most models self.compilation_config.cudagraph_mode = ( @@ -486,10 +609,10 @@ def __post_init__(self): ) current_platform.check_and_update_config(self) - # Do this after all the updates to compilation_config.level + # Do this after all the updates to compilation_config.mode if ( envs.VLLM_USE_V1 - and self.compilation_config.level == CompilationLevel.PIECEWISE + and self.compilation_config.mode == CompilationMode.VLLM_COMPILE ): self.compilation_config.set_splitting_ops_for_v1() @@ -508,8 +631,8 @@ def __post_init__(self): ) if self.compilation_config.cudagraph_mode.requires_piecewise_compilation(): - assert self.compilation_config.level == CompilationLevel.PIECEWISE, ( - "Compilation level should be CompilationLevel.PIECEWISE " + assert self.compilation_config.mode == CompilationMode.VLLM_COMPILE, ( + "Compilation mode should be CompilationMode.VLLM_COMPILE " "when cudagraph_mode piecewise cudagraphs is used, " f"cudagraph_mode={self.compilation_config.cudagraph_mode}" ) @@ -837,7 +960,7 @@ def set_current_vllm_config( if ( check_compile - and vllm_config.compilation_config.level == CompilationLevel.PIECEWISE + and vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE and compilation_counter.num_models_seen == num_models_seen ): # If the model supports compilation, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 490f209373db..6a2434ddce8b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -241,7 +241,8 @@ def get_block_ids_with_load_errors(self) -> set[int]: return self.connector_worker.get_block_ids_with_load_errors() def get_kv_connector_stats(self) -> KVConnectorStats | None: - assert self.connector_worker is not None + if self.connector_worker is None: + return None return self.connector_worker.get_kv_connector_stats() @classmethod diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 0a1e04ec10f9..a3d9dbe83a12 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -415,7 +415,6 @@ def create( def init_gloo_process_group( - backend: Backend, prefix_store: PrefixStore, group_rank: int, group_size: int, @@ -432,7 +431,7 @@ def init_gloo_process_group( group_size, ) else: - options = ProcessGroup.Options(backend=backend) + options = ProcessGroup.Options(backend="gloo") pg = ProcessGroup( prefix_store, group_rank, @@ -504,24 +503,25 @@ def stateless_init_torch_distributed_process_group( # Use a PrefixStore to avoid accidental overrides of keys used by # different systems (e.g. RPC) in case the store is multi-tenant. prefix_store = PrefixStore(init_method, store) + try: + from vllm.platforms import current_platform - if backend == "gloo": - return init_gloo_process_group( + return current_platform.stateless_init_device_torch_dist_pg( backend=backend, prefix_store=prefix_store, group_rank=group_rank, group_size=group_size, timeout=timeout, ) - from vllm.platforms import current_platform - - return current_platform.stateless_init_device_torch_dist_pg( - backend=backend, - prefix_store=prefix_store, - group_rank=group_rank, - group_size=group_size, - timeout=timeout, - ) + except NotImplementedError: + # If platform doesn't implement stateless_init_device_torch_dist_pg, it + # will raise a NotImplementedError. In this case, we fall back to gloo. + return init_gloo_process_group( + prefix_store=prefix_store, + group_rank=group_rank, + group_size=group_size, + timeout=timeout, + ) def stateless_destroy_torch_distributed_process_group(pg: ProcessGroup) -> None: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 668344fdcc34..e2db9d049a75 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -176,7 +176,7 @@ class LLM: argument is deprecated and will be removed in v0.12.0 or v1.0.0, whichever is sooner. compilation_config: Either an integer or a dictionary. If it is an - integer, it is used as the level of compilation optimization. If it + integer, it is used as the mode of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs]. @@ -257,9 +257,7 @@ def __init__( if compilation_config is not None: if isinstance(compilation_config, int): - compilation_config_instance = CompilationConfig( - level=compilation_config - ) + compilation_config_instance = CompilationConfig(mode=compilation_config) elif isinstance(compilation_config, dict): compilation_config_instance = CompilationConfig( **{ @@ -953,7 +951,7 @@ def encode( truncate_prompt_tokens: int | None = None, use_tqdm: bool | Callable[..., tqdm] = True, lora_request: list[LoRARequest] | LoRARequest | None = None, - pooling_task: PoolingTask = "encode", + pooling_task: PoolingTask | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> list[PoolingRequestOutput]: """Apply pooling to the hidden states corresponding to the input @@ -988,25 +986,24 @@ def encode( instead pass them via the `inputs` parameter. """ - if self.supported_tasks == ["encode"] and pooling_task is None: - pooling_task = "encode" + error_str = ( + "pooling_task required for `LLM.encode`\n" + "Please use one of the more specific methods or set the " + "pooling_task when using `LLM.encode`:\n" + " - For embeddings, use `LLM.embed(...)` " + 'or `pooling_task="embed"`.\n' + " - For classification logits, use `LLM.classify(...)` " + 'or `pooling_task="classify"`.\n' + " - For similarity scores, use `LLM.score(...)`.\n" + " - For rewards, use `LLM.reward(...)` " + 'or `pooling_task="token_classify"`\n' + " - For token classification, " + 'use `pooling_task="token_classify"`\n' + ' - For multi-vector retrieval, use `pooling_task="token_embed"`' + ) if pooling_task is None: - pooling_task = "embed" if "embed" in self.supported_tasks else "encode" - - logger.warning_once( - "`LLM.encode` is currently using `pooling_task = %s`.\n" - "Please use one of the more specific methods or set the " - "task directly when using `LLM.encode`:\n" - " - For embeddings, use `LLM.embed(...)` " - 'or `pooling_task="embed"`.\n' - " - For classification logits, use `LLM.classify(...)` " - 'or `pooling_task="classify"`.\n' - " - For rewards, use `LLM.reward(...)` " - 'or `pooling_task="reward"`\n' - " - For similarity scores, use `LLM.score(...)`.", - pooling_task, - ) + raise ValueError(error_str) model_config = self.model_config runner_type = model_config.runner_type @@ -1208,7 +1205,7 @@ def reward( lora_request=lora_request, pooling_params=pooling_params, truncate_prompt_tokens=truncate_prompt_tokens, - pooling_task="encode", + pooling_task="token_classify", ) def _embedding_score( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index fd80ba7a9afc..0ac035595690 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1748,16 +1748,19 @@ async def init_app_state( else None ) state.openai_serving_pooling = ( - OpenAIServingPooling( - engine_client, - state.openai_serving_models, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, - trust_request_chat_template=args.trust_request_chat_template, - log_error_stack=args.log_error_stack, + ( + OpenAIServingPooling( + engine_client, + state.openai_serving_models, + supported_tasks=supported_tasks, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + log_error_stack=args.log_error_stack, + ) ) - if "encode" in supported_tasks + if ("token_embed" in supported_tasks or "token_classify" in supported_tasks) else None ) state.openai_serving_embedding = ( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 86e1e62ff437..5b8a118280da 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1682,7 +1682,7 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]): When using plugins IOProcessor plugins, the actual input is processed by the plugin itself. Hence, we use a generic type for the request data """ - softmax: bool = True + activation: bool = False embed_dtype: str = Field( default="float32", @@ -1693,7 +1693,7 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]): ) def to_pooling_params(self): - return PoolingParams(task="encode", softmax=self.softmax) + return PoolingParams(task="token_classify", activation=self.activation) class IOProcessorResponse(OpenAIBaseModel, Generic[T]): diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 26027112eb58..5dc7f7859226 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -570,7 +570,10 @@ async def chat_completion_stream_generator( try: if self.reasoning_parser: - reasoning_parser = self.reasoning_parser(tokenizer) + reasoning_parser = self.reasoning_parser( + tokenizer, + chat_template_kwargs=request.chat_template_kwargs, # type: ignore + ) except RuntimeError as e: logger.exception("Error in reasoning parser creation.") data = self.create_streaming_error_response(str(e)) @@ -1335,7 +1338,10 @@ async def chat_completion_full_generator( if self.reasoning_parser: try: - reasoning_parser = self.reasoning_parser(tokenizer) + reasoning_parser = self.reasoning_parser( + tokenizer, + chat_template_kwargs=request.chat_template_kwargs, # type: ignore + ) except RuntimeError as e: logger.exception("Error in reasoning parser creation.") return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 3ed17abe0946..aa81a233b297 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -35,6 +35,7 @@ from vllm.entrypoints.utils import _validate_truncation_size from vllm.logger import init_logger from vllm.outputs import PoolingOutput, PoolingRequestOutput +from vllm.tasks import SupportedTask from vllm.utils import merge_async_iterators logger = init_logger(__name__) @@ -62,6 +63,7 @@ def __init__( engine_client: EngineClient, models: OpenAIServingModels, *, + supported_tasks: tuple[SupportedTask, ...], request_logger: RequestLogger | None, chat_template: str | None, chat_template_content_format: ChatTemplateContentFormatOption, @@ -75,6 +77,7 @@ def __init__( log_error_stack=log_error_stack, ) + self.supported_tasks = supported_tasks self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format self.trust_request_chat_template = trust_request_chat_template @@ -178,8 +181,17 @@ async def create_pooling( try: pooling_params = request.to_pooling_params() + if "token_embed" in self.supported_tasks: + pooling_task = "token_embed" + elif "token_classify" in self.supported_tasks: + pooling_task = "token_classify" + else: + return self.create_error_response( + f"pooling_task must be one of {self.supported_tasks}." + ) + try: - pooling_params.verify("encode", self.model_config) + pooling_params.verify(pooling_task, self.model_config) except ValueError as e: return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py index 2c5b0b6a85f7..9964d1ac25c4 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import json -import uuid from collections.abc import Sequence from typing import Any from xml.parsers.expat import ParserCreate import regex as re +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, @@ -375,14 +375,21 @@ def _find_next_complete_element(self, start_pos: int) -> tuple[str | None, int]: return buffer[: tag_end2 + 1], start_pos + tag_end2 + 1 else: # If currently not parsing tool calls (entering a tool_call), - # check if starts with + # check if starts with or if buffer == ""[: len(buffer)]: # Might be start of , wait for more data return None, start_pos + elif ( + buffer.startswith(" str | None: """Extract function name from various formats""" if attrs and "name" in attrs: @@ -1168,6 +1171,10 @@ def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) self.parser = StreamingXMLToolCallParser() + # Add missing attributes for compatibility with serving_chat.py + self.prev_tool_call_arr: list[dict] = [] + self.streamed_args_for_tool: list[str] = [] + logger.info( "vLLM Successfully import tool parser %s !", self.__class__.__name__ ) @@ -1178,6 +1185,9 @@ def extract_tool_calls( request: ChatCompletionRequest, ) -> ExtractedToolCallInformation: self.parser.reset_streaming_state() + # Reset tool call tracking arrays for new extraction + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] if request: self.parser.set_tools(request.tools) result = self.parser.parse_single_streaming_chunks(model_output) @@ -1201,6 +1211,34 @@ def extract_tool_calls( ), ) ) + + # Update tool call tracking arrays for compatibility + tool_index = ( + tool_call.index + if tool_call.index is not None + else len(self.prev_tool_call_arr) - 1 + ) + + # Ensure we have enough entries in our tracking arrays + while len(self.prev_tool_call_arr) <= tool_index: + self.prev_tool_call_arr.append({"name": "", "arguments": ""}) + while len(self.streamed_args_for_tool) <= tool_index: + self.streamed_args_for_tool.append("") + + # Update tool call information + self.prev_tool_call_arr[tool_index]["name"] = ( + tool_call.function.name + ) + self.prev_tool_call_arr[tool_index]["arguments"] = ( + tool_call.function.arguments + ) + + # Update streamed arguments + if tool_call.function.arguments: + self.streamed_args_for_tool[tool_index] = ( + tool_call.function.arguments + ) + return ExtractedToolCallInformation( tool_calls=tool_calls, tools_called=len(tool_calls) > 0, @@ -1219,6 +1257,9 @@ def extract_tool_calls_streaming( ) -> DeltaMessage | None: if not previous_text: self.parser.reset_streaming_state() + # Reset tool call tracking arrays for new streaming session + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] if request: self.parser.set_tools(request.tools) @@ -1230,20 +1271,48 @@ def extract_tool_calls_streaming( open_calls = current_text.count( self.parser.tool_call_start_token ) - current_text.count(self.parser.tool_call_end_token) - if open_calls == 0 and self.parser.tool_call_index > 0: - # If current_call_id is None, use last_completed_call_id - call_id = ( - self.parser.current_call_id or self.parser.last_completed_call_id - ) - return DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.parser.tool_call_index - 1, - id=call_id, - function=DeltaFunctionCall(arguments=""), - type="function", + if ( + open_calls == 0 + and self.parser.tool_call_index > 0 + or not self.parser.tool_call_index + and current_text + ): + return DeltaMessage(content="") + return None + + # Parse the delta text and get the result + result = self.parser.parse_single_streaming_chunks(delta_text) + + # Update tool call tracking arrays based on incremental parsing results + if result and result.tool_calls: + for tool_call in result.tool_calls: + if tool_call.function: + tool_index = ( + tool_call.index + if tool_call.index is not None + else len(self.prev_tool_call_arr) - 1 + ) + + # Ensure we have enough entries in our tracking arrays + while len(self.prev_tool_call_arr) <= tool_index: + self.prev_tool_call_arr.append({"name": "", "arguments": ""}) + while len(self.streamed_args_for_tool) <= tool_index: + self.streamed_args_for_tool.append("") + + # Update tool name if provided + if tool_call.function.name: + self.prev_tool_call_arr[tool_index]["name"] = ( + tool_call.function.name ) - ] - ) - return self.parser.parse_single_streaming_chunks(delta_text) + # Update arguments incrementally + if tool_call.function.arguments is not None: + # Concatenate the incremental arguments + # to the existing streamed arguments + self.prev_tool_call_arr[tool_index]["arguments"] += ( + tool_call.function.arguments + ) + self.streamed_args_for_tool[tool_index] += ( + tool_call.function.arguments + ) + return result diff --git a/vllm/env_override.py b/vllm/env_override.py index 7f9054e73846..eb51dee1cf03 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -3,6 +3,7 @@ import os import torch +from packaging import version from vllm.logger import init_logger @@ -21,3 +22,120 @@ os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" # see https://github.com/vllm-project/vllm/issues/10619 torch._inductor.config.compile_threads = 1 + + +# ======================================== +# torch 2.9 Inductor Scheduler monkeypatch +# ======================================== +# This change monkeypatches a function in Inductor to work around the following +# bug: https://github.com/vllm-project/vllm/issues/26678 +# +# The bug occurs when `use_inductor_graph_partition` is turned on and there +# exists operators inside of `splitting_ops` that have an in-place mutation. In +# vllm, this specifically occurs on the operator +# vllm.unified_attention_with_output. In this case, inductor does not populate +# the inductor IR's `origin_node` field, causing an assertion error when trying +# to access the node's `origin_node` field. +# +# So, we will monkeypatch torch._inductor.scheduler.Scheduler.should_partition +# so that it does not access the inductor IR node's `origin_node` field and just +# returns True if a node is registered as having a custom partition function. +# This is ok for now since vllm's implementation of the custom partition +# functions just return True. +# ======================================== + + +def should_partition_patched(self, node, should_log: bool = False) -> bool: + # This is a patched version of + # torch._inductor.scheduler.Scheduler.should_partition that modifies + # the following piece of code so that we always return True: + # https://github.com/pytorch/pytorch/blob/ecb53078faf86ca1b33277df33b82985675bb011/torch/_inductor/scheduler.py#L4712-L4724 + """Return True if we should partition the inductor graph on this node""" + + import torch._inductor.ir as ir + from torch._inductor.scheduler import ( + BaseSchedulerNode, + FusedSchedulerNode, + _custom_should_partition_fns, + ) + from torch._inductor.utils import ( + _unstable_customized_partition_wrapper, + is_cudagraph_unsafe_op, + maybe_log_cudagraph_partition, + ) + + # Allow users to manually specify if a node should be partitioned + # Can only do this for FallbackKernels + ir_node = node.node + if isinstance(ir_node, ir.FallbackKernel): + operator = ir_node.op_overload + if operator is not None and operator in _custom_should_partition_fns: + return True + + # When not using cudagraphs, keep all kernels in the `call` function + # instead of graph partition functions, since graph partition only brings + # benefit to cudagraph + if ( + not torch._inductor.config.triton.cudagraphs + and _unstable_customized_partition_wrapper.wrapper is None + ): + return True + + # avoid duplicating logs when should_partition is called multiple times + # on the same node + def noop_log(msg: str, node: BaseSchedulerNode | None) -> None: + return + + log_partition_reason = maybe_log_cudagraph_partition if should_log else noop_log + + if isinstance(node, FusedSchedulerNode): + return any(self.should_partition(snode) for snode in node.snodes) + + assert node.node is not None + + if not node.is_gpu(): + log_partition_reason("non gpu ops", node=node) + + return True + + if isinstance(node.node, ir.DeviceCopy): + log_partition_reason("DeviceCopy ops", node=node) + return True + + if isinstance(node.node, ir.Conditional): + log_partition_reason("Conditional ops", node=node) + return True + + if getattr(node.node, "unbacked_bindings", None): + log_partition_reason("unbacked binding ops", node=node) + return True + + if is_cudagraph_unsafe_op(node.node): + log_partition_reason("CUDAGraph-unsafe custom ops", node=node) + return True + + return False + + +def _update_scheduler_patched(self) -> None: + # Copied from torch._inductor.graph.GrahLowering._update_scheduler. Patches + # this method so that we can patch Scheduler.should_partition with the + # function above + """ + (Re)initializes the scheduler member. When initializing the scheduler, no CUBIN + files should be generated (to avoid biasing any benchmarks and pessimizing + fusion decisions). + """ + import torch._inductor.config as config + from torch._inductor.scheduler import Scheduler + + Scheduler.should_partition = should_partition_patched + + with config.patch("triton.store_cubin", False): + self.scheduler = Scheduler(self.operations) + + +if version.parse(str(torch.__version__)) == version.parse("2.9.0"): + from torch._inductor.graph import GraphLowering + + GraphLowering._update_scheduler = _update_scheduler_patched diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 2c44422ba217..a5f83f904002 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -18,7 +18,7 @@ from vllm.sequence import ExecuteModelRequest from vllm.tasks import SupportedTask from vllm.utils import make_async -from vllm.v1.outputs import PoolerOutput, SamplerOutput +from vllm.v1.outputs import SamplerOutput from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) @@ -54,7 +54,7 @@ def __init__( self._init_executor() self.is_sleeping = False self.sleeping_tags: set[str] = set() - self.kv_output_aggregator = None + self.kv_output_aggregator: KVOutputAggregator | None = None @abstractmethod def _init_executor(self) -> None: @@ -143,8 +143,9 @@ def supported_tasks(self) -> tuple[SupportedTask, ...]: def execute_model( self, execute_model_req: ExecuteModelRequest - ) -> list[SamplerOutput | PoolerOutput] | None: + ) -> list[SamplerOutput]: output = self.collective_rpc("execute_model", args=(execute_model_req,)) + assert output[0] is not None return output[0] def stop_remote_worker_execution_loop(self) -> None: diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 943c6a27f1e8..59e282ac92b6 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -217,7 +217,9 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwar num_gpus=num_gpus, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, rpc_rank=rank) + )(RayWorkerWrapper).remote( # type: ignore[attr-defined] + vllm_config=self.vllm_config, rpc_rank=rank + ) else: worker = ray.remote( num_cpus=0, @@ -225,7 +227,9 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwar resources={current_platform.ray_device_key: num_gpus}, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, rpc_rank=rank) + )(RayWorkerWrapper).remote( # type: ignore[attr-defined] + vllm_config=self.vllm_config, rpc_rank=rank + ) worker_metadata.append(RayWorkerMetaData(worker=worker, created_rank=rank)) worker_ips = ray.get( @@ -303,7 +307,7 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): continue worker_node_and_gpu_ids.append( ray.get(worker.get_node_and_gpu_ids.remote()) - ) # type: ignore + ) # type: ignore[attr-defined] node_workers = defaultdict(list) # node id -> list of worker ranks node_gpus = defaultdict(list) # node id -> list of gpu ids @@ -495,7 +499,9 @@ def _run_workers( if async_run_tensor_parallel_workers_only: ray_workers = self.non_driver_workers ray_worker_outputs = [ - worker.execute_method.remote(sent_method, *args, **kwargs) + worker.execute_method.remote( # type: ignore[attr-defined] + sent_method, *args, **kwargs + ) for worker in ray_workers ] @@ -715,7 +721,7 @@ async def _driver_execute_model_async( tasks.append( asyncio.create_task( _run_task_with_lock( - driver_worker.execute_method.remote, + driver_worker.execute_method.remote, # type: ignore[attr-defined] self.pp_locks[pp_rank], "execute_model", execute_model_req, @@ -733,7 +739,7 @@ async def _start_worker_execution_loop(self): "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1" ) coros = [ - worker.execute_method.remote("start_worker_execution_loop") + worker.execute_method.remote("start_worker_execution_loop") # type: ignore[attr-defined] for worker in self.non_driver_workers ] return await asyncio.gather(*coros) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index d12151bb9485..ef5a99659f30 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -90,14 +90,17 @@ def execute_model_spmd( execute_model_req = self.input_decoder.decode(serialized_req) + assert self.worker is not None, "Worker is not initialized" + # TODO(swang): This is needed right now because Ray Compiled Graph # executes on a background thread, so we need to reset torch's # current device. if not self.compiled_dag_cuda_device_set: + assert self.worker.device is not None current_platform.set_device(self.worker.device) self.compiled_dag_cuda_device_set = True - output = self.worker._execute_model_spmd( + output = self.worker._execute_model_spmd( # type: ignore[attr-defined] execute_model_req, intermediate_tensors ) # Pipeline model request and output to the next pipeline stage. @@ -119,6 +122,7 @@ def setup_device_if_necessary(self): # Not needed pass else: + assert self.worker.device is not None current_platform.set_device(self.worker.device) self.compiled_dag_cuda_device_set = True @@ -139,6 +143,7 @@ def execute_model_ray( scheduler_output, intermediate_tensors = scheduler_output else: scheduler_output, intermediate_tensors = scheduler_output, None + assert self.worker.model_runner is not None output = self.worker.model_runner.execute_model( scheduler_output, intermediate_tensors ) diff --git a/vllm/lora/ops/triton_ops/README_TUNING.md b/vllm/lora/ops/triton_ops/README_TUNING.md new file mode 100644 index 000000000000..fda95ea71891 --- /dev/null +++ b/vllm/lora/ops/triton_ops/README_TUNING.md @@ -0,0 +1,51 @@ +# Multi-LoRA Tuning + +**Note**: The LoRA configuration folder should be specified by exporting `VLLM_TUNED_CONFIG_FOLDER=/path/to/configs`. Without this, the shrink/expand kernels will use default configurations. + +## Tuning Process + +Multi-lora shrink/expand Triton kernel tuning follows a similar methodology from [Triton MoE tuning](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py). + +**Step 1** +Define the searching space. An example searching space: + +```python +block_m_range = [16, 32, 64, 128, 256] +block_n_range = [32, 64, 128, 256] +block_k_range = [32, 64, 128, 256] +num_warps_range = [4, 8] +num_stage_range = [2, 3, 4, 5] +num_ctas_range = [1] +split_k_range = [4, 8, 16, 32, 64] +``` + +**Step 2** +Get all hidden_state sizes and num_slices that the target model uses for a specific TP size. + +For example, we can aquire those info by simply checking [add_lora_linear](https://github.com/li2haipeng/vllm/blob/multi_lora_v01011/vllm/lora/punica_wrapper/punica_gpu.py#L192): + +```python +print(f"x_shape: {x.view(-1, x.shape[-1]).shape}") +print(f"num_sclises: {len(output_slices)}") +for i in range(len(output_slices)): + print(f"a{i} shape: {lora_a_stacked[i].shape}") + print(f"b{i} shape: {lora_b_stacked[i].shape}") +print("y_shape", y.shape) +``` + +**Step 3** +Benchmark the shrink/expand kernel runtime with different kernel configurations generated from the pre-defined search space by performing a grid search to find the optimal kernel configuration. vLLM's [benchmark_lora.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_lora.py) can be used to search for configurations for different shapes. + +## Config Files + +### File Name + +For `shrink`, the config file is named as `{gpu_name}_SHRINK.json`, e.g. `NVIDIA_H200_SHRINK.json`. + +For `expand`, the config fileis named as `{gpu_name}_EXPAND_{add_input}.json`, e.g. `NVIDIA_H200_EXPAND_TRUE.json`. + +The `gpu_name` can be automatically detected by calling `torch.cuda.get_device_name()` + +### Json Structure + +Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n]` diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py index a7a552b9903d..c8330455985a 100644 --- a/vllm/lora/ops/triton_ops/lora_expand_op.py +++ b/vllm/lora/ops/triton_ops/lora_expand_op.py @@ -10,7 +10,7 @@ import torch from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel -from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr +from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr, get_lora_op_configs from vllm.triton_utils import tl, triton from vllm.utils import direct_register_custom_op @@ -201,12 +201,21 @@ def _lora_expand( NUM_SLICES = len(lora_b_weights) # Triton kernel configs. - BLOCK_M = 64 - BLOCK_N = 128 - BLOCK_K = 16 - NUM_WARPS = 4 - NUM_CTAS = 1 - NUM_STAGES = 2 + kernel_config = get_lora_op_configs( + op_type="expand", + max_loras=MAX_LORAS, + batch=M, + hidden_size=MAX_N, + rank=K, + num_slices=NUM_SLICES, + add_inputs=add_inputs, + ) + BLOCK_M = kernel_config["block_m"] + BLOCK_N = kernel_config["block_n"] + BLOCK_K = kernel_config["block_k"] + NUM_WARPS = kernel_config["num_warps"] + NUM_CTAS = kernel_config["num_ctas"] + NUM_STAGES = kernel_config["num_stages"] EVEN_K = K % BLOCK_K == 0 # type: ignore diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py index 1e7e43e30de7..9cba8f494448 100644 --- a/vllm/lora/ops/triton_ops/lora_shrink_op.py +++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py @@ -10,7 +10,7 @@ import torch from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel -from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr +from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr, get_lora_op_configs from vllm.triton_utils import tl, triton from vllm.utils import direct_register_custom_op @@ -177,14 +177,21 @@ def _lora_shrink( MAX_LORAS = lora_ids.size(0) # Triton kernel configs - BLOCK_M = 32 - BLOCK_N = 16 - BLOCK_K = 256 if M < 128 else 32 - SPLIT_K = 64 if M < 128 else 8 - NUM_WARPS = 4 - NUM_CTAS = 1 - NUM_STAGES = 2 - + kernel_config = get_lora_op_configs( + "shrink", + max_loras=MAX_LORAS, + batch=M, + hidden_size=K, + rank=N, + num_slices=NUM_SLICES, + ) + BLOCK_M = kernel_config["block_m"] + BLOCK_N = kernel_config["block_n"] + BLOCK_K = kernel_config["block_k"] + SPLIT_K = kernel_config["split_k"] + NUM_WARPS = kernel_config["num_warps"] + NUM_STAGES = kernel_config["num_stages"] + NUM_CTAS = kernel_config["num_ctas"] EVEN_K = K % (BLOCK_K * SPLIT_K) == 0 # type: ignore # TODO (varun): This grid formulation maximizes parallelization at the diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 3a3e8fc8931e..9ffb6dc3d85e 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -1,8 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools +import json +from pathlib import Path +from typing import Any + import torch +from vllm import envs +from vllm.logger import init_logger + +logger = init_logger(__name__) + _LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} _LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} @@ -133,3 +143,108 @@ def _get_lora_b_ptr( MAX_N, ) return _LORA_B_PTR_DICT.get(key) + + +@functools.lru_cache +def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None: + user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER + if user_defined_config_folder is not None: + gpu_name = torch.cuda.get_device_name() + gpu_name = gpu_name.replace(" ", "_") + gpu_name = gpu_name.replace("-", "_") + + config_fname = None + if op_type == "shrink": + config_fname = f"{gpu_name}_{op_type.upper()}.json" + else: + assert op_type == "expand" + config_fname = ( + f"{gpu_name}_{op_type.upper()}_{str(add_inputs).upper()}.json" + ) + + config_path = Path(f"{user_defined_config_folder}/{config_fname}") + if not config_path.exists(): + logger.warning_once(f"No LoRA kernel configs founded in {config_path}") + return None + + # Load json + logger.info_once(f"Using tuned LoRA kernel configs from {config_path}.") + with open(str(config_path)) as f: + config_data = json.load(f) + else: + config_data = None + + return config_data + + +@functools.lru_cache +def get_lora_op_configs( + op_type: str, + max_loras: int, + batch: int, + hidden_size: int, + rank: int, + num_slices: int, + add_inputs: bool | None = None, +) -> dict[str, int | None]: + assert op_type in ["shrink", "expand"] + + # default config + default = {} + if op_type == "shrink": + default = { + "block_m": 32, + "block_n": 16, + "block_k": 256 if batch < 128 else 32, + "split_k": 64 if batch < 128 else 8, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": None, + } + else: + default = { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": None, + } + m = batch + + k, n = (hidden_size, rank) if op_type == "shrink" else (rank, hidden_size) + + config_data: Any + config_data = load_lora_op_config(op_type, add_inputs) + if not config_data: + logger.warning_once("Using default LoRA kernel configs") + return default + + # config is structured as config_data[max_loras][num_slices][m][k][n] = {} + # slice by max_loras + config_data = ( + config_data.get(str(max_loras)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - max_loras))] + ) + # slice by num_slices + config_data = config_data[str(num_slices)] + # slice by m + config_data = ( + config_data.get(str(m)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - m))] + ) + # slice by k + config_data = ( + config_data.get(str(k)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - k))] + ) + # slice by n + config_data = ( + config_data.get(str(n)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - n))] + ) + + assert config_data is not None + return config_data diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 010c607bcabf..84e176f0ea89 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -64,66 +64,6 @@ def apply(self, params: PoolingParams) -> None: params.requires_token_ids = self.requires_token_ids -class Pooler(nn.Module, ABC): - """The interface required for all poolers used in pooling models in vLLM.""" - - @staticmethod - def for_encode(pooler_config: PoolerConfig): - if pooler_config.pooling_type == "STEP": - return StepPooler() - - resolved_config = ResolvedPoolingConfig( - task="encode", pooling_type=PoolingType.ALL - ) - - return SimplePooler.from_config(resolved_config) - - @staticmethod - def for_embed(pooler_config: PoolerConfig): - resolved_config = ResolvedPoolingConfig.from_config( - task="embed", - pooler_config=pooler_config, - ) - - return SimplePooler.from_config(resolved_config) - - @staticmethod - def for_classify( - pooler_config: PoolerConfig, - classifier: ClassifierFn | None, - ): - resolved_config = ResolvedPoolingConfig.from_config( - task="classify", - pooler_config=pooler_config, - ) - - pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type) - - return ClassifierPooler( - pooling=pooling, - classifier=classifier, - ) - - @abstractmethod - def get_supported_tasks(self) -> Set[PoolingTask]: - """Determine which pooling tasks are supported.""" - raise NotImplementedError - - def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: - """ - Construct the updated pooling parameters to use for a supported task. - """ - return PoolingParamsUpdate() - - @abstractmethod - def forward( - self, - hidden_states: list[torch.Tensor] | torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> PoolerOutput: - raise NotImplementedError - - def get_prompt_lens( hidden_states: torch.Tensor | list[torch.Tensor], pooling_metadata: PoolingMetadata, @@ -237,7 +177,7 @@ def forward( class CLSPool(PoolingMethod): def get_supported_tasks(self) -> Set[PoolingTask]: - return {"encode", "embed", "classify", "score"} + return {"token_embed", "token_classify", "embed", "classify", "score"} def forward_all( self, @@ -253,7 +193,7 @@ def forward_all( class LastPool(PoolingMethod): def get_supported_tasks(self) -> Set[PoolingTask]: - return {"encode", "embed", "classify", "score"} + return {"token_embed", "token_classify", "embed", "classify", "score"} def forward_all( self, @@ -265,7 +205,7 @@ def forward_all( class AllPool(PoolingMethod): def get_supported_tasks(self) -> Set[PoolingTask]: - return {"encode"} + return {"token_embed", "token_classify"} def forward_all( self, @@ -284,7 +224,7 @@ def forward_all( class MeanPool(PoolingMethod): def get_supported_tasks(self) -> Set[PoolingTask]: - return {"encode", "embed", "classify", "score"} + return {"token_embed", "token_classify", "embed", "classify", "score"} def forward_all( self, @@ -398,6 +338,82 @@ def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: return self.fn(pooled_data) +class Pooler(nn.Module, ABC): + """The interface required for all poolers used in pooling models in vLLM.""" + + @staticmethod + def for_token_embed(pooler_config: PoolerConfig): + head = TokenEmbeddingPoolerHead() + + if pooler_config.pooling_type == "STEP": + return StepPooler(head=head) + + return AllPooler(head=head) + + @staticmethod + def for_token_classify( + pooler_config: PoolerConfig, + classifier: ClassifierFn | None = None, + act_fn: PoolerActivation | str | None = None, + ): + head = TokenClassifierPoolerHead(classifier=classifier, act_fn=act_fn) + + if pooler_config.pooling_type == "STEP": + return StepPooler(head=head) + + return AllPooler(head=head) + + @staticmethod + def for_embed(pooler_config: PoolerConfig): + resolved_config = ResolvedPoolingConfig.from_config( + task="embed", + pooler_config=pooler_config, + ) + + pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type) + head = EmbeddingPoolerHead() + + return SimplePooler(pooling=pooling, head=head) + + @staticmethod + def for_classify( + pooler_config: PoolerConfig, + classifier: ClassifierFn | None, + act_fn: PoolerActivation | str | None = None, + ): + resolved_config = ResolvedPoolingConfig.from_config( + task="classify", + pooler_config=pooler_config, + ) + + pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type) + + return ClassifierPooler( + pooling=pooling, + classifier=classifier, + act_fn=act_fn, + ) + + @abstractmethod + def get_supported_tasks(self) -> Set[PoolingTask]: + """Determine which pooling tasks are supported.""" + raise NotImplementedError + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + """ + Construct the updated pooling parameters to use for a supported task. + """ + return PoolingParamsUpdate() + + @abstractmethod + def forward( + self, + hidden_states: list[torch.Tensor] | torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + raise NotImplementedError + + class PoolerHead(nn.Module): def __init__(self, activation: PoolerActivation) -> None: super().__init__() @@ -416,7 +432,6 @@ def __init__(self) -> None: super().__init__(activation=PoolerNormalize()) # Load ST projector if available - vllm_config = get_current_vllm_config() self.projector: nn.Module | None = ( _load_st_projector(vllm_config.model_config) if vllm_config else None @@ -471,39 +486,6 @@ def forward( return pooled_data -class RewardPoolerHead(PoolerHead): - def __init__(self) -> None: - super().__init__(activation=PoolerClassify(static_num_labels=False)) - - vllm_config = get_current_vllm_config() - self.head_dtype = vllm_config.model_config.head_dtype - - def forward( - self, - pooled_data: list[torch.Tensor] | torch.Tensor, - pooling_metadata: PoolingMetadata, - ): - if isinstance(pooled_data, list): - pooled_data = [p.to(self.head_dtype) for p in pooled_data] - else: - pooled_data = pooled_data.to(self.head_dtype) - - pooling_params = get_pooling_params(pooling_metadata) - - # for softmax - flags = [p.softmax for p in pooling_params] - if len(set(flags)) == 1: - if flags[0]: - pooled_data = self.activation(pooled_data) - else: - pooled_data = [ - self.activation(vecs) if f else vecs - for vecs, f in zip(pooled_data, flags) - ] - - return pooled_data - - class SimplePooler(Pooler): """A layer that pools specific information from hidden states. @@ -513,20 +495,6 @@ class SimplePooler(Pooler): 3. Returns structured results as `PoolerOutput`. """ - @classmethod - def from_config( - cls, - pooler_config: ResolvedPoolingConfig, - ) -> "SimplePooler": - pooling = PoolingMethod.from_pooling_type(pooler_config.pooling_type) - if pooler_config.task == "embed": - head = EmbeddingPoolerHead() - elif pooler_config.task == "encode": - head = RewardPoolerHead() - else: - raise NotImplementedError(f"Unknown task: {pooler_config.task}") - return cls(pooling, head) - def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None: super().__init__() @@ -549,58 +517,6 @@ def forward( return pooled_data -class StepPooler(Pooler): - def __init__( - self, - ) -> None: - super().__init__() - - self.pooling = AllPool() - self.head = RewardPoolerHead() - - def extract_states( - self, - hidden_states: torch.Tensor | list[torch.Tensor], - pooling_metadata: PoolingMetadata, - ) -> list[torch.Tensor] | torch.Tensor: - pooled_data_lst = self.pooling(hidden_states, pooling_metadata) - prompt_token_ids = get_prompt_token_ids(pooling_metadata) - - pooled_data = list[torch.Tensor]() - - pooling_params = get_pooling_params(pooling_metadata) - - for data, token_id, pooling_param in zip( - pooled_data_lst, prompt_token_ids, pooling_params - ): - step_tag_id = pooling_param.step_tag_id - returned_token_ids = pooling_param.returned_token_ids - - if returned_token_ids is not None and len(returned_token_ids) > 0: - data = data[:, returned_token_ids] - - if step_tag_id is not None: - data = data[token_id == step_tag_id] - pooled_data.append(data) - - return pooled_data - - def get_supported_tasks(self) -> Set[PoolingTask]: - return {"encode"} - - def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: - return PoolingParamsUpdate(requires_token_ids=True) - - def forward( - self, - hidden_states: torch.Tensor | list[torch.Tensor], - pooling_metadata: PoolingMetadata, - ) -> PoolerOutput: - pooled_data = self.extract_states(hidden_states, pooling_metadata) - pooled_data = self.head(pooled_data, pooling_metadata) - return pooled_data - - class ClassifierPooler(Pooler): """A pooling layer for classification tasks. @@ -611,26 +527,46 @@ class ClassifierPooler(Pooler): """ @staticmethod - def act_fn_for_seq_cls(config: ModelConfig): - return get_classification_activation_function(config.hf_config) + def act_fn_for_seq_cls(model_config: ModelConfig): + return get_classification_activation_function(model_config.hf_config) + + @staticmethod + def act_fn_for_cross_encoder(model_config: ModelConfig): + return get_cross_encoder_activation_function(model_config.hf_config) @staticmethod - def act_fn_for_cross_encoder(config: ModelConfig): - return get_cross_encoder_activation_function(config.hf_config) + def resolve_act_fn( + model_config: ModelConfig, + static_num_labels: bool = True, + act_fn: PoolerActivation | str | None = None, + ): + if isinstance(act_fn, str): + if act_fn == "classify": + return ClassifierPooler.act_fn_for_seq_cls(model_config) + elif act_fn == "score": + return ClassifierPooler.act_fn_for_cross_encoder(model_config) + else: + raise ValueError(f"act_fn [{act_fn=}] not supported.") + elif act_fn is None: + return PoolerClassify(static_num_labels=static_num_labels) + else: + assert callable(act_fn) + return act_fn def __init__( self, pooling: PoolingFn, classifier: ClassifierFn | None, - act_fn: PoolerActivation | None = None, + act_fn: PoolerActivation | str | None = None, ) -> None: super().__init__() vllm_config = get_current_vllm_config() - self.pooling = pooling self.classifier = classifier - self.act_fn = act_fn or PoolerClassify() + self.act_fn = self.resolve_act_fn( + vllm_config.model_config, static_num_labels=True, act_fn=act_fn + ) self.logit_bias: float | None = ( vllm_config.model_config.pooler_config.logit_bias ) @@ -672,6 +608,150 @@ def forward( return scores +class TokenEmbeddingPoolerHead(EmbeddingPoolerHead): + def forward( + self, pooled_data: torch.Tensor, pooling_param: PoolingParams + ) -> torch.Tensor: + pooled_data = pooled_data.to(self.head_dtype) + # pooled_data shape: [n_tokens, hidden_dimension] + + # Apply ST projector + if self.projector is not None: + pooled_data = self.projector(pooled_data) + # pooled_data shape: [n_tokens, embedding_dimension] + + # for matryoshka representation + pooled_data = pooled_data[..., : pooling_param.dimensions] + + # for normalize + if pooling_param.normalize: + pooled_data = self.activation(pooled_data) + + # pooled_data shape: [n_tokens, embedding_dimension] + return pooled_data + + +class TokenClassifierPoolerHead(nn.Module): + def __init__( + self, + classifier: ClassifierFn | None, + act_fn: PoolerActivation | str | None = None, + ) -> None: + super().__init__() + vllm_config = get_current_vllm_config() + + self.classifier = classifier + self.act_fn = ClassifierPooler.resolve_act_fn( + vllm_config.model_config, static_num_labels=False, act_fn=act_fn + ) + self.logit_bias: float | None = ( + vllm_config.model_config.pooler_config.logit_bias + ) + self.head_dtype = vllm_config.model_config.head_dtype + + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_classify"} + + def forward( + self, + hidden_states: torch.Tensor, + pooling_param: PoolingParams, + ) -> torch.Tensor: + hidden_states = hidden_states.to(self.head_dtype) + # hidden_states shape: [n_token, hidden_size] + + if self.classifier is not None: + scores = self.classifier(hidden_states) + else: + scores = hidden_states + # scores shape: [n_token, num_labels] + + if self.logit_bias is not None: + scores -= self.logit_bias + + if pooling_param.activation: + scores = self.act_fn(scores) + + # scores shape: [n_token, num_labels] + return scores + + +class AllPooler(Pooler): + def __init__(self, head: nn.Module | PoolerHead) -> None: + super().__init__() + + self.pooling = AllPool() + self.head = head + + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_embed", "token_classify"} + + def forward( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + pooled_data = self.pooling(hidden_states, pooling_metadata) + pooling_params = get_pooling_params(pooling_metadata) + assert len(pooled_data) == len(pooling_params) + + pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)] + return pooled_data + + +class StepPooler(Pooler): + def __init__(self, head: nn.Module | PoolerHead) -> None: + super().__init__() + + self.pooling = AllPool() + self.head = head + + def extract_states( + self, + hidden_states: torch.Tensor | list[torch.Tensor], + pooling_metadata: PoolingMetadata, + ) -> torch.Tensor | list[torch.Tensor]: + pooled_data_lst = self.pooling(hidden_states, pooling_metadata) + prompt_token_ids = get_prompt_token_ids(pooling_metadata) + + pooled_data = list[torch.Tensor]() + + pooling_params = get_pooling_params(pooling_metadata) + + for data, token_id, pooling_param in zip( + pooled_data_lst, prompt_token_ids, pooling_params + ): + step_tag_id = pooling_param.step_tag_id + returned_token_ids = pooling_param.returned_token_ids + + if returned_token_ids is not None and len(returned_token_ids) > 0: + data = data[:, returned_token_ids] + + if step_tag_id is not None: + data = data[token_id == step_tag_id] + pooled_data.append(data) + + return pooled_data + + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_embed", "token_classify"} + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return PoolingParamsUpdate(requires_token_ids=True) + + def forward( + self, + hidden_states: torch.Tensor | list[torch.Tensor], + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + pooled_data = self.extract_states(hidden_states, pooling_metadata) + pooling_params = get_pooling_params(pooling_metadata) + assert len(pooled_data) == len(pooling_params) + + pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)] + return pooled_data + + class DispatchPooler(Pooler): """Dispatches calls to a sub-pooler based on the pooling task.""" diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 44feb24a1eef..4fda4d76a980 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -8,7 +8,7 @@ from vllm import _custom_ops as ops from vllm import envs -from vllm.config import CompilationLevel, get_current_vllm_config +from vllm.config import CompilationMode, get_current_vllm_config from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.platforms import current_platform @@ -419,7 +419,7 @@ def __init__( if pad_output is None: config = get_current_vllm_config().compilation_config pad_output = ( - config.level < CompilationLevel.PIECEWISE + config.mode < CompilationMode.VLLM_COMPILE and self.preferred_backend == "torch" ) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 1d3874b16484..5d51cd375741 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -250,7 +250,7 @@ def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_embed": Pooler.for_token_embed(pooler_config), "embed": Pooler.for_embed(pooler_config), }, ) @@ -279,11 +279,8 @@ def as_seq_cls_model(cls: _T) -> _T: # Lazy import from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.pooler import ( - ClassifierPooler, DispatchPooler, Pooler, - PoolingMethod, - PoolingType, ) from vllm.model_executor.models.interfaces import SupportsCrossEncoding from vllm.sequence import IntermediateTensors @@ -302,42 +299,29 @@ def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): model_config.hidden_size, config.num_labels, bias=False, - params_dtype=torch.float32, + params_dtype=vllm_config.model_config.head_dtype, quant_config=quant_config, + return_bias=False, prefix=maybe_prefix(prefix, "score"), ) pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - pooling_type_str = pooler_config.pooling_type - assert pooling_type_str is not None - pooling_type = PoolingType[pooling_type_str] - self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), - "classify": ClassifierPooler( - pooling=PoolingMethod.from_pooling_type(pooling_type), - classifier=self._classifier, - act_fn=ClassifierPooler.act_fn_for_seq_cls( - vllm_config.model_config - ), + "token_classify": Pooler.for_token_classify( + pooler_config, classifier=self.score + ), + "classify": Pooler.for_classify( + pooler_config, classifier=self.score, act_fn="classify" ), - "score": ClassifierPooler( - pooling=PoolingMethod.from_pooling_type(pooling_type), - classifier=self._classifier, - act_fn=ClassifierPooler.act_fn_for_cross_encoder( - vllm_config.model_config - ), + "score": Pooler.for_classify( + pooler_config, classifier=self.score, act_fn="score" ), } ) - def _classifier(self, x: torch.Tensor): - x, _ = self.score(x.float()) - return x - def forward( self, input_ids: torch.Tensor, @@ -393,7 +377,11 @@ def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): assert pooler_config is not None self.pooler = DispatchPooler( - {"encode": Pooler.for_encode(pooler_config)}, + { + "token_classify": Pooler.for_token_classify( + pooler_config=pooler_config + ) + } ) ModelForReward.__name__ = _get_pooling_model_name(cls.__name__, "ForReward") diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index a7f3ebed644f..1549c653482f 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -86,13 +86,12 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() assert self.total_num_heads % tp_size == 0 - assert self.total_kv_heads % tp_size == 0 assert self.total_num_heads >= self.total_kv_heads self.num_heads = self.total_num_heads // tp_size self.head_dim = config.head_dim or (self.hidden_size // self.total_num_heads) self.q_size_per_rank = self.head_dim * self.num_heads - self.num_kv_heads = self.total_kv_heads // tp_size + self.num_kv_heads = max(1, self.total_kv_heads // tp_size) self.kv_size_per_rank = self.num_kv_heads * self.head_dim self.scale = self.head_dim**-0.5 self.use_qk_norm = getattr(config, "use_qk_norm", False) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 6e81eb8dc91b..1c2334a78543 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -521,7 +521,7 @@ def _build_model(self, vllm_config: VllmConfig, prefix: str = "") -> BertModel: def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: return DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_embed": Pooler.for_token_embed(pooler_config), "embed": Pooler.for_embed(pooler_config), } ) @@ -724,7 +724,7 @@ def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: return DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_embed": Pooler.for_token_embed(pooler_config), "embed": SPLADESparsePooler( mlm_head=self.mlm_head, cls_token_id=cls_id, @@ -821,20 +821,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_classify": Pooler.for_token_classify( + pooler_config, classifier=self.classifier + ), "classify": ClassifierPooler( pooling=self.bert.pooler, classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_seq_cls( - vllm_config.model_config - ), + act_fn="classify", ), "score": ClassifierPooler( - pooling=self.bert.pooler, - classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_cross_encoder( - vllm_config.model_config - ), + pooling=self.bert.pooler, classifier=self.classifier, act_fn="score" ), } ) @@ -891,7 +887,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_classify": Pooler.for_token_classify( + pooler_config=pooler_config + ), } ) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 49111dd9ffab..31fdc4d21245 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -695,20 +695,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_classify": Pooler.for_token_classify( + pooler_config, classifier=self.classifier + ), "classify": ClassifierPooler( pooling=self.new.pooler, classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_seq_cls( - vllm_config.model_config - ), + act_fn="classify", ), "score": ClassifierPooler( - pooling=self.new.pooler, - classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_cross_encoder( - vllm_config.model_config - ), + pooling=self.new.pooler, classifier=self.classifier, act_fn="score" ), } ) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 3d7b28af8bdb..27953c27188d 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -837,7 +837,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_embed": Pooler.for_token_embed(pooler_config), "embed": Pooler.for_embed(pooler_config), } ) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index de8083313017..576977b00e61 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -17,9 +17,13 @@ VocabParallelEmbedding, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from .deepseek_v2 import DeepseekV2DecoderLayer, get_spec_layer_idx_from_weight_name +from .deepseek_v2 import ( + DeepseekV2DecoderLayer, + get_spec_layer_idx_from_weight_name, +) from .interfaces import SupportsPP from .utils import maybe_prefix @@ -56,6 +60,8 @@ def __init__(self, vllm_config: VllmConfig, prefix: str) -> None: self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False) + self.device = current_platform.device_type + self.is_v32 = hasattr(config, "index_topk") if self.is_v32: topk_tokens = config.index_topk @@ -63,7 +69,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str) -> None: vllm_config.scheduler_config.max_num_batched_tokens, topk_tokens, dtype=torch.int32, - device="cuda", + device=self.device, ) else: topk_indices_buffer = None diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 970fa80826ab..f33ed735f429 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -75,7 +75,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import cdiv, direct_register_custom_op +from vllm.utils import direct_register_custom_op from vllm.utils.deep_gemm import fp8_mqa_logits, fp8_paged_mqa_logits from vllm.v1.attention.backends.mla.indexer import ( DeepseekV32IndexerBackend, @@ -483,69 +483,6 @@ def get_attn_backend(self) -> AttentionBackend: return DeepseekV32IndexerBackend -@torch.inference_mode() -def cp_gather_indexer_k_quant_cache( - kv_cache, # [num_blocks, block_size, head_dim + 1] - dst_value, # [cu_seq_lens[-1], head_dim] - dst_scale, # [cu_seq_lens[-1], 4] - block_table, # [batch_size, num_blocks] - cu_seq_lens, # [batch_size + 1, ] - batch_size, -): - num_blocks, block_size, _ = kv_cache.shape - head_dim = dst_value.shape[-1] - kv_cache = kv_cache.view(num_blocks, -1) - - expected_value = [] - expected_scale = [] - for b in range(batch_size): - s = cu_seq_lens[b + 1] - cu_seq_lens[b] - if s == 0: - continue - tot = cdiv(s, block_size) - blocks = block_table[b, :tot] - - value = [] - scale = [] - full_block = torch.arange(tot - 1, device=kv_cache.device, dtype=torch.int32) - non_remaining_value = kv_cache[ - blocks[full_block], : block_size * head_dim - ].view(-1, head_dim) - non_remaining_scale = kv_cache[ - blocks[full_block], block_size * head_dim : - ].view(-1, 4) - - remaining = s - (tot - 1) * block_size - - value = torch.cat( - [ - non_remaining_value, - kv_cache[blocks[-1], : remaining * head_dim].view(-1, head_dim), - ], - dim=0, - ) - scale = torch.cat( - [ - non_remaining_scale, - kv_cache[ - blocks[-1], - block_size * head_dim : block_size * head_dim + remaining * 4, - ].view(-1, 4), - ], - dim=0, - ) - - expected_value.append(value) - expected_scale.append(scale) - - gather_value = torch.cat(expected_value, dim=0).view(-1, head_dim) - gather_scale = torch.cat(expected_scale, dim=0).view(-1, 4) - gather_value = gather_value.view(torch.float8_e4m3fn) - gather_scale = gather_scale.view(torch.float32) - dst_value.copy_(gather_value) - dst_scale.copy_(gather_scale) - - def sparse_attn_indexer( hidden_states: torch.Tensor, k_cache_prefix: str, @@ -605,19 +542,20 @@ def sparse_attn_indexer( dtype=torch.float8_e4m3fn, ) k_scale = torch.empty( - [chunk.total_seq_lens, 1], device=k.device, dtype=torch.float32 + [chunk.total_seq_lens, 4], + device=k.device, + dtype=torch.uint8, ) - cp_gather_indexer_k_quant_cache( + ops.cp_gather_indexer_k_quant_cache( kv_cache, k_fp8, k_scale, chunk.block_table, chunk.cu_seq_lens, - chunk.num_reqs, ) logits = fp8_mqa_logits( q_fp8[chunk.token_start : chunk.token_end], - (k_fp8, k_scale), + (k_fp8, k_scale.view(torch.float32)), weights[chunk.token_start : chunk.token_end], chunk.cu_seqlen_ks, chunk.cu_seqlen_ke, @@ -1165,6 +1103,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config + self.device = current_platform.device_type self.vocab_size = config.vocab_size self.is_v32 = hasattr(config, "index_topk") @@ -1174,7 +1113,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): vllm_config.scheduler_config.max_num_batched_tokens, topk_tokens, dtype=torch.int32, - device="cuda", + device=self.device, ) else: topk_indices_buffer = None diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index ddd6e53b4a43..6d99d02a32be 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -353,8 +353,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), - "classify": Pooler.for_classify(pooler_config, classifier=self.score), + "token_classify": Pooler.for_token_classify( + pooler_config, classifier=self.score + ), + "classify": Pooler.for_classify( + pooler_config, classifier=self.score, act_fn="classify" + ), + "score": Pooler.for_classify( + pooler_config, classifier=self.score, act_fn="score" + ), } ) diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index ede3e34881b1..181c4ed2dca5 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -239,7 +239,7 @@ def __init__( if pooler_config is not None: self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_embed": Pooler.for_token_embed(pooler_config), "embed": GritLMPooler(vllm_config.model_config), } ) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 8d83a1478dff..c5bbd5497a14 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -444,7 +444,7 @@ def __init__( assert pooler_config is not None self.pooler = DispatchPooler( - {"encode": Pooler.for_encode(pooler_config)}, + {"token_classify": Pooler.for_token_classify(pooler_config)} ) def forward( diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 49cb9311a786..f8a87cf6965f 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -604,10 +604,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_classify": Pooler.for_token_classify( + pooler_config, classifier=self.score + ), "classify": Pooler.for_classify( - pooler_config, - classifier=self.score, + pooler_config, classifier=self.score, act_fn="classify" + ), + "score": Pooler.for_classify( + pooler_config, classifier=self.score, act_fn="score" ), } ) diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py index a9333155243d..05a40837954d 100644 --- a/vllm/model_executor/models/jina_vl.py +++ b/vllm/model_executor/models/jina_vl.py @@ -97,9 +97,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.score = JinaVLScorer(vllm_config.model_config) self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), - "classify": Pooler.for_classify(pooler_config, classifier=self.score), - "score": Pooler.for_classify(pooler_config, classifier=self.score), + "token_classify": Pooler.for_token_classify( + pooler_config, classifier=self.score + ), + "classify": Pooler.for_classify( + pooler_config, classifier=self.score, act_fn="classify" + ), + "score": Pooler.for_classify( + pooler_config, classifier=self.score, act_fn="score" + ), } ) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index ff9f6a41ab99..5dbf38c69086 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -322,20 +322,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_classify": Pooler.for_token_classify( + pooler_config, classifier=self.classifier + ), "classify": ClassifierPooler( - pooling=self.pooling, - classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_seq_cls( - vllm_config.model_config - ), + pooling=self.pooling, classifier=self.classifier, act_fn="classify" ), "score": ClassifierPooler( - pooling=self.pooling, - classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_cross_encoder( - vllm_config.model_config - ), + pooling=self.pooling, classifier=self.classifier, act_fn="score" ), } ) @@ -421,7 +415,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_classify": Pooler.for_token_classify( + pooler_config=pooler_config + ), } ) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 106aaf413e99..dce94d181c4c 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1264,13 +1264,16 @@ def _apply_hf_processor_tokens_only( ) -> list[int]: processor = self.info.get_hf_processor() - # Apply the chat template to the tokens + # The chat template is already applied to the prompt tokens + # Use message_format="none" to avoid applying it again + # Prepend an empty space if `always_start_with_space` is True tokens = processor.processor.get_tokens_input( # type: ignore self.info.get_tokenizer().decode(prompt_tokens), - message_format=processor.message_format, + message_format="none", always_start_with_space=processor.always_start_with_space, ) + # Prepend a BOS token id to the tokens processed_data = self.info.ctx.call_hf_processor( processor, # type: ignore dict(tokens=tokens), diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index c2f2ba637f09..e2ba0e262cf7 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -107,7 +107,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): assert pooler_config is not None self.pooler = DispatchPooler( - {"encode": Pooler.for_encode(pooler_config)}, + {"token_classify": Pooler.for_token_classify(pooler_config)} ) @@ -120,4 +120,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - self.pooler = DispatchPooler({"encode": Pooler.for_encode(pooler_config)}) + self.pooler = DispatchPooler( + {"token_classify": Pooler.for_token_classify(pooler_config)} + ) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index d565a0108432..d5a75e75aa43 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -30,7 +30,9 @@ import torch import torch.nn as nn import torch.nn.functional as F +from packaging.version import Version from transformers import PretrainedConfig +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.feature_extraction_utils import BatchFeature from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import ( Qwen3OmniMoeConfig, @@ -711,11 +713,12 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: return x # NOTE: WhisperFeatureExtractor cannot handle empty list of audios + feature_extractor = self.info.get_feature_extractor() + hop_length = feature_extractor.hop_length if audios: # NOTE: Qwen3-Omni processor accept "audio" # To make sure the cache works with padding=True, we pre-padded # the audio to multiple of hop_length. - hop_length = self.info.get_feature_extractor().hop_length mm_data["audio"] = [ pad_to_hop_length(audio, hop_length) if isinstance(audio, np.ndarray) @@ -725,6 +728,14 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: mm_kwargs = dict( **mm_kwargs, ) + # TODO(Isotr0py): Remove this patch after upstream fix PR + # released and Transformers version update: + # https://github.com/huggingface/transformers/pull/41473 + if ( + Version(TRANSFORMERS_VERSION) < Version("4.58.0") + and "truncation" not in mm_kwargs + ): + mm_kwargs["truncation"] = False hf_inputs = super()._call_hf_processor( prompt=prompt, @@ -738,7 +749,6 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: and "feature_attention_mask" in hf_inputs and (audios := mm_data.get("audio", [])) ): - hop_length = self.info.get_feature_extractor().hop_length audio_num_frames = [] for _, audio in enumerate(audios): audio_length = len(audio[0]) if isinstance(audio, tuple) else len(audio) @@ -747,6 +757,10 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: if audio_length % hop_length == 0 else (audio_length // hop_length - 1) ) + if mm_kwargs.get("truncation", False): + num_frame = min( + num_frame, feature_extractor.n_samples // hop_length + ) audio_num_frames.append(num_frame) hf_inputs["feature_attention_mask"] = [ torch.ones(num_frame) for num_frame in audio_num_frames diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 456226360b91..cfccb904f46c 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -105,15 +105,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: @default_pooling_type("CLS") class RobertaEmbeddingModel(BertEmbeddingModel): - """A model that uses Roberta to provide embedding functionalities. - - This class encapsulates the BertModel and provides an interface for - embedding operations and customized pooling functions. - - Attributes: - model: An instance of BertModel used for forward operations. - _pooler: An instance of Pooler used for pooling operations. - """ + """A model that uses Roberta to provide embedding functionalities.""" def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) @@ -212,20 +204,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_classify": Pooler.for_token_classify( + pooler_config=pooler_config, classifier=self.classifier + ), "classify": ClassifierPooler( - pooling=CLSPool(), - classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_seq_cls( - vllm_config.model_config - ), + pooling=CLSPool(), classifier=self.classifier, act_fn="classify" ), "score": ClassifierPooler( - pooling=CLSPool(), - classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_cross_encoder( - vllm_config.model_config - ), + pooling=CLSPool(), classifier=self.classifier, act_fn="score" ), } ) diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index e8506666db5b..0252705c62b1 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -250,7 +250,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): assert pooler_config is not None self.pooler = DispatchPooler( - {"encode": Pooler.for_encode(pooler_config)}, + {"token_classify": Pooler.for_token_classify(pooler_config)} ) def get_input_embeddings( diff --git a/vllm/model_executor/models/transformers_pooling.py b/vllm/model_executor/models/transformers_pooling.py index 411fb92e9460..7ddeb403da44 100644 --- a/vllm/model_executor/models/transformers_pooling.py +++ b/vllm/model_executor/models/transformers_pooling.py @@ -135,7 +135,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_embed": Pooler.for_token_embed(pooler_config), "embed": Pooler.for_embed(pooler_config), } ) @@ -190,20 +190,14 @@ def forward(self, *args, **kwargs): self.pooler = DispatchPooler( { - "encode": Pooler.for_encode(pooler_config), + "token_classify": Pooler.for_token_classify( + pooler_config, classifier=self.classifier + ), "classify": ClassifierPooler( - pooling=CLSPool(), - classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_seq_cls( - vllm_config.model_config - ), + pooling=CLSPool(), classifier=self.classifier, act_fn="classify" ), "score": ClassifierPooler( - pooling=CLSPool(), - classifier=self.classifier, - act_fn=ClassifierPooler.act_fn_for_cross_encoder( - vllm_config.model_config - ), + pooling=CLSPool(), classifier=self.classifier, act_fn="score" ), } ) diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index f6ef675aa7c2..a29da2a56afc 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -10,12 +10,12 @@ import torch from typing_extensions import override +import vllm.envs as envs from vllm.distributed.device_communicators.shm_object_storage import ( MsgpackSerde, SingleWriterShmObjectStorage, SingleWriterShmRingBuffer, ) -from vllm.envs import VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME from vllm.logger import init_logger from vllm.utils import GiB_bytes, MiB_bytes from vllm.utils.cache import CacheInfo, LRUCache @@ -436,7 +436,7 @@ def __init__(self, vllm_config: "VllmConfig") -> None: ring_buffer = SingleWriterShmRingBuffer( data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes), - name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, + name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, create=True, # sender is the writer ) self._shm_cache = SingleWriterShmObjectStorage( @@ -678,7 +678,7 @@ def __init__( ring_buffer = SingleWriterShmRingBuffer( data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes), - name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, + name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, create=False, # Server is a reader ) self._shm_cache = SingleWriterShmObjectStorage( diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 17d610ac16a3..1a34e9150ce7 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -247,12 +247,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.enable_dbo = False # Note: workaround for v1 gpu_model_runner - from vllm.config import CompilationLevel + from vllm.config import CompilationMode vllm_config.compilation_config.cudagraph_capture_sizes = [] compilation_config = vllm_config.compilation_config - if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE: + if vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE: # Note: vLLM V1 is using PIECEWISE level compilation, which will # take time to compile kernels just-in-time with the inductor # backend. For CPU CI tests, most of them are executed fast and @@ -265,7 +265,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: backend = "inductor" - compilation_config.level = CompilationLevel.DYNAMO_ONCE + compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE compilation_config.backend = backend compilation_config.inductor_compile_config.update( { @@ -277,7 +277,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: ) if vllm_config.lora_config is not None: - compilation_config.level = CompilationLevel.NO_COMPILATION + compilation_config.mode = CompilationMode.NONE assert vllm_config.device_config.device_type == "cpu" diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 0252c3acb08c..04c2bbb43805 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -6,13 +6,10 @@ import os from collections.abc import Callable -from datetime import timedelta from functools import cache, wraps from typing import TYPE_CHECKING, TypeVar import torch -from torch.distributed import PrefixStore, ProcessGroup -from torch.distributed.distributed_c10d import is_nccl_available from typing_extensions import ParamSpec # import custom ops, trigger op registration @@ -455,37 +452,6 @@ def opaque_attention_op(cls) -> bool: def get_static_graph_wrapper_cls(cls) -> str: return "vllm.compilation.cuda_graph.CUDAGraphWrapper" - @classmethod - def stateless_init_device_torch_dist_pg( - cls, - backend: str, - prefix_store: PrefixStore, - group_rank: int, - group_size: int, - timeout: timedelta, - ) -> ProcessGroup: - assert is_nccl_available() - pg: ProcessGroup = ProcessGroup( - prefix_store, - group_rank, - group_size, - ) - from torch.distributed.distributed_c10d import ProcessGroupNCCL - - backend_options = ProcessGroupNCCL.Options() - backend_options._timeout = timeout - - backend_class = ProcessGroupNCCL( - prefix_store, group_rank, group_size, backend_options - ) - backend_type = ProcessGroup.BackendType.NCCL - device = torch.device("cuda") - pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() - - pg._register_backend(device, backend_type, backend_class) - return pg - @classmethod def device_count(cls) -> int: return cuda_device_count_stateless() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 9b8d75ac22fe..f08e62a4aa9c 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -551,7 +551,7 @@ def stateless_init_device_torch_dist_pg( """ Init platform-specific torch distributed process group. """ - raise RuntimeError(f"Unsupported torch distributed backend: {backend}") + raise NotImplementedError @classmethod def is_kv_cache_dtype_supported( diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 81745257d0ae..8fa07b10d34a 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -2,13 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from datetime import timedelta from functools import cache, lru_cache, wraps from typing import TYPE_CHECKING import torch -from torch.distributed import PrefixStore, ProcessGroup -from torch.distributed.distributed_c10d import is_nccl_available import vllm.envs as envs from vllm.logger import init_logger @@ -476,37 +473,6 @@ def is_navi(cls) -> bool: def get_static_graph_wrapper_cls(cls) -> str: return "vllm.compilation.cuda_graph.CUDAGraphWrapper" - @classmethod - def stateless_init_device_torch_dist_pg( - cls, - backend: str, - prefix_store: PrefixStore, - group_rank: int, - group_size: int, - timeout: timedelta, - ) -> ProcessGroup: - assert is_nccl_available() - pg: ProcessGroup = ProcessGroup( - prefix_store, - group_rank, - group_size, - ) - from torch.distributed.distributed_c10d import ProcessGroupNCCL - - backend_options = ProcessGroupNCCL.Options() - backend_options._timeout = timeout - - backend_class = ProcessGroupNCCL( - prefix_store, group_rank, group_size, backend_options - ) - backend_type = ProcessGroup.BackendType.NCCL - device = torch.device("cuda") - pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() - - pg._register_backend(device, backend_type, backend_class) - return pg - @classmethod def device_count(cls) -> int: return cuda_device_count_stateless() diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index dcd595cf9082..ed38f3bc3087 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -114,7 +114,7 @@ def inference_mode(cls): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - from vllm.config import CompilationLevel, CUDAGraphMode + from vllm.config import CompilationMode, CUDAGraphMode cache_config = vllm_config.cache_config # For v0, the default block size is 16. @@ -122,12 +122,13 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config.block_size = cast(BlockSize, 16) compilation_config = vllm_config.compilation_config - # TPU only supports DYNAMO_ONCE compilation level - if compilation_config.level != CompilationLevel.DYNAMO_ONCE: + # TPU only supports DYNAMO_TRACE_ONCE compilation mode + if compilation_config.mode != CompilationMode.DYNAMO_TRACE_ONCE: logger.info( - "[TPU] Forcing DYNAMO_ONCE compilation level, and disabling cudagraph." + "[TPU] Forcing DYNAMO_TRACE_ONCE compilation mode, and\ + disabling cudagraph." ) - compilation_config.level = CompilationLevel.DYNAMO_ONCE + compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE if ( compilation_config.cudagraph_mode is None diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index dcfc970d3a83..4638e9fa3021 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -144,7 +144,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config.block_size = 64 # lazy import to avoid circular import - from vllm.config import CompilationLevel, CUDAGraphMode + from vllm.config import CompilationMode, CUDAGraphMode compilation_config = vllm_config.compilation_config if compilation_config.compile_sizes is None: @@ -155,7 +155,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: ) if vllm_config.lora_config is not None: - compilation_config.level = CompilationLevel.NO_COMPILATION + compilation_config.mode = CompilationMode.NONE # check and update parallel config parallel_config = vllm_config.parallel_config diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 175a4ac01b83..c6dff6e01c1d 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -10,7 +10,7 @@ from vllm.tasks import PoolingTask if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import ModelConfig, PoolerConfig class PoolingParams( @@ -30,7 +30,6 @@ class PoolingParams( if model support matryoshka representation. activation: Whether to apply activation function to the classification outputs. - softmax: Whether to apply softmax to the reward outputs. """ # --8<-- [start:common-pooling-params] @@ -48,32 +47,19 @@ class PoolingParams( activation: bool | None = None # --8<-- [end:classification-pooling-params] - ## for reward models - softmax: bool | None = None + ## for step pooling models step_tag_id: int | None = None returned_token_ids: list[int] | None = None + ## Internal use only task: PoolingTask | None = None - """Internal use only.""" - requires_token_ids: bool = False - """Internal use only.""" - extra_kwargs: dict[str, Any] | None = None - """Internal use only.""" - output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY @property def all_parameters(self) -> list[str]: - return [ - "dimensions", - "normalize", - "activation", - "softmax", - "step_tag_id", - "returned_token_ids", - ] + return ["dimensions", "normalize", "activation"] @property def valid_parameters(self): @@ -81,7 +67,8 @@ def valid_parameters(self): "embed": ["dimensions", "normalize"], "classify": ["activation"], "score": ["activation"], - "encode": ["softmax", "step_tag_id", "returned_token_ids"], + "token_embed": ["dimensions", "normalize"], + "token_classify": ["activation"], } def clone(self) -> "PoolingParams": @@ -100,7 +87,6 @@ def verify( # NOTE: Task validation needs to done against the model instance, # which is not available in model config. So, it's not included # in this method - self._merge_default_parameters(model_config) self._set_default_parameters(model_config) self._verify_valid_parameters() @@ -125,8 +111,34 @@ def _merge_default_parameters( if getattr(self, k, None) is None: setattr(self, k, getattr(pooler_config, k)) + self._verify_step_pooling(pooler_config, valid_parameters) + + def _verify_step_pooling( + self, pooler_config: "PoolerConfig", valid_parameters: list[str] + ): + step_pooling_parameters = ["step_tag_id", "returned_token_ids"] + if pooler_config.pooling_type != "STEP": + invalid_parameters = [] + for k in step_pooling_parameters: + if getattr(self, k, None) is not None: + invalid_parameters.append(k) + + if invalid_parameters: + raise ValueError( + f"Task {self.task} only supports {valid_parameters} " + f"parameters, does not support " + f"{invalid_parameters} parameters" + ) + else: + for k in step_pooling_parameters: + if getattr(pooler_config, k, None) is None: + continue + + if getattr(self, k, None) is None: + setattr(self, k, getattr(pooler_config, k)) + def _set_default_parameters(self, model_config: Optional["ModelConfig"]): - if self.task == "embed": + if self.task in ["embed", "token_embed"]: if self.normalize is None: self.normalize = True @@ -150,13 +162,9 @@ def _set_default_parameters(self, model_config: Optional["ModelConfig"]): elif self.dimensions < 1: raise ValueError("Dimensions must be greater than 0") - elif self.task in ["classify", "score"]: + elif self.task in ["classify", "score", "token_classify"]: if self.activation is None: self.activation = True - - elif self.task == "encode": - if self.softmax is None: - self.softmax = True else: raise ValueError(f"Unknown pooling task: {self.task}") @@ -185,7 +193,6 @@ def __repr__(self) -> str: f"normalize={self.normalize}, " f"dimensions={self.dimensions}, " f"activation={self.activation}, " - f"softmax={self.softmax}, " f"step_tag_id={self.step_tag_id}, " f"returned_token_ids={self.returned_token_ids}, " f"requires_token_ids={self.requires_token_ids}, " diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 10c990f36132..ecee1af43902 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -4,11 +4,13 @@ from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .basic_parsers import BaseThinkingReasoningParser from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from .deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser from .ernie45_reasoning_parser import Ernie45ReasoningParser from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser from .gptoss_reasoning_parser import GptOssReasoningParser from .granite_reasoning_parser import GraniteReasoningParser from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser +from .identity_reasoning_parser import IdentityReasoningParser from .mistral_reasoning_parser import MistralReasoningParser from .olmo3_reasoning_parser import Olmo3ReasoningParser from .qwen3_reasoning_parser import Qwen3ReasoningParser @@ -20,6 +22,8 @@ "BaseThinkingReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser", + "IdentityReasoningParser", + "DeepSeekV3ReasoningParser", "Ernie45ReasoningParser", "GraniteReasoningParser", "HunyuanA13BReasoningParser", diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py new file mode 100644 index 000000000000..7116f90a1ac0 --- /dev/null +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.logger import init_logger +from vllm.reasoning import ( + DeepSeekR1ReasoningParser, + ReasoningParser, + ReasoningParserManager, +) + +from .identity_reasoning_parser import IdentityReasoningParser + +logger = init_logger(__name__) + + +@ReasoningParserManager.register_module("deepseek_v3") +class DeepSeekV3ReasoningParser(ReasoningParser): + """ + V3 parser that delegates to either DeepSeekR1ReasoningParser or + IdentityReasoningParser based on `thinking` and `separate_reasoning`. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): + super().__init__(tokenizer, *args, **kwargs) + + chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {} + thinking = bool(chat_kwargs.pop("thinking", False)) + + if thinking: + self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs) + else: + self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs) + + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: + return self._parser.is_reasoning_end(input_ids) + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + return self._parser.extract_content_ids(input_ids) + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[str | None, str | None]: + return self._parser.extract_reasoning_content(model_output, request) + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> DeltaMessage | None: + return self._parser.extract_reasoning_content_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + ) diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py new file mode 100644 index 000000000000..f1d17a71be33 --- /dev/null +++ b/vllm/reasoning/identity_reasoning_parser.py @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.logger import init_logger +from vllm.reasoning import ReasoningParser + +logger = init_logger(__name__) + + +class IdentityReasoningParser(ReasoningParser): + """ + Identity reasoning parser. + + This parser does not attempt to parse or strip out reasoning tokens. + It treats the entire model output as content and ignores reasoning. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): + super().__init__(tokenizer, *args, **kwargs) + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ReasoningParser " + "constructor during construction." + ) + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + # Always return True, since we never treat reasoning specially + return True + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + # Identity: return all tokens as content + return input_ids + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> DeltaMessage | None: + # Just wrap delta_text as content, ignore reasoning + if delta_text: + return DeltaMessage(content=delta_text) + return None + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[str | None, str | None]: + # No reasoning separation: return None for reasoning_content, + # and full model_output as content + return None, model_output diff --git a/vllm/tasks.py b/vllm/tasks.py index 85c5c6e43620..6551444d1710 100644 --- a/vllm/tasks.py +++ b/vllm/tasks.py @@ -5,7 +5,7 @@ GenerationTask = Literal["generate", "transcription"] GENERATION_TASKS = get_args(GenerationTask) -PoolingTask = Literal["encode", "embed", "classify", "score"] +PoolingTask = Literal["embed", "classify", "score", "token_embed", "token_classify"] POOLING_TASKS = get_args(PoolingTask) SupportedTask = Literal[GenerationTask, PoolingTask] diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index b87414d79df0..58c754dbd397 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any -from vllm.envs import VLLM_MODEL_REDIRECT_PATH +import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) @@ -86,7 +86,7 @@ def maybe_model_redirect(model: str) -> str: :return: maybe redirect to a local folder """ - model_redirect_path = VLLM_MODEL_REDIRECT_PATH + model_redirect_path = envs.VLLM_MODEL_REDIRECT_PATH if not model_redirect_path: return model diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index e1a509a303c5..f05bc555bfdc 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -98,3 +98,6 @@ def __init__(self): self.int64 = None self.int32 = None self.tensor = None + self.exp = None + self.log = None + self.log2 = None diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index c8da83047a40..bb5d3a688094 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1686,16 +1686,16 @@ def repl(match: re.Match) -> str: elif arg.startswith("-O") and arg != "-O" and arg[2] != ".": # allow -O flag to be used without space, e.g. -O3 or -Odecode # -O.<...> handled later - # also handle -O= here - level = arg[3:] if arg[2] == "=" else arg[2:] - processed_args.append(f"-O.level={level}") + # also handle -O= here + mode = arg[3:] if arg[2] == "=" else arg[2:] + processed_args.append(f"-O.mode={mode}") elif ( arg == "-O" and i + 1 < len(args) and args[i + 1] in {"0", "1", "2", "3"} ): - # Convert -O to -O.level - processed_args.append("-O.level") + # Convert -O to -O.mode + processed_args.append("-O.mode") else: processed_args.append(arg) diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py index 99c19c9db28e..6894ccff11d9 100644 --- a/vllm/utils/gc_utils.py +++ b/vllm/utils/gc_utils.py @@ -7,7 +7,7 @@ from contextlib import suppress from typing import Any -from vllm.envs import VLLM_GC_DEBUG +import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) @@ -36,7 +36,7 @@ def __init__(self, gc_debug_conf: str | None = None) -> None: self.top_objects = json_conf.get("top_objects", -1) except Exception: self.enabled = False - logger.error("Failed to parse VLLM_GC_DEBUG(%s)", VLLM_GC_DEBUG) + logger.error("Failed to parse VLLM_GC_DEBUG(%s)", envs.VLLM_GC_DEBUG) logger.info("GC Debug Config. %s", str(self)) def __repr__(self) -> str: @@ -93,7 +93,7 @@ def maybe_attach_gc_debug_callback() -> None: """ Attached a callback for GC debug when VLLM_GC_DEBUG is enabled. """ - config = GCDebugConfig(VLLM_GC_DEBUG) + config = GCDebugConfig(envs.VLLM_GC_DEBUG) if config.enabled: debugger: GCDebugger = GCDebugger(config) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index ff221048dbd1..74176e4b2051 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -219,7 +219,7 @@ def get_computed_blocks(self, request: Request) -> tuple[KVCacheBlocks, int]: self.prefix_cache_stats.queries += request.num_tokens self.prefix_cache_stats.hits += num_new_computed_tokens - return (self.create_kv_cache_blocks(computed_blocks), num_new_computed_tokens) + return self.create_kv_cache_blocks(computed_blocks), num_new_computed_tokens def allocate_slots( self, diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 9f071a0ddac2..a12704b664c3 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -43,12 +43,12 @@ def __init__(self, vllm_config: VllmConfig): not_use_piecewise_compilation or self.compilation_config.is_attention_compiled_piecewise() ), ( - "Compilation level should be CompilationLevel.PIECEWISE when " + "Compilation mode should be CompilationMode.VLLM_COMPILE when " "cudagraph_mode piecewise cudagraphs is used, " "and attention should be in splitting_ops or " "inductor splitting should be used. " f"cudagraph_mode={self.cudagraph_mode}, " - f"compilation_level={self.compilation_config.level}, " + f"compilation_mode={self.compilation_config.mode}, " f"splitting_ops={self.compilation_config.splitting_ops}" ) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 39cd1d97c280..0ec153e23316 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -16,7 +16,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.utils import _validate_truncation_size -from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -483,12 +482,12 @@ async def output_handler(): # Split outputs into chunks of at most # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the # event loop for too long. - if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: + if num_outputs <= envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: slices = (outputs.outputs,) else: slices = np.array_split( outputs.outputs, - cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE), + cdiv(num_outputs, envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE), ) for i, outputs_slice in enumerate(slices): diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index ed8bc55a3cf2..43a40bce6847 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -46,23 +46,15 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None: "Falling back to default sampling implementation." ) self.forward = self.forward_native - elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False: - # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for - # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by - # default it is unused). For backward compatibility, we set - # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and - # interpret it differently in V0 and V1 samplers: In V0, - # None means False, while in V1, None means True. This is - # why we use the condition - # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here. + elif envs.VLLM_USE_FLASHINFER_SAMPLER: + # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1. logger.info_once("Using FlashInfer for top-p & top-k sampling.") self.forward = self.forward_cuda else: - logger.warning_once( - "FlashInfer is available, but it is not enabled. " - "Falling back to the PyTorch-native implementation of " - "top-p & top-k sampling. For the best performance, " - "please set VLLM_USE_FLASHINFER_SAMPLER=1." + logger.debug_once( + "FlashInfer top-p/top-k sampling is available but disabled " + "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in " + "after verifying accuracy for your workloads." ) self.forward = self.forward_native else: diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index ad504da55fd8..6d5d0b2614fa 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -9,7 +9,7 @@ import torch.nn as nn from vllm.config import ( - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, get_layers_from_vllm_config, @@ -86,7 +86,7 @@ def __init__( self.use_cuda_graph = False compilation_config = self.vllm_config.compilation_config - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: cudagraph_mode = compilation_config.cudagraph_mode if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode( CUDAGraphMode.PIECEWISE diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 72f8824e2005..9e394dbb592e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -25,7 +25,7 @@ from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled from vllm.config import ( - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, get_layers_from_vllm_config, @@ -1926,15 +1926,16 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: supported_tasks = list(model.pooler.get_supported_tasks()) - if ( - self.scheduler_config.chunked_prefill_enabled - and "encode" in supported_tasks - ): - supported_tasks.remove("encode") + if self.scheduler_config.chunked_prefill_enabled: + if "token_embed" in supported_tasks: + supported_tasks.remove("token_embed") + if "token_classify" in supported_tasks: + supported_tasks.remove("token_classify") logger.debug_once( "Chunked prefill is not supported with " - "encode task which using ALL pooling. " + "token_embed and token_classify tasks " + "which using ALL pooling. " "Please turn off chunked prefill by " "`--no-enable-chunked-prefill` before using it." ) @@ -2927,14 +2928,15 @@ def load_model(self, eep_scale_up: bool = False) -> None: ) if ( - self.vllm_config.compilation_config.level == CompilationLevel.DYNAMO_AS_IS + self.vllm_config.compilation_config.mode + == CompilationMode.STOCK_TORCH_COMPILE and supports_dynamo() ): backend = self.vllm_config.compilation_config.init_backend(self.vllm_config) - compilation_counter.dynamo_as_is_count += 1 + compilation_counter.stock_torch_compile_count += 1 self.model.compile(fullgraph=True, backend=backend) return - # for other compilation levels, cudagraph behavior is controlled by + # for other compilation modes, cudagraph behavior is controlled by # CudagraphWraper and CudagraphDispatcher of vllm. # wrap the model with full cudagraph wrapper if needed. @@ -3985,7 +3987,7 @@ def initialize_cudagraph_capture(self) -> None: # if not supported any full cudagraphs, just raise it. msg += ( "; please try cudagraph_mode=PIECEWISE, and " - "make sure compilation level is piecewise" + "make sure compilation mode is VLLM_COMPILE" ) raise ValueError(msg) @@ -4012,7 +4014,7 @@ def initialize_cudagraph_capture(self) -> None: f"with {min_cg_builder_name} backend (support: " f"{min_cg_support})" ) - if self.compilation_config.level == CompilationLevel.PIECEWISE and ( + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE and ( self.compilation_config.splitting_ops_contain_attention() or self.compilation_config.use_inductor_graph_partition ): @@ -4068,7 +4070,7 @@ def initialize_cudagraph_capture(self) -> None: f"supported with {min_cg_builder_name} backend (" f"support:{min_cg_support}) " "; please try cudagraph_mode=PIECEWISE, " - "and make sure compilation level is piecewise" + "and make sure compilation mode is VLLM_COMPILE" ) # Trigger cudagraph dispatching keys initialization here (after