From 002b4cf80b730f384dbc7d5f85db8a10b14af56a Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 5 Aug 2025 17:20:26 +0800 Subject: [PATCH 01/34] + default_pooling_type Signed-off-by: wang.yuqi --- tests/models/language/pooling/mteb_utils.py | 12 +- tests/models/language/pooling/test_baai.py | 117 +++++++++--------- .../pooling/test_bge_reranker_v2_gemma.py | 8 +- ...test_classify_auto_prefix_cache_support.py | 64 ++++++++++ .../pooling/test_classify_pooler_config.py | 48 +++++++ .../language/pooling/test_cross_encoder.py | 12 +- tests/models/language/pooling/test_gte.py | 86 ++++++------- .../models/language/pooling/test_intfloat.py | 44 +++---- tests/models/language/pooling/test_jina.py | 14 ++- .../language/pooling/test_mxbai_rerank.py | 15 +-- tests/models/utils.py | 18 +++ tests/test_config.py | 14 +++ vllm/config.py | 25 +++- vllm/model_executor/layers/pooler.py | 38 ++---- vllm/model_executor/models/adapters.py | 4 +- vllm/model_executor/models/bert.py | 8 +- vllm/model_executor/models/bert_with_rope.py | 3 +- vllm/model_executor/models/interfaces.py | 13 ++ vllm/model_executor/models/jamba.py | 3 +- vllm/model_executor/models/modernbert.py | 5 +- vllm/model_executor/models/qwen2_rm.py | 7 +- vllm/model_executor/models/registry.py | 4 +- vllm/model_executor/models/roberta.py | 4 +- 23 files changed, 375 insertions(+), 191 deletions(-) create mode 100644 tests/models/language/pooling/test_classify_auto_prefix_cache_support.py create mode 100644 tests/models/language/pooling/test_classify_pooler_config.py diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 8c93bbdc98c0..18de1078e627 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -176,9 +176,12 @@ def mteb_test_embed_models(hf_runner, max_model_len=None, **vllm_extra_kwargs) as vllm_model: + model_config = vllm_model.llm.llm_engine.model_config + if model_info.architecture: - assert (model_info.architecture - in vllm_model.llm.llm_engine.model_config.architectures) + assert model_info.architecture in model_config.architectures + assert (model_config._model_info.default_pooling_type == + model_info.default_pooling_type) vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) @@ -285,7 +288,12 @@ def mteb_test_rerank_models(hf_runner, **vllm_extra_kwargs) as vllm_model: model_config = vllm_model.llm.llm_engine.model_config + + if model_info.architecture: + assert (model_info.architecture in model_config.architectures) assert model_config.hf_config.num_labels == 1 + assert (model_config._model_info.default_pooling_type == + model_info.default_pooling_type) vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model), tasks=MTEB_RERANK_TASKS, diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py index 64a8f25220da..6fbe0e82d7f8 100644 --- a/tests/models/language/pooling/test_baai.py +++ b/tests/models/language/pooling/test_baai.py @@ -2,73 +2,78 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import EmbedModelInfo, RerankModelInfo +from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, + EmbedModelInfo, LASTPoolingEmbedModelInfo, + RerankModelInfo) from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ ########## BertModel - EmbedModelInfo("BAAI/bge-base-en", - architecture="BertModel", - enable_test=True), - EmbedModelInfo("BAAI/bge-base-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-small-en", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-small-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-en", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-zh-noinstruct", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-base-en-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-base-zh-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-small-en-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-small-zh-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-en-v1.5", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("BAAI/bge-large-zh-v1.5", - architecture="BertModel", - enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-base-en", + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("BAAI/bge-base-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-small-en", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-small-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-en", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-noinstruct", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-base-en-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-base-zh-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-small-en-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-small-zh-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-en-v1.5", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-v1.5", + architecture="BertModel", + enable_test=False), ########## XLMRobertaModel - EmbedModelInfo("BAAI/bge-m3", - architecture="XLMRobertaModel", - enable_test=True), + CLSPoolingEmbedModelInfo("BAAI/bge-m3", + architecture="XLMRobertaModel", + enable_test=True), ########## Qwen2Model - EmbedModelInfo("BAAI/bge-code-v1", - architecture="Qwen2Model", - dtype="float32", - enable_test=True), + LASTPoolingEmbedModelInfo("BAAI/bge-code-v1", + architecture="Qwen2Model", + dtype="float32", + enable_test=True), ] RERANK_MODELS = [ ########## XLMRobertaForSequenceClassification - RerankModelInfo("BAAI/bge-reranker-base", - architecture="XLMRobertaForSequenceClassification", - enable_test=True), - RerankModelInfo("BAAI/bge-reranker-large", - architecture="XLMRobertaForSequenceClassification", - enable_test=False), - RerankModelInfo("BAAI/bge-reranker-v2-m3", - architecture="XLMRobertaForSequenceClassification", - enable_test=False) + CLSPoolingRerankModelInfo( + "BAAI/bge-reranker-base", + architecture="XLMRobertaForSequenceClassification", + enable_test=True), + CLSPoolingRerankModelInfo( + "BAAI/bge-reranker-large", + architecture="XLMRobertaForSequenceClassification", + enable_test=False), + CLSPoolingRerankModelInfo( + "BAAI/bge-reranker-v2-m3", + architecture="XLMRobertaForSequenceClassification", + enable_test=False) ] diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py index 7fa9485dbc7f..206524d7caad 100644 --- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py @@ -8,12 +8,12 @@ from tests.conftest import HfRunner -from .mteb_utils import (RerankModelInfo, VllmMtebEncoder, - mteb_test_rerank_models) +from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo +from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models RERANK_MODELS = [ - RerankModelInfo("BAAI/bge-reranker-v2-gemma", - architecture="GemmaForSequenceClassification"), + LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma", + architecture="GemmaForSequenceClassification"), ] PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501 diff --git a/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py b/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py new file mode 100644 index 000000000000..e25084e77308 --- /dev/null +++ b/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: SIM117 +# Keep Decode-only SequenceClassification models support auto prefix cache +import pytest +import torch +from transformers import AutoModelForSequenceClassification + + +@pytest.mark.parametrize( + "model", + ["jason9693/Qwen2.5-1.5B-apeach"], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_decode_only_classify( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + monkeypatch, +) -> None: + + with vllm_runner(model, + max_model_len=512, + dtype=dtype, + enable_prefix_caching=True) as vllm_model: + cache_config = vllm_model.llm.llm_engine.cache_config + assert cache_config.enable_prefix_caching + vllm_outputs = vllm_model.classify(example_prompts) + + with hf_runner(model, + dtype=dtype, + auto_cls=AutoModelForSequenceClassification) as hf_model: + hf_outputs = hf_model.classify(example_prompts) + + for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): + hf_output = torch.tensor(hf_output) + vllm_output = torch.tensor(vllm_output) + + assert torch.allclose(hf_output, vllm_output, + 1e-3 if dtype == "float" else 1e-2) + + +@pytest.mark.parametrize( + "model", ["intfloat/e5-small", "Alibaba-NLP/gte-Qwen2-1.5B-instruct"]) +@pytest.mark.parametrize("dtype", ["half"]) +def test_encode_only_classify( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + monkeypatch, +) -> None: + if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": + monkeypatch.setenv("VLLM_USE_V1", "0") + + with vllm_runner(model, + max_model_len=512, + dtype=dtype, + enable_prefix_caching=True) as vllm_model: + cache_config = vllm_model.llm.llm_engine.cache_config + assert not cache_config.enable_prefix_caching diff --git a/tests/models/language/pooling/test_classify_pooler_config.py b/tests/models/language/pooling/test_classify_pooler_config.py new file mode 100644 index 000000000000..19dfb8fbc746 --- /dev/null +++ b/tests/models/language/pooling/test_classify_pooler_config.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +import torch.nn.functional as F + +from vllm.config import PoolerConfig + + +@pytest.mark.parametrize( + "model", + [ + "jason9693/Qwen2.5-1.5B-apeach", + "papluca/xlm-roberta-base-language-detection" + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + override_pooler_config=PoolerConfig(softmax=False)) as vllm_model: + wo_softmax_out = vllm_model.classify(example_prompts) + + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + override_pooler_config=PoolerConfig(softmax=True)) as vllm_model: + w_softmax_out = vllm_model.classify(example_prompts) + + for wo_softmax, w_softmax in zip(wo_softmax_out, w_softmax_out): + wo_softmax = torch.tensor(wo_softmax) + w_softmax = torch.tensor(w_softmax) + + assert not torch.allclose( + wo_softmax, w_softmax, + atol=1e-2), "override_pooler_config is not working" + assert torch.allclose(F.softmax(wo_softmax, dim=-1), w_softmax, + 1e-3 if dtype == "float" else 1e-2) \ No newline at end of file diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling/test_cross_encoder.py index 9a33063d7b46..8c1bc5779b8a 100644 --- a/tests/models/language/pooling/test_cross_encoder.py +++ b/tests/models/language/pooling/test_cross_encoder.py @@ -2,13 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from .mteb_utils import RerankModelInfo, mteb_test_rerank_models +from ...utils import (CLSPoolingRerankModelInfo, LASTPoolingRerankModelInfo, + RerankModelInfo) +from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ - RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2", - architecture="BertForSequenceClassification"), - RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", - architecture="Qwen3ForSequenceClassification") + CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2", + architecture="BertForSequenceClassification"), + LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", + architecture="Qwen3ForSequenceClassification") ] diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 6d2eff709961..cf09c1bdccdc 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -4,56 +4,58 @@ import pytest -from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo) +from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ ########## BertModel - EmbedModelInfo("thenlper/gte-large", - architecture="BertModel", - enable_test=True), - EmbedModelInfo("thenlper/gte-base", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-small", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-large-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-base-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-small-zh", - architecture="BertModel", - enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-large", + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("thenlper/gte-base", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-small", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-large-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-base-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-small-zh", + architecture="BertModel", + enable_test=False), ########### NewModel - EmbedModelInfo("Alibaba-NLP/gte-multilingual-base", - architecture="GteNewModel", - enable_test=True), - EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", - architecture="GteNewModel", - enable_test=True), - EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", - architecture="GteNewModel", - enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base", + architecture="GteNewModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", + architecture="GteNewModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", + architecture="GteNewModel", + enable_test=True), ########### Qwen2ForCausalLM - EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", - architecture="Qwen2ForCausalLM", - enable_test=True), + LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", + architecture="Qwen2ForCausalLM", + enable_test=True), ########## ModernBertModel - EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", - architecture="ModernBertModel", - enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base", + architecture="ModernBertModel", + enable_test=True), ########## Qwen3ForCausalLM - EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", - architecture="Qwen3ForCausalLM", - dtype="float32", - enable_test=True), - EmbedModelInfo("Qwen/Qwen3-Embedding-4B", - architecture="Qwen3ForCausalLM", - dtype="float32", - enable_test=False), + LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=True), + LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=False), ] diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py index d899aaada262..e48bdbe940be 100644 --- a/tests/models/language/pooling/test_intfloat.py +++ b/tests/models/language/pooling/test_intfloat.py @@ -2,34 +2,34 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from ...utils import EmbedModelInfo +from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ ########## BertModel - EmbedModelInfo("intfloat/e5-small", - architecture="BertModel", - enable_test=True), - EmbedModelInfo("intfloat/e5-base", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("intfloat/e5-large", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("intfloat/multilingual-e5-small", - architecture="BertModel", - enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/e5-small", + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("intfloat/e5-base", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/e5-large", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-small", + architecture="BertModel", + enable_test=False), ########## XLMRobertaModel - EmbedModelInfo("intfloat/multilingual-e5-base", - architecture="XLMRobertaModel", - enable_test=True), - EmbedModelInfo("intfloat/multilingual-e5-large", - architecture="XLMRobertaModel", - enable_test=False), - EmbedModelInfo("intfloat/multilingual-e5-large-instruct", - architecture="XLMRobertaModel", - enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base", + architecture="XLMRobertaModel", + enable_test=True), + CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large", + architecture="XLMRobertaModel", + enable_test=False), + CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large-instruct", + architecture="XLMRobertaModel", + enable_test=False), ] diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 59b634428cef..37c5bdc97dd9 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -6,20 +6,22 @@ from vllm import PoolingParams -from ...utils import EmbedModelInfo, RerankModelInfo +from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, + EmbedModelInfo, RerankModelInfo) from .embed_utils import (check_embeddings_close, correctness_test_embed_models, matryoshka_fy) from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models EMBEDDING_MODELS = [ - EmbedModelInfo("jinaai/jina-embeddings-v3", - architecture="XLMRobertaModel", - is_matryoshka=True) + CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3", + architecture="XLMRobertaModel", + is_matryoshka=True) ] RERANK_MODELS = [ - RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual", - architecture="XLMRobertaForSequenceClassification") + CLSPoolingRerankModelInfo( + "jinaai/jina-reranker-v2-base-multilingual", + architecture="XLMRobertaForSequenceClassification") ] diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py index e74c58744dd2..480bd5e4567c 100644 --- a/tests/models/language/pooling/test_mxbai_rerank.py +++ b/tests/models/language/pooling/test_mxbai_rerank.py @@ -7,15 +7,16 @@ from tests.conftest import HfRunner -from .mteb_utils import RerankModelInfo, mteb_test_rerank_models +from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo +from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ - RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2", - architecture="Qwen2ForSequenceClassification", - enable_test=True), - RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2", - architecture="Qwen2ForSequenceClassification", - enable_test=False) + LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2", + architecture="Qwen2ForSequenceClassification", + enable_test=True), + LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2", + architecture="Qwen2ForSequenceClassification", + enable_test=False) ] diff --git a/tests/models/utils.py b/tests/models/utils.py index bda7ea3e3ad5..0a6b2b59e853 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -343,11 +343,29 @@ class EmbedModelInfo(NamedTuple): matryoshka_dimensions: Optional[list[int]] = None architecture: str = "" dtype: str = "auto" + default_pooling_type: str = "" enable_test: bool = True +class CLSPoolingEmbedModelInfo(EmbedModelInfo): + default_pooling_type: str = "CLS" + + +class LASTPoolingEmbedModelInfo(EmbedModelInfo): + default_pooling_type: str = "LAST" + + class RerankModelInfo(NamedTuple): name: str architecture: str = "" dtype: str = "auto" + default_pooling_type: str = "" enable_test: bool = True + + +class CLSPoolingRerankModelInfo(RerankModelInfo): + default_pooling_type: str = "CLS" + + +class LASTPoolingRerankModelInfo(RerankModelInfo): + default_pooling_type: str = "LAST" diff --git a/tests/test_config.py b/tests/test_config.py index 441c07b99acf..4989ce31838f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -249,6 +249,20 @@ def test_get_pooling_config_from_args(): assert asdict(pooling_config) == asdict(override_pooler_config) +@pytest.mark.parametrize( + ("model_id", "default_pooling_type", "pooling_type"), + [ + ("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "LAST", "LAST"), # LLM + ("intfloat/e5-small", "CLS", "MEAN"), # BertModel + ("Qwen/Qwen2.5-Math-RM-72B", "ALL", "ALL"), # reward + ("Qwen/Qwen2.5-Math-PRM-7B", "STEP", "STEP") # step reward + ]) +def test_default_pooling_type(model_id, default_pooling_type, pooling_type): + model_config = ModelConfig(model_id) + assert model_config._model_info.default_pooling_type == default_pooling_type + assert model_config.pooler_config.pooling_type == pooling_type + + @pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") def test_get_bert_tokenization_sentence_transformer_config(): diff --git a/vllm/config.py b/vllm/config.py index 899862bf541e..9b7f09a12ccd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -913,6 +913,9 @@ def _init_pooler_config(self) -> Optional["PoolerConfig"]: if getattr(pooler_config, k) is None: setattr(pooler_config, k, v) + if pooler_config.pooling_type is None: + pooler_config.pooling_type = self._model_info.default_pooling_type + return pooler_config return None @@ -1744,6 +1747,18 @@ def use_pad_token(self) -> bool: # `llm as reranker` models defaults to not using pad_token. return getattr(self.hf_config, "use_pad_token", True) + @property + def attn_type(self) -> Optional[str]: + if self.is_attention_free: + return None + if self.is_encoder_decoder: + return "encoder_decoder" + if self._model_info.default_pooling_type == "CLS" or not getattr( + self.hf_config, "is_causal", True): + return "encoder_only" + else: + return "decoder" + def get_and_verify_max_len(self, max_model_len: int): # Consider max_model_len in tokenizer_config only when # pooling models use absolute position_embedding. @@ -4832,12 +4847,12 @@ def __post_init__(self): disable_chunked_prefill_reasons: list[str] = [] - if self.model_config and self.model_config.pooler_config: - pooling_type = self.model_config.pooler_config.pooling_type - if pooling_type is None or pooling_type.lower() != "last": + if self.model_config and self.model_config.runner_type == "pooling": + attn_type = self.model_config.attn_type + if attn_type != "decoder": disable_chunked_prefill_reasons.append( - "Only \"last\" pooling supports chunked " - "prefill and prefix caching; disabling both.") + "Chunked prefill and prefix caching are only available " + "with attn_type='decoder';disabling both.") if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 0f2e58eb9b5d..4f43a8681322 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -44,15 +44,14 @@ class ResolvedPoolingConfig: task: PoolingTask @classmethod - def from_config_with_defaults( + def from_config( cls, task: PoolingTask, pooler_config: PoolerConfig, - pooling_type: PoolingType, ) -> "ResolvedPoolingConfig": + assert pooler_config.pooling_type is not None return cls(task=task, - pooling_type=PoolingType[pooler_config.pooling_type] - if pooler_config.pooling_type is not None else pooling_type) + pooling_type=PoolingType[pooler_config.pooling_type]) @dataclass(frozen=True) @@ -68,32 +67,20 @@ class Pooler(nn.Module, ABC): """The interface required for all poolers used in pooling models in vLLM.""" @staticmethod - def for_encode( - pooler_config: PoolerConfig, - *, - default_pooling_type: PoolingType = PoolingType.ALL, - ): - resolved_config = ResolvedPoolingConfig.from_config_with_defaults( - task="encode", - pooler_config=pooler_config, - pooling_type=default_pooling_type, - ) - - if resolved_config.pooling_type == PoolingType.STEP: + def for_encode(pooler_config: PoolerConfig, ): + if pooler_config.pooling_type == PoolingType.STEP: return StepPooler() + resolved_config = ResolvedPoolingConfig(task="encode", + pooling_type=PoolingType.ALL) + return SimplePooler.from_config(resolved_config) @staticmethod - def for_embed( - pooler_config: PoolerConfig, - *, - default_pooling_type: PoolingType = PoolingType.LAST, - ): - resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + def for_embed(pooler_config: PoolerConfig, ): + resolved_config = ResolvedPoolingConfig.from_config( task="embed", pooler_config=pooler_config, - pooling_type=default_pooling_type, ) return SimplePooler.from_config(resolved_config) @@ -102,13 +89,10 @@ def for_embed( def for_classify( pooler_config: PoolerConfig, classifier: Optional[ClassifierFn], - *, - default_pooling_type: PoolingType = PoolingType.LAST, ): - resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + resolved_config = ResolvedPoolingConfig.from_config( task="classify", pooler_config=pooler_config, - pooling_type=default_pooling_type, ) pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 867de2c68b4c..1dbe70f84a62 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -182,8 +182,8 @@ def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): assert pooler_config is not None pooling_type_str = pooler_config.pooling_type - pooling_type = (PoolingType.LAST if pooling_type_str is None else - PoolingType[pooling_type_str]) + assert pooling_type_str is not None + pooling_type = PoolingType[pooling_type_str] self.pooler = DispatchPooler({ "encode": diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 504621c8abd8..f8c28594fb67 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -28,7 +28,7 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only +from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only, default_pooling_type from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -411,6 +411,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params +@default_pooling_type("ALL") class BertPoolingModel(BertModel): is_pooling_model = True @@ -441,6 +442,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params +@default_pooling_type("CLS") class BertEmbeddingModel(nn.Module, SupportsQuant): """A model that uses Bert to provide embedding functionalities. @@ -502,12 +504,12 @@ def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: Pooler.for_encode(pooler_config), "embed": Pooler.for_embed( - pooler_config, - default_pooling_type=PoolingType.CLS, + pooler_config ), }) +@default_pooling_type("CLS") class BertForSequenceClassification(nn.Module, SupportsV0Only, SupportsCrossEncoding, SupportsQuant): """A model that uses Bert to provide embedding functionalities. diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 59033cb74a33..21293bc347e8 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -25,7 +25,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import SupportsQuant +from vllm.model_executor.models.interfaces import SupportsQuant, default_pooling_type from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform @@ -391,6 +391,7 @@ def forward( return hidden_states +@default_pooling_type("CLS") class BertWithRope(nn.Module, SupportsQuant): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b6d9877cd01b..8c0e0efe8543 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -641,6 +641,19 @@ def supports_cross_encoding( return is_pooling_model(model) and _supports_cross_encoding(model) +def default_pooling_type(pooling_type): + # set default_pooling_type decorator + def func(model): + model.default_pooling_type = pooling_type + return model + + return func + + +def get_default_pooling_type(model: Union[type[object], object]) -> str: + return getattr(model, "default_pooling_type", "LAST") + + class SupportsQuant: """The interface required for all models that support quantization.""" diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index ab21b7ce2c5f..89692e7a54e5 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -591,7 +591,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): "classify": Pooler.for_classify( pooler_config, - classifier=self.score, - default_pooling_type=PoolingType.LAST, + classifier=self.score ), }) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 4967032a244e..267a0c06b68c 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -25,7 +25,7 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import SupportsCrossEncoding, SupportsV0Only +from .interfaces import SupportsCrossEncoding, SupportsV0Only, default_pooling_type from .utils import WeightsMapper, maybe_prefix @@ -199,6 +199,7 @@ def forward( return hidden_states +@default_pooling_type("CLS") class ModernBertModel(nn.Module): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={"layers.": "encoder_layer.layers."}) @@ -264,7 +265,6 @@ def __init__(self, config: ModernBertConfig): self.pooling = PoolingMethod.from_pooling_type(pooling_type) self.dense = nn.Linear(config.hidden_size, config.hidden_size, config.classifier_bias) - self.pooling_type = config.classifier_pooling self.act = nn.GELU() self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, @@ -294,6 +294,7 @@ def forward( return pooled_output +@default_pooling_type("CLS") class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, SupportsCrossEncoding): diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 9b6b70c75c34..fb408993fe02 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -19,7 +19,7 @@ PoolingType) from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type from .qwen2 import Qwen2Model from .utils import AutoWeightsLoader, maybe_prefix @@ -90,6 +90,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loader.load_weights(weights) +@default_pooling_type("ALL") class Qwen2ForRewardModel(Qwen2RewardBaseModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -103,6 +104,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): {"encode": Pooler.for_encode(pooler_config)}, ) +@default_pooling_type("STEP") class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -115,7 +117,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pooler = DispatchPooler({ "encode": Pooler.for_encode( - pooler_config, - default_pooling_type=PoolingType.STEP, + pooler_config ) }) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 9b6ab52d8680..049719818456 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -28,7 +28,7 @@ from .interfaces import (has_inner_state, has_noops, is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, supports_multimodal_raw_input, - supports_pp, supports_transcription, supports_v0_only) + supports_pp, supports_transcription, supports_v0_only, get_default_pooling_type) from .interfaces_base import is_pooling_model, is_text_generation_model logger = init_logger(__name__) @@ -303,6 +303,7 @@ class _ModelInfo: architecture: str is_text_generation_model: bool is_pooling_model: bool + default_pooling_type: str supports_cross_encoding: bool supports_multimodal: bool supports_multimodal_raw_input: bool @@ -321,6 +322,7 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), is_pooling_model=is_pooling_model(model), + default_pooling_type=get_default_pooling_type(model), supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), supports_multimodal_raw_input=supports_multimodal_raw_input(model), diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 77e072c79275..9b77052e960f 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -20,7 +20,7 @@ from vllm.sequence import IntermediateTensors from .bert_with_rope import BertWithRope, JinaRobertaModel -from .interfaces import SupportsCrossEncoding, SupportsV0Only +from .interfaces import SupportsCrossEncoding, SupportsV0Only, default_pooling_type class RobertaEmbedding(nn.Module): @@ -88,6 +88,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x +@default_pooling_type("CLS") class RobertaEmbeddingModel(BertEmbeddingModel): """A model that uses Roberta to provide embedding functionalities. @@ -153,6 +154,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): return loader.load_weights(weights_list, mapper=mapper) +@default_pooling_type("CLS") class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsV0Only): """A model that uses Roberta to provide embedding functionalities. From 4d21759b08026a7d63eb24a8f4b60d87d10f5a40 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 5 Aug 2025 17:22:58 +0800 Subject: [PATCH 02/34] fix Signed-off-by: wang.yuqi --- ...test_classify_auto_prefix_cache_support.py | 6 +-- .../pooling/test_classify_pooler_config.py | 48 ------------------- 2 files changed, 1 insertion(+), 53 deletions(-) delete mode 100644 tests/models/language/pooling/test_classify_pooler_config.py diff --git a/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py b/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py index e25084e77308..e26639280bc6 100644 --- a/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py +++ b/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py @@ -50,12 +50,8 @@ def test_encode_only_classify( vllm_runner, example_prompts, model: str, - dtype: str, - monkeypatch, + dtype: str ) -> None: - if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - monkeypatch.setenv("VLLM_USE_V1", "0") - with vllm_runner(model, max_model_len=512, dtype=dtype, diff --git a/tests/models/language/pooling/test_classify_pooler_config.py b/tests/models/language/pooling/test_classify_pooler_config.py deleted file mode 100644 index 19dfb8fbc746..000000000000 --- a/tests/models/language/pooling/test_classify_pooler_config.py +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest -import torch -import torch.nn.functional as F - -from vllm.config import PoolerConfig - - -@pytest.mark.parametrize( - "model", - [ - "jason9693/Qwen2.5-1.5B-apeach", - "papluca/xlm-roberta-base-language-detection" - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, -) -> None: - - with vllm_runner( - model, - max_model_len=512, - dtype=dtype, - override_pooler_config=PoolerConfig(softmax=False)) as vllm_model: - wo_softmax_out = vllm_model.classify(example_prompts) - - with vllm_runner( - model, - max_model_len=512, - dtype=dtype, - override_pooler_config=PoolerConfig(softmax=True)) as vllm_model: - w_softmax_out = vllm_model.classify(example_prompts) - - for wo_softmax, w_softmax in zip(wo_softmax_out, w_softmax_out): - wo_softmax = torch.tensor(wo_softmax) - w_softmax = torch.tensor(w_softmax) - - assert not torch.allclose( - wo_softmax, w_softmax, - atol=1e-2), "override_pooler_config is not working" - assert torch.allclose(F.softmax(wo_softmax, dim=-1), w_softmax, - 1e-3 if dtype == "float" else 1e-2) \ No newline at end of file From 964560b085e372b34bf313150dc0e5a5630c5f2c Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 5 Aug 2025 18:25:53 +0800 Subject: [PATCH 03/34] fix Signed-off-by: wang.yuqi --- .../test_classify_auto_prefix_cache_support.py | 9 ++------- vllm/config.py | 9 +++++++-- vllm/model_executor/layers/pooler.py | 2 +- vllm/model_executor/models/bert.py | 11 ++++------- vllm/model_executor/models/bert_with_rope.py | 3 ++- vllm/model_executor/models/jamba.py | 8 ++------ vllm/model_executor/models/modernbert.py | 3 ++- vllm/model_executor/models/qwen2_rm.py | 11 +++-------- vllm/model_executor/models/registry.py | 6 +++--- vllm/model_executor/models/roberta.py | 3 ++- 10 files changed, 28 insertions(+), 37 deletions(-) diff --git a/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py b/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py index e26639280bc6..5dcce92aad7c 100644 --- a/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py +++ b/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py @@ -45,13 +45,8 @@ def test_decode_only_classify( @pytest.mark.parametrize( "model", ["intfloat/e5-small", "Alibaba-NLP/gte-Qwen2-1.5B-instruct"]) @pytest.mark.parametrize("dtype", ["half"]) -def test_encode_only_classify( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str -) -> None: +def test_encode_only_classify(hf_runner, vllm_runner, example_prompts, + model: str, dtype: str) -> None: with vllm_runner(model, max_model_len=512, dtype=dtype, diff --git a/vllm/config.py b/vllm/config.py index 9b7f09a12ccd..88e9eda28908 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -913,8 +913,9 @@ def _init_pooler_config(self) -> Optional["PoolerConfig"]: if getattr(pooler_config, k) is None: setattr(pooler_config, k, v) + default_pooling_type = self._model_info.default_pooling_type if pooler_config.pooling_type is None: - pooler_config.pooling_type = self._model_info.default_pooling_type + pooler_config.pooling_type = default_pooling_type return pooler_config @@ -1756,8 +1757,12 @@ def attn_type(self) -> Optional[str]: if self._model_info.default_pooling_type == "CLS" or not getattr( self.hf_config, "is_causal", True): return "encoder_only" - else: + elif self._model_info.default_pooling_type == "LAST": return "decoder" + else: + # default_pooling_type == "ALL" and "STEP" + # is not supported temporarily + return None def get_and_verify_max_len(self, max_model_len: int): # Consider max_model_len in tokenizer_config only when diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 4f43a8681322..1b089f96ffef 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -68,7 +68,7 @@ class Pooler(nn.Module, ABC): @staticmethod def for_encode(pooler_config: PoolerConfig, ): - if pooler_config.pooling_type == PoolingType.STEP: + if pooler_config.pooling_type == "STEP": return StepPooler() resolved_config = ResolvedPoolingConfig(task="encode", diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index f8c28594fb67..11e28d8516ad 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -28,7 +28,8 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only, default_pooling_type +from .interfaces import (SupportsCrossEncoding, SupportsQuant, SupportsV0Only, + default_pooling_type) from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -500,12 +501,8 @@ def _build_model(self, def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: return DispatchPooler({ - "encode": - Pooler.for_encode(pooler_config), - "embed": - Pooler.for_embed( - pooler_config - ), + "encode": Pooler.for_encode(pooler_config), + "embed": Pooler.for_embed(pooler_config), }) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 21293bc347e8..8f5000bec9ea 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -25,7 +25,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import SupportsQuant, default_pooling_type +from vllm.model_executor.models.interfaces import (SupportsQuant, + default_pooling_type) from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 89692e7a54e5..f1ba6cbc776d 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -19,8 +19,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer -from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, - PoolingType) +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) @@ -589,8 +588,5 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): "encode": Pooler.for_encode(pooler_config), "classify": - Pooler.for_classify( - pooler_config, - classifier=self.score - ), + Pooler.for_classify(pooler_config, classifier=self.score), }) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 267a0c06b68c..fa7245aa4af2 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -25,7 +25,8 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import SupportsCrossEncoding, SupportsV0Only, default_pooling_type +from .interfaces import (SupportsCrossEncoding, SupportsV0Only, + default_pooling_type) from .utils import WeightsMapper, maybe_prefix diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index fb408993fe02..e0a30e04c602 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -15,8 +15,7 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, - PoolingType) +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type @@ -114,9 +113,5 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - self.pooler = DispatchPooler({ - "encode": - Pooler.for_encode( - pooler_config - ) - }) + self.pooler = DispatchPooler( + {"encode": Pooler.for_encode(pooler_config)}) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 049719818456..02fb7d47f36a 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -25,10 +25,10 @@ from vllm.transformers_utils.dynamic_module import ( try_get_class_from_dynamic_module) -from .interfaces import (has_inner_state, has_noops, is_attention_free, - is_hybrid, supports_cross_encoding, +from .interfaces import (get_default_pooling_type, has_inner_state, has_noops, + is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, supports_multimodal_raw_input, - supports_pp, supports_transcription, supports_v0_only, get_default_pooling_type) + supports_pp, supports_transcription, supports_v0_only) from .interfaces_base import is_pooling_model, is_text_generation_model logger = init_logger(__name__) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 9b77052e960f..dd79d89d8c68 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -20,7 +20,8 @@ from vllm.sequence import IntermediateTensors from .bert_with_rope import BertWithRope, JinaRobertaModel -from .interfaces import SupportsCrossEncoding, SupportsV0Only, default_pooling_type +from .interfaces import (SupportsCrossEncoding, SupportsV0Only, + default_pooling_type) class RobertaEmbedding(nn.Module): From 32411ce3150f792f9e02dec63e475e9f37a7183e Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 5 Aug 2025 18:33:08 +0800 Subject: [PATCH 04/34] fix pooling Signed-off-by: wang.yuqi --- vllm/model_executor/models/internlm2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index d29779a35e5c..d0c4bf5450d6 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -31,7 +31,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -401,6 +401,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params +@default_pooling_type("ALL") class InternLM2ForRewardModel(InternLM2ForCausalLM): is_pooling_model = True From a97a004a50d0085ecb693da73520a22b7b998bd2 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 10:53:43 +0800 Subject: [PATCH 05/34] turn off encode Signed-off-by: wang.yuqi --- ...t.py => test_auto_prefix_cache_support.py} | 51 ++++++++++++-- tests/models/language/pooling/test_nomic.py | 27 ++++---- .../pooling/test_snowflake_arctic_embed.py | 67 ++++++++++--------- vllm/config.py | 30 +++------ vllm/entrypoints/llm.py | 6 ++ vllm/entrypoints/openai/api_server.py | 3 +- 6 files changed, 110 insertions(+), 74 deletions(-) rename tests/models/language/pooling/{test_classify_auto_prefix_cache_support.py => test_auto_prefix_cache_support.py} (56%) diff --git a/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py similarity index 56% rename from tests/models/language/pooling/test_classify_auto_prefix_cache_support.py rename to tests/models/language/pooling/test_auto_prefix_cache_support.py index 5dcce92aad7c..5efed4f83bde 100644 --- a/tests/models/language/pooling/test_classify_auto_prefix_cache_support.py +++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py @@ -1,26 +1,29 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: SIM117 # Keep Decode-only SequenceClassification models support auto prefix cache import pytest import torch from transformers import AutoModelForSequenceClassification +from tests.models.language.pooling.embed_utils import ( + run_embedding_correctness_test) + @pytest.mark.parametrize( "model", ["jason9693/Qwen2.5-1.5B-apeach"], ) @pytest.mark.parametrize("dtype", ["half"]) -def test_decode_only_classify( +def test_classify_models( hf_runner, vllm_runner, example_prompts, model: str, dtype: str, - monkeypatch, ) -> None: + example_prompts = example_prompts * 2 + with vllm_runner(model, max_model_len=512, dtype=dtype, @@ -43,10 +46,46 @@ def test_decode_only_classify( @pytest.mark.parametrize( - "model", ["intfloat/e5-small", "Alibaba-NLP/gte-Qwen2-1.5B-instruct"]) + "model", + ["Qwen/Qwen3-Embedding-0.6B"], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_embed_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +): + example_prompts = [str(s).strip() for s in example_prompts] * 2 + + with vllm_runner( + model, + runner="pooling", + max_model_len=None, + enable_prefix_caching=True, + ) as vllm_model: + cache_config = vllm_model.llm.llm_engine.cache_config + assert cache_config.enable_prefix_caching + vllm_outputs = vllm_model.embed(example_prompts) + + with hf_runner( + model, + is_sentence_transformer=True, + ) as hf_model: + run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) + + +@pytest.mark.parametrize( + "model", + [ + "intfloat/e5-small", + "Alibaba-NLP/gte-Qwen2-1.5B-instruct", # is_causal == False + "papluca/xlm-roberta-base-language-detection", + ]) @pytest.mark.parametrize("dtype", ["half"]) -def test_encode_only_classify(hf_runner, vllm_runner, example_prompts, - model: str, dtype: str) -> None: +def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str, + dtype: str) -> None: with vllm_runner(model, max_model_len=512, dtype=dtype, diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index e16ec239a338..2d05958e9bcd 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -3,22 +3,23 @@ import pytest -from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ - EmbedModelInfo("nomic-ai/nomic-embed-text-v1", - architecture="NomicBertModel", - enable_test=True), - EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5", - architecture="NomicBertModel", - enable_test=False), - EmbedModelInfo("nomic-ai/CodeRankEmbed", - architecture="NomicBertModel", - enable_test=False), - EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", - architecture="NomicBertModel", - enable_test=True) + CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1", + architecture="NomicBertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5", + architecture="NomicBertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("nomic-ai/CodeRankEmbed", + architecture="NomicBertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", + architecture="NomicBertModel", + enable_test=True) ] diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index d6b5dbd08372..585fa0e683da 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -3,42 +3,43 @@ import pytest -from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo +from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ - EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", - is_matryoshka=False, - architecture="BertModel", - enable_test=True), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-s", - is_matryoshka=False, - architecture="BertModel", - enable_test=False), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m", - is_matryoshka=False, - architecture="BertModel", - enable_test=False), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long", - is_matryoshka=False, - architecture="NomicBertModel", - enable_test=True), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-l", - is_matryoshka=False, - architecture="BertModel", - enable_test=False), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", - is_matryoshka=True, - architecture="BertModel", - enable_test=True), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0", - is_matryoshka=True, - architecture="XLMRobertaModel", - enable_test=True), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0", - is_matryoshka=True, - architecture="GteModel", - enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", + is_matryoshka=False, + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s", + is_matryoshka=False, + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m", + is_matryoshka=False, + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long", + is_matryoshka=False, + architecture="NomicBertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l", + is_matryoshka=False, + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", + is_matryoshka=True, + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0", + is_matryoshka=True, + architecture="XLMRobertaModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0", + is_matryoshka=True, + architecture="GteModel", + enable_test=True), ] diff --git a/vllm/config.py b/vllm/config.py index 88e9eda28908..1a248e773f58 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1748,22 +1748,6 @@ def use_pad_token(self) -> bool: # `llm as reranker` models defaults to not using pad_token. return getattr(self.hf_config, "use_pad_token", True) - @property - def attn_type(self) -> Optional[str]: - if self.is_attention_free: - return None - if self.is_encoder_decoder: - return "encoder_decoder" - if self._model_info.default_pooling_type == "CLS" or not getattr( - self.hf_config, "is_causal", True): - return "encoder_only" - elif self._model_info.default_pooling_type == "LAST": - return "decoder" - else: - # default_pooling_type == "ALL" and "STEP" - # is not supported temporarily - return None - def get_and_verify_max_len(self, max_model_len: int): # Consider max_model_len in tokenizer_config only when # pooling models use absolute position_embedding. @@ -4852,12 +4836,16 @@ def __post_init__(self): disable_chunked_prefill_reasons: list[str] = [] - if self.model_config and self.model_config.runner_type == "pooling": - attn_type = self.model_config.attn_type - if attn_type != "decoder": + if self.model_config and self.model_config.pooler_config: + pooling_type = self.model_config.pooler_config.pooling_type + if pooling_type is None or pooling_type.lower() != "last": + disable_chunked_prefill_reasons.append( + "Only \"last\" pooling supports chunked " + "prefill and prefix caching; disabling both.") + elif not getattr(self.model_config.hf_config, "is_causal", True): disable_chunked_prefill_reasons.append( - "Chunked prefill and prefix caching are only available " - "with attn_type='decoder';disabling both.") + "Only models using causal attention supports chunked " + "prefill and prefix caching; disabling both.") if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index ca24b0c32b73..fac83e822cd8 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1096,6 +1096,12 @@ def encode( "Try passing `--runner pooling` to use the model as a " "pooling model.") + if (pooling_task == "encode" + and self.llm_engine.cache_config.enable_prefix_caching): + raise ValueError("LLM.encode() uses ALL pooling, which does " + "not support prefix_caching. " + "Please turn off prefix_caching before using it.") + if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( prompts=cast(Optional[Union[str, list[str]]], prompts), diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9bf470232078..eba9adef1551 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1691,7 +1691,8 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, - ) if "encode" in supported_tasks else None + ) if ("encode" in supported_tasks + and not vllm_config.cache_config.enable_prefix_caching) else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, From 3f70b32189fb768be426a4df9e718189c1f42569 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 10:59:32 +0800 Subject: [PATCH 06/34] conflicts Signed-off-by: wang.yuqi --- tests/models/utils.py | 18 ------------------ vllm/model_executor/models/jamba.py | 11 +++++++++-- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index 0a6b2b59e853..bda7ea3e3ad5 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -343,29 +343,11 @@ class EmbedModelInfo(NamedTuple): matryoshka_dimensions: Optional[list[int]] = None architecture: str = "" dtype: str = "auto" - default_pooling_type: str = "" enable_test: bool = True -class CLSPoolingEmbedModelInfo(EmbedModelInfo): - default_pooling_type: str = "CLS" - - -class LASTPoolingEmbedModelInfo(EmbedModelInfo): - default_pooling_type: str = "LAST" - - class RerankModelInfo(NamedTuple): name: str architecture: str = "" dtype: str = "auto" - default_pooling_type: str = "" enable_test: bool = True - - -class CLSPoolingRerankModelInfo(RerankModelInfo): - default_pooling_type: str = "CLS" - - -class LASTPoolingRerankModelInfo(RerankModelInfo): - default_pooling_type: str = "LAST" diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index f1ba6cbc776d..263f4c8379cf 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -19,7 +19,8 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer -from vllm.model_executor.layers.pooler import DispatchPooler, Pooler +from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, + PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) @@ -588,5 +589,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): "encode": Pooler.for_encode(pooler_config), "classify": - Pooler.for_classify(pooler_config, classifier=self.score), + Pooler.for_classify( + pooler_config, + classifier=self.score, + default_pooling_type=PoolingType.LAST, + default_normalize=False, + default_softmax=False, + ), }) From 384f4061517ca8e6e910698426a7e6234868c5f2 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 11:04:26 +0800 Subject: [PATCH 07/34] fix Signed-off-by: wang.yuqi --- tests/models/utils.py | 18 ++++++++++++++++++ vllm/model_executor/models/jamba.py | 6 +----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index 1513db52209e..eff78535e371 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -344,16 +344,34 @@ class EmbedModelInfo(NamedTuple): matryoshka_dimensions: Optional[list[int]] = None architecture: str = "" dtype: str = "auto" + default_pooling_type: str = "" enable_test: bool = True +class CLSPoolingEmbedModelInfo(EmbedModelInfo): + default_pooling_type: str = "CLS" + + +class LASTPoolingEmbedModelInfo(EmbedModelInfo): + default_pooling_type: str = "LAST" + + class RerankModelInfo(NamedTuple): name: str architecture: str = "" dtype: str = "auto" + default_pooling_type: str = "" enable_test: bool = True +class CLSPoolingRerankModelInfo(RerankModelInfo): + default_pooling_type: str = "CLS" + + +class LASTPoolingRerankModelInfo(RerankModelInfo): + default_pooling_type: str = "LAST" + + def dummy_hf_overrides( hf_config: PretrainedConfig, model_arch: str, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 8a9efd4d7247..fbd310121ad4 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -22,8 +22,7 @@ from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.mamba.mamba_utils import ( MambaStateShapeCalculator) -from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, - PoolingType) +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) @@ -604,8 +603,5 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): Pooler.for_classify( pooler_config, classifier=self.score, - default_pooling_type=PoolingType.LAST, - default_normalize=False, - default_softmax=False, ), }) From 8a94d1c5b2cb915c135f47c86d54fef894f943b2 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 14:10:41 +0800 Subject: [PATCH 08/34] update Signed-off-by: wang.yuqi --- vllm/engine/arg_utils.py | 6 ++---- vllm/entrypoints/llm.py | 9 +++++---- vllm/entrypoints/openai/api_server.py | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3e2f03d56c40..728358031090 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1596,11 +1596,9 @@ def _set_default_args_v1(self, usage_context: UsageContext, else: pooling_type = model_config.pooler_config.pooling_type - - # TODO: when encoder models are supported we'll have to - # check for causal attention here. + is_causal = getattr(model_config.hf_config, "is_causal", True) incremental_prefill_supported = (pooling_type is not None and - pooling_type.lower() == "last") + pooling_type.lower() == "last" and is_causal) action = "Enabling" if \ incremental_prefill_supported else "Disabling" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index fac83e822cd8..1a006a999263 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1096,11 +1096,12 @@ def encode( "Try passing `--runner pooling` to use the model as a " "pooling model.") - if (pooling_task == "encode" - and self.llm_engine.cache_config.enable_prefix_caching): + if (pooling_task == "encode" and self.llm_engine.vllm_config. + scheduler_config.chunked_prefill_enabled): raise ValueError("LLM.encode() uses ALL pooling, which does " - "not support prefix_caching. " - "Please turn off prefix_caching before using it.") + "not support chunked prefill. " + "Please turn off chunked prefill by " + "`--no-enable-chunked-prefill` before using it.") if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index fa0ecfb3872d..9f9a31e1ce21 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1698,8 +1698,8 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, - ) if ("encode" in supported_tasks - and not vllm_config.cache_config.enable_prefix_caching) else None + ) if ("encode" in supported_tasks and + not vllm_config.scheduler_config.chunked_prefill_enabled) else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, From f9d7017bf3086c532e833df323fc50e0f57b2fca Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 14:13:17 +0800 Subject: [PATCH 09/34] conflicts Signed-off-by: wang.yuqi --- tests/models/language/pooling/test_gte.py | 86 +++++++++++------------ 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index cf09c1bdccdc..6d2eff709961 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -4,58 +4,56 @@ import pytest -from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, - LASTPoolingEmbedModelInfo) -from .embed_utils import correctness_test_embed_models +from .embed_utils import EmbedModelInfo, correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ ########## BertModel - CLSPoolingEmbedModelInfo("thenlper/gte-large", - architecture="BertModel", - enable_test=True), - CLSPoolingEmbedModelInfo("thenlper/gte-base", - architecture="BertModel", - enable_test=False), - CLSPoolingEmbedModelInfo("thenlper/gte-small", - architecture="BertModel", - enable_test=False), - CLSPoolingEmbedModelInfo("thenlper/gte-large-zh", - architecture="BertModel", - enable_test=False), - CLSPoolingEmbedModelInfo("thenlper/gte-base-zh", - architecture="BertModel", - enable_test=False), - CLSPoolingEmbedModelInfo("thenlper/gte-small-zh", - architecture="BertModel", - enable_test=False), + EmbedModelInfo("thenlper/gte-large", + architecture="BertModel", + enable_test=True), + EmbedModelInfo("thenlper/gte-base", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("thenlper/gte-small", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("thenlper/gte-large-zh", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("thenlper/gte-base-zh", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("thenlper/gte-small-zh", + architecture="BertModel", + enable_test=False), ########### NewModel - CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base", - architecture="GteNewModel", - enable_test=True), - CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", - architecture="GteNewModel", - enable_test=True), - CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", - architecture="GteNewModel", - enable_test=True), + EmbedModelInfo("Alibaba-NLP/gte-multilingual-base", + architecture="GteNewModel", + enable_test=True), + EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", + architecture="GteNewModel", + enable_test=True), + EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", + architecture="GteNewModel", + enable_test=True), ########### Qwen2ForCausalLM - LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", - architecture="Qwen2ForCausalLM", - enable_test=True), + EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", + architecture="Qwen2ForCausalLM", + enable_test=True), ########## ModernBertModel - CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base", - architecture="ModernBertModel", - enable_test=True), + EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", + architecture="ModernBertModel", + enable_test=True), ########## Qwen3ForCausalLM - LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", - architecture="Qwen3ForCausalLM", - dtype="float32", - enable_test=True), - LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B", - architecture="Qwen3ForCausalLM", - dtype="float32", - enable_test=False), + EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=True), + EmbedModelInfo("Qwen/Qwen3-Embedding-4B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=False), ] From 6638ae05b94c451d81ce77df7be0c425670919df Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 14:17:25 +0800 Subject: [PATCH 10/34] fix Signed-off-by: wang.yuqi --- tests/models/language/pooling/test_gte.py | 87 ++++++++++++----------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 48a0cd64fec1..5a5fdfbb214c 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -4,57 +4,58 @@ import pytest -from ...utils import check_transformers_version -from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo, + LASTPoolingEmbedModelInfo, check_transformers_version) +from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models MODELS = [ ########## BertModel - EmbedModelInfo("thenlper/gte-large", - architecture="BertModel", - enable_test=True), - EmbedModelInfo("thenlper/gte-base", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-small", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-large-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-base-zh", - architecture="BertModel", - enable_test=False), - EmbedModelInfo("thenlper/gte-small-zh", - architecture="BertModel", - enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-large", + architecture="BertModel", + enable_test=True), + CLSPoolingEmbedModelInfo("thenlper/gte-base", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-small", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-large-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-base-zh", + architecture="BertModel", + enable_test=False), + CLSPoolingEmbedModelInfo("thenlper/gte-small-zh", + architecture="BertModel", + enable_test=False), ########### NewModel - EmbedModelInfo("Alibaba-NLP/gte-multilingual-base", - architecture="GteNewModel", - enable_test=True), - EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", - architecture="GteNewModel", - enable_test=True), - EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", - architecture="GteNewModel", - enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base", + architecture="GteNewModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", + architecture="GteNewModel", + enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", + architecture="GteNewModel", + enable_test=True), ########### Qwen2ForCausalLM - EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", - architecture="Qwen2ForCausalLM", - enable_test=True), + LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", + architecture="Qwen2ForCausalLM", + enable_test=True), ########## ModernBertModel - EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", - architecture="ModernBertModel", - enable_test=True), + CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base", + architecture="ModernBertModel", + enable_test=True), ########## Qwen3ForCausalLM - EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", - architecture="Qwen3ForCausalLM", - dtype="float32", - enable_test=True), - EmbedModelInfo("Qwen/Qwen3-Embedding-4B", - architecture="Qwen3ForCausalLM", - dtype="float32", - enable_test=False), + LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=True), + LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B", + architecture="Qwen3ForCausalLM", + dtype="float32", + enable_test=False), ] From bf14fc43a7ffe8538599f63f2c33abf356b5879b Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 14:52:05 +0800 Subject: [PATCH 11/34] + tests Signed-off-by: wang.yuqi --- tests/entrypoints/llm/test_classify.py | 8 ++++++++ tests/entrypoints/openai/test_classification.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py index abdce8935ea5..b81cedb27b90 100644 --- a/tests/entrypoints/llm/test_classify.py +++ b/tests/entrypoints/llm/test_classify.py @@ -65,3 +65,11 @@ def get_outputs(activation): assert torch.allclose( softmax(wo_activation), w_activation, atol=1e-2 ), "w_activation should be close to activation(wo_activation)." + + +@pytest.mark.skip_global_cleanup +def test_encode_api(llm: LLM): + err_msg = ("LLM.encode\(\) uses ALL pooling, which does " + "not support chunked prefill.+") + with pytest.raises(ValueError, match=err_msg): + llm.encode(prompts, use_tqdm=False) diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index 886267c21124..700203247f1a 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -211,3 +211,18 @@ async def get_outputs(activation): assert torch.allclose( F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2 ), "w_activation should be close to activation(wo_activation)." + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_pooling(server: RemoteOpenAIServer, model_name: str): + # pooling api uses ALL pooling, which does not support chunked prefill. + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float" + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" \ No newline at end of file From bc2753c252f53d1b9825899531add7a77d1abf44 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 15:15:13 +0800 Subject: [PATCH 12/34] fix Signed-off-by: wang.yuqi --- tests/entrypoints/openai/test_classification.py | 2 +- vllm/engine/arg_utils.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index 700203247f1a..30078fe90257 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -225,4 +225,4 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str): "encoding_format": "float" }, ) - assert response.json()["error"]["type"] == "BadRequestError" \ No newline at end of file + assert response.json()["error"]["type"] == "BadRequestError" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 728358031090..15745e030de1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1597,8 +1597,9 @@ def _set_default_args_v1(self, usage_context: UsageContext, pooling_type = model_config.pooler_config.pooling_type is_causal = getattr(model_config.hf_config, "is_causal", True) - incremental_prefill_supported = (pooling_type is not None and - pooling_type.lower() == "last" and is_causal) + incremental_prefill_supported = (pooling_type is not None + and pooling_type.lower() == "last" + and is_causal) action = "Enabling" if \ incremental_prefill_supported else "Disabling" From fef73eaf42fd4bce4c8e48dc53cfe5690d5fb886 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 7 Aug 2025 16:42:44 +0800 Subject: [PATCH 13/34] fix Signed-off-by: wang.yuqi --- .../pooling/test_auto_prefix_cache_support.py | 1 - .../language/pooling/test_qwen3_reranker.py | 15 ++++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py index 5efed4f83bde..15e24c59d1dd 100644 --- a/tests/models/language/pooling/test_auto_prefix_cache_support.py +++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# Keep Decode-only SequenceClassification models support auto prefix cache import pytest import torch from transformers import AutoModelForSequenceClassification diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py index 68e96f32700c..37f5566a330d 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling/test_qwen3_reranker.py @@ -8,15 +8,16 @@ from tests.conftest import HfRunner from tests.utils import multi_gpu_test -from .mteb_utils import RerankModelInfo, mteb_test_rerank_models +from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo +from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ - RerankModelInfo("Qwen/Qwen3-Reranker-0.6B", - architecture="Qwen3ForSequenceClassification", - enable_test=True), - RerankModelInfo("Qwen/Qwen3-Reranker-4B", - architecture="Qwen3ForSequenceClassification", - enable_test=False) + LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B", + architecture="Qwen3ForSequenceClassification", + enable_test=True), + LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B", + architecture="Qwen3ForSequenceClassification", + enable_test=False) ] From ec212d76363142d83c3a32f24e1d89dfbb07633f Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 15:41:16 +0800 Subject: [PATCH 14/34] supported_tasks.remove("encode") Signed-off-by: wang.yuqi --- tests/entrypoints/llm/test_classify.py | 3 +-- vllm/config.py | 10 ++++++++++ vllm/entrypoints/llm.py | 15 +++------------ vllm/entrypoints/openai/api_server.py | 10 ++-------- vllm/model_executor/layers/pooler.py | 4 ++-- 5 files changed, 18 insertions(+), 24 deletions(-) diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py index b81cedb27b90..afae8ed1ff2d 100644 --- a/tests/entrypoints/llm/test_classify.py +++ b/tests/entrypoints/llm/test_classify.py @@ -69,7 +69,6 @@ def get_outputs(activation): @pytest.mark.skip_global_cleanup def test_encode_api(llm: LLM): - err_msg = ("LLM.encode\(\) uses ALL pooling, which does " - "not support chunked prefill.+") + err_msg = "pooling_task must be one of.+" with pytest.raises(ValueError, match=err_msg): llm.encode(prompts, use_tqdm=False) diff --git a/vllm/config.py b/vllm/config.py index a83dc0b98fd8..17c28b5fcc85 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4883,6 +4883,15 @@ def __post_init__(self): if self.cache_config is not None: self.cache_config.enable_prefix_caching = False + if (self.scheduler_config.chunked_prefill_enabled + and "encode" in self.model_config.supported_tasks): + self.model_config.supported_tasks.remove("encode") + + logger.info("Chunked prefill is not supported with " + "encode task which using ALL pooling. " + "Please turn off chunked prefill by " + "`--no-enable-chunked-prefill` before using it.") + if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): @@ -5105,6 +5114,7 @@ def __str__(self): f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa f"use_async_output_proc={self.model_config.use_async_output_proc}, " f"pooler_config={self.model_config.pooler_config!r}, " + f"supported_tasks ={self.model_config.supported_tasks !r}, " f"compilation_config={self.compilation_config!r}") diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1a006a999263..771bbf1859b7 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -281,12 +281,7 @@ def __init__( self.request_counter = Counter() self.default_sampling_params: Union[dict[str, Any], None] = None - if envs.VLLM_USE_V1: - supported_tasks = self.llm_engine \ - .get_supported_tasks() # type: ignore - else: - supported_tasks = self.llm_engine.model_config.supported_tasks - + supported_tasks = self.llm_engine.model_config.supported_tasks logger.info("Supported_tasks: %s", supported_tasks) self.supported_tasks = supported_tasks @@ -1096,12 +1091,8 @@ def encode( "Try passing `--runner pooling` to use the model as a " "pooling model.") - if (pooling_task == "encode" and self.llm_engine.vllm_config. - scheduler_config.chunked_prefill_enabled): - raise ValueError("LLM.encode() uses ALL pooling, which does " - "not support chunked prefill. " - "Please turn off chunked prefill by " - "`--no-enable-chunked-prefill` before using it.") + if pooling_task not in self.supported_tasks: + raise ValueError(f"pooling_task must be one of {self.supported_tasks}.") if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6d9c06d082ee..d19e15551012 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1600,12 +1600,7 @@ async def init_app_state( state.log_stats = not args.disable_log_stats state.vllm_config = vllm_config model_config = vllm_config.model_config - - if envs.VLLM_USE_V1: - supported_tasks = await engine_client \ - .get_supported_tasks() # type: ignore - else: - supported_tasks = model_config.supported_tasks + supported_tasks = model_config.supported_tasks logger.info("Supported_tasks: %s", supported_tasks) @@ -1710,8 +1705,7 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, - ) if ("encode" in supported_tasks and - not vllm_config.scheduler_config.chunked_prefill_enabled) else None + ) if "encode" in supported_tasks else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 1b089f96ffef..e2162e5cbf95 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -67,7 +67,7 @@ class Pooler(nn.Module, ABC): """The interface required for all poolers used in pooling models in vLLM.""" @staticmethod - def for_encode(pooler_config: PoolerConfig, ): + def for_encode(pooler_config: PoolerConfig): if pooler_config.pooling_type == "STEP": return StepPooler() @@ -77,7 +77,7 @@ def for_encode(pooler_config: PoolerConfig, ): return SimplePooler.from_config(resolved_config) @staticmethod - def for_embed(pooler_config: PoolerConfig, ): + def for_embed(pooler_config: PoolerConfig): resolved_config = ResolvedPoolingConfig.from_config( task="embed", pooler_config=pooler_config, From a8ed919ec9ebfbadaf2fbac0d241925031178238 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 16:16:28 +0800 Subject: [PATCH 15/34] fix Signed-off-by: wang.yuqi --- vllm/entrypoints/llm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 771bbf1859b7..16142974b3b4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -13,7 +13,6 @@ from tqdm.auto import tqdm from typing_extensions import TypeVar, deprecated -import vllm.envs as envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, create_sort_beams_key_function) @@ -1092,7 +1091,8 @@ def encode( "pooling model.") if pooling_task not in self.supported_tasks: - raise ValueError(f"pooling_task must be one of {self.supported_tasks}.") + raise ValueError( + f"pooling_task must be one of {self.supported_tasks}.") if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( From 7b4277fd6cd509b283576da833fc72fc344ec5a9 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 18:21:17 +0800 Subject: [PATCH 16/34] set model_config.supported_tasks inside model runner Signed-off-by: wang.yuqi --- vllm/config.py | 9 +++++++++ vllm/entrypoints/llm.py | 8 +++++++- vllm/entrypoints/openai/api_server.py | 7 ++++++- vllm/v1/worker/gpu_model_runner.py | 8 +++++++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index e67b9834f47c..9cf146a0c0ae 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4887,6 +4887,15 @@ def __post_init__(self): "Only models using causal attention supports chunked " "prefill and prefix caching; disabling both.") + if (self.scheduler_config.chunked_prefill_enabled + and "encode" in self.model_config.supported_tasks): + self.model_config.supported_tasks.remove("encode") + + logger.info("Chunked prefill is not supported with " + "encode task which using ALL pooling. " + "Please turn off chunked prefill by " + "`--no-enable-chunked-prefill` before using it.") + if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: logger.info(reason) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 16142974b3b4..5c6889a88925 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -13,6 +13,7 @@ from tqdm.auto import tqdm from typing_extensions import TypeVar, deprecated +from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, create_sort_beams_key_function) @@ -280,7 +281,12 @@ def __init__( self.request_counter = Counter() self.default_sampling_params: Union[dict[str, Any], None] = None - supported_tasks = self.llm_engine.model_config.supported_tasks + if envs.VLLM_USE_V1: + supported_tasks = self.llm_engine \ + .get_supported_tasks() # type: ignore + else: + supported_tasks = self.llm_engine.model_config.supported_tasks + logger.info("Supported_tasks: %s", supported_tasks) self.supported_tasks = supported_tasks diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d19e15551012..c695ea8b5a0e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1600,7 +1600,12 @@ async def init_app_state( state.log_stats = not args.disable_log_stats state.vllm_config = vllm_config model_config = vllm_config.model_config - supported_tasks = model_config.supported_tasks + + if envs.VLLM_USE_V1: + supported_tasks = await engine_client \ + .get_supported_tasks() # type: ignore + else: + supported_tasks = model_config.supported_tasks logger.info("Supported_tasks: %s", supported_tasks) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 08b253dcdb35..1c944d4b753f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1235,7 +1235,13 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: if not is_pooling_model(model): return [] - return list(model.pooler.get_supported_tasks()) + supported_tasks = list(model.pooler.get_supported_tasks()) + + if (self.scheduler_config.chunked_prefill_enabled + and "encode" in supported_tasks): + supported_tasks.remove("encode") + + return supported_tasks def get_supported_tasks(self) -> tuple[SupportedTask, ...]: tasks = list[SupportedTask]() From 1253f07bb0fdd72b97fda76f2aa859360776e700 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 18:22:52 +0800 Subject: [PATCH 17/34] fix Signed-off-by: wang.yuqi --- vllm/entrypoints/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5c6889a88925..1bbf645c35b7 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -13,7 +13,7 @@ from tqdm.auto import tqdm from typing_extensions import TypeVar, deprecated -from vllm import envs +import vllm.envs as envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, create_sort_beams_key_function) From 0e29a79c618e9d7a5a2ab2bd305d0dbb7dad6df2 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 18:24:57 +0800 Subject: [PATCH 18/34] fix Signed-off-by: wang.yuqi --- vllm/v1/worker/gpu_model_runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1c944d4b753f..e859eaaf9038 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1239,6 +1239,8 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: if (self.scheduler_config.chunked_prefill_enabled and "encode" in supported_tasks): + # Chunked prefill is not supported with the encode task + # which using ALL pooling. supported_tasks.remove("encode") return supported_tasks From b6933fc33050fc570482fdac0baf502d2f270cec Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 18:40:10 +0800 Subject: [PATCH 19/34] fix Signed-off-by: wang.yuqi --- tests/entrypoints/llm/test_classify.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py index afae8ed1ff2d..71e76abcb7d2 100644 --- a/tests/entrypoints/llm/test_classify.py +++ b/tests/entrypoints/llm/test_classify.py @@ -67,7 +67,6 @@ def get_outputs(activation): ), "w_activation should be close to activation(wo_activation)." -@pytest.mark.skip_global_cleanup def test_encode_api(llm: LLM): err_msg = "pooling_task must be one of.+" with pytest.raises(ValueError, match=err_msg): From 37b68270f5d7402dacd8f744075f07fd9d8cd0a0 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 18:41:02 +0800 Subject: [PATCH 20/34] fix Signed-off-by: wang.yuqi --- vllm/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 9cf146a0c0ae..5689693e02db 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -5139,7 +5139,6 @@ def __str__(self): f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa f"use_async_output_proc={self.model_config.use_async_output_proc}, " f"pooler_config={self.model_config.pooler_config!r}, " - f"supported_tasks ={self.model_config.supported_tasks !r}, " f"compilation_config={self.compilation_config!r}") From 2d3fa37b3f6a9ae874d28873e25f2f34c072fd6b Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 18:42:51 +0800 Subject: [PATCH 21/34] fix Signed-off-by: wang.yuqi --- vllm/config.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 5689693e02db..b56f5489f913 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4887,15 +4887,6 @@ def __post_init__(self): "Only models using causal attention supports chunked " "prefill and prefix caching; disabling both.") - if (self.scheduler_config.chunked_prefill_enabled - and "encode" in self.model_config.supported_tasks): - self.model_config.supported_tasks.remove("encode") - - logger.info("Chunked prefill is not supported with " - "encode task which using ALL pooling. " - "Please turn off chunked prefill by " - "`--no-enable-chunked-prefill` before using it.") - if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: logger.info(reason) @@ -4908,15 +4899,6 @@ def __post_init__(self): if self.cache_config is not None: self.cache_config.enable_prefix_caching = False - if (self.scheduler_config.chunked_prefill_enabled - and "encode" in self.model_config.supported_tasks): - self.model_config.supported_tasks.remove("encode") - - logger.info("Chunked prefill is not supported with " - "encode task which using ALL pooling. " - "Please turn off chunked prefill by " - "`--no-enable-chunked-prefill` before using it.") - if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): From f42ab135d33168f02c192fcc34cd37b1b3913d7f Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 18:43:46 +0800 Subject: [PATCH 22/34] fix Signed-off-by: wang.yuqi --- vllm/model_executor/models/interfaces.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 8c0e0efe8543..132da07ac3ae 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -642,7 +642,7 @@ def supports_cross_encoding( def default_pooling_type(pooling_type): - # set default_pooling_type decorator + """Set default_pooling_type decorator. """ def func(model): model.default_pooling_type = pooling_type return model From d80582a756fb93fd9c07dbcfeccc7ca24ababa66 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 18:46:51 +0800 Subject: [PATCH 23/34] + logger.info Signed-off-by: wang.yuqi --- vllm/config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index b56f5489f913..e4b8921b77dd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4899,6 +4899,13 @@ def __post_init__(self): if self.cache_config is not None: self.cache_config.enable_prefix_caching = False + if (self.scheduler_config.chunked_prefill_enabled + and "encode" in self.model_config.supported_tasks): + logger.info("Chunked prefill is not supported with " + "encode task which using ALL pooling. " + "Please turn off chunked prefill by " + "`--no-enable-chunked-prefill` before using it.") + if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): From 8db0205b55a223278f0d73fc9f50127af93707c0 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 18:51:28 +0800 Subject: [PATCH 24/34] fix Signed-off-by: wang.yuqi --- vllm/model_executor/models/interfaces.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 132da07ac3ae..b8754d0ce89d 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -641,9 +641,9 @@ def supports_cross_encoding( return is_pooling_model(model) and _supports_cross_encoding(model) -def default_pooling_type(pooling_type): +def default_pooling_type(pooling_type: str) -> object: """Set default_pooling_type decorator. """ - def func(model): + def func(model: object): model.default_pooling_type = pooling_type return model From 568ed63cd4d11319f9fe0e7803aa00657a2c433d Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 19:00:09 +0800 Subject: [PATCH 25/34] fix Signed-off-by: wang.yuqi --- vllm/model_executor/models/interfaces.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b8754d0ce89d..46caf3fce404 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -643,6 +643,7 @@ def supports_cross_encoding( def default_pooling_type(pooling_type: str) -> object: """Set default_pooling_type decorator. """ + def func(model: object): model.default_pooling_type = pooling_type return model From 9cd466df0a668d1e0fe7c5f42b42fc0f2bb9d3c6 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 19:25:55 +0800 Subject: [PATCH 26/34] logger.info in runner Signed-off-by: wang.yuqi --- vllm/config.py | 7 ------- vllm/v1/worker/gpu_model_runner.py | 5 +++++ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index e4b8921b77dd..b56f5489f913 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4899,13 +4899,6 @@ def __post_init__(self): if self.cache_config is not None: self.cache_config.enable_prefix_caching = False - if (self.scheduler_config.chunked_prefill_enabled - and "encode" in self.model_config.supported_tasks): - logger.info("Chunked prefill is not supported with " - "encode task which using ALL pooling. " - "Please turn off chunked prefill by " - "`--no-enable-chunked-prefill` before using it.") - if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e859eaaf9038..65ad764f9a94 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1243,6 +1243,11 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: # which using ALL pooling. supported_tasks.remove("encode") + logger.info("Chunked prefill is not supported with " + "encode task which using ALL pooling. " + "Please turn off chunked prefill by " + "`--no-enable-chunked-prefill` before using it.") + return supported_tasks def get_supported_tasks(self) -> tuple[SupportedTask, ...]: From e988353356ecb2f8d825523be378fff11b5c713d Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 8 Aug 2025 19:28:04 +0800 Subject: [PATCH 27/34] logger.info in runner Signed-off-by: wang.yuqi --- vllm/v1/worker/gpu_model_runner.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 65ad764f9a94..0aa1eb74b35b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1243,10 +1243,10 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: # which using ALL pooling. supported_tasks.remove("encode") - logger.info("Chunked prefill is not supported with " - "encode task which using ALL pooling. " - "Please turn off chunked prefill by " - "`--no-enable-chunked-prefill` before using it.") + logger.info_once("Chunked prefill is not supported with " + "encode task which using ALL pooling. " + "Please turn off chunked prefill by " + "`--no-enable-chunked-prefill` before using it.") return supported_tasks From 58038ab428c14528c6cfc1d4ce30a26cef4e15d7 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Sat, 9 Aug 2025 13:44:27 +0800 Subject: [PATCH 28/34] conflicts Signed-off-by: wang.yuqi --- vllm/config.py | 8 -------- vllm/model_executor/models/bert_with_rope.py | 1 - vllm/model_executor/models/modernbert.py | 1 - 3 files changed, 10 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b56f5489f913..7147702eddde 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -918,10 +918,6 @@ def _init_pooler_config(self) -> Optional["PoolerConfig"]: if getattr(pooler_config, k) is None: setattr(pooler_config, k, v) - default_pooling_type = self._model_info.default_pooling_type - if pooler_config.pooling_type is None: - pooler_config.pooling_type = default_pooling_type - return pooler_config return None @@ -4882,10 +4878,6 @@ def __post_init__(self): disable_chunked_prefill_reasons.append( "Only \"last\" pooling supports chunked " "prefill and prefix caching; disabling both.") - elif not getattr(self.model_config.hf_config, "is_causal", True): - disable_chunked_prefill_reasons.append( - "Only models using causal attention supports chunked " - "prefill and prefix caching; disabling both.") if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 8f5000bec9ea..555f88a0b8ff 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -392,7 +392,6 @@ def forward( return hidden_states -@default_pooling_type("CLS") class BertWithRope(nn.Module, SupportsQuant): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index fa7245aa4af2..33390d23bae0 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -295,7 +295,6 @@ def forward( return pooled_output -@default_pooling_type("CLS") class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, SupportsCrossEncoding): From 998e9cbfd872d7327b696751db90f66c22c2ed71 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Sat, 9 Aug 2025 13:45:43 +0800 Subject: [PATCH 29/34] conflicts Signed-off-by: wang.yuqi --- vllm/model_executor/models/modernbert.py | 1 - vllm/v1/worker/gpu_model_runner.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 33390d23bae0..906a83fb6b66 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -200,7 +200,6 @@ def forward( return hidden_states -@default_pooling_type("CLS") class ModernBertModel(nn.Module): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={"layers.": "encoder_layer.layers."}) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0aa1eb74b35b..d60f38518a58 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1239,8 +1239,6 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: if (self.scheduler_config.chunked_prefill_enabled and "encode" in supported_tasks): - # Chunked prefill is not supported with the encode task - # which using ALL pooling. supported_tasks.remove("encode") logger.info_once("Chunked prefill is not supported with " From 906824e1e78ff9d1ad1742fb150889c90d32c522 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Sat, 9 Aug 2025 13:50:43 +0800 Subject: [PATCH 30/34] add back Signed-off-by: wang.yuqi --- vllm/config/__init__.py | 8 ++++++++ vllm/model_executor/models/bert_with_rope.py | 1 + vllm/model_executor/models/modernbert.py | 2 ++ 3 files changed, 11 insertions(+) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 69c05b75d3eb..878850ad431b 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -898,6 +898,10 @@ def _init_pooler_config(self) -> Optional["PoolerConfig"]: if getattr(pooler_config, k) is None: setattr(pooler_config, k, v) + default_pooling_type = self._model_info.default_pooling_type + if pooler_config.pooling_type is None: + pooler_config.pooling_type = default_pooling_type + return pooler_config return None @@ -4434,6 +4438,10 @@ def __post_init__(self): disable_chunked_prefill_reasons.append( "Only \"last\" pooling supports chunked " "prefill and prefix caching; disabling both.") + elif not getattr(self.model_config.hf_config, "is_causal", True): + disable_chunked_prefill_reasons.append( + "Only models using causal attention supports chunked " + "prefill and prefix caching; disabling both.") if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 13c118d05bae..e18b7b7ffaba 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -402,6 +402,7 @@ def forward( @support_torch_compile +@default_pooling_type("CLS") class BertWithRope(nn.Module, SupportsQuant): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 005b531f4fad..2c3bdd1c93ae 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -202,6 +202,7 @@ def forward( @support_torch_compile +@default_pooling_type("CLS") class ModernBertModel(nn.Module): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={"layers.": "encoder_layer.layers."}) @@ -294,6 +295,7 @@ def forward( return pooled_output +@default_pooling_type("CLS") class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, SupportsCrossEncoding): From cbda86d1b48b38c4dc6e70c2566508960ee64938 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 11 Aug 2025 15:03:35 +0800 Subject: [PATCH 31/34] conflicts Signed-off-by: wang.yuqi --- vllm/model_executor/models/bert.py | 3 --- vllm/model_executor/models/roberta.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index f9ee31bd86ae..82fddb56eb22 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -412,7 +412,6 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -@default_pooling_type("ALL") class BertPoolingModel(BertModel): is_pooling_model = True @@ -443,7 +442,6 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -@default_pooling_type("CLS") class BertEmbeddingModel(nn.Module, SupportsQuant): """A model that uses Bert to provide embedding functionalities. @@ -506,7 +504,6 @@ def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: }) -@default_pooling_type("CLS") class BertForSequenceClassification(nn.Module, SupportsV0Only, SupportsCrossEncoding, SupportsQuant): """A model that uses Bert to provide embedding functionalities. diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index b2f35a786089..742c01a89086 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -89,7 +89,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -@default_pooling_type("CLS") class RobertaEmbeddingModel(BertEmbeddingModel): """A model that uses Roberta to provide embedding functionalities. @@ -155,7 +154,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): return loader.load_weights(weights_list, mapper=mapper) -@default_pooling_type("CLS") class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsV0Only): """A model that uses Roberta to provide embedding functionalities. From 63b03cc2ff7c27b90153050ebf1ae323ed16fe96 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 11 Aug 2025 15:06:37 +0800 Subject: [PATCH 32/34] conflicts Signed-off-by: wang.yuqi --- vllm/model_executor/models/bert.py | 12 ++++++++---- vllm/model_executor/models/roberta.py | 3 +-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 82fddb56eb22..8f988903f78c 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -28,8 +28,7 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import (SupportsCrossEncoding, SupportsQuant, SupportsV0Only, - default_pooling_type) +from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -499,8 +498,13 @@ def _build_model(self, def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: return DispatchPooler({ - "encode": Pooler.for_encode(pooler_config), - "embed": Pooler.for_embed(pooler_config), + "encode": + Pooler.for_encode(pooler_config), + "embed": + Pooler.for_embed( + pooler_config, + default_pooling_type=PoolingType.CLS, + ), }) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 742c01a89086..61c8faed4065 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -20,8 +20,7 @@ from vllm.sequence import IntermediateTensors from .bert_with_rope import BertWithRope, JinaRobertaModel -from .interfaces import (SupportsCrossEncoding, SupportsV0Only, - default_pooling_type) +from .interfaces import SupportsCrossEncoding, SupportsV0Only class RobertaEmbedding(nn.Module): From 135dffa27b4453c52d77e57f18731b5683b7508d Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 11 Aug 2025 15:11:48 +0800 Subject: [PATCH 33/34] add back Signed-off-by: wang.yuqi --- vllm/model_executor/models/bert.py | 7 ++++++- vllm/model_executor/models/roberta.py | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 3d5d5d505b35..f311b4769eeb 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -28,7 +28,8 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import PoolingTask -from .interfaces import SupportsCrossEncoding, SupportsQuant +from .interfaces import (SupportsCrossEncoding, SupportsQuant, + default_pooling_type) from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -327,6 +328,7 @@ def forward(self, hidden_states: torch.Tensor, @support_torch_compile +@default_pooling_type("CLS") class BertModel(nn.Module, SupportsQuant): is_pooling_model = True @@ -401,6 +403,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params +@default_pooling_type("ALL") class BertPoolingModel(BertModel): is_pooling_model = True @@ -431,6 +434,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params +@default_pooling_type("CLS") class BertEmbeddingModel(nn.Module, SupportsQuant): """A model that uses Bert to provide embedding functionalities. @@ -541,6 +545,7 @@ def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor: return token_type_ids +@default_pooling_type("CLS") class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQuant): """A model that uses Bert to provide embedding functionalities. diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 005b9179827e..32a4a2c9a269 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -23,7 +23,7 @@ from vllm.sequence import IntermediateTensors from .bert_with_rope import BertWithRope, JinaRobertaModel -from .interfaces import SupportsCrossEncoding +from .interfaces import SupportsCrossEncoding, default_pooling_type class RobertaEmbedding(nn.Module): @@ -86,6 +86,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x +@default_pooling_type("CLS") class RobertaEmbeddingModel(BertEmbeddingModel): """A model that uses Roberta to provide embedding functionalities. @@ -149,6 +150,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): return loader.load_weights(weights_list, mapper=mapper) +@default_pooling_type("CLS") class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): """A model that uses Roberta to provide embedding functionalities. From 9552a49d13dbb331134a82c51c00653e4e4f9ba9 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 11 Aug 2025 15:14:21 +0800 Subject: [PATCH 34/34] fix Signed-off-by: wang.yuqi --- vllm/model_executor/models/bert.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index f311b4769eeb..6638f06f9826 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -490,13 +490,8 @@ def _build_model(self, def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: return DispatchPooler({ - "encode": - Pooler.for_encode(pooler_config), - "embed": - Pooler.for_embed( - pooler_config, - default_pooling_type=PoolingType.CLS, - ), + "encode": Pooler.for_encode(pooler_config), + "embed": Pooler.for_embed(pooler_config), })