Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
aa55dab
init
noooop Oct 11, 2025
4e09700
Merge branch 'main' into chunked_prefill_logic
noooop Nov 18, 2025
2deeb18
Merge branch 'main' into chunked_prefill_logic
noooop Nov 18, 2025
0c624ba
+ BoolWithReason
noooop Nov 20, 2025
0eb45a1
fix
noooop Nov 20, 2025
5588a84
Merge branch 'main' into chunked_prefill_logic
noooop Nov 20, 2025
f51e7d9
state -> value
noooop Nov 21, 2025
0397175
Merge branch 'main' into chunked_prefill_logic
noooop Nov 21, 2025
8174b3c
+ @attn_type("encoder_only")
noooop Nov 21, 2025
43a8616
fix clip
noooop Nov 21, 2025
e944168
fix
noooop Nov 21, 2025
824dd36
fix
noooop Nov 21, 2025
e6b2a3b
add AttnTypeStr & PoolingTypeStr
noooop Nov 21, 2025
ce19005
ruff
noooop Nov 21, 2025
909cceb
TYPE_CHECKING
noooop Nov 21, 2025
2aec810
fix
noooop Nov 21, 2025
f8757ca
fix
noooop Nov 21, 2025
0597911
Merge branch 'main' into chunked_prefill_logic
noooop Nov 21, 2025
2c567cd
+ BoolWithReasonGroup
noooop Nov 21, 2025
1705fbf
+ BoolWithReasonGroup
noooop Nov 21, 2025
7a6ac0c
fix
noooop Nov 21, 2025
862618e
constants as class variables
noooop Nov 21, 2025
4aad642
E501
noooop Nov 21, 2025
aa3fcc5
Merge branch 'main' into chunked_prefill_logic
noooop Nov 21, 2025
bf5578f
warning_if_false
noooop Nov 22, 2025
acda010
fix
noooop Nov 22, 2025
d2301ee
Merge branch 'main' into chunked_prefill_logic
noooop Nov 22, 2025
0b798e6
+ FIXME
noooop Nov 22, 2025
1375cfb
Merge branch 'main' into chunked_prefill_logic
noooop Nov 24, 2025
84665f2
update
noooop Nov 24, 2025
7b2ad12
update
noooop Nov 24, 2025
896e407
update
noooop Nov 24, 2025
f07f422
SIM102
noooop Nov 24, 2025
867ba67
SIM102
noooop Nov 24, 2025
c844a78
SIM102
noooop Nov 24, 2025
524ee08
cache_kwargs["enable_prefix_caching"]['default'] = None
noooop Nov 24, 2025
7c25dea
Merge branch 'main' into chunked_prefill_logic
noooop Nov 24, 2025
563c458
fix
noooop Nov 24, 2025
d1a9f7d
fix
noooop Nov 24, 2025
61c866f
fix
noooop Nov 24, 2025
f2a4a05
fix
noooop Nov 24, 2025
34d8dbe
fix
noooop Nov 24, 2025
4a0d865
fix
noooop Nov 24, 2025
953a5f2
Merge branch 'main' into chunked_prefill_logic
noooop Nov 26, 2025
a32ab75
+ attention_free
noooop Nov 26, 2025
d7172fc
fix
noooop Nov 26, 2025
ed25df5
fix
noooop Nov 26, 2025
bd0cbab
fix
noooop Nov 26, 2025
0928d57
fix
noooop Nov 26, 2025
0c80c9e
fix
noooop Nov 26, 2025
3a28ae1
+ attn_type_to_reason_map
noooop Nov 26, 2025
f3a317a
fix
noooop Nov 26, 2025
183b0f6
fix
noooop Nov 26, 2025
0a00f3f
E501
noooop Nov 26, 2025
7623684
Merge branch 'main' into chunked_prefill_logic
noooop Nov 26, 2025
dbf3ebf
- BoolWithReason
noooop Nov 27, 2025
ea5df7a
- Unnecessary modifications
noooop Nov 27, 2025
2fcc5e9
Merge branch 'main' into chunked_prefill_logic
noooop Nov 27, 2025
f210656
Merge branch 'main' into chunked_prefill_logic
DarkLight1337 Nov 27, 2025
eaa4c96
- Unnecessary modifications
noooop Nov 27, 2025
73212be
Merge branch 'main' into chunked_prefill_logic
noooop Nov 27, 2025
c620f45
update
noooop Nov 27, 2025
df8bba6
Merge branch 'main' into chunked_prefill_logic
DarkLight1337 Nov 27, 2025
084bc8f
+ all pooling and step pooling
noooop Nov 27, 2025
3abb7fa
SIM103
noooop Nov 27, 2025
466d36d
Merge branch 'main' into chunked_prefill_logic
noooop Nov 27, 2025
dad2bbd
fix
noooop Nov 27, 2025
4d816b4
fix
noooop Nov 27, 2025
ba80b42
fix
noooop Nov 27, 2025
792f33a
Merge branch 'main' into chunked_prefill_logic
noooop Nov 28, 2025
ac53016
fix
noooop Nov 28, 2025
08a2f8f
Merge branch 'main' into chunked_prefill_logic
noooop Nov 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,6 @@ def test_embed_models(
def test_non_causal_models(
hf_runner, vllm_runner, example_prompts, model: str, dtype: str
) -> None:
with vllm_runner(
model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
) as vllm_model:
with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
cache_config = vllm_model.llm.llm_engine.cache_config
assert not cache_config.enable_prefix_caching
240 changes: 239 additions & 1 deletion tests/test_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import logging
import os
from dataclasses import MISSING, Field, asdict, dataclass, field
from unittest.mock import patch
Expand Down Expand Up @@ -602,6 +602,244 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)


@pytest.mark.parametrize(
("model_id", "expected_attn_type", "expected_result", "reason"),
[
# pooling models
(
"jason9693/Qwen2.5-1.5B-apeach",
"decoder",
True,
"Pooling models with causal attn and last pooling support chunked prefill.",
),
(
"Qwen/Qwen3-Embedding-0.6B",
"decoder",
True,
"Pooling models with causal attn and last pooling support chunked prefill.",
),
(
"Qwen/Qwen2.5-Math-PRM-7B",
"decoder",
False,
"Pooling models with step pooling does not support chunked prefill.",
),
(
"internlm/internlm2-1_8b-reward",
"decoder",
False,
"Pooling models with all pooling does not support chunked prefill.",
),
(
"BAAI/bge-base-en",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support chunked prefill.",
),
(
"boltuix/NeuroBERT-NER",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support chunked prefill.",
),
(
"papluca/xlm-roberta-base-language-detection",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support chunked prefill.",
),
(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support chunked prefill.",
),
(
"intfloat/e5-small",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support chunked prefill.",
),
# multimodal models
(
"openai/clip-vit-base-patch32",
"decoder",
True,
"Pooling models with causal attn and last pooling support chunked prefill.",
),
(
"google/siglip-base-patch16-224",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support chunked prefill.",
),
# generate models
(
"Qwen/Qwen3-0.6B",
"decoder",
True,
"Generative models support chunked prefill.",
),
(
"Qwen/Qwen3-Next-80B-A3B-Instruct",
"hybrid",
True,
"Generative models support chunked prefill.",
),
(
"ibm-granite/granite-4.0-h-small",
"hybrid",
True,
"Generative models support chunked prefill.",
),
(
"state-spaces/mamba-130m-hf",
"attention_free",
True,
"Generative models support chunked prefill.",
),
# encoder_decoder models
(
"openai/whisper-small",
"encoder_decoder",
False,
"Encoder decoder models does not support chunked prefill.",
),
],
)
def test_is_chunked_prefill_supported(
model_id: str,
expected_attn_type: str,
expected_result: bool,
reason: str,
caplog_vllm,
):
model_config = ModelConfig(model_id, trust_remote_code=True)
assert model_config.attn_type == expected_attn_type
with caplog_vllm.at_level(level=logging.DEBUG):
assert model_config.is_chunked_prefill_supported == expected_result
assert reason in caplog_vllm.text


@pytest.mark.parametrize(
("model_id", "expected_attn_type", "expected_result", "reason"),
[
# pooling models
(
"jason9693/Qwen2.5-1.5B-apeach",
"decoder",
True,
"Pooling models with causal attn and last pooling support prefix caching.",
),
(
"Qwen/Qwen3-Embedding-0.6B",
"decoder",
True,
"Pooling models with causal attn and last pooling support prefix caching.",
),
(
"Qwen/Qwen2.5-Math-PRM-7B",
"decoder",
False,
"Pooling models with step pooling does not support prefix caching.",
),
(
"internlm/internlm2-1_8b-reward",
"decoder",
False,
"Pooling models with all pooling does not support prefix caching.",
),
(
"BAAI/bge-base-en",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support prefix caching.",
),
(
"boltuix/NeuroBERT-NER",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support prefix caching.",
),
(
"papluca/xlm-roberta-base-language-detection",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support prefix caching.",
),
(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support prefix caching.",
),
(
"intfloat/e5-small",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support prefix caching.",
),
# multimodal models
(
"openai/clip-vit-base-patch32",
"decoder",
True,
"Pooling models with causal attn and last pooling support prefix caching.",
),
(
"google/siglip-base-patch16-224",
"encoder_only",
False,
"Pooling models with bidirectional attn does not support prefix caching.",
),
# generate models
(
"Qwen/Qwen3-0.6B",
"decoder",
True,
"Generative models support prefix caching.",
),
(
"Qwen/Qwen3-Next-80B-A3B-Instruct",
"hybrid",
False,
"Hybrid models does not support prefix caching since the feature is still experimental.", # noqa: E501
),
(
"ibm-granite/granite-4.0-h-small",
"hybrid",
False,
"Hybrid models does not support prefix caching since the feature is still experimental.", # noqa: E501
),
(
"state-spaces/mamba-130m-hf",
"attention_free",
False,
"Attention free models does not support prefix caching since the feature is still experimental.", # noqa: E501
),
# encoder_decoder models
(
"openai/whisper-small",
"encoder_decoder",
False,
"Encoder decoder models does not support prefix caching.",
),
],
)
def test_is_prefix_caching_supported(
model_id: str,
expected_attn_type: str,
expected_result: bool,
reason: str,
caplog_vllm,
):
model_config = ModelConfig(model_id, trust_remote_code=True)
assert model_config.attn_type == expected_attn_type
with caplog_vllm.at_level(level=logging.DEBUG):
assert model_config.is_prefix_caching_supported == expected_result
assert reason in caplog_vllm.text


@pytest.mark.parametrize(
("backend", "custom_ops", "expected"),
[
Expand Down
109 changes: 109 additions & 0 deletions vllm/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@
"draft": [],
}

AttnTypeStr = Literal[
"decoder", "encoder", "encoder_only", "encoder_decoder", "attention_free", "hybrid"
]


@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
Expand Down Expand Up @@ -1752,6 +1756,111 @@ def get_and_verify_max_len(self, max_model_len: int):
logger.info("Using max model len %s", max_model_len)
return max_model_len

@property
def attn_type(self) -> AttnTypeStr:
if self.pooler_config is not None:
pooling_type = self._model_info.default_pooling_type.lower()
if pooling_type == "cls":
return "encoder_only"
else:
is_causal = getattr(self.hf_config, "is_causal", True)
return "encoder_only" if not is_causal else self._model_info.attn_type
elif self.is_hybrid:
return "hybrid"
elif self.is_attention_free:
return "attention_free"
elif self.is_encoder_decoder:
return "encoder_decoder"
else:
return "decoder"

@property
def is_chunked_prefill_supported(self) -> bool:
attn_type = self.attn_type
if self.pooler_config is not None:
# for pooling models
if attn_type == "encoder_only":
logger.debug(
"Pooling models with bidirectional attn does not support "
"chunked prefill."
)
return False
elif attn_type == "decoder":
pooling_type = self.pooler_config.pooling_type.lower()
if pooling_type in ["all", "mean", "step", "cls"]:
logger.debug(
"Pooling models with %s pooling does not "
"support chunked prefill.",
pooling_type,
)
return False
else:
# pooling_type == "last"
logger.debug(
"Pooling models with causal attn and last pooling support "
"chunked prefill."
)
return True
# vllm currently does not have pooling models using hybrid,
# attention_free or encoder_decoder attn types.
return attn_type != "encoder_decoder"
else:
if attn_type == "encoder_decoder":
logger.debug("Encoder decoder models does not support chunked prefill.")
return False
logger.debug("Generative models support chunked prefill.")
return True

@property
def is_prefix_caching_supported(self) -> bool:
attn_type = self.attn_type
if self.pooler_config is not None:
# for pooling models
if attn_type == "encoder_only":
logger.debug(
"Pooling models with bidirectional attn does not "
"support prefix caching."
)
return False
elif attn_type == "decoder":
pooling_type = self.pooler_config.pooling_type.lower()
if pooling_type in ["all", "mean", "step", "cls"]:
logger.debug(
"Pooling models with %s pooling does not "
"support prefix caching.",
pooling_type,
)
return False
else:
# pooling_type == "last"
logger.debug(
"Pooling models with causal attn and last pooling support "
"prefix caching."
)
return True
# vllm currently does not have pooling models using hybrid,
# attention_free or encoder_decoder attn types.
return False
else:
if attn_type == "hybrid":
logger.debug(
"Hybrid models does not support prefix caching since the feature "
"is still experimental."
)
return False
elif attn_type == "attention_free":
logger.debug(
"Attention free models does not support prefix caching since the "
"feature is still experimental."
)
return False
elif attn_type == "encoder_decoder":
logger.debug("Encoder decoder models does not support prefix caching.")
return False
else: # attn_type == "decoder"
logger.debug("Generative models support prefix caching.")
return True

def is_model_moe(
self,
) -> bool:
Expand Down
Loading