Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion tests/entrypoints/openai/test_chat_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class MockModelConfig:
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
hf_text_config = MockHFConfig()
logits_processor_pattern = None
logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/openai/test_completion_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ class MockModelConfig:
tokenizer_revision = None
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
logits_processor_pattern = None
logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/openai/test_lora_resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ class MockModelConfig:
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
logits_processors: list[str] | None = None
logits_processor_pattern: str | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,6 @@ class MockModelConfig:
hf_config = MockHFConfig()
hf_text_config = MockHFConfig()
logits_processors: list[str] | None = None
logits_processor_pattern = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
Expand Down
14 changes: 0 additions & 14 deletions tests/v1/sample/test_sampling_params_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,20 +144,6 @@ def contains_bad_word(text: str, tokens: list[int], bad_word: str) -> bool:
assert not contains_bad_word(new_text, new_tokens, bad_words_2)


def test_logits_processor(llm):
"""Check that we reject logits processor."""

# This sample logits processor gives infinite score to the i-th token,
# where i is the length of the input sequence.
# We therefore expect the output token sequence to be [0, 1, 2, ...]
def pick_ith(token_ids, logits):
logits[len(token_ids)] = float("inf")
return logits

with pytest.raises(ValueError):
_ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))


def test_allowed_token_ids(llm):
"""Check that we can use allowed_token_ids."""

Expand Down
5 changes: 0 additions & 5 deletions vllm/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,6 @@ class ModelConfig:
hf_overrides: HfOverrides = field(default_factory=dict)
"""If a dictionary, contains arguments to be forwarded to the Hugging Face
config. If a callable, it is called to update the HuggingFace config."""
logits_processor_pattern: str | None = None
"""Optional regex pattern specifying valid logits processor qualified names
that can be passed with the `logits_processors` extra completion argument.
Defaults to `None`, which allows no processors."""
generation_config: str = "auto"
"""The folder path to the generation config. Defaults to `"auto"`, the
generation config will be loaded from model path. If set to `"vllm"`, no
Expand Down Expand Up @@ -342,7 +338,6 @@ def compute_hash(self) -> str:
"config_format",
"hf_token",
"hf_overrides",
"logits_processor_pattern",
"override_attention_dtype",
"logits_processors",
"io_processor_plugin",
Expand Down
13 changes: 2 additions & 11 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,8 +508,6 @@ class EngineArgs:
reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
reasoning_parser_plugin: str | None = None

logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern

speculative_config: dict[str, Any] | None = None

show_hidden_metrics_for_version: str | None = (
Expand Down Expand Up @@ -710,9 +708,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
)
model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
model_group.add_argument(
"--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
)
model_group.add_argument(
"--generation-config", **model_kwargs["generation_config"]
)
Expand Down Expand Up @@ -1320,7 +1315,6 @@ def create_model_config(self) -> ModelConfig:
mm_encoder_tp_mode=self.mm_encoder_tp_mode,
mm_encoder_attn_backend=self.mm_encoder_attn_backend,
pooler_config=self.pooler_config,
logits_processor_pattern=self.logits_processor_pattern,
generation_config=self.generation_config,
override_generation_config=self.override_generation_config,
enable_sleep_mode=self.enable_sleep_mode,
Expand Down Expand Up @@ -1429,7 +1423,7 @@ def create_engine_config(
self.model_weights = model_config.model_weights
self.tokenizer = model_config.tokenizer

self._check_feature_supported(model_config)
self._check_feature_supported()
self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
self._set_default_max_num_seqs_and_batched_tokens_args(
usage_context, model_config
Expand Down Expand Up @@ -1831,11 +1825,8 @@ def create_engine_config(

return config

def _check_feature_supported(self, model_config: ModelConfig):
def _check_feature_supported(self):
"""Raise an error if the feature is not supported."""
if self.logits_processor_pattern != EngineArgs.logits_processor_pattern:
_raise_unsupported_error(feature_name="--logits-processor-pattern")

# No Concurrent Partial Prefills so far.
if (
self.max_num_partial_prefills != SchedulerConfig.max_num_partial_prefills
Expand Down
22 changes: 3 additions & 19 deletions vllm/entrypoints/openai/chat_completion/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,11 @@
FunctionCall,
FunctionDefinition,
LegacyStructuralTagResponseFormat,
LogitsProcessors,
OpenAIBaseModel,
StreamOptions,
StructuralTagResponseFormat,
ToolCall,
UsageInfo,
get_logits_processors,
)
from vllm.exceptions import VLLMValidationError
from vllm.logger import init_logger
Expand Down Expand Up @@ -293,19 +291,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
"through out the inference process and return in response."
),
)
logits_processors: LogitsProcessors | None = Field(
default=None,
description=(
"A list of either qualified names of logits processors, or "
"constructor objects, to apply when sampling. A constructor is "
"a JSON object with a required 'qualname' field specifying the "
"qualified name of the processor class/factory, and optional "
"'args' and 'kwargs' fields containing positional and keyword "
"arguments. For example: {'qualname': "
"'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
"{'param': 'value'}}."
),
)

return_tokens_as_token_ids: bool | None = Field(
default=None,
description=(
Expand All @@ -324,6 +310,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
"need to map generated text back to input tokens."
),
)

cache_salt: str | None = Field(
default=None,
description=(
Expand All @@ -335,6 +322,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
"to 256 bit)."
),
)

kv_transfer_params: dict[str, Any] | None = Field(
default=None,
description="KVTransfer parameters used for disaggregated serving.",
Expand Down Expand Up @@ -417,7 +405,6 @@ def to_beam_search_params(
def to_sampling_params(
self,
max_tokens: int,
logits_processor_pattern: str | None,
default_sampling_params: dict,
) -> SamplingParams:
# Default parameters
Expand Down Expand Up @@ -502,9 +489,6 @@ def to_sampling_params(
min_tokens=self.min_tokens,
skip_special_tokens=self.skip_special_tokens,
spaces_between_special_tokens=self.spaces_between_special_tokens,
logits_processors=get_logits_processors(
self.logits_processors, logits_processor_pattern
),
include_stop_str_in_output=self.include_stop_str_in_output,
truncate_prompt_tokens=self.truncate_prompt_tokens,
output_kind=RequestOutputKind.DELTA
Expand Down
9 changes: 0 additions & 9 deletions vllm/entrypoints/openai/chat_completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@
from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
from vllm.tool_parsers.utils import partial_json_loads
from vllm.utils.collection_utils import as_list
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters

logger = init_logger(__name__)

Expand Down Expand Up @@ -130,9 +129,6 @@ def __init__(
self.enable_log_outputs = enable_log_outputs
self.enable_log_deltas = enable_log_deltas

# set up logits processors
self.logits_processors = self.model_config.logits_processors

# set up reasoning parser
self.reasoning_parser_cls = ParserManager.get_reasoning_parser(
reasoning_parser_name=reasoning_parser
Expand Down Expand Up @@ -403,13 +399,8 @@ async def create_chat_completion(
else:
sampling_params = request.to_sampling_params(
max_tokens,
self.model_config.logits_processor_pattern,
self.default_sampling_params,
)
validate_logits_processors_parameters(
self.logits_processors,
sampling_params,
)

self._log_inputs(
sub_request_id,
Expand Down
19 changes: 0 additions & 19 deletions vllm/entrypoints/openai/completion/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,10 @@
from vllm.entrypoints.openai.engine.protocol import (
AnyResponseFormat,
LegacyStructuralTagResponseFormat,
LogitsProcessors,
OpenAIBaseModel,
StreamOptions,
StructuralTagResponseFormat,
UsageInfo,
get_logits_processors,
)
from vllm.exceptions import VLLMValidationError
from vllm.logger import init_logger
Expand Down Expand Up @@ -117,19 +115,6 @@ class CompletionRequest(OpenAIBaseModel):
"through out the inference process and return in response."
),
)
logits_processors: LogitsProcessors | None = Field(
default=None,
description=(
"A list of either qualified names of logits processors, or "
"constructor objects, to apply when sampling. A constructor is "
"a JSON object with a required 'qualname' field specifying the "
"qualified name of the processor class/factory, and optional "
"'args' and 'kwargs' fields containing positional and keyword "
"arguments. For example: {'qualname': "
"'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
"{'param': 'value'}}."
),
)

return_tokens_as_token_ids: bool | None = Field(
default=None,
Expand Down Expand Up @@ -221,7 +206,6 @@ def to_beam_search_params(
def to_sampling_params(
self,
max_tokens: int,
logits_processor_pattern: str | None,
default_sampling_params: dict | None = None,
) -> SamplingParams:
if default_sampling_params is None:
Expand Down Expand Up @@ -312,9 +296,6 @@ def to_sampling_params(
skip_special_tokens=self.skip_special_tokens,
spaces_between_special_tokens=self.spaces_between_special_tokens,
include_stop_str_in_output=self.include_stop_str_in_output,
logits_processors=get_logits_processors(
self.logits_processors, logits_processor_pattern
),
truncate_prompt_tokens=self.truncate_prompt_tokens,
output_kind=RequestOutputKind.DELTA
if self.stream
Expand Down
9 changes: 0 additions & 9 deletions vllm/entrypoints/openai/completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from vllm.tokenizers import TokenizerLike
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters

logger = init_logger(__name__)

Expand All @@ -67,9 +66,6 @@ def __init__(
log_error_stack=log_error_stack,
)

# set up logits processors
self.logits_processors = self.model_config.logits_processors

self.enable_prompt_tokens_details = enable_prompt_tokens_details
self.enable_force_include_usage = enable_force_include_usage

Expand Down Expand Up @@ -178,13 +174,8 @@ async def create_completion(
else:
sampling_params = request.to_sampling_params(
max_tokens,
self.model_config.logits_processor_pattern,
self.default_sampling_params,
)
validate_logits_processors_parameters(
self.logits_processors,
sampling_params,
)

request_id_item = f"{request_id}-{i}"

Expand Down
Loading