Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions tests/entrypoints/openai/test_chat_with_tool_reasoning.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,32 @@ async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
assert len(tool_calls.choices[0].message.reasoning_content) > 0
assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS

@pytest.mark.asyncio
async def test_stop_str_with_reasoning(client: openai.AsyncOpenAI):
# check that the response is correctly stopped at "9.8"
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "user",
"content": "9.11 and 9.8, which is greater?"
}],
temperature=1.0,
stop="9.8",
)

assert response.choices[0].message.reasoning_content.find("9.8") != -1
assert response.choices[0].message.content.find("9.8") == -1

# check no stop string
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "user",
"content": "9.11 and 9.8, which is greater?"
}],
temperature=1.0,
)
assert response.choices[0].message.reasoning_content.find("9.8") != -1
# check that the response is not stopped at "9.8"
assert response.choices[0].message.content.find("9.8") != -1
36 changes: 20 additions & 16 deletions tests/tokenization/test_detokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast

from vllm.config import VllmConfig
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
from vllm.v1.engine import EngineCoreRequest
Expand Down Expand Up @@ -60,25 +61,28 @@ def _run_incremental_decode(
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
request = EngineCoreRequest(
request_id="",
prompt_token_ids=prompt_token_ids,
mm_features=None,
sampling_params=params,
pooling_params=None,
eos_token_id=None,
arrival_time=0.0,
lora_request=None,
cache_salt=None,
data_parallel_rank=None,
)

request = EngineCoreRequest(request_id="",
prompt_token_ids=prompt_token_ids,
mm_features=None,
sampling_params=params,
pooling_params=None,
eos_token_id=None,
arrival_time=0.0,
lora_request=None,
cache_salt=None,
data_parallel_rank=None)
vllm_config = VllmConfig()
if fast is None:
detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
detokenizer = IncrementalDetokenizer.from_new_request(
vllm_config=vllm_config, tokenizer=tokenizer, request=request)
elif fast:
detokenizer = FastIncrementalDetokenizer(tokenizer, request)
detokenizer = FastIncrementalDetokenizer(vllm_config=vllm_config,
tokenizer=tokenizer,
request=request)
else:
detokenizer = SlowIncrementalDetokenizer(tokenizer, request)
detokenizer = SlowIncrementalDetokenizer(vllm_config=vllm_config,
tokenizer=tokenizer,
request=request)

output_text = ""
for i, token_id in enumerate(all_input_ids[starting_index:]):
Expand Down
6 changes: 4 additions & 2 deletions tests/v1/engine/test_fast_incdec_prefix_err.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from transformers import AutoTokenizer

from vllm.config import VllmConfig
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.detokenizer import IncrementalDetokenizer
Expand All @@ -21,7 +22,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3.
"""
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")

vllm_config = VllmConfig()
# Create a test request
prompt_token_ids = [107, 4606, 236787, 107]
params = SamplingParams(skip_special_tokens=True)
Expand All @@ -38,7 +39,8 @@ def test_fast_inc_detok_invalid_utf8_err_case():
data_parallel_rank=None,
)

detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
detokenizer = IncrementalDetokenizer.from_new_request(
vllm_config, tokenizer, request)

assert detokenizer.__class__.__name__ == "FastIncrementalDetokenizer", (
"Should use FastIncrementalDetokenizer by default"
Expand Down
75 changes: 47 additions & 28 deletions tests/v1/engine/test_output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
MockEngineCore,
)
from vllm import PoolingParams
from vllm.config import VllmConfig, StructuredOutputsConfig
from vllm.config import DecodingConfig, VllmConfig
from vllm.logprobs import PromptLogprobs, SampleLogprobs
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.sampling_params import RequestOutputKind, SamplingParams
Expand All @@ -41,13 +43,17 @@ def _ref_convert_id_to_token(


@pytest.mark.parametrize(
"request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
)
def test_incremental_detokenization(
request_output_kind: RequestOutputKind, dummy_test_vectors
):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
"request_output_kind",
[RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
def test_incremental_detokenization(request_output_kind: RequestOutputKind,
dummy_test_vectors):
vllm_config = VllmConfig(
structured_outputs_config=StructuredOutputsConfig())
output_processor = OutputProcessor(vllm_config=vllm_config,
tokenizer=dummy_test_vectors.tokenizer,
log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens)

# Make N requests.
requests = [
Expand Down Expand Up @@ -407,17 +413,21 @@ def _validate_logprobs(


@pytest.mark.parametrize(
"request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
)
@pytest.mark.parametrize("num_sample_logprobs", [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
@pytest.mark.parametrize("num_prompt_logprobs", [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
def test_logprobs_processor(
request_output_kind: RequestOutputKind,
num_sample_logprobs: Optional[int],
num_prompt_logprobs: Optional[int],
dummy_test_vectors,
):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
"request_output_kind",
[RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
@pytest.mark.parametrize("num_sample_logprobs",
[None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
@pytest.mark.parametrize("num_prompt_logprobs",
[None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
def test_logprobs_processor(request_output_kind: RequestOutputKind,
num_sample_logprobs: Optional[int],
num_prompt_logprobs: Optional[int],
dummy_test_vectors):
vllm_config = VllmConfig(
structured_outputs_config=StructuredOutputsConfig())
output_processor = OutputProcessor(vllm_config=vllm_config,
tokenizer=dummy_test_vectors.tokenizer,
log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=None
Expand Down Expand Up @@ -588,8 +598,11 @@ def test_stop_token(
dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None
) # '<|end_of_text|>'
stop_token_ids = [128009] if not is_eos_test else None # '<|eot_id|>'

output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
vllm_config = VllmConfig(
structured_outputs_config=StructuredOutputsConfig())
output_processor = OutputProcessor(vllm_config=vllm_config,
tokenizer=dummy_test_vectors.tokenizer,
log_stats=False)
# Dummy engine core outputs, with control tokens suffixed to test stops
suffix_token = [eos_token_id] if is_eos_test else stop_token_ids
assert suffix_token is not None and isinstance(suffix_token[0], int)
Expand Down Expand Up @@ -693,13 +706,15 @@ def test_stop_token(


@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
@pytest.mark.parametrize("num_sample_logprobs", [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
def test_stop_string(
include_stop_str_in_output: bool,
num_sample_logprobs: Optional[int],
dummy_test_vectors,
):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
@pytest.mark.parametrize("num_sample_logprobs",
[None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
def test_stop_string(include_stop_str_in_output: bool,
num_sample_logprobs: Optional[int], dummy_test_vectors):
vllm_config = VllmConfig(
structured_outputs_config=StructuredOutputsConfig())
output_processor = OutputProcessor(vllm_config=vllm_config,
tokenizer=dummy_test_vectors.tokenizer,
log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
Expand Down Expand Up @@ -827,7 +842,11 @@ def test_stop_string(


def test_iteration_stats(dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
vllm_config = VllmConfig(
structured_outputs_config=StructuredOutputsConfig())
output_processor = OutputProcessor(vllm_config=vllm_config,
tokenizer=dummy_test_vectors.tokenizer,
log_stats=True)
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
engine_core_timestamp = time.monotonic()

Expand Down
6 changes: 3 additions & 3 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,9 @@ def __init__(
)

# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
self.output_processor = OutputProcessor(
self.tokenizer, log_stats=self.log_stats
)
self.output_processor = OutputProcessor(self.vllm_config,
self.tokenizer,
log_stats=self.log_stats)
if self.observability_config.otlp_traces_endpoint is not None:
tracer = init_tracer(
"vllm.llm_engine", self.observability_config.otlp_traces_endpoint
Expand Down
Loading
Loading