diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 3bd5bd87fe6f..f5264e685a18 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -89,14 +89,10 @@ steps: torch_nightly: true source_file_dependencies: - vllm/ - - tests/basic_correctness/test_basic_correctness - - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_cumem.py + - tests/basic_correctness/ commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s basic_correctness/test_cumem.py - - pytest -v -s basic_correctness/test_basic_correctness.py - - pytest -v -s basic_correctness/test_cpu_offload.py + - pytest -v -s basic_correctness - label: Entrypoints Unit Tests # 5min timeout_in_minutes: 10 @@ -104,10 +100,9 @@ steps: fast_check: true source_file_dependencies: - vllm/entrypoints - - tests/entrypoints/ + - tests/entrypoints/unit commands: - - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/unit - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -121,12 +116,12 @@ steps: - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration Test (API Server) # 100min - timeout_in_minutes: 130 +- label: Entrypoints Integration Test (API Server) %N # 50min each + timeout_in_minutes: 65 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true @@ -134,12 +129,21 @@ steps: source_file_dependencies: - vllm/ - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ - - pytest -v -s entrypoints/test_chat_utils.py + # PYTHONPATH is needed to import custom Worker extension + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s entrypoints/openai \ + --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py \ + --ignore=entrypoints/openai/test_oot_registration.py \ + --ignore=entrypoints/openai/test_tensorizer_entrypoint.py \ + --ignore=entrypoints/openai/correctness/ \ + --ignore=entrypoints/openai/test_collective_rpc.py \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py index 747676ac9567..d9bf99cb8d35 100644 --- a/tests/entrypoints/llm/test_collective_rpc.py +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -2,19 +2,18 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -import torch from vllm import LLM from ...utils import create_new_process_for_each_test +pytestmark = pytest.mark.multi_gpu_test(num_gpus=2) + @pytest.mark.parametrize("tp_size", [1, 2]) @pytest.mark.parametrize("backend", ["mp", "ray"]) @create_new_process_for_each_test() def test_collective_rpc(tp_size, backend, monkeypatch): - if torch.cuda.device_count() < tp_size: - pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") if tp_size == 1 and backend == "ray": pytest.skip("Skip duplicate test case") if tp_size == 1: diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/openai/test_chat_utils.py similarity index 99% rename from tests/entrypoints/test_chat_utils.py rename to tests/entrypoints/openai/test_chat_utils.py index ca87b3e76b3f..b48fd21eb132 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/openai/test_chat_utils.py @@ -31,8 +31,8 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer -from ..models.registry import HF_EXAMPLE_MODELS -from ..utils import VLLM_PATH +from ...models.registry import HF_EXAMPLE_MODELS +from ...utils import VLLM_PATH EXAMPLES_DIR = VLLM_PATH / "examples" diff --git a/tests/entrypoints/openai/tool_parsers/__init__.py b/tests/entrypoints/unit/__init__.py similarity index 100% rename from tests/entrypoints/openai/tool_parsers/__init__.py rename to tests/entrypoints/unit/__init__.py diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/unit/test_api_server_process_manager.py similarity index 100% rename from tests/entrypoints/test_api_server_process_manager.py rename to tests/entrypoints/unit/test_api_server_process_manager.py diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/unit/test_context.py similarity index 100% rename from tests/entrypoints/test_context.py rename to tests/entrypoints/unit/test_context.py diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/unit/test_harmony_utils.py similarity index 100% rename from tests/entrypoints/test_harmony_utils.py rename to tests/entrypoints/unit/test_harmony_utils.py diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/unit/test_renderer.py similarity index 100% rename from tests/entrypoints/test_renderer.py rename to tests/entrypoints/unit/test_renderer.py diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/unit/test_ssl_cert_refresher.py similarity index 100% rename from tests/entrypoints/test_ssl_cert_refresher.py rename to tests/entrypoints/unit/test_ssl_cert_refresher.py diff --git a/tests/entrypoints/unit/tool_parsers/__init__.py b/tests/entrypoints/unit/tool_parsers/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/entrypoints/openai/tool_parsers/conftest.py b/tests/entrypoints/unit/tool_parsers/conftest.py similarity index 100% rename from tests/entrypoints/openai/tool_parsers/conftest.py rename to tests/entrypoints/unit/tool_parsers/conftest.py diff --git a/tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py new file mode 100644 index 000000000000..14c9db4adbae --- /dev/null +++ b/tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser +from vllm.transformers_utils.tokenizer import AnyTokenizer + + +@pytest.fixture +def qwen_tokenizer() -> AnyTokenizer: + from vllm.transformers_utils.tokenizer import get_tokenizer + + return get_tokenizer("Qwen/Qwen3-32B") + + +@pytest.fixture +def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser: + return Hermes2ProToolParser(qwen_tokenizer) + + +@pytest.fixture +def any_chat_request() -> ChatCompletionRequest: + return ChatCompletionRequest( + seed=42, + model="Qwen/Qwen3-32B", + messages=[], + ) + + +def test_hermes_parser_streaming_just_forward_text( + qwen_tokenizer: AnyTokenizer, + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """This is some prior text that has nothing to do with tool calling.""" + tokens = qwen_tokenizer.encode(text) + previous_text = "" + delta_messages = [] + for token in tokens: + delta_text = qwen_tokenizer.decode([token]) + current_text = previous_text + delta_text + delta = hermes_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta_text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=any_chat_request, + ) + previous_text = current_text + delta_messages.append(delta) + + for delta in delta_messages: + assert delta is not None + assert not delta.tool_calls + + print(delta_messages) + assert "".join([delta.content for delta in delta_messages]) == text + + +def test_hermes_parser_streaming_failure_case_bug_19056( + qwen_tokenizer: AnyTokenizer, + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """ +{"name": "final_answer", "arguments": {"trigger": true}} +""" + tokens = qwen_tokenizer.encode(text) + previous_text = "" + delta_messages = [] + for token in tokens: + text = qwen_tokenizer.decode([token]) + current_text = previous_text + text + delta = hermes_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=any_chat_request, + ) + previous_text = current_text + if delta is not None: + delta_messages.append(delta) + + assert delta_messages[0].tool_calls[0].function.name == "final_answer" + tool_call_args = "".join( + delta.tool_calls[0].function.arguments or "" for delta in delta_messages + ) + assert tool_call_args == '{"trigger": true}' + + +def test_hermes_parser_streaming( + qwen_tokenizer: AnyTokenizer, + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = '\ +{"name": "get_current_temperature",\ +"arguments": {"location":\ +"San Francisco, California, United States", "unit": "celsius"}}\ +' + + tokens = qwen_tokenizer.encode(text) + previous_text = "" + delta_messages = [] + for token in tokens: + text = qwen_tokenizer.decode([token]) + current_text = previous_text + text + delta = hermes_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=text, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=any_chat_request, + ) + previous_text = current_text + if delta is not None: + delta_messages.append(delta) + print(delta_messages) + assert delta_messages[0].tool_calls[0].function.name == "get_current_temperature" + tool_call_args = "".join( + delta.tool_calls[0].function.arguments or "" for delta in delta_messages + ) + assert tool_call_args == ( + '{"location":"San Francisco, California, United States", "unit": "celsius"}' + ) + + +def test_hermes_parser_non_streaming_no_tool_call( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """This is not a tool call.""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert not tool_call.tools_called + + +def test_hermes_parser_non_streaming_tool_call_between_tags( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """ +{"name": "final_answer", "arguments": {"trigger": true}} +""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert tool_call.tools_called + assert tool_call.tool_calls[0].function.name == "final_answer" + assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' + + +def test_hermes_parser_non_streaming_tool_call_until_eos( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + text = """ +{"name": "final_answer", "arguments": {"trigger": true}}""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert tool_call.tools_called + assert tool_call.tool_calls[0].function.name == "final_answer" + assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' + + +def test_hermes_parser_non_streaming_tool_call_invalid_json( + hermes_parser: Hermes2ProToolParser, + any_chat_request: ChatCompletionRequest, +) -> None: + # Missing closing brace to trigger exception + text = """ +{"name": "final_answer", "arguments": {"trigger": true}""" + tool_call = hermes_parser.extract_tool_calls( + model_output=text, + request=any_chat_request, + ) + + assert tool_call is not None + assert not tool_call.tools_called diff --git a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_hunyuan_a13b_tool_parser.py similarity index 100% rename from tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py rename to tests/entrypoints/unit/tool_parsers/test_hunyuan_a13b_tool_parser.py diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_llama3_json_tool_parser.py similarity index 100% rename from tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py rename to tests/entrypoints/unit/tool_parsers/test_llama3_json_tool_parser.py diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_llama4_pythonic_tool_parser.py similarity index 100% rename from tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py rename to tests/entrypoints/unit/tool_parsers/test_llama4_pythonic_tool_parser.py diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_olmo3_tool_parser.py similarity index 100% rename from tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py rename to tests/entrypoints/unit/tool_parsers/test_olmo3_tool_parser.py diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_pythonic_tool_parser.py similarity index 100% rename from tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py rename to tests/entrypoints/unit/tool_parsers/test_pythonic_tool_parser.py diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/unit/tool_parsers/utils.py similarity index 100% rename from tests/entrypoints/openai/tool_parsers/utils.py rename to tests/entrypoints/unit/tool_parsers/utils.py diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/tool_use/test_hermes_tool_parser.py similarity index 57% rename from tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py rename to tests/tool_use/test_hermes_tool_parser.py index 38008dafe32b..e396ab5d8dbb 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/tool_use/test_hermes_tool_parser.py @@ -5,11 +5,7 @@ import pytest -from vllm.entrypoints.openai.protocol import ChatCompletionRequest -from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser -from vllm.transformers_utils.tokenizer import AnyTokenizer - -from ....utils import RemoteOpenAIServer +from ..utils import RemoteOpenAIServer MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci" @@ -267,194 +263,3 @@ async def test_streaming_product_tool_call(): print("\n[Streaming Product Test Passed]") print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") print(f"Reconstructed Arguments: {arguments}") - - -@pytest.fixture -def qwen_tokenizer() -> AnyTokenizer: - from vllm.transformers_utils.tokenizer import get_tokenizer - - return get_tokenizer("Qwen/Qwen3-32B") - - -@pytest.fixture -def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser: - return Hermes2ProToolParser(qwen_tokenizer) - - -@pytest.fixture -def any_chat_request() -> ChatCompletionRequest: - return ChatCompletionRequest( - seed=42, - model="Qwen/Qwen3-32B", - messages=[], - ) - - -def test_hermes_parser_streaming_just_forward_text( - qwen_tokenizer: AnyTokenizer, - hermes_parser: Hermes2ProToolParser, - any_chat_request: ChatCompletionRequest, -) -> None: - text = """This is some prior text that has nothing to do with tool calling.""" - tokens = qwen_tokenizer.encode(text) - previous_text = "" - delta_messages = [] - for token in tokens: - delta_text = qwen_tokenizer.decode([token]) - current_text = previous_text + delta_text - delta = hermes_parser.extract_tool_calls_streaming( - previous_text=previous_text, - current_text=current_text, - delta_text=delta_text, - previous_token_ids=[], - current_token_ids=[], - delta_token_ids=[], - request=any_chat_request, - ) - previous_text = current_text - delta_messages.append(delta) - - for delta in delta_messages: - assert delta is not None - assert not delta.tool_calls - - print(delta_messages) - assert "".join([delta.content for delta in delta_messages]) == text - - -def test_hermes_parser_streaming_failure_case_bug_19056( - qwen_tokenizer: AnyTokenizer, - hermes_parser: Hermes2ProToolParser, - any_chat_request: ChatCompletionRequest, -) -> None: - text = """ -{"name": "final_answer", "arguments": {"trigger": true}} -""" - tokens = qwen_tokenizer.encode(text) - previous_text = "" - delta_messages = [] - for token in tokens: - text = qwen_tokenizer.decode([token]) - current_text = previous_text + text - delta = hermes_parser.extract_tool_calls_streaming( - previous_text=previous_text, - current_text=current_text, - delta_text=text, - previous_token_ids=[], - current_token_ids=[], - delta_token_ids=[], - request=any_chat_request, - ) - previous_text = current_text - if delta is not None: - delta_messages.append(delta) - - assert delta_messages[0].tool_calls[0].function.name == "final_answer" - tool_call_args = "".join( - delta.tool_calls[0].function.arguments or "" for delta in delta_messages - ) - assert tool_call_args == '{"trigger": true}' - - -def test_hermes_parser_streaming( - qwen_tokenizer: AnyTokenizer, - hermes_parser: Hermes2ProToolParser, - any_chat_request: ChatCompletionRequest, -) -> None: - text = '\ -{"name": "get_current_temperature",\ -"arguments": {"location":\ -"San Francisco, California, United States", "unit": "celsius"}}\ -' - - tokens = qwen_tokenizer.encode(text) - previous_text = "" - delta_messages = [] - for token in tokens: - text = qwen_tokenizer.decode([token]) - current_text = previous_text + text - delta = hermes_parser.extract_tool_calls_streaming( - previous_text=previous_text, - current_text=current_text, - delta_text=text, - previous_token_ids=[], - current_token_ids=[], - delta_token_ids=[], - request=any_chat_request, - ) - previous_text = current_text - if delta is not None: - delta_messages.append(delta) - print(delta_messages) - assert delta_messages[0].tool_calls[0].function.name == "get_current_temperature" - tool_call_args = "".join( - delta.tool_calls[0].function.arguments or "" for delta in delta_messages - ) - assert tool_call_args == ( - '{"location":"San Francisco, California, United States", "unit": "celsius"}' - ) - - -def test_hermes_parser_non_streaming_no_tool_call( - hermes_parser: Hermes2ProToolParser, - any_chat_request: ChatCompletionRequest, -) -> None: - text = """This is not a tool call.""" - tool_call = hermes_parser.extract_tool_calls( - model_output=text, - request=any_chat_request, - ) - - assert tool_call is not None - assert not tool_call.tools_called - - -def test_hermes_parser_non_streaming_tool_call_between_tags( - hermes_parser: Hermes2ProToolParser, - any_chat_request: ChatCompletionRequest, -) -> None: - text = """ -{"name": "final_answer", "arguments": {"trigger": true}} -""" - tool_call = hermes_parser.extract_tool_calls( - model_output=text, - request=any_chat_request, - ) - - assert tool_call is not None - assert tool_call.tools_called - assert tool_call.tool_calls[0].function.name == "final_answer" - assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' - - -def test_hermes_parser_non_streaming_tool_call_until_eos( - hermes_parser: Hermes2ProToolParser, - any_chat_request: ChatCompletionRequest, -) -> None: - text = """ -{"name": "final_answer", "arguments": {"trigger": true}}""" - tool_call = hermes_parser.extract_tool_calls( - model_output=text, - request=any_chat_request, - ) - - assert tool_call is not None - assert tool_call.tools_called - assert tool_call.tool_calls[0].function.name == "final_answer" - assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' - - -def test_hermes_parser_non_streaming_tool_call_invalid_json( - hermes_parser: Hermes2ProToolParser, - any_chat_request: ChatCompletionRequest, -) -> None: - # Missing closing brace to trigger exception - text = """ -{"name": "final_answer", "arguments": {"trigger": true}""" - tool_call = hermes_parser.extract_tool_calls( - model_output=text, - request=any_chat_request, - ) - - assert tool_call is not None - assert not tool_call.tools_called