diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index 6ec6ab94ff08..1def2c4682b1 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -127,7 +127,7 @@ run_and_track_test() { # --- Actual Test Execution --- run_and_track_test 1 "test_struct_output_generate.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" + "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" run_and_track_test 2 "test_moe_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" run_and_track_test 3 "test_lora.py" \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 82e97bfbb1b2..d2f5c4f30bf1 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -483,19 +483,6 @@ steps: - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" -- label: Entrypoints V1 # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints - - - label: V1 Sample + Logits # TBD timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] @@ -1173,14 +1160,14 @@ steps: - vllm/v1/engine/ - vllm/v1/worker/ - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py - label: Distributed Compile + RPC Tests (2 GPUs) # TBD @@ -1770,19 +1757,6 @@ steps: - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" -- label: Entrypoints V1 # 25.7m - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints - - - label: V1 Spec Decode # TBD timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] @@ -2395,14 +2369,14 @@ steps: - vllm/v1/engine/ - vllm/v1/worker/ - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py - label: Distributed Compile + RPC Tests (2 GPUs) # 56.1m diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 6cf8b43f57c4..4e1cd1433fe6 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -27,14 +27,14 @@ steps: - vllm/v1/engine/ - vllm/v1/worker/ - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/entrypoints/openai/test_multi_api_servers.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py - label: Distributed Compile + RPC Tests (2 GPUs) timeout_in_minutes: 20 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index ac6be8e141f2..0d3bbc515488 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -34,7 +34,7 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/test_chat_utils.py mirror: amd: @@ -75,19 +75,6 @@ steps: commands: - pytest -v -s entrypoints/openai/responses -- label: Entrypoints V1 - timeout_in_minutes: 50 - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - - label: OpenAI API Correctness timeout_in_minutes: 30 source_file_dependencies: diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml index 85421399d1b8..238d5956a025 100644 --- a/.buildkite/test_areas/model_runner_v2.yaml +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -11,7 +11,7 @@ steps: - vllm/v1/attention/ - tests/v1/engine/test_llm_engine.py - tests/v1/e2e/ - - tests/v1/entrypoints/llm/test_struct_output_generate.py + - tests/entrypoints/llm/test_struct_output_generate.py commands: - set -x - export VLLM_USE_V2_MODEL_RUNNER=1 @@ -22,7 +22,7 @@ steps: - pytest -v -s v1/e2e/general/test_context_length.py - pytest -v -s v1/e2e/general/test_min_tokens.py # Temporary hack filter to exclude ngram spec decoding based tests. - - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" + - pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" - label: Model Runner V2 Examples timeout_in_minutes: 45 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b0e49432775f..c0ceae044d25 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -75,7 +75,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/multimodal @DarkLight1337 @ywang96 @NickLucche /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety /tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm +/tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery /tests/weight_loading @mgoin @youkaichao @yewentao256 diff --git a/.github/mergify.yml b/.github/mergify.yml index 1c6837277831..eace1f479035 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -260,7 +260,7 @@ pull_request_rules: - files=examples/offline_inference/structured_outputs.py - files=examples/online_serving/structured_outputs/structured_outputs.py - files~=^tests/v1/structured_output/ - - files=tests/v1/entrypoints/llm/test_struct_output_generate.py + - files=tests/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ actions: label: diff --git a/tests/conftest.py b/tests/conftest.py index 719bfa5ed1f0..f3b22d898903 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,9 +6,6 @@ from tblib import pickling_support -# Import fixture -from tests.v1.entrypoints.conftest import sample_json_schema # noqa - # ruff: noqa # Install support for pickling exceptions so that we can nicely propagate @@ -81,6 +78,55 @@ logger = init_logger(__name__) + +@pytest.fixture +def sample_json_schema(): + return { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + "skills": { + "type": "array", + "items": { + "type": "string", + }, + }, + "grade": { + "type": "string", + "pattern": "^[A-D]$", + }, + "email": { + "type": "string", + "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", + }, + "work_history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": {"type": "string"}, + "duration": { + "type": "number", + "minimum": 0.0, + "maximum": 100.0, + }, + "position": {"type": "string"}, + }, + "required": ["company", "duration", "position"], + "additionalProperties": False, + }, + "minItems": 0, + "maxItems": 3, + }, + }, + "required": ["name", "age", "skills", "grade", "email", "work_history"], + "additionalProperties": False, + "minProperties": 1, + "maxProperties": 10, + } + + _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/entrypoints/llm/test_struct_output_generate.py similarity index 91% rename from tests/v1/entrypoints/llm/test_struct_output_generate.py rename to tests/entrypoints/llm/test_struct_output_generate.py index 70c6d250bc1b..3ece27234368 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/entrypoints/llm/test_struct_output_generate.py @@ -24,6 +24,108 @@ StructuredOutputsParams, ) +SAMPLE_REGEX = ( + r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" +) + +# Note: Ensure this only uses attributes compatible with xgrammar +SAMPLE_JSON_SCHEMA = { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + "skills": { + "type": "array", + "items": { + "type": "string", + }, + }, + "grade": { + "type": "string", + "pattern": "^[A-D]$", # Regex pattern + }, + "email": { + "type": "string", + "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", + }, + "work_history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": {"type": "string"}, + "duration": { + "type": "number", + "minimum": 0.0, + "maximum": 100.0, # Numeric range + }, + "position": {"type": "string"}, + }, + "required": ["company", "duration", "position"], + "additionalProperties": False, + }, + "minItems": 0, + "maxItems": 3, + }, + }, + "required": ["name", "age", "skills", "grade", "email", "work_history"], + "additionalProperties": False, + "minProperties": 1, + "maxProperties": 10, +} + +# A schema unsupported by xgrammar +UNSUPPORTED_JSON_SCHEMA = { + "type": "object", + "properties": { + "score": { + "type": "integer", + "multipleOf": 5, # Numeric multiple + }, + "tags": { + "type": "array", + "items": {"type": "string", "minLength": 10, "maxLength": 20}, + }, + }, + "required": ["score", "tags"], + "additionalProperties": False, + "patternProperties": { + "^score$": {"type": "integer"}, + }, +} + +SAMPLE_STRUCTURED_OUTPUTS_CHOICES = [ + "Python", + "Java", + "JavaScript", + "C++", + "C#", + "PHP", + "TypeScript", + "Ruby", + "Swift", + "Kotlin", +] + +SAMPLE_SQL_EBNF = """ +root ::= select_statement +select_statement ::= "SELECT" column "from" table "where" condition +column ::= "col_1" | "col_2" +table ::= "table_1" | "table_2" +condition ::= column "=" number +number ::= "1" | "2" +""" + +SAMPLE_SQL_LARK = """ +start: select_statement +select_statement: "SELECT" column "from" table "where" condition +column: "col_1" | "col_2" +table: "table_1" | "table_2" +condition: column "=" number +number: "1" | "2" +""" + NGRAM_SPEC_CONFIG = { "model": "[ngram]", "num_speculative_tokens": 5, @@ -110,17 +212,17 @@ class CarDescription(BaseModel): PARAMS_MODELS_BACKENDS_TOKENIZER_MODE, ) def test_structured_output( - sample_json_schema: dict[str, Any], - unsupported_json_schema: dict[str, Any], - sample_sql_ebnf: str, - sample_sql_lark: str, - sample_regex: str, - sample_structured_outputs_choices: str, backend: str, tokenizer_mode: str, model_name: str, speculative_config: dict[str, Any], ): + sample_json_schema = SAMPLE_JSON_SCHEMA + unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA + sample_sql_ebnf = SAMPLE_SQL_EBNF + sample_sql_lark = SAMPLE_SQL_LARK + sample_regex = SAMPLE_REGEX + sample_structured_outputs_choices = SAMPLE_STRUCTURED_OUTPUTS_CHOICES if current_platform.is_tpu() and speculative_config: pytest.skip("TPU does not support speculative decoding") @@ -702,10 +804,10 @@ def test_structured_output_with_reasoning_matrices( @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) def test_structured_output_auto_mode( - unsupported_json_schema: dict[str, Any], model_name: str, tokenizer_mode: str, ): + unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA llm = LLM( model=model_name, max_model_len=1024, @@ -808,9 +910,9 @@ def generate_with_backend(backend): @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"]) def test_structured_output_batched_with_non_structured_outputs_requests( - sample_json_schema: dict[str, Any], backend: str, ): + sample_json_schema = SAMPLE_JSON_SCHEMA # Don't use eager execution on TPUs because we want to test for no # recompilation at runtime enforce_eager = bool(not current_platform.is_tpu()) diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/entrypoints/openai/chat_completion/test_chat_completion.py similarity index 100% rename from tests/v1/entrypoints/openai/test_chat_completion.py rename to tests/entrypoints/openai/chat_completion/test_chat_completion.py diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/entrypoints/openai/chat_completion/test_completion_with_image_embeds.py similarity index 100% rename from tests/v1/entrypoints/openai/test_completion_with_image_embeds.py rename to tests/entrypoints/openai/chat_completion/test_completion_with_image_embeds.py diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/completion/test_completion.py similarity index 100% rename from tests/v1/entrypoints/openai/test_completion.py rename to tests/entrypoints/openai/completion/test_completion.py diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/entrypoints/openai/test_multi_api_servers.py similarity index 100% rename from tests/v1/entrypoints/openai/test_multi_api_servers.py rename to tests/entrypoints/openai/test_multi_api_servers.py diff --git a/tests/v1/entrypoints/__init__.py b/tests/v1/entrypoints/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py deleted file mode 100644 index bc9674ee86cf..000000000000 --- a/tests/v1/entrypoints/conftest.py +++ /dev/null @@ -1,173 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - - -@pytest.fixture -def sample_prompts(): - return [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - -@pytest.fixture -def sample_token_ids(): - return [ - [0], - [0, 1], - [0, 2, 1], - [0, 3, 1, 2], - ] - - -@pytest.fixture -def sample_regex(): - return ( - r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" - ) - - -# Note: Ensure this only uses attributes compatible with xgrammar -@pytest.fixture -def sample_json_schema(): - return { - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - "skills": { - "type": "array", - "items": { - "type": "string", - }, - }, - "grade": { - "type": "string", - "pattern": "^[A-D]$", # Regex pattern - }, - "email": { - "type": "string", - "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", - }, - "work_history": { - "type": "array", - "items": { - "type": "object", - "properties": { - "company": {"type": "string"}, - "duration": { - "type": "number", - "minimum": 0.0, - "maximum": 100.0, # Numeric range - }, - "position": {"type": "string"}, - }, - "required": ["company", "duration", "position"], - "additionalProperties": False, - }, - "minItems": 0, - "maxItems": 3, - }, - }, - "required": ["name", "age", "skills", "grade", "email", "work_history"], - "additionalProperties": False, - "minProperties": 1, - "maxProperties": 10, - } - - -# A schema unsupported by xgrammar -@pytest.fixture -def unsupported_json_schema(): - return { - "type": "object", - "properties": { - "score": { - "type": "integer", - "multipleOf": 5, # Numeric multiple - }, - "tags": { - "type": "array", - "items": {"type": "string", "minLength": 10, "maxLength": 20}, - }, - }, - "required": ["score", "tags"], - "additionalProperties": False, - "patternProperties": { - "^score$": {"type": "integer"}, - }, - } - - -@pytest.fixture -def sample_definition_json_schema(): - return { - "$defs": { - "Step": { - "properties": { - "explanation": {"title": "Explanation", "type": "string"}, - "output": {"title": "Output", "type": "string"}, - }, - "required": ["explanation", "output"], - "title": "Step", - "type": "object", - } - }, - "properties": { - "steps": { - "items": {"$ref": "#/$defs/Step"}, - "title": "Steps", - "type": "array", - }, - "final_answer": {"title": "Final Answer", "type": "string"}, - }, - "required": ["steps", "final_answer"], - "title": "MathReasoning", - "type": "object", - "additionalProperties": False, - } - - -@pytest.fixture -def sample_structured_outputs_choices(): - return [ - "Python", - "Java", - "JavaScript", - "C++", - "C#", - "PHP", - "TypeScript", - "Ruby", - "Swift", - "Kotlin", - ] - - -@pytest.fixture -def sample_sql_ebnf(): - return """ -root ::= select_statement -select_statement ::= "SELECT" column "from" table "where" condition -column ::= "col_1" | "col_2" -table ::= "table_1" | "table_2" -condition ::= column "=" number -number ::= "1" | "2" -""" - - -@pytest.fixture -def sample_sql_lark(): - return """ -start: select_statement -select_statement: "SELECT" column "from" table "where" condition -column: "col_1" | "col_2" -table: "table_1" | "table_2" -condition: column "=" number -number: "1" | "2" -""" diff --git a/tests/v1/entrypoints/llm/__init__.py b/tests/v1/entrypoints/llm/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000