vllm-project · hsliuustc0106 · Mar 20, 2026 · Mar 19, 2026 · Mar 20, 2026 · Mar 20, 2026
@@ -164,18 +164,13 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Benchmark & Engine Test"
+  - label: "Engine Test"
     depends_on: upload-merge-pipeline
     commands:
       - |
         timeout 15m bash -c '
                 export VLLM_WORKER_MULTIPROC_METHOD=spawn
-                set +e
-                pytest -s -v tests/benchmarks/test_serve_cli.py
-                EXIT1=$$?
                 pytest -s -v tests/engine/test_async_omni_engine_abort.py
-                EXIT2=$$?
-                exit $$((EXIT1 | EXIT2))
         '
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU

@@ -121,18 +121,13 @@ steps:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
 
-  - label: "Benchmark & Engine Test"
+  - label: "Engine Test"
     depends_on: upload-ready-pipeline
     commands:
       - |
         timeout 15m bash -c '
                 export VLLM_WORKER_MULTIPROC_METHOD=spawn
-                set +e
-                pytest -s -v tests/benchmarks/test_serve_cli.py
-                EXIT1=$$?
                 pytest -s -v tests/engine/test_async_omni_engine_abort.py
-                EXIT2=$$?
-                exit $$((EXIT1 | EXIT2))
         '
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU

diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
@@ -1,55 +1,143 @@
+import json
+import os
 import subprocess
+import textwrap
 from pathlib import Path
 
 import pytest
 
-from tests.conftest import OmniServerParams
-from tests.utils import hardware_test
-from vllm_omni.platforms import current_omni_platform
 
-models = ["Qwen/Qwen2.5-Omni-7B"]
-stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen2_5_omni_ci.yaml")]
+@pytest.mark.core_model
+@pytest.mark.benchmark
+@pytest.mark.cpu
+def test_bench_serve_cli_mocks_http_request(tmp_path: Path):
+    num_prompts = 5
+    port = 18000
+    result_filename = "bench-result.json"
+    calls_filename = "http-post-calls.json"
+    result_path = tmp_path / result_filename
+    calls_path = tmp_path / calls_filename
 
-if current_omni_platform.is_xpu():
-    stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "xpu" / "qwen2_5_omni_ci.yaml")]
-elif current_omni_platform.is_rocm():
-    stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")]
+    sitecustomize_path = tmp_path / "sitecustomize.py"
+    sitecustomize_path.write_text(
+        textwrap.dedent(
+            """
+            import atexit
+            import json
+            import os
 
-# Create parameter combinations for model and stage config
-test_params = [
-    OmniServerParams(model=model, stage_config_path=stage_config) for model in models for stage_config in stage_configs
-]
+            POST_CALLS = []
+            SSE_CHUNKS = [
+                b'data: {"choices":[{"delta":{"content":"hi"}}],"modality":"text"}\\n\\n',
+                b'data: {"choices":[{"delta":{"content":" there"}}],"modality":"text","metrics":{"num_tokens_out":4,"num_tokens_in":5}}\\n\\n',
+                b"data: [DONE]\\n\\n",
+            ]
 
+            class _Content:
+                def __init__(self, chunks):
+                    self._chunks = chunks
 
-@pytest.mark.core_model
-@pytest.mark.benchmark
-@hardware_test(res={"cuda": "L4", "xpu": "B60", "rocm": "MI325"}, num_cards=3)
-@pytest.mark.parametrize("omni_server", test_params, indirect=True)
-def test_bench_serve_chat(omni_server):
-    command = [
+                async def iter_any(self):
+                    for chunk in self._chunks:
+                        yield chunk
+
+            class MockResponse:
+                def __init__(self):
+                    self.status = 200
+                    self.reason = "OK"
+                    self.content = _Content(SSE_CHUNKS)
+
+                async def __aenter__(self):
+                    return self
+
+                async def __aexit__(self, exc_type, exc, tb):
+                    return False
+
+            class MockClientSession:
+                def __init__(self, *args, **kwargs):
+                    pass
+
+                def post(self, url=None, *args, **kwargs):
+                    if url is not None:
+                        POST_CALLS.append(url)
+                    return MockResponse()
+
+                async def close(self):
+                    return None
+
+            import vllm_omni.benchmarks.patch.patch as patch_mod
+
+            patch_mod.aiohttp.ClientSession = MockClientSession
+            patch_mod.aiohttp.TCPConnector = lambda *args, **kwargs: object()
+
+            calls_file = os.environ.get("VLLM_OMNI_TEST_POST_CALLS_FILE")
+
+            if calls_file:
+                def _write_calls():
+                    with open(calls_file, "w", encoding="utf-8") as f:
+                        json.dump({"requested_urls": POST_CALLS}, f)
+
+                atexit.register(_write_calls)
+            """
+        ),
+        encoding="utf-8",
+    )
+
+    env = os.environ.copy()
+    env["PYTHONPATH"] = str(tmp_path) + os.pathsep + env.get("PYTHONPATH", "")
+    env["VLLM_OMNI_TEST_POST_CALLS_FILE"] = str(calls_path)
+
+    cmd = [
         "vllm",
         "bench",
         "serve",
         "--omni",
         "--model",
-        omni_server.model,
+        "Qwen/Qwen2.5-Omni-7B",
         "--port",
-        str(omni_server.port),
+        str(port),
         "--dataset-name",
         "random",
         "--random-input-len",
         "32",
         "--random-output-len",
         "4",
         "--num-prompts",
-        "5",
+        str(num_prompts),
         "--endpoint",
         "/v1/chat/completions",
         "--backend",
         "openai-chat-omni",
+        "--disable-tqdm",
+        "--num-warmups",
+        "0",
+        "--ready-check-timeout-sec",
+        "0",
+        "--save-result",
+        "--result-dir",
+        str(tmp_path),
+        "--result-filename",
+        result_filename,
     ]
-    result = subprocess.run(command, capture_output=True, text=True)
-    print(result.stdout)
-    print(result.stderr)
+    proc = subprocess.run(
+        cmd,
+        cwd=str(Path(__file__).resolve().parents[2]),
+        env=env,
+        capture_output=True,
+        text=True,
+    )
+
+    assert proc.returncode == 0, f"CLI failed: stdout={proc.stdout}\nstderr={proc.stderr}"
+    print(f"CLI output: {proc.stdout}")
+    assert result_path.exists()
+    result = json.loads(result_path.read_text(encoding="utf-8"))
+    assert calls_path.exists()
+    calls = json.loads(calls_path.read_text(encoding="utf-8"))
+
+    expected_url = f"http://127.0.0.1:{port}/v1/chat/completions"
+    requested_urls = calls["requested_urls"]
+    sent_requests = len(requested_urls)
 
-    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
+    assert result["completed"] == sent_requests == num_prompts
+    assert requested_urls
+    assert all(url == expected_url for url in requested_urls), f"Unexpected target URLs: {requested_urls}"