vllm-project · robellliu-dev · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 9, 2026
@@ -37,6 +37,25 @@ async def test_empty_prompt():
             )
 
 
+@pytest.mark.asyncio
+async def test_empty_prompt_list():
+    model_name = "gpt2"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(
+            openai.BadRequestError,
+            match="Either prompt or prompt_embeds must be provided and non-empty.",
+        ):
+            await client.completions.create(
+                model=model_name,
+                prompt=[],
+                max_tokens=5,
+                temperature=0.0,
+            )
+
+
 @pytest.mark.asyncio
 async def test_out_of_vocab_token_ids():
     model_name = "gpt2"

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -26,6 +26,7 @@
 from vllm.config.vllm import (
     OPTIMIZATION_LEVEL_TO_CONFIG,
     OptimizationLevel,
+    enable_allreduce_rms_fusion,
 )
 from vllm.platforms import current_platform
 
@@ -58,6 +59,58 @@ def test_async_scheduling_with_pipeline_parallelism_is_allowed():
     assert cfg.scheduler_config.async_scheduling is True
 
 
+@pytest.mark.parametrize(
+    ("parallel_config", "should_be_enabled"),
+    [
+        (
+            ParallelConfig(
+                tensor_parallel_size=2,
+                pipeline_parallel_size=1,
+                data_parallel_size=1,
+            ),
+            True,
+        ),
+        (
+            ParallelConfig(
+                tensor_parallel_size=1,
+                pipeline_parallel_size=1,
+                data_parallel_size=1,
+            ),
+            False,
+        ),
+        (
+            ParallelConfig(
+                tensor_parallel_size=2,
+                pipeline_parallel_size=2,
+                data_parallel_size=1,
+            ),
+            False,
+        ),
+        (
+            ParallelConfig(
+                tensor_parallel_size=2,
+                pipeline_parallel_size=1,
+                data_parallel_size=2,
+            ),
+            False,
+        ),
+    ],
+    ids=["TP-only", "No-TP", "With-PP", "With-DP"],
+)
+def test_enable_allreduce_rms_fusion_gating(
+    parallel_config: ParallelConfig,
+    should_be_enabled: bool,
+):
+    cfg = VllmConfig(parallel_config=parallel_config)
+
+    with (
+        patch("vllm.utils.flashinfer.has_flashinfer", return_value=True),
+        patch.object(current_platform, "is_cuda", return_value=True),
+        patch.object(current_platform, "is_device_capability", return_value=True),
+    ):
+        assert enable_allreduce_rms_fusion(cfg) is should_be_enabled
+
+
 @dataclass
 class _TestConfigFields:
     a: int

@@ -421,7 +421,9 @@ def validate_prompt_and_prompt_embeds(cls, data):
         prompt = data.get("prompt")
         prompt_embeds = data.get("prompt_embeds")
 
-        prompt_is_empty = prompt is None or (isinstance(prompt, str) and prompt == "")
+        prompt_is_empty = prompt is None or (
+            isinstance(prompt, (str, list)) and len(prompt) == 0
+        )
         embeds_is_empty = prompt_embeds is None or (
             isinstance(prompt_embeds, list) and len(prompt_embeds) == 0
         )