diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 5aff3b3c7bd9..9c5f12ecb5eb 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -37,6 +37,25 @@ async def test_empty_prompt(): ) +@pytest.mark.asyncio +async def test_empty_prompt_list(): + model_name = "gpt2" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises( + openai.BadRequestError, + match="Either prompt or prompt_embeds must be provided and non-empty.", + ): + await client.completions.create( + model=model_name, + prompt=[], + max_tokens=5, + temperature=0.0, + ) + + @pytest.mark.asyncio async def test_out_of_vocab_token_ids(): model_name = "gpt2" diff --git a/tests/test_config.py b/tests/test_config.py index f98b30f990cd..50cb5b01ec28 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -26,6 +26,7 @@ from vllm.config.vllm import ( OPTIMIZATION_LEVEL_TO_CONFIG, OptimizationLevel, + enable_allreduce_rms_fusion, ) from vllm.platforms import current_platform @@ -58,6 +59,58 @@ def test_async_scheduling_with_pipeline_parallelism_is_allowed(): assert cfg.scheduler_config.async_scheduling is True +@pytest.mark.parametrize( + ("parallel_config", "should_be_enabled"), + [ + ( + ParallelConfig( + tensor_parallel_size=2, + pipeline_parallel_size=1, + data_parallel_size=1, + ), + True, + ), + ( + ParallelConfig( + tensor_parallel_size=1, + pipeline_parallel_size=1, + data_parallel_size=1, + ), + False, + ), + ( + ParallelConfig( + tensor_parallel_size=2, + pipeline_parallel_size=2, + data_parallel_size=1, + ), + False, + ), + ( + ParallelConfig( + tensor_parallel_size=2, + pipeline_parallel_size=1, + data_parallel_size=2, + ), + False, + ), + ], + ids=["TP-only", "No-TP", "With-PP", "With-DP"], +) +def test_enable_allreduce_rms_fusion_gating( + parallel_config: ParallelConfig, + should_be_enabled: bool, +): + cfg = VllmConfig(parallel_config=parallel_config) + + with ( + patch("vllm.utils.flashinfer.has_flashinfer", return_value=True), + patch.object(current_platform, "is_cuda", return_value=True), + patch.object(current_platform, "is_device_capability", return_value=True), + ): + assert enable_allreduce_rms_fusion(cfg) is should_be_enabled + + @dataclass class _TestConfigFields: a: int diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py index 73232ec3aa94..88a46f70a383 100644 --- a/vllm/entrypoints/openai/completion/protocol.py +++ b/vllm/entrypoints/openai/completion/protocol.py @@ -421,7 +421,9 @@ def validate_prompt_and_prompt_embeds(cls, data): prompt = data.get("prompt") prompt_embeds = data.get("prompt_embeds") - prompt_is_empty = prompt is None or (isinstance(prompt, str) and prompt == "") + prompt_is_empty = prompt is None or ( + isinstance(prompt, (str, list)) and len(prompt) == 0 + ) embeds_is_empty = prompt_embeds is None or ( isinstance(prompt_embeds, list) and len(prompt_embeds) == 0 )