Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,9 +319,6 @@ def _compare_tp(
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of a Ray Compiled Graph issue.
common_args.append("--disable-frontend-multiprocessing")
elif distributed_backend == "mp":
pp_env = None
else:
Expand Down
34 changes: 3 additions & 31 deletions tests/entrypoints/instrumentator/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> ["--max-model-len", "10100"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
Expand All @@ -40,7 +40,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> ...

This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--max-model-len 10100`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.

"""
Expand Down Expand Up @@ -79,17 +79,6 @@ async def client(server):
yield async_client


@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio
async def test_show_version(server: RemoteOpenAIServer):
response = requests.get(server.url_for("version"))
Expand All @@ -98,17 +87,6 @@ async def test_show_version(server: RemoteOpenAIServer):
assert response.json() == {"version": VLLM_VERSION}


@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio
async def test_check_health(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health"))
Expand All @@ -119,13 +97,7 @@ async def test_check_health(server: RemoteOpenAIServer):
@pytest.mark.parametrize(
"server_args",
[
pytest.param(
["--max-model-len", "10100"], id="default-frontend-multiprocessing"
),
pytest.param(
["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
id="disable-frontend-multiprocessing",
),
pytest.param(["--max-model-len", "10100"]),
],
indirect=True,
)
Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/instrumentator/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def default_server_args():
params=[
"",
"--enable-chunked-prefill",
"--disable-frontend-multiprocessing",
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,8 @@ def example_prompt_embeds(hf_runner):
return [_encode_embeds(item) for item in example_embeddings]


@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
def server_with_prompt_embeds(default_server_args, request):
if request.param:
default_server_args.append(request.param)

@pytest.fixture(scope="module")
def server_with_prompt_embeds(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server

Expand Down
1 change: 0 additions & 1 deletion tests/entrypoints/openai/completion/test_shutdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ async def test_shutdown_on_engine_failure():
"0.05",
"--max-num-seqs",
"2",
"--disable-frontend-multiprocessing",
],
# ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
# stdout/stderr pipes are enabled during ROCm GPU initialization.
Expand Down
13 changes: 3 additions & 10 deletions tests/v1/entrypoints/openai/test_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,12 @@ def default_server_args():
"128",
"--enforce-eager",
"--enable-prompt-tokens-details",
"--no-enable-prefix-caching",
]


@pytest.fixture(
scope="module",
params=[
["--no-enable-prefix-caching"],
["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"],
],
)
def server(default_server_args, request):
if request.param:
default_server_args = default_server_args + request.param
@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server

Expand Down
9 changes: 0 additions & 9 deletions vllm/benchmarks/throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@ async def run_vllm_async(
n: int,
engine_args: AsyncEngineArgs,
do_profile: bool,
disable_frontend_multiprocessing: bool = False,
disable_detokenize: bool = False,
) -> float:
from vllm import SamplingParams
Expand All @@ -191,7 +190,6 @@ async def run_vllm_async(

async with build_async_engine_client_from_engine_args(
engine_args,
disable_frontend_multiprocessing=disable_frontend_multiprocessing,
) as llm:
model_config = llm.model_config
assert all(
Expand Down Expand Up @@ -757,12 +755,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
default=False,
help="Use vLLM async engine rather than LLM class.",
)
parser.add_argument(
"--disable-frontend-multiprocessing",
action="store_true",
default=False,
help="Disable decoupled async engine frontend.",
)
parser.add_argument(
"--disable-detokenize",
action="store_true",
Expand Down Expand Up @@ -880,7 +872,6 @@ def main(args: argparse.Namespace):
requests,
args.n,
AsyncEngineArgs.from_cli_args(args),
disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
disable_detokenize=args.disable_detokenize,
do_profile=args.profile,
)
Expand Down
9 changes: 0 additions & 9 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ async def build_async_engine_client(
args: Namespace,
*,
usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
disable_frontend_multiprocessing: bool | None = None,
client_config: dict[str, Any] | None = None,
) -> AsyncIterator[EngineClient]:
if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver":
Expand All @@ -98,13 +97,9 @@ async def build_async_engine_client(
engine_args._api_process_count = client_config.get("client_count", 1)
engine_args._api_process_rank = client_config.get("client_index", 0)

if disable_frontend_multiprocessing is None:
disable_frontend_multiprocessing = bool(args.disable_frontend_multiprocessing)

async with build_async_engine_client_from_engine_args(
engine_args,
usage_context=usage_context,
disable_frontend_multiprocessing=disable_frontend_multiprocessing,
client_config=client_config,
) as engine:
yield engine
Expand All @@ -115,7 +110,6 @@ async def build_async_engine_client_from_engine_args(
engine_args: AsyncEngineArgs,
*,
usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
disable_frontend_multiprocessing: bool = False,
client_config: dict[str, Any] | None = None,
) -> AsyncIterator[EngineClient]:
"""
Expand All @@ -129,9 +123,6 @@ async def build_async_engine_client_from_engine_args(
# Create the EngineConfig (determines if we can use V1).
vllm_config = engine_args.create_engine_config(usage_context=usage_context)

if disable_frontend_multiprocessing:
logger.warning("V1 is enabled, but got --disable-frontend-multiprocessing.")

from vllm.v1.engine.async_llm import AsyncLLM

async_llm: AsyncLLM | None = None
Expand Down
3 changes: 0 additions & 3 deletions vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,6 @@ class BaseFrontendArgs:
"""When `--max-logprobs` is specified, represents single tokens as
strings of the form 'token_id:{token_id}' so that tokens that are not
JSON-encodable can be identified."""
disable_frontend_multiprocessing: bool = False
"""If specified, will run the OpenAI frontend server in the same process as
the model serving engine."""
enable_auto_tool_choice: bool = False
"""Enable auto tool choice for supported models. Use `--tool-call-parser`
to specify which parser to use."""
Expand Down
1 change: 0 additions & 1 deletion vllm/entrypoints/openai/run_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,6 @@ async def main(args: Namespace):
async with build_async_engine_client(
args,
usage_context=UsageContext.OPENAI_BATCH_RUNNER,
disable_frontend_multiprocessing=False,
) as engine_client:
await run_batch(engine_client, args)

Expand Down
Loading