Skip to content
Merged
189 changes: 0 additions & 189 deletions tests/serve/test_dynamo_serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
DeploymentGraph,
Payload,
chat_completions_response_handler,
completions_response_handler,
)
from tests.utils.managed_process import ManagedProcess

Expand Down Expand Up @@ -56,106 +55,7 @@
expected_response=["bus"],
)

text_payload = Payload(
payload_chat={
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"messages": [
{
"role": "user",
"content": text_prompt, # Shorter prompt
}
],
"max_tokens": 150, # Reduced from 500
"temperature": 0.1,
# "seed": 0,
},
payload_completions={
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"prompt": text_prompt,
"max_tokens": 150,
"temperature": 0.1,
# "seed": 0,
},
repeat_count=10,
expected_log=[],
expected_response=["AI"],
)

deployment_graphs = {
"agg": (
DeploymentGraph(
module="graphs.agg:Frontend",
config="configs/agg.yaml",
directory="/workspace/examples/llm",
endpoints=["v1/chat/completions"],
response_handlers=[
chat_completions_response_handler,
],
marks=[pytest.mark.gpu_1, pytest.mark.vllm],
),
text_payload,
),
"sglang_agg": (
DeploymentGraph(
module="graphs.agg:Frontend",
config="configs/agg.yaml",
directory="/workspace/examples/sglang",
endpoints=["v1/chat/completions", "v1/completions"],
response_handlers=[
chat_completions_response_handler,
completions_response_handler,
],
marks=[pytest.mark.gpu_1, pytest.mark.sglang],
),
text_payload,
),
"disagg": (
DeploymentGraph(
module="graphs.disagg:Frontend",
config="configs/disagg.yaml",
directory="/workspace/examples/llm",
endpoints=["v1/chat/completions"],
response_handlers=[
chat_completions_response_handler,
],
marks=[pytest.mark.gpu_2, pytest.mark.vllm],
),
text_payload,
),
"agg_router": (
DeploymentGraph(
module="graphs.agg_router:Frontend",
config="configs/agg_router.yaml",
directory="/workspace/examples/llm",
endpoints=["v1/chat/completions"],
response_handlers=[
chat_completions_response_handler,
],
marks=[pytest.mark.gpu_1, pytest.mark.vllm],
# FIXME: This is a hack to allow deployments to start before sending any requests.
# When using KV-router, if all the endpoints are not registered, the service
# enters a non-recoverable state.
delayed_start=120,
),
text_payload,
),
"disagg_router": (
DeploymentGraph(
module="graphs.disagg_router:Frontend",
config="configs/disagg_router.yaml",
directory="/workspace/examples/llm",
endpoints=["v1/chat/completions"],
response_handlers=[
chat_completions_response_handler,
],
marks=[pytest.mark.gpu_2, pytest.mark.vllm],
# FIXME: This is a hack to allow deployments to start before sending any requests.
# When using KV-router, if all the endpoints are not registered, the service
# enters a non-recoverable state.
delayed_start=120,
),
text_payload,
),
"multimodal_agg": (
DeploymentGraph(
module="graphs.agg:Frontend",
Expand All @@ -169,84 +69,6 @@
),
multimodal_payload,
),
"vllm_v1_agg": (
DeploymentGraph(
module="graphs.agg:Frontend",
config="configs/agg.yaml",
directory="/workspace/examples/vllm_v1",
endpoints=["v1/chat/completions", "v1/completions"],
response_handlers=[
chat_completions_response_handler,
completions_response_handler,
],
marks=[pytest.mark.gpu_1, pytest.mark.vllm],
),
text_payload,
),
"trtllm_agg": (
DeploymentGraph(
module="graphs.agg:Frontend",
config="configs/agg.yaml",
directory="/workspace/examples/tensorrt_llm",
endpoints=["v1/chat/completions", "v1/completions"],
response_handlers=[
chat_completions_response_handler,
completions_response_handler,
],
marks=[pytest.mark.gpu_1, pytest.mark.tensorrtllm],
),
text_payload,
),
"trtllm_agg_router": (
DeploymentGraph(
module="graphs.agg:Frontend",
config="configs/agg_router.yaml",
directory="/workspace/examples/tensorrt_llm",
endpoints=["v1/chat/completions", "v1/completions"],
response_handlers=[
chat_completions_response_handler,
completions_response_handler,
],
marks=[pytest.mark.gpu_1, pytest.mark.tensorrtllm],
# FIXME: This is a hack to allow deployments to start before sending any requests.
# When using KV-router, if all the endpoints are not registered, the service
# enters a non-recoverable state.
delayed_start=120,
),
text_payload,
),
"trtllm_disagg": (
DeploymentGraph(
module="graphs.disagg:Frontend",
config="configs/disagg.yaml",
directory="/workspace/examples/tensorrt_llm",
endpoints=["v1/chat/completions", "v1/completions"],
response_handlers=[
chat_completions_response_handler,
completions_response_handler,
],
marks=[pytest.mark.gpu_2, pytest.mark.tensorrtllm],
),
text_payload,
),
"trtllm_disagg_router": (
DeploymentGraph(
module="graphs.disagg:Frontend",
config="configs/disagg_router.yaml",
directory="/workspace/examples/tensorrt_llm",
endpoints=["v1/chat/completions", "v1/completions"],
response_handlers=[
chat_completions_response_handler,
completions_response_handler,
],
marks=[pytest.mark.gpu_2, pytest.mark.tensorrtllm],
# FIXME: This is a hack to allow deployments to start before sending any requests.
# When using KV-router, if all the endpoints are not registered, the service
# enters a non-recoverable state.
delayed_start=120,
),
text_payload,
),
}


Expand Down Expand Up @@ -394,17 +216,6 @@ def wait_for_ready(self, payload, logger=logging.getLogger()):
@pytest.fixture(
params=[
pytest.param("multimodal_agg", marks=[pytest.mark.vllm, pytest.mark.gpu_2]),
pytest.param("trtllm_agg", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_1]),
pytest.param(
"trtllm_agg_router", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_1]
),
pytest.param(
"trtllm_disagg", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_2]
),
pytest.param(
"trtllm_disagg_router", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_2]
),
# pytest.param("sglang", marks=[pytest.mark.sglang, pytest.mark.gpu_2]),
]
)
def deployment_graph_test(request):
Expand Down
Loading
Loading