Skip to content

Commit e831673

Browse files
authored
fix: timeout and broken pipe in disagg and worker tests (#5827)
Signed-off-by: zhengd-nv <[email protected]>
1 parent aeea5b3 commit e831673

File tree

2 files changed

+16
-11
lines changed

2 files changed

+16
-11
lines changed

tensorrt_llm/serve/openai_disagg_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def __init__(self,
6868
async def lifespan(app: FastAPI):
6969
# Create a persistent aiohttp ClientSession
7070
self.session = aiohttp.ClientSession(
71-
connector=aiohttp.TCPConnector(limit=0, limit_per_host=0, keepalive_timeout=300),
71+
connector=aiohttp.TCPConnector(limit=0, limit_per_host=0, force_close=True),
7272
timeout=aiohttp.ClientTimeout(total=req_timeout_secs))
7373

7474
logger.info("Waiting for context and generation servers to be ready")

tests/integration/defs/disaggregated/test_workers.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,21 +64,26 @@ def run_disaggregated_workers(
6464
return workers_proc, ctx_servers, gen_servers
6565

6666

67+
DEFAULT_TIMEOUT_SERVER_START = 900
68+
DEFAULT_TIMEOUT_REQUEST = 180
69+
70+
6771
class BasicWorkerTester:
6872

6973
def __init__(self,
7074
ctx_servers: List[str],
7175
gen_servers: List[str],
72-
req_timeout_secs: int = 180,
73-
server_start_timeout_secs: int = 180):
76+
req_timeout_secs: int = DEFAULT_TIMEOUT_REQUEST,
77+
server_start_timeout_secs: int = DEFAULT_TIMEOUT_SERVER_START):
7478
self.ctx_servers = ctx_servers
7579
self.gen_servers = gen_servers
7680
self.req_timeout_secs = req_timeout_secs
7781
self.server_start_timeout_secs = server_start_timeout_secs
7882

7983
async def new_session(self):
80-
session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
81-
total=self.req_timeout_secs))
84+
session = aiohttp.ClientSession(
85+
connector=aiohttp.TCPConnector(force_close=True),
86+
timeout=aiohttp.ClientTimeout(total=self.req_timeout_secs))
8287
await OpenAIDisaggServer.wait_for_all_servers_ready(
8388
session, self.ctx_servers, self.gen_servers,
8489
self.server_start_timeout_secs)
@@ -146,8 +151,8 @@ class ConditionalWorkerTester(BasicWorkerTester):
146151
def __init__(self,
147152
ctx_servers: List[str],
148153
gen_servers: List[str],
149-
req_timeout_secs: int = 180,
150-
server_start_timeout_secs: int = 180,
154+
req_timeout_secs: int = DEFAULT_TIMEOUT_REQUEST,
155+
server_start_timeout_secs: int = DEFAULT_TIMEOUT_SERVER_START,
151156
model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
152157
super().__init__(ctx_servers, gen_servers, req_timeout_secs,
153158
server_start_timeout_secs)
@@ -199,8 +204,8 @@ class KvCacheEventWorkerTester(BasicWorkerTester):
199204
def __init__(self,
200205
ctx_servers: List[str],
201206
gen_servers: List[str],
202-
req_timeout_secs: int = 180,
203-
server_start_timeout_secs: int = 240,
207+
req_timeout_secs: int = DEFAULT_TIMEOUT_REQUEST,
208+
server_start_timeout_secs: int = DEFAULT_TIMEOUT_SERVER_START,
204209
model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
205210
model_path: Optional[str] = None):
206211
super().__init__(ctx_servers, gen_servers, req_timeout_secs,
@@ -316,8 +321,8 @@ class KvCacheAwareRouterTester(BasicWorkerTester):
316321
def __init__(self,
317322
ctx_servers: List[str],
318323
gen_servers: List[str],
319-
req_timeout_secs: int = 180,
320-
server_start_timeout_secs: int = 180,
324+
req_timeout_secs: int = DEFAULT_TIMEOUT_REQUEST,
325+
server_start_timeout_secs: int = DEFAULT_TIMEOUT_SERVER_START,
321326
model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
322327
tokens_per_block: int = 32):
323328
super().__init__(ctx_servers, gen_servers, req_timeout_secs,

0 commit comments

Comments
 (0)