diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 6f52c6e7dbe..bace0febe0a 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -146,6 +146,7 @@ def launch_disaggregated_llm( for i, port in enumerate(ctx_ports): env_ctx = os.environ.copy() + env_ctx["TRTLLM_USE_UCX_KVCACHE"] = "1" gpu_range = range(current_gpu_offset, current_gpu_offset + ctx_total_gpus) env_ctx["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_range)) @@ -166,6 +167,7 @@ def launch_disaggregated_llm( for i, port in enumerate(gen_ports): env_gen = os.environ.copy() + env_ctx["TRTLLM_USE_UCX_KVCACHE"] = "1" gpu_range = range(current_gpu_offset, current_gpu_offset + gen_total_gpus) env_gen["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_range)) @@ -1103,15 +1105,12 @@ def test_chunked_prefill(self): }, "enable_chunked_prefill": True, "max_num_tokens": 256, - "max_batch_size": - 1, # max_batch_size=1 will stabilize the accuracy test result at a cost of speed } gen_server_config = { "cuda_graph_config": None, "cache_transceiver_config": { "backend": "DEFAULT" - }, - "max_batch_size": 1, + } } disaggregated_server_config = { "hostname": "localhost",