vllm-project · hmellor · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -62,7 +62,7 @@ benchmark() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
 
   CUDA_VISIBLE_DEVICES=1 python3 \
@@ -72,7 +72,7 @@ benchmark() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   wait_for_server 8100
   wait_for_server 8200

diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -69,7 +69,7 @@ launch_disagg_prefill() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
@@ -78,7 +78,7 @@ launch_disagg_prefill() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   wait_for_server 8100
   wait_for_server 8200

diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
@@ -30,12 +30,12 @@ def run_prefill(prefill_done):
     ]
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
 
-    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # Using P2pNcclConnector to transmit KV caches between vLLM instances.
     # This instance is the prefill node (kv_producer, rank 0).
     # The number of parallel instances for KV cache transfer is set to 2,
-    # as required for PyNcclConnector.
+    # as required for P2pNcclConnector.
     ktc = KVTransferConfig(
-        kv_connector="PyNcclConnector",
+        kv_connector="P2pNcclConnector",
         kv_role="kv_producer",
         kv_rank=0,
         kv_parallel_size=2,
@@ -74,12 +74,12 @@ def run_decode(prefill_done):
     ]
     sampling_params = SamplingParams(temperature=0, top_p=0.95)
 
-    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # Using P2pNcclConnector to transmit KV caches between vLLM instances.
     # This instance is the decode node (kv_consumer, rank 1).
     # The number of parallel instances for KV cache transfer is set to 2,
-    # as required for PyNcclConnector.
+    # as required for P2pNcclConnector.
     ktc = KVTransferConfig(
-        kv_connector="PyNcclConnector",
+        kv_connector="P2pNcclConnector",
         kv_role="kv_consumer",
         kv_rank=1,
         kv_parallel_size=2,

diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
@@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
 
 # decoding instance, which is the KV consumer
 CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
@@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
 
 # wait until prefill and decode instances are ready
 wait_for_server 8100

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
@@ -128,7 +128,7 @@ def stress_test(my_rank, buf, device):
     print(f"initialized! My rank is {my_rank}")
 
     config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
+        kv_connector='P2pNcclConnector',
         kv_buffer_device='cuda',
         kv_buffer_size=1e9,
         kv_rank=my_rank,

diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
@@ -137,7 +137,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     )
 
     config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
+        kv_connector='P2pNcclConnector',
         kv_buffer_device='cuda',
         kv_buffer_size=1e9,
         kv_rank=my_rank,

@@ -3247,7 +3247,7 @@ class KVTransferConfig:
 
     kv_parallel_size: int = 1
     """The number of parallel instances for KV cache transfer. For
-    PyNcclConnector, this should be 2."""
+    P2pNcclConnector, this should be 2."""
 
     kv_ip: str = "127.0.0.1"
     """The KV connector ip, used to build distributed connection."""