Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ benchmark() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &


CUDA_VISIBLE_DEVICES=1 python3 \
Expand All @@ -72,7 +72,7 @@ benchmark() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

wait_for_server 8100
wait_for_server 8200
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
Expand All @@ -78,7 +78,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

wait_for_server 8100
wait_for_server 8200
Expand Down
12 changes: 6 additions & 6 deletions examples/offline_inference/disaggregated_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ def run_prefill(prefill_done):
]
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)

# Using PyNcclConnector to transmit KV caches between vLLM instances.
# Using P2pNcclConnector to transmit KV caches between vLLM instances.
# This instance is the prefill node (kv_producer, rank 0).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
# as required for P2pNcclConnector.
ktc = KVTransferConfig(
kv_connector="PyNcclConnector",
kv_connector="P2pNcclConnector",
kv_role="kv_producer",
kv_rank=0,
kv_parallel_size=2,
Expand Down Expand Up @@ -74,12 +74,12 @@ def run_decode(prefill_done):
]
sampling_params = SamplingParams(temperature=0, top_p=0.95)

# Using PyNcclConnector to transmit KV caches between vLLM instances.
# Using P2pNcclConnector to transmit KV caches between vLLM instances.
# This instance is the decode node (kv_consumer, rank 1).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
# as required for P2pNcclConnector.
ktc = KVTransferConfig(
kv_connector="PyNcclConnector",
kv_connector="P2pNcclConnector",
kv_role="kv_consumer",
kv_rank=1,
kv_parallel_size=2,
Expand Down
4 changes: 2 additions & 2 deletions examples/online_serving/disaggregated_prefill.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
--gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &

# decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
Expand All @@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
--gpu-memory-utilization 0.8 \
--trust-remote-code \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &

# wait until prefill and decode instances are ready
wait_for_server 8100
Expand Down
2 changes: 1 addition & 1 deletion tests/kv_transfer/test_lookup_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def stress_test(my_rank, buf, device):
print(f"initialized! My rank is {my_rank}")

config = KVTransferConfig(
kv_connector='PyNcclConnector',
kv_connector='P2pNcclConnector',
kv_buffer_device='cuda',
kv_buffer_size=1e9,
kv_rank=my_rank,
Expand Down
2 changes: 1 addition & 1 deletion tests/kv_transfer/test_send_recv.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
)

config = KVTransferConfig(
kv_connector='PyNcclConnector',
kv_connector='P2pNcclConnector',
kv_buffer_device='cuda',
kv_buffer_size=1e9,
kv_rank=my_rank,
Expand Down
2 changes: 1 addition & 1 deletion vllm/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3247,7 +3247,7 @@ class KVTransferConfig:

kv_parallel_size: int = 1
"""The number of parallel instances for KV cache transfer. For
PyNcclConnector, this should be 2."""
P2pNcclConnector, this should be 2."""

kv_ip: str = "127.0.0.1"
"""The KV connector ip, used to build distributed connection."""
Expand Down