Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 17 additions & 14 deletions python/sglang/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
remove_prefix,
set_ulimit,
)
from sglang.srt.utils.network import NetworkAddress

_ROUTING_KEY_HEADER = "X-SMG-Routing-Key"

Expand Down Expand Up @@ -1726,10 +1727,16 @@ def run_benchmark(args_: argparse.Namespace):
"truss": 8080,
}.get(args.backend, 30000)

# Build base URL with proper IPv6 bracket wrapping (only when base_url is not provided)
if not args.base_url:
_na = NetworkAddress(args.host, args.port)
_host_base = _na.to_url()
else:
_na = None
_host_base = None

model_url = (
f"{args.base_url}/v1/models"
if args.base_url
else f"http://{args.host}:{args.port}/v1/models"
f"{args.base_url}/v1/models" if args.base_url else f"{_host_base}/v1/models"
)

if args.backend == "sglang-embedding":
Expand All @@ -1740,43 +1747,39 @@ def run_benchmark(args_: argparse.Namespace):
)
elif args.backend in ["sglang", "sglang-native"]:
api_url = (
f"{args.base_url}/generate"
if args.base_url
else f"http://{args.host}:{args.port}/generate"
f"{args.base_url}/generate" if args.base_url else f"{_host_base}/generate"
)
elif args.backend in ["sglang-oai", "vllm", "lmdeploy"]:
api_url = (
f"{args.base_url}/v1/completions"
if args.base_url
else f"http://{args.host}:{args.port}/v1/completions"
else f"{_host_base}/v1/completions"
)
elif args.backend in ["sglang-oai-chat", "vllm-chat", "lmdeploy-chat"]:
api_url = (
f"{args.base_url}/v1/chat/completions"
if args.base_url
else f"http://{args.host}:{args.port}/v1/chat/completions"
else f"{_host_base}/v1/chat/completions"
)
elif args.backend == "trt":
api_url = (
f"{args.base_url}/v2/models/ensemble/generate_stream"
if args.base_url
else f"http://{args.host}:{args.port}/v2/models/ensemble/generate_stream"
else f"{_host_base}/v2/models/ensemble/generate_stream"
)
if args.model is None:
print("Please provide a model using `--model` when using `trt` backend.")
sys.exit(1)
elif args.backend == "gserver":
api_url = args.base_url if args.base_url else f"{args.host}:{args.port}"
api_url = args.base_url if args.base_url else _na.to_host_port_str()
args.model = args.model or "default"
elif args.backend == "truss":
api_url = (
f"{args.base_url}/v1/models/model:predict"
if args.base_url
else f"http://{args.host}:{args.port}/v1/models/model:predict"
else f"{_host_base}/v1/models/model:predict"
)
base_url = (
f"http://{args.host}:{args.port}" if args.base_url is None else args.base_url
)
base_url = _host_base if args.base_url is None else args.base_url

# Wait for server to be ready
if args.ready_check_timeout_sec > 0:
Expand Down
3 changes: 2 additions & 1 deletion python/sglang/multimodal_gen/benchmarks/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
init_logger,
)
from sglang.multimodal_gen.test.test_utils import print_divider, print_value_formatted
from sglang.srt.utils.network import NetworkAddress

logger = init_logger(__name__)

Expand Down Expand Up @@ -457,7 +458,7 @@ async def benchmark(args):

# Construct base_url if not provided
if args.base_url is None:
args.base_url = f"http://{args.host}:{args.port}"
args.base_url = NetworkAddress(args.host, args.port).to_url()

# Wait for service
wait_for_service(args.base_url)
Expand Down
3 changes: 2 additions & 1 deletion python/sglang/srt/disaggregation/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
set_schedule_time_batch,
set_time_batch,
)
from sglang.srt.utils.network import NetworkAddress
from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter

logger = logging.getLogger(__name__)
Expand All @@ -87,7 +88,7 @@ def _is_fake_transfer(req: Req, server_args: ServerArgs) -> bool:

def _bootstrap_addr(req: Req) -> str:
# FIXME: make a property of a req
return f"{req.bootstrap_host}:{req.bootstrap_port}"
return NetworkAddress(req.bootstrap_host, req.bootstrap_port).to_host_port_str()


class DecodeReqToTokenPool:
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/disaggregation/encode_grpc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ async def serve_grpc_encoder(server_args: ServerArgs):
)
reflection.enable_server_reflection(SERVICE_NAMES, server)

listen_addr = f"{server_args.host}:{server_args.port}"
listen_addr = NetworkAddress(server_args.host, server_args.port).to_host_port_str()
server.add_insecure_port(listen_addr)

await server.start()
Expand Down
10 changes: 8 additions & 2 deletions python/sglang/srt/disaggregation/encode_receiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import ImageData
from sglang.srt.utils.hf_transformers_utils import get_processor
from sglang.srt.utils.network import get_local_ip_auto, get_zmq_socket_on_host
from sglang.srt.utils.network import (
NetworkAddress,
get_local_ip_auto,
get_zmq_socket_on_host,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -447,7 +451,9 @@ async def send_embedding_port(req_id, receive_count, host_name, embedding_port):
payload = {
"req_id": part_req_id, # use part_req_id to match encode request
"receive_count": receive_count,
"receive_url": f"{host_name}:{embedding_port}",
"receive_url": NetworkAddress(
host_name, embedding_port
).to_host_port_str(),
"modality": modality.name,
}
logger.info(
Expand Down
3 changes: 2 additions & 1 deletion python/sglang/srt/disaggregation/mooncake/conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,8 @@ def transfer_worker(
)
self.record_failure(
kv_chunk.room,
f"Failed to send kv chunk of {kv_chunk.room} to {req.endpoint}:{req.dst_port}",
f"Failed to send kv chunk of {kv_chunk.room} to "
f"{NetworkAddress(req.endpoint, req.dst_port).to_host_port_str()}",
)
self.update_status(kv_chunk.room, KVPoll.Failed)
self.sync_status_to_decode_endpoint(
Expand Down
8 changes: 6 additions & 2 deletions python/sglang/srt/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2380,6 +2380,8 @@ def launch_dummy_health_check_server(host, port, enable_metrics):
import uvicorn
from fastapi import FastAPI, Response

from sglang.srt.utils.network import NetworkAddress

app = FastAPI()

@app.get("/ping")
Expand Down Expand Up @@ -2422,14 +2424,16 @@ def run_server():
logger.error(f"Dummy health check server failed to start: {e}")
raise
finally:
logger.info(f"Dummy health check server stopped at {host}:{port}")
logger.info(
f"Dummy health check server stopped at {NetworkAddress(host, port).to_host_port_str()}"
)

thread = threading.Thread(
target=run_server, daemon=True, name="health-check-server"
)
thread.start()
logger.info(
f"Dummy health check server started in background thread at {host}:{port}"
f"Dummy health check server started in background thread at {NetworkAddress(host, port).to_host_port_str()}"
)


Expand Down
Loading