Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ tracing = [
"opentelemetry-sdk",
]

http2 = [
"granian>=2.6.0",
]

test = [
"accelerate",
"addict",
Expand All @@ -146,13 +150,15 @@ test = [
"diff-cover",
"sentence_transformers",
"tabulate",
"granian>=2.6.0",
]

dev = ["sglang[test]"]

all = [
"sglang[diffusion]",
"sglang[tracing]",
"sglang[http2]",
]

[tool.uv.extra-build-dependencies]
Expand Down
106 changes: 106 additions & 0 deletions python/sglang/srt/entrypoints/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,32 @@ def get_global_state() -> _GlobalState:
return _global_state


async def _init_granian_worker() -> ServerArgs:
main_pid = get_main_process_id()
port_args, server_args, scheduler_info = read_from_shared_memory(
f"multi_tokenizer_args_{main_pid}"
)

tokenizer_manager = TokenizerManager(server_args, port_args)
template_manager = TemplateManager()
template_manager.initialize_templates(
tokenizer_manager=tokenizer_manager,
model_path=server_args.model_path,
chat_template=server_args.chat_template,
completion_template=server_args.completion_template,
)
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]

set_global_state(
_GlobalState(
tokenizer_manager=tokenizer_manager,
template_manager=template_manager,
scheduler_info=scheduler_info,
)
)
return server_args


async def init_multi_tokenizer() -> ServerArgs:
"""
Initialization function for multi-process tokenizer mode.
Expand Down Expand Up @@ -263,6 +289,10 @@ async def lifespan(fast_api_app: FastAPI):
server_args = fast_api_app.server_args
warmup_thread_kwargs = fast_api_app.warmup_thread_kwargs
thread_label = "Tokenizer"
elif envs.SGLANG_GRANIAN_PARENT_PID.get() is not None:
server_args = await _init_granian_worker()
warmup_thread_kwargs = dict(server_args=server_args)
thread_label = "Tokenizer"
else:
# Initialize multi-tokenizer support for worker processes
server_args = await init_multi_tokenizer()
Expand Down Expand Up @@ -1959,6 +1989,53 @@ def _wait_weights_ready():
)


def _close_main_process_sockets():
"""Close the main process's ZMQ sockets before spawning Granian workers.

Granian workers create their own TokenizerManager with fresh ZMQ sockets.
The main process must release its sockets first to avoid binding conflicts
on the same IPC addresses.
"""
if _global_state is None or _global_state.tokenizer_manager is None:
return
tm = _global_state.tokenizer_manager
for attr in ("recv_from_detokenizer", "send_to_scheduler"):
sock = getattr(tm, attr, None)
if sock is None:
continue
inner = getattr(sock, "socket", None)
if inner is not None:
inner.close()
elif hasattr(sock, "close"):
sock.close()
setattr(tm, attr, None)


def _run_granian_server(server_args: ServerArgs):
"""Launch Granian with HTTP/2 support"""
from granian import Granian
from granian.constants import HTTPModes, Interfaces, Loops

granian_kwargs = dict(
target="sglang.srt.entrypoints.http_server:app",
address=server_args.host,
port=server_args.port,
interface=Interfaces.ASGI,
http=HTTPModes.auto,
loop=Loops.uvloop,
log_level=server_args.log_level_http or server_args.log_level or "info",
workers=1,
)

ssl_enabled = server_args.ssl_certfile and server_args.ssl_keyfile
if ssl_enabled:
granian_kwargs["ssl_cert"] = server_args.ssl_certfile
granian_kwargs["ssl_key"] = server_args.ssl_keyfile

server = Granian(**granian_kwargs)
server.serve()


def _setup_and_run_http_server(
server_args: ServerArgs,
tokenizer_manager,
Expand Down Expand Up @@ -1989,6 +2066,35 @@ def _setup_and_run_http_server(
if server_args.enable_metrics:
add_prometheus_track_response_middleware(app)

# Use Granian for HTTP/2 server
if server_args.enable_http2:
# Reuse the multi-tokenizer shared memory mechanism to pass
# init args (port_args, server_args, scheduler_info) to
# Granian workers, which are independent processes.
multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
port_args, server_args, scheduler_infos[0]
)
try:
if server_args.ssl_certfile:
logger.info(
f"SSL enabled: certfile={server_args.ssl_certfile}, "
f"keyfile={server_args.ssl_keyfile}"
)
logger.info(
f"Starting Granian HTTP/2 server on "
f"{server_args.host}:{server_args.port}"
)
# Propagate the main process PID via os.environ so Granian
# workers (forked or spawned) can locate the shared memory
# segment created above.
envs.SGLANG_GRANIAN_PARENT_PID.set(os.getpid())
_close_main_process_sockets()
_run_granian_server(server_args)
finally:
if multi_tokenizer_args_shm is not None:
multi_tokenizer_args_shm.unlink()
return

# Pass additional arguments to the lifespan function.
# They will be used for additional initialization setups.
if server_args.tokenizer_worker_num == 1:
Expand Down
3 changes: 3 additions & 0 deletions python/sglang/srt/environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,9 @@ class Envs:
# HTTP Server
SGLANG_TIMEOUT_KEEP_ALIVE = EnvInt(5)

# HTTP/2 Server
SGLANG_GRANIAN_PARENT_PID = EnvInt(None)

# Health Check
SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION = EnvBool(True)

Expand Down
11 changes: 10 additions & 1 deletion python/sglang/srt/managers/multi_tokenizer_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,16 @@ async def print_exception_wrapper(func):


def get_main_process_id() -> int:
"""Get the main process ID"""
"""Get the main process ID.

Supports override via SGLANG_GRANIAN_PARENT_PID for workers whose
multiprocessing parent PID differs from the shared-memory owner.
"""
from sglang.srt.environ import envs

override = envs.SGLANG_GRANIAN_PARENT_PID.get()
if override is not None:
return override
return multiprocessing.current_process()._parent_pid


Expand Down
29 changes: 29 additions & 0 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ class ServerArgs:
ssl_ca_certs: Optional[str] = None
ssl_keyfile_password: Optional[str] = None
enable_ssl_refresh: bool = False
enable_http2: bool = False

# Quantization and data type
dtype: str = "auto"
Expand Down Expand Up @@ -919,6 +920,26 @@ def _handle_ssl_validation(self):
"to be specified."
)

if self.enable_http2:
try:
import granian # noqa: F401
except ImportError:
raise ValueError(
"--enable-http2 requires the 'granian' package. "
'Install it with: pip install "sglang[http2]"'
)
if self.enable_ssl_refresh:
raise ValueError(
"--enable-ssl-refresh is not supported with --enable-http2. "
"Granian does not support SSL certificate hot-reloading. "
"Use Uvicorn (the default) or handle certificate rotation externally."
)
if self.tokenizer_worker_num > 1:
raise ValueError(
"--enable-http2 does not yet support --tokenizer-worker-num > 1. "
"Multi-worker HTTP/2 support will be added in a future release."
)

def _handle_deprecated_args(self):
# Handle deprecated tool call parsers
deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
Expand Down Expand Up @@ -3821,6 +3842,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="Enable automatic SSL certificate hot-reloading when cert/key "
"files change on disk. Requires --ssl-certfile and --ssl-keyfile.",
)
parser.add_argument(
"--enable-http2",
action="store_true",
default=ServerArgs.enable_http2,
help="Use Granian instead of Uvicorn as the ASGI server, enabling HTTP/1.1 and "
"HTTP/2 auto-negotiation. Clients may use h2c (cleartext HTTP/2) or plain HTTP/1.1. "
"Requires 'pip install sglang[http2]'.",
)

# Quantization and data type
parser.add_argument(
Expand Down
112 changes: 112 additions & 0 deletions test/registered/openai_server/basic/test_http2_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
Test HTTP/2 server (Granian) with basic OpenAI-compatible endpoints.

Verifies that --enable-http2 launches successfully and serves requests
via both HTTP/1.1 and HTTP/2 (h2c).
"""

import subprocess
import unittest

import requests

from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.test_utils import (
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)

try:
import granian # noqa: F401

_HAS_GRANIAN = True
except ImportError:
_HAS_GRANIAN = False

register_cuda_ci(est_time=120, suite="stage-b-test-1-gpu-small")


@unittest.skipUnless(_HAS_GRANIAN, "granian not installed (pip install sglang[http2])")
class TestHTTP2Server(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--enable-http2"],
)

@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)

def test_health(self):
resp = requests.get(f"{self.base_url}/health")
self.assertEqual(resp.status_code, 200)

def test_get_model_info(self):
resp = requests.get(f"{self.base_url}/get_model_info")
self.assertEqual(resp.status_code, 200)
self.assertIn("model_path", resp.json())

def test_completion(self):
resp = requests.post(
f"{self.base_url}/v1/completions",
json={
"model": self.model,
"prompt": "The capital of France is",
"max_tokens": 8,
"temperature": 0,
},
)
self.assertEqual(resp.status_code, 200)
data = resp.json()
self.assertIn("choices", data)
self.assertGreater(len(data["choices"][0]["text"]), 0)

def test_chat_completion(self):
resp = requests.post(
f"{self.base_url}/v1/chat/completions",
json={
"model": self.model,
"messages": [{"role": "user", "content": "Say hello"}],
"max_tokens": 16,
"temperature": 0,
},
)
self.assertEqual(resp.status_code, 200)
data = resp.json()
self.assertIn("choices", data)
self.assertGreater(len(data["choices"][0]["message"]["content"]), 0)

def test_h2c_with_curl(self):
"""Verify the server actually speaks HTTP/2 via h2c."""
result = subprocess.run(
[
"curl",
"--http2-prior-knowledge",
"-s",
"-o",
"/dev/null",
"-w",
"%{http_version}",
f"{self.base_url}/health",
],
capture_output=True,
text=True,
timeout=10,
)
self.assertEqual(
result.stdout.strip(), "2", "Server should respond with HTTP/2"
)


if __name__ == "__main__":
unittest.main(verbosity=3)
Loading