From 2d6c415a6d00f5d9efa2cb55c42f06d808553d13 Mon Sep 17 00:00:00 2001 From: ispobock Date: Mon, 30 Mar 2026 15:58:17 +0000 Subject: [PATCH 1/6] support http2 --- python/pyproject.toml | 5 + python/sglang/srt/entrypoints/http_server.py | 110 ++++++++++++++++++ python/sglang/srt/environ.py | 3 + .../srt/managers/multi_tokenizer_mixin.py | 11 +- python/sglang/srt/server_args.py | 24 ++++ 5 files changed, 152 insertions(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 483cf432bccd..5f6ee9bc083f 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -128,6 +128,10 @@ tracing = [ "opentelemetry-sdk", ] +http2 = [ + "granian>=2.6.0", +] + test = [ "accelerate", "addict", @@ -151,6 +155,7 @@ dev = ["sglang[test]"] all = [ "sglang[diffusion]", "sglang[tracing]", + "sglang[http2]", ] [tool.uv.extra-build-dependencies] diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py index 9cba65bf7f08..e975e7a788b9 100644 --- a/python/sglang/srt/entrypoints/http_server.py +++ b/python/sglang/srt/entrypoints/http_server.py @@ -219,6 +219,39 @@ def get_global_state() -> _GlobalState: return _global_state +async def _init_granian_worker() -> ServerArgs: + """Initialize a Granian worker by creating a full TokenizerManager. + + Unlike init_multi_tokenizer (which creates a lightweight TokenizerWorker + that talks to a MultiTokenizerRouter), this creates a direct + TokenizerManager that connects to the scheduler, suitable for Granian's + single-tokenizer mode. + """ + main_pid = get_main_process_id() + port_args, server_args, scheduler_info = read_from_shared_memory( + f"multi_tokenizer_args_{main_pid}" + ) + + tokenizer_manager = TokenizerManager(server_args, port_args) + template_manager = TemplateManager() + template_manager.initialize_templates( + tokenizer_manager=tokenizer_manager, + model_path=server_args.model_path, + chat_template=server_args.chat_template, + completion_template=server_args.completion_template, + ) + tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"] + + set_global_state( + _GlobalState( + tokenizer_manager=tokenizer_manager, + template_manager=template_manager, + scheduler_info=scheduler_info, + ) + ) + return server_args + + async def init_multi_tokenizer() -> ServerArgs: """ Initialization function for multi-process tokenizer mode. @@ -276,6 +309,10 @@ async def lifespan(fast_api_app: FastAPI): server_args = fast_api_app.server_args warmup_thread_kwargs = fast_api_app.warmup_thread_kwargs thread_label = "Tokenizer" + elif envs.SGLANG_GRANIAN_PARENT_PID.get() is not None: + server_args = await _init_granian_worker() + warmup_thread_kwargs = dict(server_args=server_args) + thread_label = "Tokenizer" else: # Initialize multi-tokenizer support for worker processes server_args = await init_multi_tokenizer() @@ -1979,6 +2016,52 @@ def _wait_weights_ready(): ) +def _close_main_process_sockets(): + """Close the main process's ZMQ sockets before spawning Granian workers. + + Granian workers create their own TokenizerManager with fresh ZMQ sockets. + The main process must release its sockets first to avoid binding conflicts + on the same IPC addresses. + """ + if _global_state is None or _global_state.tokenizer_manager is None: + return + tm = _global_state.tokenizer_manager + for attr in ("recv_from_detokenizer", "send_to_scheduler"): + sock = getattr(tm, attr, None) + if sock is None: + continue + inner = getattr(sock, "socket", None) + if inner is not None: + inner.close() + elif hasattr(sock, "close"): + sock.close() + + +def _run_granian_server(server_args): + """Launch Granian with HTTP/2 support.""" + from granian import Granian + from granian.constants import HTTPModes, Interfaces, Loops + + granian_kwargs = dict( + target="sglang.srt.entrypoints.http_server:app", + address=server_args.host, + port=server_args.port, + interface=Interfaces.ASGI, + http=HTTPModes.auto, + loop=Loops.uvloop, + log_level=server_args.log_level_http or server_args.log_level or "info", + workers=server_args.tokenizer_worker_num, + ) + + ssl_enabled = server_args.ssl_certfile and server_args.ssl_keyfile + if ssl_enabled: + granian_kwargs["ssl_cert"] = server_args.ssl_certfile + granian_kwargs["ssl_key"] = server_args.ssl_keyfile + + server = Granian(**granian_kwargs) + server.serve() + + def _setup_and_run_http_server( server_args: ServerArgs, tokenizer_manager, @@ -2015,6 +2098,33 @@ def _setup_and_run_http_server( if server_args.enable_metrics: add_prometheus_track_response_middleware(app) + # Use Granian for HTTP/2 server + if server_args.enable_http2: + # Reuse the multi-tokenizer shared memory mechanism to pass + # init args (port_args, server_args, scheduler_info) to + # Granian workers, which are independent processes. + multi_tokenizer_args_shm = write_data_for_multi_tokenizer( + port_args, server_args, scheduler_infos[0] + ) + try: + if server_args.ssl_certfile: + logger.info( + f"SSL enabled: certfile={server_args.ssl_certfile}, " + f"keyfile={server_args.ssl_keyfile}" + ) + logger.info( + f"Starting Granian HTTP/2 server on " + f"{server_args.host}:{server_args.port}" + ) + # Set the main process PID for shared memory lookup + envs.SGLANG_GRANIAN_PARENT_PID.set(os.getpid()) + _close_main_process_sockets() + _run_granian_server(server_args) + finally: + if multi_tokenizer_args_shm is not None: + multi_tokenizer_args_shm.unlink() + return + # Pass additional arguments to the lifespan function. # They will be used for additional initialization setups. if server_args.tokenizer_worker_num == 1: diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index 28b9ee4f4b86..ddf44933f0d8 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -484,6 +484,9 @@ class Envs: # HTTP Server SGLANG_TIMEOUT_KEEP_ALIVE = EnvInt(5) + # HTTP/2 Server + SGLANG_GRANIAN_PARENT_PID = EnvInt(None) + # Health Check SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION = EnvBool(True) diff --git a/python/sglang/srt/managers/multi_tokenizer_mixin.py b/python/sglang/srt/managers/multi_tokenizer_mixin.py index 4da03863068b..a16dea5eecd4 100644 --- a/python/sglang/srt/managers/multi_tokenizer_mixin.py +++ b/python/sglang/srt/managers/multi_tokenizer_mixin.py @@ -431,7 +431,16 @@ async def print_exception_wrapper(func): def get_main_process_id() -> int: - """Get the main process ID""" + """Get the main process ID. + + Supports override via SGLANG_GRANIAN_PARENT_PID for workers whose + multiprocessing parent PID differs from the shared-memory owner. + """ + from sglang.srt.environ import envs + + override = envs.SGLANG_GRANIAN_PARENT_PID.get() + if override is not None: + return override return multiprocessing.current_process()._parent_pid diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 77695d92d0aa..9f144542a41d 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -321,6 +321,7 @@ class ServerArgs: ssl_ca_certs: Optional[str] = None ssl_keyfile_password: Optional[str] = None enable_ssl_refresh: bool = False + enable_http2: bool = False # Quantization and data type dtype: str = "auto" @@ -902,6 +903,21 @@ def _handle_ssl_validation(self): "to be specified." ) + if self.enable_http2: + try: + import granian # noqa: F401 + except ImportError: + raise ValueError( + "--enable-http2 requires the 'granian' package. " + 'Install it with: pip install "sglang[http2]"' + ) + if self.enable_ssl_refresh: + raise ValueError( + "--enable-ssl-refresh is not supported with --enable-http2. " + "Granian does not support SSL certificate hot-reloading. " + "Use Uvicorn (the default) or handle certificate rotation externally." + ) + def _handle_deprecated_args(self): # Handle deprecated tool call parsers deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"} @@ -3768,6 +3784,14 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Enable automatic SSL certificate hot-reloading when cert/key " "files change on disk. Requires --ssl-certfile and --ssl-keyfile.", ) + parser.add_argument( + "--enable-http2", + action="store_true", + default=ServerArgs.enable_http2, + help="Use Granian (HTTP/2) instead of Uvicorn (HTTP/1.1) as the ASGI server. " + "Requires 'pip install sglang[http2]'. Serves h2c (cleartext HTTP/2) by default " + "and serves h2 over TLS when --ssl-certfile and --ssl-keyfile are provided.", + ) # Quantization and data type parser.add_argument( From 8a1783ee520dadeca5a5ceb7c4d369125841ae5e Mon Sep 17 00:00:00 2001 From: ispobock Date: Mon, 30 Mar 2026 16:13:10 +0000 Subject: [PATCH 2/6] add test --- python/pyproject.toml | 1 + .../openai_server/basic/test_http2_server.py | 112 ++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 test/registered/openai_server/basic/test_http2_server.py diff --git a/python/pyproject.toml b/python/pyproject.toml index 5f6ee9bc083f..baa2024799e2 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -148,6 +148,7 @@ test = [ "diff-cover", "sentence_transformers", "tabulate", + "granian>=2.6.0", ] dev = ["sglang[test]"] diff --git a/test/registered/openai_server/basic/test_http2_server.py b/test/registered/openai_server/basic/test_http2_server.py new file mode 100644 index 000000000000..988561143a1b --- /dev/null +++ b/test/registered/openai_server/basic/test_http2_server.py @@ -0,0 +1,112 @@ +""" +Test HTTP/2 server (Granian) with basic OpenAI-compatible endpoints. + +Verifies that --enable-http2 launches successfully and serves requests +via both HTTP/1.1 and HTTP/2 (h2c). +""" + +import subprocess +import unittest + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import ( + DEFAULT_SMALL_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +try: + import granian # noqa: F401 + + _HAS_GRANIAN = True +except ImportError: + _HAS_GRANIAN = False + +register_cuda_ci(est_time=120, suite="stage-b-test-small-1-gpu") + + +@unittest.skipUnless(_HAS_GRANIAN, "granian not installed (pip install sglang[http2])") +class TestHTTP2Server(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--enable-http2"], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_health(self): + resp = requests.get(f"{self.base_url}/health") + self.assertEqual(resp.status_code, 200) + + def test_get_model_info(self): + resp = requests.get(f"{self.base_url}/get_model_info") + self.assertEqual(resp.status_code, 200) + self.assertIn("model_path", resp.json()) + + def test_completion(self): + resp = requests.post( + f"{self.base_url}/v1/completions", + json={ + "model": self.model, + "prompt": "The capital of France is", + "max_tokens": 8, + "temperature": 0, + }, + ) + self.assertEqual(resp.status_code, 200) + data = resp.json() + self.assertIn("choices", data) + self.assertGreater(len(data["choices"][0]["text"]), 0) + + def test_chat_completion(self): + resp = requests.post( + f"{self.base_url}/v1/chat/completions", + json={ + "model": self.model, + "messages": [{"role": "user", "content": "Say hello"}], + "max_tokens": 16, + "temperature": 0, + }, + ) + self.assertEqual(resp.status_code, 200) + data = resp.json() + self.assertIn("choices", data) + self.assertGreater(len(data["choices"][0]["message"]["content"]), 0) + + def test_h2c_with_curl(self): + """Verify the server actually speaks HTTP/2 via h2c.""" + result = subprocess.run( + [ + "curl", + "--http2-prior-knowledge", + "-s", + "-o", + "/dev/null", + "-w", + "%{http_version}", + f"{self.base_url}/health", + ], + capture_output=True, + text=True, + timeout=10, + ) + self.assertEqual( + result.stdout.strip(), "2", "Server should respond with HTTP/2" + ) + + +if __name__ == "__main__": + unittest.main(verbosity=3) From 585c9af3392bc5d1d3c514df81c7d0c27901b43f Mon Sep 17 00:00:00 2001 From: Ke Bao Date: Tue, 31 Mar 2026 21:53:45 +0800 Subject: [PATCH 3/6] Update python/sglang/srt/entrypoints/http_server.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- python/sglang/srt/entrypoints/http_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py index e975e7a788b9..decaa7a474a1 100644 --- a/python/sglang/srt/entrypoints/http_server.py +++ b/python/sglang/srt/entrypoints/http_server.py @@ -2037,7 +2037,7 @@ def _close_main_process_sockets(): sock.close() -def _run_granian_server(server_args): +def _run_granian_server(server_args: ServerArgs): """Launch Granian with HTTP/2 support.""" from granian import Granian from granian.constants import HTTPModes, Interfaces, Loops From 5bdec3342597815ee3ef3d877bb1f47bf259ef1c Mon Sep 17 00:00:00 2001 From: ispobock Date: Wed, 1 Apr 2026 14:58:45 +0000 Subject: [PATCH 4/6] update --- python/sglang/srt/entrypoints/http_server.py | 16 ++++++---------- python/sglang/srt/server_args.py | 11 ++++++++--- .../openai_server/basic/test_http2_server.py | 7 +++++++ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py index 2b9084453156..ba9590d6c020 100644 --- a/python/sglang/srt/entrypoints/http_server.py +++ b/python/sglang/srt/entrypoints/http_server.py @@ -208,13 +208,6 @@ def get_global_state() -> _GlobalState: async def _init_granian_worker() -> ServerArgs: - """Initialize a Granian worker by creating a full TokenizerManager. - - Unlike init_multi_tokenizer (which creates a lightweight TokenizerWorker - that talks to a MultiTokenizerRouter), this creates a direct - TokenizerManager that connects to the scheduler, suitable for Granian's - single-tokenizer mode. - """ main_pid = get_main_process_id() port_args, server_args, scheduler_info = read_from_shared_memory( f"multi_tokenizer_args_{main_pid}" @@ -2036,10 +2029,11 @@ def _close_main_process_sockets(): inner.close() elif hasattr(sock, "close"): sock.close() + setattr(tm, attr, None) def _run_granian_server(server_args: ServerArgs): - """Launch Granian with HTTP/2 support.""" + """Launch Granian with HTTP/2 support""" from granian import Granian from granian.constants import HTTPModes, Interfaces, Loops @@ -2051,7 +2045,7 @@ def _run_granian_server(server_args: ServerArgs): http=HTTPModes.auto, loop=Loops.uvloop, log_level=server_args.log_level_http or server_args.log_level or "info", - workers=server_args.tokenizer_worker_num, + workers=1, ) ssl_enabled = server_args.ssl_certfile and server_args.ssl_keyfile @@ -2111,7 +2105,9 @@ def _setup_and_run_http_server( f"Starting Granian HTTP/2 server on " f"{server_args.host}:{server_args.port}" ) - # Set the main process PID for shared memory lookup + # Propagate the main process PID via os.environ so Granian + # workers (forked or spawned) can locate the shared memory + # segment created above. envs.SGLANG_GRANIAN_PARENT_PID.set(os.getpid()) _close_main_process_sockets() _run_granian_server(server_args) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index d44ff573b0b8..7245fd4d1529 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -920,6 +920,11 @@ def _handle_ssl_validation(self): "Granian does not support SSL certificate hot-reloading. " "Use Uvicorn (the default) or handle certificate rotation externally." ) + if self.tokenizer_worker_num > 1: + raise ValueError( + "--enable-http2 does not yet support --tokenizer-worker-num > 1. " + "Multi-worker HTTP/2 support will be added in a future release." + ) def _handle_deprecated_args(self): # Handle deprecated tool call parsers @@ -3825,9 +3830,9 @@ def add_cli_args(parser: argparse.ArgumentParser): "--enable-http2", action="store_true", default=ServerArgs.enable_http2, - help="Use Granian (HTTP/2) instead of Uvicorn (HTTP/1.1) as the ASGI server. " - "Requires 'pip install sglang[http2]'. Serves h2c (cleartext HTTP/2) by default " - "and serves h2 over TLS when --ssl-certfile and --ssl-keyfile are provided.", + help="Use Granian instead of Uvicorn as the ASGI server, enabling HTTP/1.1 and " + "HTTP/2 auto-negotiation. Clients may use h2c (cleartext HTTP/2) or plain HTTP/1.1. " + "Requires 'pip install sglang[http2]'.", ) # Quantization and data type diff --git a/test/registered/openai_server/basic/test_http2_server.py b/test/registered/openai_server/basic/test_http2_server.py index 988561143a1b..1a14a789c156 100644 --- a/test/registered/openai_server/basic/test_http2_server.py +++ b/test/registered/openai_server/basic/test_http2_server.py @@ -88,6 +88,13 @@ def test_chat_completion(self): def test_h2c_with_curl(self): """Verify the server actually speaks HTTP/2 via h2c.""" + probe = subprocess.run( + ["curl", "--http2-prior-knowledge", "-V"], + capture_output=True, + ) + if probe.returncode != 0: + self.skipTest("curl does not support --http2-prior-knowledge") + result = subprocess.run( [ "curl", From 1222a873b8aad2a98429b69a6dc92c73b63e7297 Mon Sep 17 00:00:00 2001 From: ispobock Date: Wed, 1 Apr 2026 15:12:50 +0000 Subject: [PATCH 5/6] update --- test/registered/openai_server/basic/test_http2_server.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test/registered/openai_server/basic/test_http2_server.py b/test/registered/openai_server/basic/test_http2_server.py index 1a14a789c156..988561143a1b 100644 --- a/test/registered/openai_server/basic/test_http2_server.py +++ b/test/registered/openai_server/basic/test_http2_server.py @@ -88,13 +88,6 @@ def test_chat_completion(self): def test_h2c_with_curl(self): """Verify the server actually speaks HTTP/2 via h2c.""" - probe = subprocess.run( - ["curl", "--http2-prior-knowledge", "-V"], - capture_output=True, - ) - if probe.returncode != 0: - self.skipTest("curl does not support --http2-prior-knowledge") - result = subprocess.run( [ "curl", From 51db219a84d8cf52f25448562b8f16249f5d9a66 Mon Sep 17 00:00:00 2001 From: ispobock Date: Thu, 2 Apr 2026 10:43:02 +0000 Subject: [PATCH 6/6] update --- test/registered/openai_server/basic/test_http2_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/registered/openai_server/basic/test_http2_server.py b/test/registered/openai_server/basic/test_http2_server.py index 988561143a1b..6cfc3ee7e7e0 100644 --- a/test/registered/openai_server/basic/test_http2_server.py +++ b/test/registered/openai_server/basic/test_http2_server.py @@ -27,7 +27,7 @@ except ImportError: _HAS_GRANIAN = False -register_cuda_ci(est_time=120, suite="stage-b-test-small-1-gpu") +register_cuda_ci(est_time=120, suite="stage-b-test-1-gpu-small") @unittest.skipUnless(_HAS_GRANIAN, "granian not installed (pip install sglang[http2])")