From 80e46aa83f8529359fb37e0f8ccacca57caa4b2d Mon Sep 17 00:00:00 2001 From: David Cheung Date: Thu, 26 Mar 2026 07:54:24 +0000 Subject: [PATCH 1/8] doc update --- docs/advanced_features/server_arguments.md | 2 +- docs/advanced_features/sgl_model_gateway.md | 4 ++-- docs/basic_usage/native_api.ipynb | 4 ++-- docs/developer_guide/bench_serving.md | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md index 61cfe91e07c6..f33808984e42 100644 --- a/docs/advanced_features/server_arguments.md +++ b/docs/advanced_features/server_arguments.md @@ -211,7 +211,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s | Argument | Description | Defaults | Options | | --- | --- | --- | --- | | `--api-key` | Set API key of the server. It is also used in the OpenAI API compatible server. | `None` | Type: str | -| `--admin-api-key` | Set **admin API key** for administrative/control endpoints (e.g., weights update, cache flush, `/get_server_info`). Endpoints marked as admin-only require `Authorization: Bearer ` when this is set. | `None` | Type: str | +| `--admin-api-key` | Set **admin API key** for administrative/control endpoints (e.g., weights update, cache flush, `/server_info`). Endpoints marked as admin-only require `Authorization: Bearer ` when this is set. | `None` | Type: str | | `--served-model-name` | Override the model name returned by the v1/models endpoint in OpenAI API server. | `None` | Type: str | | `--weight-version` | Version identifier for the model weights. Defaults to 'default' if not specified. | `default` | Type: str | | `--chat-template` | The builtin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server. | `None` | Type: str | diff --git a/docs/advanced_features/sgl_model_gateway.md b/docs/advanced_features/sgl_model_gateway.md index 753743b0b0bb..a718b8a37359 100644 --- a/docs/advanced_features/sgl_model_gateway.md +++ b/docs/advanced_features/sgl_model_gateway.md @@ -77,7 +77,7 @@ SGLang Model Gateway is a high-performance model-routing gateway for large-scale ### Control Plane -- **Worker Manager** discovers capabilities (`/get_server_info`, `/get_model_info`), tracks load, and registers/removes workers in the shared registry. +- **Worker Manager** discovers capabilities (`/server_info`, `/get_model_info`), tracks load, and registers/removes workers in the shared registry. - **Job Queue** serializes add/remove requests and exposes status (`/workers/{worker_id}`) so clients can track onboarding progress. - **Load Monitor** feeds cache-aware and power-of-two policies with live worker load statistics. - **Health Checker** continuously probes workers and updates readiness, circuit breaker state, and router metrics. @@ -552,7 +552,7 @@ Response: | `GET` | `/engine_metrics` | Engine-level metrics from workers | | `GET` | `/v1/models` | List available models | | `GET` | `/get_model_info` | Get model information | -| `GET` | `/get_server_info` | Get server information | +| `GET` | `/server_info` | Get server information | | `POST` | `/flush_cache` | Clear all caches | | `GET` | `/get_loads` | Get all worker loads | | `POST` | `/wasm` | Upload WASM module | diff --git a/docs/basic_usage/native_api.ipynb b/docs/basic_usage/native_api.ipynb index 05f4f3688306..ba975b5d61ff 100644 --- a/docs/basic_usage/native_api.ipynb +++ b/docs/basic_usage/native_api.ipynb @@ -10,7 +10,7 @@ "\n", "- `/generate` (text generation model)\n", "- `/get_model_info`\n", - "- `/get_server_info`\n", + "- `/server_info`\n", "- `/health`\n", "- `/health_generate`\n", "- `/flush_cache`\n", @@ -140,7 +140,7 @@ "metadata": {}, "outputs": [], "source": [ - "url = f\"http://localhost:{port}/get_server_info\"\n", + "url = f\"http://localhost:{port}/server_info\"\n", "\n", "response = requests.get(url)\n", "print_highlight(response.text)" diff --git a/docs/developer_guide/bench_serving.md b/docs/developer_guide/bench_serving.md index 5a67723c8ab7..fee65a117735 100644 --- a/docs/developer_guide/bench_serving.md +++ b/docs/developer_guide/bench_serving.md @@ -352,4 +352,4 @@ python3 -m sglang.bench_serving \ ### Notes - The script raises the file descriptor soft limit (`RLIMIT_NOFILE`) to help with many concurrent connections. -- For sglang, `/get_server_info` is queried post-run to report speculative decoding accept length when available. +- For sglang, `/server_info` is queried post-run to report speculative decoding accept length when available. From 88f897bb209b048be61e426b92717aa5d5ea2429 Mon Sep 17 00:00:00 2001 From: David Cheung Date: Thu, 26 Mar 2026 07:54:52 +0000 Subject: [PATCH 2/8] sgl-model-gateway update --- sgl-model-gateway/README.md | 2 +- .../bindings/python/src/sglang_router/mini_lb.py | 7 ++++--- .../src/core/steps/worker/local/discover_metadata.rs | 3 +++ sgl-model-gateway/src/routers/http/pd_router.rs | 2 +- sgl-model-gateway/src/routers/http/router.rs | 2 +- sgl-model-gateway/src/server.rs | 2 ++ sgl-model-gateway/tests/api/api_endpoints_test.rs | 4 ++-- sgl-model-gateway/tests/common/mock_worker.rs | 2 +- sgl-model-gateway/tests/common/tls_mock_worker.rs | 2 +- sgl-model-gateway/tests/routing/test_pd_routing.rs | 2 +- 10 files changed, 17 insertions(+), 11 deletions(-) diff --git a/sgl-model-gateway/README.md b/sgl-model-gateway/README.md index 4c4f92da0256..046cf352a14e 100644 --- a/sgl-model-gateway/README.md +++ b/sgl-model-gateway/README.md @@ -407,7 +407,7 @@ Use upstream SGLang binaries to start dedicated worker processes. ### Worker Lifecycle & Job Queue - `JobQueue` handles asynchronous add/remove operations to avoid blocking clients. -- `WorkerManager` inspects worker metadata (`/get_server_info`, `/get_model_info`), tracks load, and exposes `flush_cache` and `get_loads`. +- `WorkerManager` inspects worker metadata (`/server_info`, `/get_model_info`), tracks load, and exposes `flush_cache` and `get_loads`. - Per-worker circuit breakers and health probes keep the registry healthy; load monitor feeds metrics to cache-aware and power-of-two policies. ### Administrative & Worker APIs diff --git a/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py b/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py index f5fd0b5a7323..1e7793a3d08f 100644 --- a/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py +++ b/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py @@ -268,8 +268,9 @@ async def flush_cache(): await response return Response(status_code=200) - +# TODO: Remove `/get_server_info` alias after one release-cycle deprecation window. @app.get("/get_server_info") +@app.get("/server_info") async def get_server_info(): prefill_infos = [] decode_infos = [] @@ -277,10 +278,10 @@ async def get_server_info(): async with aiohttp.ClientSession() as session: for server in lb.prefill_urls: - server_info = await session.get(f"{server}/get_server_info") + server_info = await session.get(f"{server}/server_info") prefill_infos.append(await server_info.json()) for server in lb.decode_urls: - server_info = await session.get(f"{server}/get_server_info") + server_info = await session.get(f"{server}/server_info") info_json = await server_info.json() decode_infos.append(info_json) # Extract internal_states from decode servers diff --git a/sgl-model-gateway/src/core/steps/worker/local/discover_metadata.rs b/sgl-model-gateway/src/core/steps/worker/local/discover_metadata.rs index 13f91a3ef196..7557ed002844 100644 --- a/sgl-model-gateway/src/core/steps/worker/local/discover_metadata.rs +++ b/sgl-model-gateway/src/core/steps/worker/local/discover_metadata.rs @@ -96,6 +96,8 @@ async fn get_json_fallback( } /// Get server info from /server_info endpoint. +// TODO: Rename to `server_info` (or `fetch_server_info`) after removing +// `/get_server_info` fallback compatibility. pub async fn get_server_info(url: &str, api_key: Option<&str>) -> Result { let base_url = url.trim_end_matches('/'); let server_info_url = format!("{}/server_info", base_url); @@ -110,6 +112,7 @@ pub async fn get_server_info(url: &str, api_key: Option<&str>) -> Result) -> Response { // Get info from the first decode server to match sglang's server info format // Note: We use decode workers for server info to match expected format - self.proxy_to_first_prefill_worker("get_server_info", None) + self.proxy_to_first_prefill_worker("server_info", None) .await } diff --git a/sgl-model-gateway/src/routers/http/router.rs b/sgl-model-gateway/src/routers/http/router.rs index 0fbf2e422abb..b02f6638dbfb 100644 --- a/sgl-model-gateway/src/routers/http/router.rs +++ b/sgl-model-gateway/src/routers/http/router.rs @@ -724,7 +724,7 @@ impl RouterTrait for Router { } async fn get_server_info(&self, req: Request) -> Response { - self.proxy_get_request(req, "get_server_info").await + self.proxy_get_request(req, "server_info").await } async fn get_models(&self, req: Request) -> Response { diff --git a/sgl-model-gateway/src/server.rs b/sgl-model-gateway/src/server.rs index 4cea623de27f..db23d0aad39e 100644 --- a/sgl-model-gateway/src/server.rs +++ b/sgl-model-gateway/src/server.rs @@ -610,6 +610,8 @@ pub fn build_app( .route("/engine_metrics", get(engine_metrics)) .route("/v1/models", get(v1_models)) .route("/get_model_info", get(get_model_info)) + .route("/server_info", get(get_server_info)) + // TODO: Remove `/get_server_info` alias after one release-cycle deprecation window. .route("/get_server_info", get(get_server_info)); // Build admin routes with control plane auth if configured, otherwise use simple API key auth diff --git a/sgl-model-gateway/tests/api/api_endpoints_test.rs b/sgl-model-gateway/tests/api/api_endpoints_test.rs index 7a0d36676581..6e6ff125e9b2 100644 --- a/sgl-model-gateway/tests/api/api_endpoints_test.rs +++ b/sgl-model-gateway/tests/api/api_endpoints_test.rs @@ -314,7 +314,7 @@ mod model_info_tests { let req = Request::builder() .method("GET") - .uri("/get_server_info") + .uri("/server_info") .body(Body::empty()) .unwrap(); @@ -445,7 +445,7 @@ mod model_info_tests { let req = Request::builder() .method("GET") - .uri("/get_server_info") + .uri("/server_info") .body(Body::empty()) .unwrap(); let resp = app.clone().oneshot(req).await.unwrap(); diff --git a/sgl-model-gateway/tests/common/mock_worker.rs b/sgl-model-gateway/tests/common/mock_worker.rs index 23d6bb6f5d32..166fc9d8314b 100755 --- a/sgl-model-gateway/tests/common/mock_worker.rs +++ b/sgl-model-gateway/tests/common/mock_worker.rs @@ -82,7 +82,7 @@ impl MockWorker { let app = Router::new() .route("/health", get(health_handler)) .route("/health_generate", get(health_generate_handler)) - .route("/get_server_info", get(server_info_handler)) + .route("/server_info", get(server_info_handler)) .route("/get_model_info", get(model_info_handler)) .route("/generate", post(generate_handler)) .route("/v1/chat/completions", post(chat_completions_handler)) diff --git a/sgl-model-gateway/tests/common/tls_mock_worker.rs b/sgl-model-gateway/tests/common/tls_mock_worker.rs index 270866aa1130..36a6c542d877 100644 --- a/sgl-model-gateway/tests/common/tls_mock_worker.rs +++ b/sgl-model-gateway/tests/common/tls_mock_worker.rs @@ -101,7 +101,7 @@ impl TlsMockWorker { let app = Router::new() .route("/health", get(health_handler)) .route("/health_generate", get(health_generate_handler)) - .route("/get_server_info", get(server_info_handler)) + .route("/server_info", get(server_info_handler)) .route("/generate", post(generate_handler)) .route("/v1/chat/completions", post(chat_completions_handler)) .with_state(config); diff --git a/sgl-model-gateway/tests/routing/test_pd_routing.rs b/sgl-model-gateway/tests/routing/test_pd_routing.rs index 1853474723df..b6c69576f1c5 100644 --- a/sgl-model-gateway/tests/routing/test_pd_routing.rs +++ b/sgl-model-gateway/tests/routing/test_pd_routing.rs @@ -765,7 +765,7 @@ mod pd_routing_unit_tests { let implemented_endpoints = vec![ ("/health", "GET", true), ("/health_generate", "GET", true), // Note: Python uses POST, we use GET - ("/get_server_info", "GET", true), + ("/server_info", "GET", true), ("/v1/models", "GET", true), ("/get_model_info", "GET", true), ("/generate", "POST", true), From eb634b88a84a95397938676641e1074fae1e3107 Mon Sep 17 00:00:00 2001 From: David Cheung Date: Thu, 26 Mar 2026 07:55:08 +0000 Subject: [PATCH 3/8] test suites update --- test/registered/4-gpu-models/test_qwen35_models.py | 4 ++-- test/registered/8-gpu-models/test_deepseek_v32_mtp.py | 8 ++++---- test/registered/8-gpu-models/test_deepseek_v3_mtp.py | 2 +- .../amd/accuracy/mi30x/test_deepseek_v32_mtp_eval_amd.py | 2 +- .../accuracy/mi35x/test_deepseek_v32_mtp_eval_mi35x.py | 2 +- test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py | 2 +- test/registered/amd/test_deepseek_v32_mtp.py | 4 ++-- test/registered/amd/test_deepseek_v3_mtp.py | 2 +- test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py | 2 +- test/registered/amd/test_moriep_small.py | 6 +++--- test/registered/amd/test_qwen3_coder_next_8gpu.py | 2 +- .../ascend/basic_function/parameter/test_npu_warmups.py | 2 +- test/registered/attention/test_hybrid_attn_backend.py | 2 +- test/registered/core/test_srt_endpoint.py | 4 ++-- test/registered/distributed/test_data_parallelism.py | 4 ++-- test/registered/distributed/test_dp_attention.py | 2 +- test/registered/distributed/test_dp_attention_large.py | 2 +- test/registered/ep/test_deepep_large.py | 2 +- test/registered/ep/test_deepep_small.py | 6 +++--- test/registered/hicache/test_hicache_variants.py | 2 +- test/registered/mla/test_mla_deepseek_v3.py | 2 +- test/registered/mla/test_mla_flashinfer.py | 2 +- test/registered/mla/test_mla_int8_deepseek_v3.py | 4 ++-- test/registered/quant/test_deepseek_v32_fp4_mtp_4gpu.py | 4 ++-- test/registered/quant/test_w4a8_deepseek_v3.py | 2 +- test/registered/spec/eagle/test_eagle_dp_attention.py | 2 +- .../spec/eagle/test_eagle_infer_beta_dp_attention.py | 2 +- .../spec/test_standalone_speculative_decoding.py | 4 ++-- 28 files changed, 42 insertions(+), 42 deletions(-) diff --git a/test/registered/4-gpu-models/test_qwen35_models.py b/test/registered/4-gpu-models/test_qwen35_models.py index f088c8242915..562d201f21e6 100644 --- a/test/registered/4-gpu-models/test_qwen35_models.py +++ b/test/registered/4-gpu-models/test_qwen35_models.py @@ -149,7 +149,7 @@ def test_gsm8k(self): print(f"{metrics=}") self.assertGreaterEqual(metrics["score"], ACC_THRESHOLDS[self.model]["gsm8k"]) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -226,7 +226,7 @@ def test_gsm8k(self): print(f"{metrics=}") self.assertGreaterEqual(metrics["score"], ACC_THRESHOLDS[self.model]["gsm8k"]) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/8-gpu-models/test_deepseek_v32_mtp.py b/test/registered/8-gpu-models/test_deepseek_v32_mtp.py index 75498322f264..cbccd8004515 100644 --- a/test/registered/8-gpu-models/test_deepseek_v32_mtp.py +++ b/test/registered/8-gpu-models/test_deepseek_v32_mtp.py @@ -75,7 +75,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -161,7 +161,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -243,7 +243,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -326,7 +326,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/8-gpu-models/test_deepseek_v3_mtp.py b/test/registered/8-gpu-models/test_deepseek_v3_mtp.py index 31e99ab0819c..667c350edcd6 100644 --- a/test/registered/8-gpu-models/test_deepseek_v3_mtp.py +++ b/test/registered/8-gpu-models/test_deepseek_v3_mtp.py @@ -72,7 +72,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/amd/accuracy/mi30x/test_deepseek_v32_mtp_eval_amd.py b/test/registered/amd/accuracy/mi30x/test_deepseek_v32_mtp_eval_amd.py index 6676ab612085..1ffa71a1f272 100644 --- a/test/registered/amd/accuracy/mi30x/test_deepseek_v32_mtp_eval_amd.py +++ b/test/registered/amd/accuracy/mi30x/test_deepseek_v32_mtp_eval_amd.py @@ -106,7 +106,7 @@ def test_a_gsm8k(self): metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/amd/accuracy/mi35x/test_deepseek_v32_mtp_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_deepseek_v32_mtp_eval_mi35x.py index 09a012043416..dad040a302d7 100644 --- a/test/registered/amd/accuracy/mi35x/test_deepseek_v32_mtp_eval_mi35x.py +++ b/test/registered/amd/accuracy/mi35x/test_deepseek_v32_mtp_eval_mi35x.py @@ -108,7 +108,7 @@ def test_a_gsm8k(self): metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py b/test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py index 28249e706b75..04d4f6efb7a7 100644 --- a/test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py +++ b/test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py @@ -135,7 +135,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/amd/test_deepseek_v32_mtp.py b/test/registered/amd/test_deepseek_v32_mtp.py index 87e4e6923b38..69587bdf6e05 100644 --- a/test/registered/amd/test_deepseek_v32_mtp.py +++ b/test/registered/amd/test_deepseek_v32_mtp.py @@ -87,7 +87,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -179,7 +179,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/amd/test_deepseek_v3_mtp.py b/test/registered/amd/test_deepseek_v3_mtp.py index 29190947414b..0a9f94090e93 100644 --- a/test/registered/amd/test_deepseek_v3_mtp.py +++ b/test/registered/amd/test_deepseek_v3_mtp.py @@ -72,7 +72,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py b/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py index a62eadf7a587..949b743485e6 100644 --- a/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py +++ b/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py @@ -76,7 +76,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/amd/test_moriep_small.py b/test/registered/amd/test_moriep_small.py index 3eca8ce279e6..76ccb42d63f6 100644 --- a/test/registered/amd/test_moriep_small.py +++ b/test/registered/amd/test_moriep_small.py @@ -145,7 +145,7 @@ def test_gsm8k( print(f"{metrics=}") self.assertGreaterEqual(metrics["accuracy"], 0.92) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -397,7 +397,7 @@ def test_gsm8k( print(f"{metrics=}") self.assertGreaterEqual(metrics["accuracy"], 0.92) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -457,7 +457,7 @@ def test_gsm8k( print(f"{metrics=}") self.assertGreaterEqual(metrics["accuracy"], 0.92) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/amd/test_qwen3_coder_next_8gpu.py b/test/registered/amd/test_qwen3_coder_next_8gpu.py index a8631af6e8c2..b4181273d0d8 100644 --- a/test/registered/amd/test_qwen3_coder_next_8gpu.py +++ b/test/registered/amd/test_qwen3_coder_next_8gpu.py @@ -146,7 +146,7 @@ def test_a_gsm8k(self): metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/ascend/basic_function/parameter/test_npu_warmups.py b/test/registered/ascend/basic_function/parameter/test_npu_warmups.py index 7b1df16af9db..da678037402b 100644 --- a/test/registered/ascend/basic_function/parameter/test_npu_warmups.py +++ b/test/registered/ascend/basic_function/parameter/test_npu_warmups.py @@ -64,7 +64,7 @@ def tearDownClass(cls): def test_warmups_with_voice_chat(self): # Call the get_server_info API to verify that the warmups parameter configuration takes effect. - response = requests.get(f"{DEFAULT_URL_FOR_TEST}/get_server_info") + response = requests.get(f"{DEFAULT_URL_FOR_TEST}/server_info") self.assertEqual(response.status_code, 200) self.assertEqual("voice_chat", response.json().get("warmups")) diff --git a/test/registered/attention/test_hybrid_attn_backend.py b/test/registered/attention/test_hybrid_attn_backend.py index 1c70ee03217d..33f669daaaf7 100644 --- a/test/registered/attention/test_hybrid_attn_backend.py +++ b/test/registered/attention/test_hybrid_attn_backend.py @@ -93,7 +93,7 @@ def test_gsm8k(self): self.assertGreater(metrics[metric_key], self.accuracy_threshold) if self.speculative_decode: - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/core/test_srt_endpoint.py b/test/registered/core/test_srt_endpoint.py index 46c5853a674e..22e3b468fff7 100644 --- a/test/registered/core/test_srt_endpoint.py +++ b/test/registered/core/test_srt_endpoint.py @@ -500,7 +500,7 @@ def send_and_check_cached_tokens(input_ids): self.assertEqual(send_and_check_cached_tokens(range(0, 11000)), 10000) def test_get_server_info(self): - response = requests.get(self.base_url + "/get_server_info") + response = requests.get(self.base_url + "/server_info") response_json = response.json() max_total_num_tokens = response_json["max_total_num_tokens"] @@ -630,7 +630,7 @@ def test_get_server_info_concurrent(self): tp = ThreadPoolExecutor(max_workers=30) def s(): - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") server_info.json() futures = [] diff --git a/test/registered/distributed/test_data_parallelism.py b/test/registered/distributed/test_data_parallelism.py index b93af4584d5b..db62d55b2f39 100644 --- a/test/registered/distributed/test_data_parallelism.py +++ b/test/registered/distributed/test_data_parallelism.py @@ -69,12 +69,12 @@ def test_update_weight(self): def test_get_memory_pool_size(self): # use `get_server_info` instead since `get_memory_pool_size` is merged into `get_server_info` - response = requests.get(self.base_url + "/get_server_info") + response = requests.get(self.base_url + "/server_info") assert response.status_code == 200 time.sleep(1) - response = requests.get(self.base_url + "/get_server_info") + response = requests.get(self.base_url + "/server_info") assert response.status_code == 200 diff --git a/test/registered/distributed/test_dp_attention.py b/test/registered/distributed/test_dp_attention.py index 0d7f1838c6e6..54251c0cc2d1 100644 --- a/test/registered/distributed/test_dp_attention.py +++ b/test/registered/distributed/test_dp_attention.py @@ -177,7 +177,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/distributed/test_dp_attention_large.py b/test/registered/distributed/test_dp_attention_large.py index 00e3f18e85b3..192882afa94d 100644 --- a/test/registered/distributed/test_dp_attention_large.py +++ b/test/registered/distributed/test_dp_attention_large.py @@ -128,7 +128,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/ep/test_deepep_large.py b/test/registered/ep/test_deepep_large.py index f107b4c40b3b..aef770b261c0 100644 --- a/test/registered/ep/test_deepep_large.py +++ b/test/registered/ep/test_deepep_large.py @@ -148,7 +148,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.92) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/ep/test_deepep_small.py b/test/registered/ep/test_deepep_small.py index 911915aaff8a..6f788fde0e72 100644 --- a/test/registered/ep/test_deepep_small.py +++ b/test/registered/ep/test_deepep_small.py @@ -412,7 +412,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -486,7 +486,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -562,7 +562,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/hicache/test_hicache_variants.py b/test/registered/hicache/test_hicache_variants.py index 6934a32fcc28..317c11bf3f0d 100644 --- a/test/registered/hicache/test_hicache_variants.py +++ b/test/registered/hicache/test_hicache_variants.py @@ -157,7 +157,7 @@ def test_mmlu(self): self.assertGreaterEqual(metrics["score"], self.expected_mmlu_score) # EAGLE-specific check - server_info = requests.get(self.base_url + "/get_server_info").json() + server_info = requests.get(self.base_url + "/server_info").json() avg_spec_accept_length = server_info["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/mla/test_mla_deepseek_v3.py b/test/registered/mla/test_mla_deepseek_v3.py index 392154e3d6ce..1847a47f0f57 100644 --- a/test/registered/mla/test_mla_deepseek_v3.py +++ b/test/registered/mla/test_mla_deepseek_v3.py @@ -199,7 +199,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/mla/test_mla_flashinfer.py b/test/registered/mla/test_mla_flashinfer.py index 555a54e5e768..29739f704980 100644 --- a/test/registered/mla/test_mla_flashinfer.py +++ b/test/registered/mla/test_mla_flashinfer.py @@ -115,7 +115,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/get_server_info").json() + server_info = requests.get(self.base_url + "/server_info").json() avg_spec_accept_length = server_info["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/mla/test_mla_int8_deepseek_v3.py b/test/registered/mla/test_mla_int8_deepseek_v3.py index d7acb80403bb..51701798d198 100644 --- a/test/registered/mla/test_mla_int8_deepseek_v3.py +++ b/test/registered/mla/test_mla_int8_deepseek_v3.py @@ -117,7 +117,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -221,7 +221,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/quant/test_deepseek_v32_fp4_mtp_4gpu.py b/test/registered/quant/test_deepseek_v32_fp4_mtp_4gpu.py index 3a99f6ad9f12..c2c72a28f55f 100644 --- a/test/registered/quant/test_deepseek_v32_fp4_mtp_4gpu.py +++ b/test/registered/quant/test_deepseek_v32_fp4_mtp_4gpu.py @@ -83,7 +83,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -173,7 +173,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/quant/test_w4a8_deepseek_v3.py b/test/registered/quant/test_w4a8_deepseek_v3.py index a6c33bea33de..b37a14698b6e 100644 --- a/test/registered/quant/test_w4a8_deepseek_v3.py +++ b/test/registered/quant/test_w4a8_deepseek_v3.py @@ -106,7 +106,7 @@ def test_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/spec/eagle/test_eagle_dp_attention.py b/test/registered/spec/eagle/test_eagle_dp_attention.py index a25edf588f7f..e136373ef763 100644 --- a/test/registered/spec/eagle/test_eagle_dp_attention.py +++ b/test/registered/spec/eagle/test_eagle_dp_attention.py @@ -87,7 +87,7 @@ def test_a_gsm8k(self): metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") server_data = server_info.json() # Try to get avg_spec_accept_length diff --git a/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention.py b/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention.py index 8a6e9779fb51..950d27e824e5 100644 --- a/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention.py +++ b/test/registered/spec/eagle/test_eagle_infer_beta_dp_attention.py @@ -33,7 +33,7 @@ def test_gsm8k(base_url: str): port=int(base_url.split(":")[-1]), ) metrics = run_eval_few_shot_gsm8k(args) - server_info = requests.get(base_url + "/get_server_info") + server_info = requests.get(base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/registered/spec/test_standalone_speculative_decoding.py b/test/registered/spec/test_standalone_speculative_decoding.py index 1a3cc0647e6a..ec411b1159b5 100644 --- a/test/registered/spec/test_standalone_speculative_decoding.py +++ b/test/registered/spec/test_standalone_speculative_decoding.py @@ -112,7 +112,7 @@ def test_gsm8k(self): metric_key = "accuracy" self.assertGreater(metrics[metric_key], self.accuracy_threshold) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -173,7 +173,7 @@ def test_gsm8k(self): metric_key = "accuracy" self.assertGreater(metrics[metric_key], self.accuracy_threshold) - server_info = requests.get(self.base_url + "/get_server_info") + server_info = requests.get(self.base_url + "/server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] From 1ae5a6923933dcfa1e91763393a5f611b51a3437 Mon Sep 17 00:00:00 2001 From: David Cheung Date: Thu, 26 Mar 2026 07:56:37 +0000 Subject: [PATCH 4/8] update remaining pointers --- python/sglang/bench_serving.py | 4 ++-- python/sglang/lang/backend/runtime_endpoint.py | 8 ++++++-- python/sglang/profiler.py | 2 +- python/sglang/test/bench_one_batch_server_internal.py | 4 ++-- python/sglang/test/kits/cache_hit_kit.py | 2 +- python/sglang/test/kl_test_utils.py | 4 ++-- python/sglang/test/nightly_utils.py | 2 +- scripts/playground/bench_speculative.py | 2 +- 8 files changed, 16 insertions(+), 12 deletions(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 4e79ece7e3bc..90058ac1fb72 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -1401,7 +1401,7 @@ async def limited_request_func(request_func_input, pbar): if "sglang" in backend: server_info = requests.get( - base_url + "/get_server_info", headers=get_auth_headers() + base_url + "/server_info", headers=get_auth_headers() ) if server_info.status_code == 200: server_info_json = server_info.json() @@ -1537,7 +1537,7 @@ async def limited_request_func(request_func_input, pbar): print("{:<40} {:<10.2f}".format("Max ITL (ms):", metrics.max_itl_ms)) print("=" * 50) - resp = requests.get(base_url + "/get_server_info", headers=get_auth_headers()) + resp = requests.get(base_url + "/server_info", headers=get_auth_headers()) server_info = resp.json() if resp.status_code == 200 else None if ( diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py index 41a48103123f..37002337c597 100644 --- a/python/sglang/lang/backend/runtime_endpoint.py +++ b/python/sglang/lang/backend/runtime_endpoint.py @@ -66,8 +66,10 @@ def flush_cache(self): self._assert_success(res) def get_server_info(self): + # TODO: Rename this method to `server_info` after the `/get_server_info` deprecation + # window ends, and keep a temporary alias for backward compatibility. res = http_request( - self.base_url + "/get_server_info", + self.base_url + "/server_info", api_key=self.api_key, verify=self.verify, ) @@ -530,8 +532,10 @@ def encode( return json.dumps(response.json()) async def get_server_info(self): + # TODO: Rename this method to `server_info` after the `/get_server_info` deprecation + # window ends, and keep a temporary alias for backward compatibility. async with aiohttp.ClientSession() as session: - async with session.get(f"{self.url}/get_server_info") as response: + async with session.get(f"{self.url}/server_info") as response: if response.status == 200: return await response.json() else: diff --git a/python/sglang/profiler.py b/python/sglang/profiler.py index ebc7a100e24b..8424e7f54bbe 100644 --- a/python/sglang/profiler.py +++ b/python/sglang/profiler.py @@ -42,7 +42,7 @@ def run_profile( # Dump server args. file_path = Path(output_dir) / "server_args.json" if not file_path.exists(): - response = requests.get(url + "/get_server_info") + response = requests.get(url + "/server_info") response.raise_for_status() server_args_data = response.json() with open(file_path, "w") as file: diff --git a/python/sglang/test/bench_one_batch_server_internal.py b/python/sglang/test/bench_one_batch_server_internal.py index 4585340da084..39e7ea4376ad 100644 --- a/python/sglang/test/bench_one_batch_server_internal.py +++ b/python/sglang/test/bench_one_batch_server_internal.py @@ -609,7 +609,7 @@ def run_one_case( last_gen_throughput = -1 acc_length = -1 else: - response = requests.get(url + "/get_server_info", timeout=DEFAULT_TIMEOUT) + response = requests.get(url + "/server_info", timeout=DEFAULT_TIMEOUT) response.raise_for_status() server_info = response.json() internal_state = server_info.get("internal_states", [{}]) @@ -793,7 +793,7 @@ def run_benchmark_internal( skip_max_running_requests_threshold = float("inf") else: model_name = None - response = requests.get(base_url + "/get_server_info", timeout=DEFAULT_TIMEOUT) + response = requests.get(base_url + "/server_info", timeout=DEFAULT_TIMEOUT) response.raise_for_status() server_info = response.json() if "tokenizer_path" in server_info: diff --git a/python/sglang/test/kits/cache_hit_kit.py b/python/sglang/test/kits/cache_hit_kit.py index 81895eff07c7..5e1c9172c29e 100644 --- a/python/sglang/test/kits/cache_hit_kit.py +++ b/python/sglang/test/kits/cache_hit_kit.py @@ -221,7 +221,7 @@ async def _send_one(payload): def _get_page_size(base_url: str) -> int: """Query server for page_size used by radix cache.""" try: - resp = requests.get(f"{base_url}/get_server_info", timeout=10) + resp = requests.get(f"{base_url}/server_info", timeout=10) resp.raise_for_status() info = resp.json() return info.get("page_size", 1) diff --git a/python/sglang/test/kl_test_utils.py b/python/sglang/test/kl_test_utils.py index 116f0ad7ee40..b3c90caaec97 100644 --- a/python/sglang/test/kl_test_utils.py +++ b/python/sglang/test/kl_test_utils.py @@ -208,7 +208,7 @@ def test_input_output_logprobs_match_helper( def test_input_output_logprobs_match_prefill_cache_hit_helper( base_url, ACC_THRESHOLDS, model_name, max_samples=None, max_new_tokens=8192 ): - server_info = requests.get(base_url + "/get_server_info").json() + server_info = requests.get(base_url + "/server_info").json() if server_info["disable_radix_cache"]: print("Radix cache is disabled, skipping test") return @@ -261,7 +261,7 @@ def test_input_output_logprobs_match_prefill_cache_hit_helper( def test_input_output_logprobs_match_decode_cache_hit_helper( base_url, ACC_THRESHOLDS, model_name, max_samples=None, max_new_tokens=8192 ): - server_info = requests.get(base_url + "/get_server_info").json() + server_info = requests.get(base_url + "/server_info").json() if server_info["disable_radix_cache"]: print("Radix cache is disabled, skipping test") return diff --git a/python/sglang/test/nightly_utils.py b/python/sglang/test/nightly_utils.py index ac69fabb7010..2a9d01f2e8ef 100644 --- a/python/sglang/test/nightly_utils.py +++ b/python/sglang/test/nightly_utils.py @@ -324,7 +324,7 @@ def _get_spec_accept_length(self) -> Optional[float]: The average speculative decoding accept length, or None if not available. """ try: - response = requests.get(f"{self.base_url}/get_server_info", timeout=10) + response = requests.get(f"{self.base_url}/server_info", timeout=10) if response.status_code == 200: server_info = response.json() internal_states = server_info.get("internal_states", []) diff --git a/scripts/playground/bench_speculative.py b/scripts/playground/bench_speculative.py index 806699f7121c..5373df5169e8 100644 --- a/scripts/playground/bench_speculative.py +++ b/scripts/playground/bench_speculative.py @@ -119,7 +119,7 @@ def send_one_batch(base_url, num_prompts, batch_size, processor, is_multimodal): acc_length = results["accept_length"] or 1.0 avg_output_token = results["total_output_tokens"] / results["completed"] - server_info = requests.get(base_url + "/get_server_info").json() + server_info = requests.get(base_url + "/server_info").json() # We use 20% percentile instead of median on purpose step_time = np.percentile( server_info["internal_states"][0]["step_time_dict"][str(batch_size)], 20 From 5e4717e995414460f6f0728c8fe9a992d1e6f190 Mon Sep 17 00:00:00 2001 From: hnyls2002 Date: Wed, 1 Apr 2026 20:24:58 -0700 Subject: [PATCH 5/8] fix lint: add missing blank line before function definition --- sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py b/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py index 1e7793a3d08f..79dfe425bde6 100644 --- a/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py +++ b/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py @@ -268,6 +268,7 @@ async def flush_cache(): await response return Response(status_code=200) + # TODO: Remove `/get_server_info` alias after one release-cycle deprecation window. @app.get("/get_server_info") @app.get("/server_info") From acbc15cf5673ebebca757e90ef7e7e4141029083 Mon Sep 17 00:00:00 2001 From: hnyls2002 Date: Wed, 1 Apr 2026 20:30:00 -0700 Subject: [PATCH 6/8] remove unnecessary TODO about method rename --- python/sglang/lang/backend/runtime_endpoint.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py index 37002337c597..8732e401fe71 100644 --- a/python/sglang/lang/backend/runtime_endpoint.py +++ b/python/sglang/lang/backend/runtime_endpoint.py @@ -66,8 +66,6 @@ def flush_cache(self): self._assert_success(res) def get_server_info(self): - # TODO: Rename this method to `server_info` after the `/get_server_info` deprecation - # window ends, and keep a temporary alias for backward compatibility. res = http_request( self.base_url + "/server_info", api_key=self.api_key, @@ -532,8 +530,6 @@ def encode( return json.dumps(response.json()) async def get_server_info(self): - # TODO: Rename this method to `server_info` after the `/get_server_info` deprecation - # window ends, and keep a temporary alias for backward compatibility. async with aiohttp.ClientSession() as session: async with session.get(f"{self.url}/server_info") as response: if response.status == 200: From 09efb87a231ca63ff4f57192ec46e355e3663c66 Mon Sep 17 00:00:00 2001 From: hnyls2002 Date: Wed, 1 Apr 2026 20:31:40 -0700 Subject: [PATCH 7/8] fix stale comment and decorator order --- sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py | 2 +- test/registered/distributed/test_data_parallelism.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py b/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py index 79dfe425bde6..11ef2f4c6b64 100644 --- a/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py +++ b/sgl-model-gateway/bindings/python/src/sglang_router/mini_lb.py @@ -270,8 +270,8 @@ async def flush_cache(): # TODO: Remove `/get_server_info` alias after one release-cycle deprecation window. -@app.get("/get_server_info") @app.get("/server_info") +@app.get("/get_server_info") async def get_server_info(): prefill_infos = [] decode_infos = [] diff --git a/test/registered/distributed/test_data_parallelism.py b/test/registered/distributed/test_data_parallelism.py index e94ee800d720..bb3eb29c74e0 100644 --- a/test/registered/distributed/test_data_parallelism.py +++ b/test/registered/distributed/test_data_parallelism.py @@ -57,7 +57,7 @@ def test_update_weight(self): assert response.status_code == 200 def test_get_memory_pool_size(self): - # use `get_server_info` instead since `get_memory_pool_size` is merged into `get_server_info` + # use `server_info` instead since `get_memory_pool_size` is merged into `server_info` response = requests.get(self.base_url + "/server_info") assert response.status_code == 200 From 7dab4038cd1afee4a3a030b7fcc76c366ac22622 Mon Sep 17 00:00:00 2001 From: hnyls2002 Date: Wed, 1 Apr 2026 20:35:52 -0700 Subject: [PATCH 8/8] remove unnecessary TODOs in discover_metadata.rs --- .../src/core/steps/worker/local/discover_metadata.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/sgl-model-gateway/src/core/steps/worker/local/discover_metadata.rs b/sgl-model-gateway/src/core/steps/worker/local/discover_metadata.rs index 7557ed002844..13f91a3ef196 100644 --- a/sgl-model-gateway/src/core/steps/worker/local/discover_metadata.rs +++ b/sgl-model-gateway/src/core/steps/worker/local/discover_metadata.rs @@ -96,8 +96,6 @@ async fn get_json_fallback( } /// Get server info from /server_info endpoint. -// TODO: Rename to `server_info` (or `fetch_server_info`) after removing -// `/get_server_info` fallback compatibility. pub async fn get_server_info(url: &str, api_key: Option<&str>) -> Result { let base_url = url.trim_end_matches('/'); let server_info_url = format!("{}/server_info", base_url); @@ -112,7 +110,6 @@ pub async fn get_server_info(url: &str, api_key: Option<&str>) -> Result