Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/advanced_features/server_arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
| Argument | Description | Defaults | Options |
| --- | --- | --- | --- |
| `--api-key` | Set API key of the server. It is also used in the OpenAI API compatible server. | `None` | Type: str |
| `--admin-api-key` | Set **admin API key** for administrative/control endpoints (e.g., weights update, cache flush, `/get_server_info`). Endpoints marked as admin-only require `Authorization: Bearer <admin_api_key>` when this is set. | `None` | Type: str |
| `--admin-api-key` | Set **admin API key** for administrative/control endpoints (e.g., weights update, cache flush, `/server_info`). Endpoints marked as admin-only require `Authorization: Bearer <admin_api_key>` when this is set. | `None` | Type: str |
| `--served-model-name` | Override the model name returned by the v1/models endpoint in OpenAI API server. | `None` | Type: str |
| `--weight-version` | Version identifier for the model weights. Defaults to 'default' if not specified. | `default` | Type: str |
| `--chat-template` | The builtin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server. | `None` | Type: str |
Expand Down
4 changes: 2 additions & 2 deletions docs/advanced_features/sgl_model_gateway.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ SGLang Model Gateway is a high-performance model-routing gateway for large-scale

### Control Plane

- **Worker Manager** discovers capabilities (`/get_server_info`, `/get_model_info`), tracks load, and registers/removes workers in the shared registry.
- **Worker Manager** discovers capabilities (`/server_info`, `/get_model_info`), tracks load, and registers/removes workers in the shared registry.
- **Job Queue** serializes add/remove requests and exposes status (`/workers/{worker_id}`) so clients can track onboarding progress.
- **Load Monitor** feeds cache-aware and power-of-two policies with live worker load statistics.
- **Health Checker** continuously probes workers and updates readiness, circuit breaker state, and router metrics.
Expand Down Expand Up @@ -552,7 +552,7 @@ Response:
| `GET` | `/engine_metrics` | Engine-level metrics from workers |
| `GET` | `/v1/models` | List available models |
| `GET` | `/get_model_info` | Get model information |
| `GET` | `/get_server_info` | Get server information |
| `GET` | `/server_info` | Get server information |
| `POST` | `/flush_cache` | Clear all caches |
| `GET` | `/get_loads` | Get all worker loads |
| `POST` | `/wasm` | Upload WASM module |
Expand Down
4 changes: 2 additions & 2 deletions docs/basic_usage/native_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"\n",
"- `/generate` (text generation model)\n",
"- `/get_model_info`\n",
"- `/get_server_info`\n",
"- `/server_info`\n",
"- `/health`\n",
"- `/health_generate`\n",
"- `/flush_cache`\n",
Expand Down Expand Up @@ -140,7 +140,7 @@
"metadata": {},
"outputs": [],
"source": [
"url = f\"http://localhost:{port}/get_server_info\"\n",
"url = f\"http://localhost:{port}/server_info\"\n",
"\n",
"response = requests.get(url)\n",
"print_highlight(response.text)"
Expand Down
2 changes: 1 addition & 1 deletion docs/developer_guide/bench_serving.md
Original file line number Diff line number Diff line change
Expand Up @@ -352,4 +352,4 @@ python3 -m sglang.bench_serving \
### Notes

- The script raises the file descriptor soft limit (`RLIMIT_NOFILE`) to help with many concurrent connections.
- For sglang, `/get_server_info` is queried post-run to report speculative decoding accept length when available.
- For sglang, `/server_info` is queried post-run to report speculative decoding accept length when available.
4 changes: 2 additions & 2 deletions python/sglang/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -1402,7 +1402,7 @@ async def limited_request_func(request_func_input, pbar):

if "sglang" in backend:
server_info = requests.get(
base_url + "/get_server_info", headers=get_auth_headers()
base_url + "/server_info", headers=get_auth_headers()
)
if server_info.status_code == 200:
server_info_json = server_info.json()
Expand Down Expand Up @@ -1538,7 +1538,7 @@ async def limited_request_func(request_func_input, pbar):
print("{:<40} {:<10.2f}".format("Max ITL (ms):", metrics.max_itl_ms))
print("=" * 50)

resp = requests.get(base_url + "/get_server_info", headers=get_auth_headers())
resp = requests.get(base_url + "/server_info", headers=get_auth_headers())
server_info = resp.json() if resp.status_code == 200 else None

if (
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/lang/backend/runtime_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def flush_cache(self):

def get_server_info(self):
res = http_request(
self.base_url + "/get_server_info",
self.base_url + "/server_info",
api_key=self.api_key,
verify=self.verify,
)
Expand Down Expand Up @@ -531,7 +531,7 @@ def encode(

async def get_server_info(self):
async with aiohttp.ClientSession() as session:
async with session.get(f"{self.url}/get_server_info") as response:
async with session.get(f"{self.url}/server_info") as response:
if response.status == 200:
return await response.json()
else:
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def run_profile(
# Dump server args.
file_path = Path(output_dir) / "server_args.json"
if not file_path.exists():
response = requests.get(url + "/get_server_info")
response = requests.get(url + "/server_info")
response.raise_for_status()
server_args_data = response.json()
with open(file_path, "w") as file:
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/test/bench_one_batch_server_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ def run_one_case(
last_gen_throughput = -1
acc_length = -1
else:
response = requests.get(url + "/get_server_info", timeout=DEFAULT_TIMEOUT)
response = requests.get(url + "/server_info", timeout=DEFAULT_TIMEOUT)
response.raise_for_status()
server_info = response.json()
internal_state = server_info.get("internal_states", [{}])
Expand Down Expand Up @@ -793,7 +793,7 @@ def run_benchmark_internal(
skip_max_running_requests_threshold = float("inf")
else:
model_name = None
response = requests.get(base_url + "/get_server_info", timeout=DEFAULT_TIMEOUT)
response = requests.get(base_url + "/server_info", timeout=DEFAULT_TIMEOUT)
response.raise_for_status()
server_info = response.json()
if "tokenizer_path" in server_info:
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/test/kits/cache_hit_kit.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ async def _send_one(payload):
def _get_page_size(base_url: str) -> int:
"""Query server for page_size used by radix cache."""
try:
resp = requests.get(f"{base_url}/get_server_info", timeout=10)
resp = requests.get(f"{base_url}/server_info", timeout=10)
resp.raise_for_status()
info = resp.json()
return info.get("page_size", 1)
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/test/kl_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def test_input_output_logprobs_match_helper(
def test_input_output_logprobs_match_prefill_cache_hit_helper(
base_url, ACC_THRESHOLDS, model_name, max_samples=None, max_new_tokens=8192
):
server_info = requests.get(base_url + "/get_server_info").json()
server_info = requests.get(base_url + "/server_info").json()
if server_info["disable_radix_cache"]:
print("Radix cache is disabled, skipping test")
return
Expand Down Expand Up @@ -261,7 +261,7 @@ def test_input_output_logprobs_match_prefill_cache_hit_helper(
def test_input_output_logprobs_match_decode_cache_hit_helper(
base_url, ACC_THRESHOLDS, model_name, max_samples=None, max_new_tokens=8192
):
server_info = requests.get(base_url + "/get_server_info").json()
server_info = requests.get(base_url + "/server_info").json()
if server_info["disable_radix_cache"]:
print("Radix cache is disabled, skipping test")
return
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/test/nightly_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def _get_spec_accept_length(self) -> Optional[float]:
The average speculative decoding accept length, or None if not available.
"""
try:
response = requests.get(f"{self.base_url}/get_server_info", timeout=10)
response = requests.get(f"{self.base_url}/server_info", timeout=10)
if response.status_code == 200:
server_info = response.json()
internal_states = server_info.get("internal_states", [])
Expand Down
2 changes: 1 addition & 1 deletion scripts/playground/bench_speculative.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def send_one_batch(base_url, num_prompts, batch_size, processor, is_multimodal):
acc_length = results["accept_length"] or 1.0
avg_output_token = results["total_output_tokens"] / results["completed"]

server_info = requests.get(base_url + "/get_server_info").json()
server_info = requests.get(base_url + "/server_info").json()
# We use 20% percentile instead of median on purpose
step_time = np.percentile(
server_info["internal_states"][0]["step_time_dict"][str(batch_size)], 20
Expand Down
2 changes: 1 addition & 1 deletion sgl-model-gateway/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ Use upstream SGLang binaries to start dedicated worker processes.

### Worker Lifecycle & Job Queue
- `JobQueue` handles asynchronous add/remove operations to avoid blocking clients.
- `WorkerManager` inspects worker metadata (`/get_server_info`, `/get_model_info`), tracks load, and exposes `flush_cache` and `get_loads`.
- `WorkerManager` inspects worker metadata (`/server_info`, `/get_model_info`), tracks load, and exposes `flush_cache` and `get_loads`.
- Per-worker circuit breakers and health probes keep the registry healthy; load monitor feeds metrics to cache-aware and power-of-two policies.

### Administrative & Worker APIs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ async def flush_cache():
return Response(status_code=200)


# TODO: Remove `/get_server_info` alias after one release-cycle deprecation window.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This TODO is a good reminder for future cleanup. It's important to ensure that the deprecation window is clearly defined and communicated to users to avoid breaking changes.

@app.get("/server_info")
@app.get("/get_server_info")
async def get_server_info():
prefill_infos = []
Expand All @@ -277,10 +279,10 @@ async def get_server_info():

async with aiohttp.ClientSession() as session:
for server in lb.prefill_urls:
server_info = await session.get(f"{server}/get_server_info")
server_info = await session.get(f"{server}/server_info")
prefill_infos.append(await server_info.json())
for server in lb.decode_urls:
server_info = await session.get(f"{server}/get_server_info")
server_info = await session.get(f"{server}/server_info")
info_json = await server_info.json()
decode_infos.append(info_json)
# Extract internal_states from decode servers
Expand Down
2 changes: 1 addition & 1 deletion sgl-model-gateway/src/routers/http/pd_router.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1223,7 +1223,7 @@ impl RouterTrait for PDRouter {
async fn get_server_info(&self, _req: Request<Body>) -> Response {
// Get info from the first decode server to match sglang's server info format
// Note: We use decode workers for server info to match expected format
self.proxy_to_first_prefill_worker("get_server_info", None)
self.proxy_to_first_prefill_worker("server_info", None)
.await
}

Expand Down
2 changes: 1 addition & 1 deletion sgl-model-gateway/src/routers/http/router.rs
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,7 @@ impl RouterTrait for Router {
}

async fn get_server_info(&self, req: Request<Body>) -> Response {
self.proxy_get_request(req, "get_server_info").await
self.proxy_get_request(req, "server_info").await
}

async fn get_models(&self, req: Request<Body>) -> Response {
Expand Down
2 changes: 2 additions & 0 deletions sgl-model-gateway/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,8 @@ pub fn build_app(
.route("/engine_metrics", get(engine_metrics))
.route("/v1/models", get(v1_models))
.route("/get_model_info", get(get_model_info))
.route("/server_info", get(get_server_info))
// TODO: Remove `/get_server_info` alias after one release-cycle deprecation window.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This TODO is a good reminder to remove the old /get_server_info alias after the deprecation window, ensuring a clean API surface in the long term.

.route("/get_server_info", get(get_server_info));

// Build admin routes with control plane auth if configured, otherwise use simple API key auth
Expand Down
4 changes: 2 additions & 2 deletions sgl-model-gateway/tests/api/api_endpoints_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ mod model_info_tests {

let req = Request::builder()
.method("GET")
.uri("/get_server_info")
.uri("/server_info")
.body(Body::empty())
.unwrap();

Expand Down Expand Up @@ -445,7 +445,7 @@ mod model_info_tests {

let req = Request::builder()
.method("GET")
.uri("/get_server_info")
.uri("/server_info")
.body(Body::empty())
.unwrap();
let resp = app.clone().oneshot(req).await.unwrap();
Expand Down
2 changes: 1 addition & 1 deletion sgl-model-gateway/tests/common/mock_worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ impl MockWorker {
let app = Router::new()
.route("/health", get(health_handler))
.route("/health_generate", get(health_generate_handler))
.route("/get_server_info", get(server_info_handler))
.route("/server_info", get(server_info_handler))
.route("/get_model_info", get(model_info_handler))
.route("/generate", post(generate_handler))
.route("/v1/chat/completions", post(chat_completions_handler))
Expand Down
2 changes: 1 addition & 1 deletion sgl-model-gateway/tests/common/tls_mock_worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ impl TlsMockWorker {
let app = Router::new()
.route("/health", get(health_handler))
.route("/health_generate", get(health_generate_handler))
.route("/get_server_info", get(server_info_handler))
.route("/server_info", get(server_info_handler))
.route("/generate", post(generate_handler))
.route("/v1/chat/completions", post(chat_completions_handler))
.with_state(config);
Expand Down
2 changes: 1 addition & 1 deletion sgl-model-gateway/tests/routing/test_pd_routing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ mod pd_routing_unit_tests {
let implemented_endpoints = vec![
("/health", "GET", true),
("/health_generate", "GET", true), // Note: Python uses POST, we use GET
("/get_server_info", "GET", true),
("/server_info", "GET", true),
("/v1/models", "GET", true),
("/get_model_info", "GET", true),
("/generate", "POST", true),
Expand Down
4 changes: 2 additions & 2 deletions test/registered/4-gpu-models/test_qwen35_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def test_gsm8k(self):
print(f"{metrics=}")
self.assertGreaterEqual(metrics["score"], ACC_THRESHOLDS[self.model]["gsm8k"])

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down Expand Up @@ -226,7 +226,7 @@ def test_gsm8k(self):
print(f"{metrics=}")
self.assertGreaterEqual(metrics["score"], ACC_THRESHOLDS[self.model]["gsm8k"])

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down
8 changes: 4 additions & 4 deletions test/registered/8-gpu-models/test_deepseek_v32_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_a_gsm8k(
metrics = run_eval(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down Expand Up @@ -163,7 +163,7 @@ def test_a_gsm8k(
metrics = run_eval(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down Expand Up @@ -246,7 +246,7 @@ def test_a_gsm8k(
metrics = run_eval(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down Expand Up @@ -330,7 +330,7 @@ def test_a_gsm8k(
metrics = run_eval(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down
2 changes: 1 addition & 1 deletion test/registered/8-gpu-models/test_deepseek_v3_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_a_gsm8k(
metrics = run_eval(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def test_a_gsm8k(self):
metrics = run_eval_few_shot_gsm8k(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test_a_gsm8k(self):
metrics = run_eval_few_shot_gsm8k(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down
2 changes: 1 addition & 1 deletion test/registered/amd/test_deepseek_r1_mxfp4_8gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def test_a_gsm8k(
metrics = run_eval_few_shot_gsm8k(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down
4 changes: 2 additions & 2 deletions test/registered/amd/test_deepseek_v32_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_a_gsm8k(
metrics = run_eval_few_shot_gsm8k(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down Expand Up @@ -179,7 +179,7 @@ def test_a_gsm8k(
metrics = run_eval_few_shot_gsm8k(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down
2 changes: 1 addition & 1 deletion test/registered/amd/test_deepseek_v3_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_a_gsm8k(
metrics = run_eval_few_shot_gsm8k(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down
2 changes: 1 addition & 1 deletion test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_a_gsm8k(
metrics = run_eval_few_shot_gsm8k(args)
print(f"{metrics=}")

server_info = requests.get(self.base_url + "/get_server_info")
server_info = requests.get(self.base_url + "/server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
Expand Down
Loading
Loading