diff --git a/docs/advanced_features/sgl_model_gateway.md b/docs/advanced_features/sgl_model_gateway.md
index 5f965b9527c8..8aa1b2d1b71e 100644
--- a/docs/advanced_features/sgl_model_gateway.md
+++ b/docs/advanced_features/sgl_model_gateway.md
@@ -593,6 +593,14 @@ Response:
## Reliability and Flow Control
+### HTTP Client Pool
+
+Configure the idle timeout for pooled upstream HTTP connections:
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `--pool-idle-timeout-secs` | 50 | Idle timeout in seconds for pooled upstream HTTP connections. Can also be set with `SMG_POOL_IDLE_TIMEOUT_SECS`. |
+
### Retries
Configure exponential backoff retries:
diff --git a/docs_new/docs/advanced_features/sgl_model_gateway.mdx b/docs_new/docs/advanced_features/sgl_model_gateway.mdx
index 1789442dee27..f6867cda2edf 100644
--- a/docs_new/docs/advanced_features/sgl_model_gateway.mdx
+++ b/docs_new/docs/advanced_features/sgl_model_gateway.mdx
@@ -944,6 +944,32 @@ Response:
***
## Reliability and Flow Control
+### HTTP Client Pool
+
+Configure the idle timeout for pooled upstream HTTP connections:
+
+
+
+
+
+
+
+
+
+ | Parameter |
+ Default |
+ Description |
+
+
+
+
+ | `--pool-idle-timeout-secs` |
+ 50 |
+ Idle timeout in seconds for pooled upstream HTTP connections. Can also be set with `SMG_POOL_IDLE_TIMEOUT_SECS`. |
+
+
+
+
### Retries
Configure exponential backoff retries:
diff --git a/sgl-model-gateway/README.md b/sgl-model-gateway/README.md
index 046cf352a14e..c221a8f9052f 100644
--- a/sgl-model-gateway/README.md
+++ b/sgl-model-gateway/README.md
@@ -726,6 +726,7 @@ Router flags map to these values:
- `--redis-retention-days` (env: `REDIS_RETENTION_DAYS`). Set to `-1` for persistent storage (default: 30 days).
## Reliability & Flow Control
+- **HTTP Client Pool**: Upstream HTTP connection pool idle timeout defaults to 50 seconds. Configure via `--pool-idle-timeout-secs` or `SMG_POOL_IDLE_TIMEOUT_SECS`.
- **Retries**: Default max retries = 5 with exponential backoff (`--retry-max-retries`, `--retry-initial-backoff-ms`, `--retry-max-backoff-ms`, `--retry-backoff-multiplier`, `--retry-jitter-factor`). Retries trigger on 408/429/500/502/503/504.
- **Circuit Breakers**: Per worker thresholds (`--cb-failure-threshold`, `--cb-success-threshold`, `--cb-timeout-duration-secs`, `--cb-window-duration-secs`). Disable via `--disable-circuit-breaker`.
- **Rate Limiting**: Token bucket driven by `--max-concurrent-requests`. Set `--rate-limit-tokens-per-second` to override refill rate. Configure request queue via `--queue-size` and `--queue-timeout-secs`; queued requests observe FIFO order and respect cancellation.
diff --git a/sgl-model-gateway/src/app_context.rs b/sgl-model-gateway/src/app_context.rs
index cbb47c1e14f4..0254ff222c7c 100644
--- a/sgl-model-gateway/src/app_context.rs
+++ b/sgl-model-gateway/src/app_context.rs
@@ -329,7 +329,7 @@ impl AppContextBuilder {
let has_tls_config = config.client_identity.is_some() || !config.ca_certificates.is_empty();
let mut client_builder = Client::builder()
- .pool_idle_timeout(Some(Duration::from_secs(50)))
+ .pool_idle_timeout(Some(Duration::from_secs(config.pool_idle_timeout_secs)))
.pool_max_idle_per_host(500)
.timeout(Duration::from_secs(timeout_secs))
.connect_timeout(Duration::from_secs(10))
diff --git a/sgl-model-gateway/src/config/builder.rs b/sgl-model-gateway/src/config/builder.rs
index b103d8a672f5..70091180ab8c 100644
--- a/sgl-model-gateway/src/config/builder.rs
+++ b/sgl-model-gateway/src/config/builder.rs
@@ -187,6 +187,11 @@ impl RouterConfigBuilder {
self
}
+ pub fn pool_idle_timeout_secs(mut self, timeout: u64) -> Self {
+ self.config.pool_idle_timeout_secs = timeout;
+ self
+ }
+
// ==================== Rate Limiting ====================
pub fn max_concurrent_requests(mut self, max: i32) -> Self {
diff --git a/sgl-model-gateway/src/config/types.rs b/sgl-model-gateway/src/config/types.rs
index 39e0a1df4ad5..a8f93a1e7627 100644
--- a/sgl-model-gateway/src/config/types.rs
+++ b/sgl-model-gateway/src/config/types.rs
@@ -7,6 +7,8 @@ use serde::{Deserialize, Serialize};
use super::ConfigResult;
use crate::core::ConnectionMode;
+pub const DEFAULT_POOL_IDLE_TIMEOUT_SECS: u64 = 50;
+
/// Main router configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RouterConfig {
@@ -28,6 +30,8 @@ pub struct RouterConfig {
pub log_dir: Option,
pub log_level: Option,
pub request_id_headers: Option>,
+ #[serde(default = "default_pool_idle_timeout_secs")]
+ pub pool_idle_timeout_secs: u64,
/// Set to -1 to disable rate limiting
pub max_concurrent_requests: i32,
pub queue_size: usize,
@@ -119,6 +123,10 @@ fn default_l1_max_memory() -> usize {
50 * 1024 * 1024 // 50MB
}
+fn default_pool_idle_timeout_secs() -> u64 {
+ DEFAULT_POOL_IDLE_TIMEOUT_SECS
+}
+
impl TokenizerCacheConfig {
/// Returns Some(self) if any caching is enabled, None otherwise.
/// Use this when passing cache config to tokenizer registration workflow.
@@ -492,6 +500,7 @@ impl Default for RouterConfig {
log_dir: None,
log_level: None,
request_id_headers: None,
+ pool_idle_timeout_secs: default_pool_idle_timeout_secs(),
max_concurrent_requests: -1,
queue_size: 100,
queue_timeout_secs: 60,
@@ -613,6 +622,10 @@ mod tests {
assert!(config.trace_config.is_none());
assert!(config.log_dir.is_none());
assert!(config.log_level.is_none());
+ assert_eq!(
+ config.pool_idle_timeout_secs,
+ DEFAULT_POOL_IDLE_TIMEOUT_SECS
+ );
}
#[test]
@@ -662,6 +675,22 @@ mod tests {
assert!(deserialized.trace_config.is_none());
}
+ #[test]
+ fn test_router_config_pool_idle_timeout_deserialization_default() {
+ let config = RouterConfig::default();
+ let mut json = serde_json::to_value(&config).unwrap();
+ json.as_object_mut()
+ .unwrap()
+ .remove("pool_idle_timeout_secs");
+
+ let deserialized: RouterConfig = serde_json::from_value(json).unwrap();
+
+ assert_eq!(
+ deserialized.pool_idle_timeout_secs,
+ default_pool_idle_timeout_secs()
+ );
+ }
+
#[test]
fn test_routing_mode_is_pd_mode() {
let regular = RoutingMode::Regular {
diff --git a/sgl-model-gateway/src/main.rs b/sgl-model-gateway/src/main.rs
index 3d8b9842f56a..6d61264031de 100644
--- a/sgl-model-gateway/src/main.rs
+++ b/sgl-model-gateway/src/main.rs
@@ -8,7 +8,7 @@ use smg::{
CircuitBreakerConfig, ConfigError, ConfigResult, DiscoveryConfig, HealthCheckConfig,
HistoryBackend, ManualAssignmentMode, MetricsConfig, OracleConfig, PolicyConfig,
PostgresConfig, RedisConfig, RetryConfig, RouterConfig, RoutingMode, TokenizerCacheConfig,
- TraceConfig,
+ TraceConfig, DEFAULT_POOL_IDLE_TIMEOUT_SECS,
},
core::ConnectionMode,
observability::{
@@ -298,6 +298,16 @@ struct CliArgs {
#[arg(long, num_args = 0.., help_heading = "Request Handling")]
cors_allowed_origins: Vec,
+ // ==================== HTTP Client Pool ====================
+ /// Idle timeout in seconds for pooled upstream HTTP connections
+ #[arg(
+ long,
+ env = "SMG_POOL_IDLE_TIMEOUT_SECS",
+ default_value_t = DEFAULT_POOL_IDLE_TIMEOUT_SECS,
+ help_heading = "HTTP Client Pool"
+ )]
+ pool_idle_timeout_secs: u64,
+
// ==================== Rate Limiting ====================
/// Maximum concurrent requests (-1 to disable)
#[arg(long, default_value_t = -1, help_heading = "Rate Limiting")]
@@ -972,6 +982,7 @@ impl CliArgs {
.request_timeout_secs(self.request_timeout_secs)
.worker_startup_timeout_secs(self.worker_startup_timeout_secs)
.worker_startup_check_interval_secs(self.worker_startup_check_interval)
+ .pool_idle_timeout_secs(self.pool_idle_timeout_secs)
.max_concurrent_requests(self.max_concurrent_requests)
.queue_size(self.queue_size)
.queue_timeout_secs(self.queue_timeout_secs)