Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/advanced_features/sgl_model_gateway.md
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,14 @@ Response:

## Reliability and Flow Control

### HTTP Client Pool

Configure the idle timeout for pooled upstream HTTP connections:

| Parameter | Default | Description |
|-----------|---------|-------------|
| `--pool-idle-timeout-secs` | 50 | Idle timeout in seconds for pooled upstream HTTP connections. Can also be set with `SMG_POOL_IDLE_TIMEOUT_SECS`. |

### Retries

Configure exponential backoff retries:
Expand Down
26 changes: 26 additions & 0 deletions docs_new/docs/advanced_features/sgl_model_gateway.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,32 @@ Response:
***
## Reliability and Flow Control

### HTTP Client Pool

Configure the idle timeout for pooled upstream HTTP connections:

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
<colgroup>
<col style={{width: "34%"}} />
<col style={{width: "33%"}} />
<col style={{width: "33%"}} />
</colgroup>
<thead>
<tr style={{borderBottom: "2px solid #d55816"}}>
<th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Parameter</th>
<th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Default</th>
<th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--pool-idle-timeout-secs`</td>
<td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>50</td>
<td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>Idle timeout in seconds for pooled upstream HTTP connections. Can also be set with `SMG_POOL_IDLE_TIMEOUT_SECS`.</td>
</tr>
</tbody>
</table>

### Retries

Configure exponential backoff retries:
Expand Down
1 change: 1 addition & 0 deletions sgl-model-gateway/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,7 @@ Router flags map to these values:
- `--redis-retention-days` (env: `REDIS_RETENTION_DAYS`). Set to `-1` for persistent storage (default: 30 days).

## Reliability & Flow Control
- **HTTP Client Pool**: Upstream HTTP connection pool idle timeout defaults to 50 seconds. Configure via `--pool-idle-timeout-secs` or `SMG_POOL_IDLE_TIMEOUT_SECS`.
- **Retries**: Default max retries = 5 with exponential backoff (`--retry-max-retries`, `--retry-initial-backoff-ms`, `--retry-max-backoff-ms`, `--retry-backoff-multiplier`, `--retry-jitter-factor`). Retries trigger on 408/429/500/502/503/504.
- **Circuit Breakers**: Per worker thresholds (`--cb-failure-threshold`, `--cb-success-threshold`, `--cb-timeout-duration-secs`, `--cb-window-duration-secs`). Disable via `--disable-circuit-breaker`.
- **Rate Limiting**: Token bucket driven by `--max-concurrent-requests`. Set `--rate-limit-tokens-per-second` to override refill rate. Configure request queue via `--queue-size` and `--queue-timeout-secs`; queued requests observe FIFO order and respect cancellation.
Expand Down
2 changes: 1 addition & 1 deletion sgl-model-gateway/src/app_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ impl AppContextBuilder {
let has_tls_config = config.client_identity.is_some() || !config.ca_certificates.is_empty();

let mut client_builder = Client::builder()
.pool_idle_timeout(Some(Duration::from_secs(50)))
.pool_idle_timeout(Some(Duration::from_secs(config.pool_idle_timeout_secs)))
.pool_max_idle_per_host(500)
.timeout(Duration::from_secs(timeout_secs))
.connect_timeout(Duration::from_secs(10))
Expand Down
5 changes: 5 additions & 0 deletions sgl-model-gateway/src/config/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,11 @@ impl RouterConfigBuilder {
self
}

pub fn pool_idle_timeout_secs(mut self, timeout: u64) -> Self {
self.config.pool_idle_timeout_secs = timeout;
self
}

// ==================== Rate Limiting ====================

pub fn max_concurrent_requests(mut self, max: i32) -> Self {
Expand Down
29 changes: 29 additions & 0 deletions sgl-model-gateway/src/config/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use serde::{Deserialize, Serialize};
use super::ConfigResult;
use crate::core::ConnectionMode;

pub const DEFAULT_POOL_IDLE_TIMEOUT_SECS: u64 = 50;

/// Main router configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RouterConfig {
Expand All @@ -28,6 +30,8 @@ pub struct RouterConfig {
pub log_dir: Option<String>,
pub log_level: Option<String>,
pub request_id_headers: Option<Vec<String>>,
#[serde(default = "default_pool_idle_timeout_secs")]
pub pool_idle_timeout_secs: u64,
/// Set to -1 to disable rate limiting
pub max_concurrent_requests: i32,
pub queue_size: usize,
Expand Down Expand Up @@ -119,6 +123,10 @@ fn default_l1_max_memory() -> usize {
50 * 1024 * 1024 // 50MB
}

fn default_pool_idle_timeout_secs() -> u64 {
DEFAULT_POOL_IDLE_TIMEOUT_SECS
}

impl TokenizerCacheConfig {
/// Returns Some(self) if any caching is enabled, None otherwise.
/// Use this when passing cache config to tokenizer registration workflow.
Expand Down Expand Up @@ -492,6 +500,7 @@ impl Default for RouterConfig {
log_dir: None,
log_level: None,
request_id_headers: None,
pool_idle_timeout_secs: default_pool_idle_timeout_secs(),
max_concurrent_requests: -1,
queue_size: 100,
queue_timeout_secs: 60,
Expand Down Expand Up @@ -613,6 +622,10 @@ mod tests {
assert!(config.trace_config.is_none());
assert!(config.log_dir.is_none());
assert!(config.log_level.is_none());
assert_eq!(
config.pool_idle_timeout_secs,
DEFAULT_POOL_IDLE_TIMEOUT_SECS
);
}

#[test]
Expand Down Expand Up @@ -662,6 +675,22 @@ mod tests {
assert!(deserialized.trace_config.is_none());
}

#[test]
fn test_router_config_pool_idle_timeout_deserialization_default() {
let config = RouterConfig::default();
let mut json = serde_json::to_value(&config).unwrap();
json.as_object_mut()
.unwrap()
.remove("pool_idle_timeout_secs");

let deserialized: RouterConfig = serde_json::from_value(json).unwrap();

assert_eq!(
deserialized.pool_idle_timeout_secs,
default_pool_idle_timeout_secs()
);
}

#[test]
fn test_routing_mode_is_pd_mode() {
let regular = RoutingMode::Regular {
Expand Down
13 changes: 12 additions & 1 deletion sgl-model-gateway/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use smg::{
CircuitBreakerConfig, ConfigError, ConfigResult, DiscoveryConfig, HealthCheckConfig,
HistoryBackend, ManualAssignmentMode, MetricsConfig, OracleConfig, PolicyConfig,
PostgresConfig, RedisConfig, RetryConfig, RouterConfig, RoutingMode, TokenizerCacheConfig,
TraceConfig,
TraceConfig, DEFAULT_POOL_IDLE_TIMEOUT_SECS,
},
core::ConnectionMode,
observability::{
Expand Down Expand Up @@ -298,6 +298,16 @@ struct CliArgs {
#[arg(long, num_args = 0.., help_heading = "Request Handling")]
cors_allowed_origins: Vec<String>,

// ==================== HTTP Client Pool ====================
/// Idle timeout in seconds for pooled upstream HTTP connections
#[arg(
long,
env = "SMG_POOL_IDLE_TIMEOUT_SECS",
default_value_t = DEFAULT_POOL_IDLE_TIMEOUT_SECS,
help_heading = "HTTP Client Pool"
)]
pool_idle_timeout_secs: u64,

// ==================== Rate Limiting ====================
/// Maximum concurrent requests (-1 to disable)
#[arg(long, default_value_t = -1, help_heading = "Rate Limiting")]
Expand Down Expand Up @@ -972,6 +982,7 @@ impl CliArgs {
.request_timeout_secs(self.request_timeout_secs)
.worker_startup_timeout_secs(self.worker_startup_timeout_secs)
.worker_startup_check_interval_secs(self.worker_startup_check_interval)
.pool_idle_timeout_secs(self.pool_idle_timeout_secs)
.max_concurrent_requests(self.max_concurrent_requests)
.queue_size(self.queue_size)
.queue_timeout_secs(self.queue_timeout_secs)
Expand Down
Loading