Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: "h100-dsr1-fp8-agg-workeronly-tp8-pp2_1K_1K"

model:
path: "dsr1-0528"
container: "docker://lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp8"

resources:
gpu_type: "h100"
gpus_per_node: 8
# TP*PP = 8*2 = 16 GPUs total → 2 nodes @ 8 GPUs each
agg_nodes: 2
agg_workers: 1
slurm:
time_limit: "02:00:00"

sbatch_directives:
# Prevent automatic cancellation during long model-load / warmup periods with idle GPUs.
# (Cluster reaper expects this JSON under --comment)
comment: >-
'{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"60","reason":"data_loading","description":"DeepSeek-R1 FP8 model load + warmup can keep some GPUs idle initially"}}'

frontend:
# Stock SGLang:
# - agg_workers=1 => worker-only (direct-to-worker) handled automatically under the hood
# - agg_workers>1 or disagg => router
type: sglang

backend:
type: sglang

aggregated_environment:
TORCH_CUDA_ARCH_LIST: "9.0"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"

sglang_config:
aggregated:
# srtctl mounts host model dir -> /model inside container
model-path: "/model/"
tokenizer-path: "/model/"
served-model-name: "deepseek-ai/DeepSeek-R1-0528"
trust-remote-code: true

tensor-parallel-size: 8
data-parallel-size: 1
pipeline-parallel-size: 2

disable-radix-cache: true
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 16000
max-prefill-tokens: 16000
mem-fraction-static: 0.70
kv-cache-dtype: "auto"
attention-backend: "flashinfer"
stream-interval: 10
decode-log-interval: 1

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x2x4x8x16x32x64x128x256x512"
req_rate: "inf"

# DSR1 can take a long time to load weights across 2 nodes.
# Health timeout controls how long srtctl waits for the worker to become ready (independent of SLURM time_limit).
health_check:
max_attempts: 720 # 720 * 10s = 7200s (matches 2:00:00 time limit)
interval_seconds: 10
Comment thread
yeswanthk-26 marked this conversation as resolved.
Comment thread
coderabbitai[bot] marked this conversation as resolved.


Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
name: "h100-dsr1-fp8-agg-workeronly-tp8-pp2_1K_1K_nsys"

model:
path: "dsr1-0528"
container: "docker://lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp8"

resources:
gpu_type: "h100"
gpus_per_node: 8
# TP*PP = 8*2 = 16 GPUs total → 2 nodes @ 8 GPUs each
agg_nodes: 2
agg_workers: 1
slurm:
time_limit: "02:00:00"

sbatch_directives:
# Prevent automatic cancellation during long model-load / warmup periods with idle GPUs.
# (Cluster reaper expects this JSON under --comment)
comment: >-
'{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"60","reason":"data_loading","description":"DeepSeek-R1 FP8 model load + warmup can keep some GPUs idle initially"}}'

# Nsight Systems (nsys) is not shipped in the SGLang runtime container.
# Mount the site-provided Nsight Systems CLI into the container and put it on PATH.
environment:
PATH: "/opt/nsight/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
LD_LIBRARY_PATH: "/opt/nsight/target-linux-x64:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/lib/x86_64-linux-gnu"

container_mounts:
"/lustre/fsw/portfolios/general/users/yeswanthk/nsight2025.6.1/opt/nvidia/nsight-systems-cli/2025.6.1": "/opt/nsight"

frontend:
# Stock SGLang:
# - agg_workers=1 => worker-only (direct-to-worker) handled automatically under the hood
# - agg_workers>1 or disagg => router
type: sglang

backend:
type: sglang

aggregated_environment:
TORCH_CUDA_ARCH_LIST: "9.0"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"

sglang_config:
aggregated:
# srtctl mounts host model dir -> /model inside container
model-path: "/model/"
tokenizer-path: "/model/"
served-model-name: "deepseek-ai/DeepSeek-R1-0528"
trust-remote-code: true

tensor-parallel-size: 8
data-parallel-size: 1
pipeline-parallel-size: 2

disable-radix-cache: true
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 16000
max-prefill-tokens: 16000
mem-fraction-static: 0.70
kv-cache-dtype: "auto"
attention-backend: "flashinfer"
stream-interval: 10
decode-log-interval: 1

# Profiling and benchmarking are mutually exclusive. For nsys, set benchmark to manual.
benchmark:
type: "manual"

profiling:
type: "nsys"
isl: 1024
osl: 1024
# Keep this modest for profiling (nsys output size grows quickly with concurrency).
concurrency: 16
aggregated:
start_step: 10
stop_step: 30

# DSR1 can take a long time to load weights across 2 nodes.
# Health timeout controls how long srtctl waits for the worker to become ready (independent of SLURM time_limit).
health_check:
max_attempts: 720 # 720 * 10s = 7200s (matches 2:00:00 time limit)
interval_seconds: 10


Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
name: "h100-dsr1-fp8-agg-workeronly-tp8-pp2_1K_8K"

model:
path: "dsr1-0528"
container: "docker://lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp8"

resources:
gpu_type: "h100"
gpus_per_node: 8
# TP*PP = 8*2 = 16 GPUs total → 2 nodes @ 8 GPUs each
agg_nodes: 2
agg_workers: 1

slurm:
time_limit: "03:00:00"

sbatch_directives:
# Prevent automatic cancellation during long model-load / warmup periods with idle GPUs.
# (Cluster reaper expects this JSON under --comment)
comment: >-
'{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"60","reason":"data_loading","description":"DeepSeek-R1 FP8 model load + warmup can keep some GPUs idle initially"}}'

frontend:
# Stock SGLang:
# - agg_workers=1 => worker-only (direct-to-worker) handled automatically under the hood
# - agg_workers>1 or disagg => router
type: sglang

backend:
type: sglang

aggregated_environment:
TORCH_CUDA_ARCH_LIST: "9.0"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"

sglang_config:
aggregated:
# srtctl mounts host model dir -> /model inside container
model-path: "/model/"
tokenizer-path: "/model/"
served-model-name: "deepseek-ai/DeepSeek-R1-0528"
trust-remote-code: true

tensor-parallel-size: 8
data-parallel-size: 1
pipeline-parallel-size: 2

disable-radix-cache: true
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 16000
max-prefill-tokens: 16000
mem-fraction-static: 0.70
kv-cache-dtype: "auto"
attention-backend: "flashinfer"
stream-interval: 10
decode-log-interval: 1

benchmark:
type: "sa-bench"
isl: 1024
osl: 8192
concurrencies: "1x2x4x8x16x32x64x128x176x256"
req_rate: "inf"

# DSR1 can take a long time to load weights across 2 nodes.
# Default health timeout (max_attempts * interval_seconds) was too short and caused premature job failure.
health_check:
max_attempts: 720 # 720 * 10s = 7200s (matches 2:00:00 time limit)
interval_seconds: 10
Comment thread
yeswanthk-26 marked this conversation as resolved.
Comment thread
yeswanthk-26 marked this conversation as resolved.


Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: "h100-dsr1-fp8-agg-workeronly-tp8-pp2_8K_1K"

model:
path: "dsr1-0528"
container: "docker://lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp8"

resources:
gpu_type: "h100"
gpus_per_node: 8
# TP*PP = 8*2 = 16 GPUs total → 2 nodes @ 8 GPUs each
agg_nodes: 2
agg_workers: 1

sbatch_directives:
# Prevent automatic cancellation during long model-load / warmup periods with idle GPUs.
# (Cluster reaper expects this JSON under --comment)
comment: >-
'{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"60","reason":"data_loading","description":"DeepSeek-R1 FP8 model load + warmup can keep some GPUs idle initially"}}'

frontend:
# Stock SGLang:
# - agg_workers=1 => worker-only (direct-to-worker) handled automatically under the hood
# - agg_workers>1 or disagg => router
type: sglang

backend:
type: sglang

aggregated_environment:
TORCH_CUDA_ARCH_LIST: "9.0"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"

sglang_config:
aggregated:
# srtctl mounts host model dir -> /model inside container
model-path: "/model/"
tokenizer-path: "/model/"
served-model-name: "deepseek-ai/DeepSeek-R1-0528"
trust-remote-code: true

tensor-parallel-size: 8
data-parallel-size: 1
pipeline-parallel-size: 2

disable-radix-cache: true
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 16000
max-prefill-tokens: 16000
mem-fraction-static: 0.70
kv-cache-dtype: "auto"
attention-backend: "flashinfer"
stream-interval: 10
decode-log-interval: 1

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "1x2x64"
req_rate: "inf"

# DSR1 can take a long time to load weights across 2 nodes.
# Health timeout controls how long srtctl waits for the worker to become ready (independent of SLURM time_limit).
health_check:
max_attempts: 720 # 720 * 10s = 7200s (matches 2:00:00 time limit)
interval_seconds: 10
Comment thread
yeswanthk-26 marked this conversation as resolved.


Comment thread
yeswanthk-26 marked this conversation as resolved.
9 changes: 5 additions & 4 deletions src/srtctl/backends/sglang.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def build_worker_command(
process: The process to start
endpoint_processes: All processes for this endpoint (for multi-node)
runtime: Runtime context with paths and settings
frontend_type: Frontend type - "sglang" uses sglang.launch_server, "dynamo" uses dynamo.sglang
frontend_type: Frontend type - "sglang"/"direct" use sglang.launch_server, "dynamo" uses dynamo.sglang
profiling_enabled: Whether profiling is enabled (forces sglang.launch_server)
nsys_prefix: Optional nsys profiling command prefix
dump_config_path: Path to dump config JSON
Expand All @@ -240,7 +240,7 @@ def build_worker_command(

# Choose Python module
# When profiling is enabled, always use sglang.launch_server (not dynamo.sglang)
use_sglang = frontend_type == "sglang" or profiling_enabled
use_sglang = frontend_type in ("sglang", "direct", "none") or profiling_enabled
python_module = "sglang.launch_server" if use_sglang else "dynamo.sglang"

# Get served model name from config
Expand Down Expand Up @@ -289,8 +289,9 @@ def build_worker_command(
]
)

# Add config dump path (not when using sglang frontend)
if dump_config_path and frontend_type != "sglang":
# Add config dump path (only for dynamo.sglang).
# sglang.launch_server does not support --dump-config-to.
if dump_config_path and frontend_type not in ("sglang", "direct", "none"):
cmd.extend(["--dump-config-to", str(dump_config_path)])

# Add kv-events-config if enabled for this mode and we have an allocated port
Expand Down
29 changes: 22 additions & 7 deletions src/srtctl/cli/do_sweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,16 @@ def endpoints(self) -> list[Endpoint]:
@functools.cached_property
def backend_processes(self) -> list[Process]:
"""Compute physical process topology from endpoints (cached)."""
return self.backend.endpoints_to_processes(self.endpoints)
# NOTE: On shared clusters, fixed DYN_SYSTEM_PORT ranges can collide across jobs
# and crash dynamo.sglang with "Address already in use". Use a job-specific base.
try:
return self.backend.endpoints_to_processes(
self.endpoints,
base_sys_port=self.runtime.sys_port_base,
)
except TypeError:
# Backends that don't accept base_sys_port keep their default behavior.
return self.backend.endpoints_to_processes(self.endpoints)

def start_head_infrastructure(self, registry: ProcessRegistry) -> ManagedProcess:
"""Start NATS and etcd on the infra node.
Expand Down Expand Up @@ -130,14 +139,16 @@ def start_head_infrastructure(self, registry: ProcessRegistry) -> ManagedProcess
critical=True,
)

# 300s timeout to handle slow container imports on first run
# NOTE: Starting infra requires an `srun` into the container image.
# On busy clusters, `pyxis` image import can easily exceed 60s, so keep this
# timeout comfortably larger than the container startup overhead.
logger.info("Waiting for NATS (port 4222) on %s...", infra_node)
if not wait_for_port(infra_node, 4222, timeout=300):
if not wait_for_port(self.runtime.infra_node_ip, 4222, timeout=300):
raise RuntimeError("NATS failed to start")
logger.info("NATS is ready")

logger.info("Waiting for etcd (port 2379) on %s...", infra_node)
if not wait_for_port(infra_node, 2379, timeout=300):
if not wait_for_port(self.runtime.infra_node_ip, 2379, timeout=300):
raise RuntimeError("etcd failed to start")
logger.info("etcd is ready")

Expand All @@ -154,7 +165,10 @@ def _print_connection_info(self) -> None:
logger.info("=" * 60)
logger.info("Connection Commands")
logger.info("=" * 60)
logger.info("Frontend URL: http://%s:8000", self.runtime.nodes.head)
if self.runtime.effective_frontend_type == "direct":
logger.info("Worker URL: http://%s:%d", self.runtime.nodes.head, self.runtime.frontend_port)
else:
logger.info("Frontend URL: http://%s:%d", self.runtime.nodes.head, self.runtime.frontend_port)
logger.info("")
logger.info("To connect to head node (%s):", self.runtime.nodes.head)
logger.info(
Expand Down Expand Up @@ -211,8 +225,9 @@ def run(self) -> int:
try:
# Stage 1: Head infrastructure (NATS, etcd)
reporter.report(JobStatus.STARTING, JobStage.HEAD_INFRASTRUCTURE, "Starting head infrastructure")
head_proc = self.start_head_infrastructure(registry)
registry.add_process(head_proc)
if self.runtime.effective_frontend_type != "direct":
head_proc = self.start_head_infrastructure(registry)
registry.add_process(head_proc)

# Stage 2: Workers
reporter.report(JobStatus.WORKERS, JobStage.WORKERS, "Starting workers")
Expand Down
Loading