ishandhanani · yeswanthk-26 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 27, 2026
diff --git a/recipies/h100/aggdsr1sglangfp8/dsr1-fp8-agg-workeronly-tp8-pp2_1K_1K.yaml b/recipies/h100/aggdsr1sglangfp8/dsr1-fp8-agg-workeronly-tp8-pp2_1K_1K.yaml
@@ -0,0 +1,72 @@
+name: "h100-dsr1-fp8-agg-workeronly-tp8-pp2_1K_1K"
+
+model:
+  path: "dsr1-0528"
+  container: "docker://lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  # TP*PP = 8*2 = 16 GPUs total → 2 nodes @ 8 GPUs each
+  agg_nodes: 2
+  agg_workers: 1
+slurm:
+  time_limit: "02:00:00"
+
+sbatch_directives:
+  # Prevent automatic cancellation during long model-load / warmup periods with idle GPUs.
+  # (Cluster reaper expects this JSON under --comment)
+  comment: >-
+    '{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"60","reason":"data_loading","description":"DeepSeek-R1 FP8 model load + warmup can keep some GPUs idle initially"}}'
+
+frontend:
+  # Stock SGLang:
+  # - agg_workers=1 => worker-only (direct-to-worker) handled automatically under the hood
+  # - agg_workers>1 or disagg => router
+  type: sglang
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    TORCH_CUDA_ARCH_LIST: "9.0"
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+
+  sglang_config:
+    aggregated:
+      # srtctl mounts host model dir -> /model inside container
+      model-path: "/model/"
+      tokenizer-path: "/model/"
+      served-model-name: "deepseek-ai/DeepSeek-R1-0528"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      pipeline-parallel-size: 2
+
+      disable-radix-cache: true
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 16000
+      max-prefill-tokens: 16000
+      mem-fraction-static: 0.70
+      kv-cache-dtype: "auto"
+      attention-backend: "flashinfer"
+      stream-interval: 10
+      decode-log-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64x128x256x512"
+  req_rate: "inf"
+
+# DSR1 can take a long time to load weights across 2 nodes.
+# Health timeout controls how long srtctl waits for the worker to become ready (independent of SLURM time_limit).
+health_check:
+  max_attempts: 720        # 720 * 10s = 7200s (matches 2:00:00 time limit)
+  interval_seconds: 10
+
+
diff --git a/recipies/h100/aggdsr1sglangfp8/dsr1-fp8-agg-workeronly-tp8-pp2_1K_1K_nsys.yaml b/recipies/h100/aggdsr1sglangfp8/dsr1-fp8-agg-workeronly-tp8-pp2_1K_1K_nsys.yaml
@@ -0,0 +1,88 @@
+name: "h100-dsr1-fp8-agg-workeronly-tp8-pp2_1K_1K_nsys"
+
+model:
+  path: "dsr1-0528"
+  container: "docker://lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  # TP*PP = 8*2 = 16 GPUs total → 2 nodes @ 8 GPUs each
+  agg_nodes: 2
+  agg_workers: 1
+slurm:
+  time_limit: "02:00:00"
+
+sbatch_directives:
+  # Prevent automatic cancellation during long model-load / warmup periods with idle GPUs.
+  # (Cluster reaper expects this JSON under --comment)
+  comment: >-
+    '{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"60","reason":"data_loading","description":"DeepSeek-R1 FP8 model load + warmup can keep some GPUs idle initially"}}'
+
+# Nsight Systems (nsys) is not shipped in the SGLang runtime container.
+# Mount the site-provided Nsight Systems CLI into the container and put it on PATH.
+environment:
+  PATH: "/opt/nsight/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+  LD_LIBRARY_PATH: "/opt/nsight/target-linux-x64:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/lib/x86_64-linux-gnu"
+
+container_mounts:
+  "/lustre/fsw/portfolios/general/users/yeswanthk/nsight2025.6.1/opt/nvidia/nsight-systems-cli/2025.6.1": "/opt/nsight"
+
+frontend:
+  # Stock SGLang:
+  # - agg_workers=1 => worker-only (direct-to-worker) handled automatically under the hood
+  # - agg_workers>1 or disagg => router
+  type: sglang
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    TORCH_CUDA_ARCH_LIST: "9.0"
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+
+  sglang_config:
+    aggregated:
+      # srtctl mounts host model dir -> /model inside container
+      model-path: "/model/"
+      tokenizer-path: "/model/"
+      served-model-name: "deepseek-ai/DeepSeek-R1-0528"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      pipeline-parallel-size: 2
+
+      disable-radix-cache: true
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 16000
+      max-prefill-tokens: 16000
+      mem-fraction-static: 0.70
+      kv-cache-dtype: "auto"
+      attention-backend: "flashinfer"
+      stream-interval: 10
+      decode-log-interval: 1
+
+# Profiling and benchmarking are mutually exclusive. For nsys, set benchmark to manual.
+benchmark:
+  type: "manual"
+
+profiling:
+  type: "nsys"
+  isl: 1024
+  osl: 1024
+  # Keep this modest for profiling (nsys output size grows quickly with concurrency).
+  concurrency: 16
+  aggregated:
+    start_step: 10
+    stop_step: 30
+
+# DSR1 can take a long time to load weights across 2 nodes.
+# Health timeout controls how long srtctl waits for the worker to become ready (independent of SLURM time_limit).
+health_check:
+  max_attempts: 720        # 720 * 10s = 7200s (matches 2:00:00 time limit)
+  interval_seconds: 10
+
+
diff --git a/recipies/h100/aggdsr1sglangfp8/dsr1-fp8-agg-workeronly-tp8-pp2_1K_8K.yaml b/recipies/h100/aggdsr1sglangfp8/dsr1-fp8-agg-workeronly-tp8-pp2_1K_8K.yaml
@@ -0,0 +1,73 @@
+name: "h100-dsr1-fp8-agg-workeronly-tp8-pp2_1K_8K"
+
+model:
+  path: "dsr1-0528"
+  container: "docker://lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  # TP*PP = 8*2 = 16 GPUs total → 2 nodes @ 8 GPUs each
+  agg_nodes: 2
+  agg_workers: 1
+
+slurm:
+  time_limit: "03:00:00"
+
+sbatch_directives:
+  # Prevent automatic cancellation during long model-load / warmup periods with idle GPUs.
+  # (Cluster reaper expects this JSON under --comment)
+  comment: >-
+    '{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"60","reason":"data_loading","description":"DeepSeek-R1 FP8 model load + warmup can keep some GPUs idle initially"}}'
+
+frontend:
+  # Stock SGLang:
+  # - agg_workers=1 => worker-only (direct-to-worker) handled automatically under the hood
+  # - agg_workers>1 or disagg => router
+  type: sglang
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    TORCH_CUDA_ARCH_LIST: "9.0"
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+
+  sglang_config:
+    aggregated:
+      # srtctl mounts host model dir -> /model inside container
+      model-path: "/model/"
+      tokenizer-path: "/model/"
+      served-model-name: "deepseek-ai/DeepSeek-R1-0528"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      pipeline-parallel-size: 2
+
+      disable-radix-cache: true
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 16000
+      max-prefill-tokens: 16000
+      mem-fraction-static: 0.70
+      kv-cache-dtype: "auto"
+      attention-backend: "flashinfer"
+      stream-interval: 10
+      decode-log-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 8192
+  concurrencies: "1x2x4x8x16x32x64x128x176x256"
+  req_rate: "inf"
+
+# DSR1 can take a long time to load weights across 2 nodes.
+# Default health timeout (max_attempts * interval_seconds) was too short and caused premature job failure.
+health_check:
+  max_attempts: 720        # 720 * 10s = 7200s (matches 2:00:00 time limit)
+  interval_seconds: 10
+
+
diff --git a/recipies/h100/aggdsr1sglangfp8/dsr1-fp8-agg-workeronly-tp8-pp2_8K_1K.yaml b/recipies/h100/aggdsr1sglangfp8/dsr1-fp8-agg-workeronly-tp8-pp2_8K_1K.yaml
@@ -0,0 +1,70 @@
+name: "h100-dsr1-fp8-agg-workeronly-tp8-pp2_8K_1K"
+
+model:
+  path: "dsr1-0528"
+  container: "docker://lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  gpus_per_node: 8
+  # TP*PP = 8*2 = 16 GPUs total → 2 nodes @ 8 GPUs each
+  agg_nodes: 2
+  agg_workers: 1
+
+sbatch_directives:
+  # Prevent automatic cancellation during long model-load / warmup periods with idle GPUs.
+  # (Cluster reaper expects this JSON under --comment)
+  comment: >-
+    '{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"60","reason":"data_loading","description":"DeepSeek-R1 FP8 model load + warmup can keep some GPUs idle initially"}}'
+
+frontend:
+  # Stock SGLang:
+  # - agg_workers=1 => worker-only (direct-to-worker) handled automatically under the hood
+  # - agg_workers>1 or disagg => router
+  type: sglang
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    TORCH_CUDA_ARCH_LIST: "9.0"
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+
+  sglang_config:
+    aggregated:
+      # srtctl mounts host model dir -> /model inside container
+      model-path: "/model/"
+      tokenizer-path: "/model/"
+      served-model-name: "deepseek-ai/DeepSeek-R1-0528"
+      trust-remote-code: true
+
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      pipeline-parallel-size: 2
+
+      disable-radix-cache: true
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 16000
+      max-prefill-tokens: 16000
+      mem-fraction-static: 0.70
+      kv-cache-dtype: "auto"
+      attention-backend: "flashinfer"
+      stream-interval: 10
+      decode-log-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x64"
+  req_rate: "inf"
+
+# DSR1 can take a long time to load weights across 2 nodes.
+# Health timeout controls how long srtctl waits for the worker to become ready (independent of SLURM time_limit).
+health_check:
+  max_attempts: 720        # 720 * 10s = 7200s (matches 2:00:00 time limit)
+  interval_seconds: 10
+
+
diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py
@@ -220,7 +220,7 @@ def build_worker_command(
             process: The process to start
             endpoint_processes: All processes for this endpoint (for multi-node)
             runtime: Runtime context with paths and settings
-            frontend_type: Frontend type - "sglang" uses sglang.launch_server, "dynamo" uses dynamo.sglang
+            frontend_type: Frontend type - "sglang"/"direct" use sglang.launch_server, "dynamo" uses dynamo.sglang
             profiling_enabled: Whether profiling is enabled (forces sglang.launch_server)
             nsys_prefix: Optional nsys profiling command prefix
             dump_config_path: Path to dump config JSON
@@ -240,7 +240,7 @@ def build_worker_command(
 
         # Choose Python module
         # When profiling is enabled, always use sglang.launch_server (not dynamo.sglang)
-        use_sglang = frontend_type == "sglang" or profiling_enabled
+        use_sglang = frontend_type in ("sglang", "direct", "none") or profiling_enabled
         python_module = "sglang.launch_server" if use_sglang else "dynamo.sglang"
 
         # Get served model name from config
@@ -289,8 +289,9 @@ def build_worker_command(
                 ]
             )
 
-        # Add config dump path (not when using sglang frontend)
-        if dump_config_path and frontend_type != "sglang":
+        # Add config dump path (only for dynamo.sglang).
+        # sglang.launch_server does not support --dump-config-to.
+        if dump_config_path and frontend_type not in ("sglang", "direct", "none"):
             cmd.extend(["--dump-config-to", str(dump_config_path)])
 
         # Add kv-events-config if enabled for this mode and we have an allocated port

diff --git a/src/srtctl/cli/do_sweep.py b/src/srtctl/cli/do_sweep.py
@@ -80,7 +80,16 @@ def endpoints(self) -> list[Endpoint]:
     @functools.cached_property
     def backend_processes(self) -> list[Process]:
         """Compute physical process topology from endpoints (cached)."""
-        return self.backend.endpoints_to_processes(self.endpoints)
+        # NOTE: On shared clusters, fixed DYN_SYSTEM_PORT ranges can collide across jobs
+        # and crash dynamo.sglang with "Address already in use". Use a job-specific base.
+        try:
+            return self.backend.endpoints_to_processes(
+                self.endpoints,
+                base_sys_port=self.runtime.sys_port_base,
+            )
+        except TypeError:
+            # Backends that don't accept base_sys_port keep their default behavior.
+            return self.backend.endpoints_to_processes(self.endpoints)
 
     def start_head_infrastructure(self, registry: ProcessRegistry) -> ManagedProcess:
         """Start NATS and etcd on the infra node.
@@ -130,14 +139,16 @@ def start_head_infrastructure(self, registry: ProcessRegistry) -> ManagedProcess
             critical=True,
         )
 
-        # 300s timeout to handle slow container imports on first run
+        # NOTE: Starting infra requires an `srun` into the container image.
+        # On busy clusters, `pyxis` image import can easily exceed 60s, so keep this
+        # timeout comfortably larger than the container startup overhead.
         logger.info("Waiting for NATS (port 4222) on %s...", infra_node)
-        if not wait_for_port(infra_node, 4222, timeout=300):
+        if not wait_for_port(self.runtime.infra_node_ip, 4222, timeout=300):
             raise RuntimeError("NATS failed to start")
         logger.info("NATS is ready")
 
         logger.info("Waiting for etcd (port 2379) on %s...", infra_node)
-        if not wait_for_port(infra_node, 2379, timeout=300):
+        if not wait_for_port(self.runtime.infra_node_ip, 2379, timeout=300):
             raise RuntimeError("etcd failed to start")
         logger.info("etcd is ready")
 
@@ -154,7 +165,10 @@ def _print_connection_info(self) -> None:
         logger.info("=" * 60)
         logger.info("Connection Commands")
         logger.info("=" * 60)
-        logger.info("Frontend URL: http://%s:8000", self.runtime.nodes.head)
+        if self.runtime.effective_frontend_type == "direct":
+            logger.info("Worker URL: http://%s:%d", self.runtime.nodes.head, self.runtime.frontend_port)
+        else:
+            logger.info("Frontend URL: http://%s:%d", self.runtime.nodes.head, self.runtime.frontend_port)
         logger.info("")
         logger.info("To connect to head node (%s):", self.runtime.nodes.head)
         logger.info(
@@ -211,8 +225,9 @@ def run(self) -> int:
         try:
             # Stage 1: Head infrastructure (NATS, etcd)
             reporter.report(JobStatus.STARTING, JobStage.HEAD_INFRASTRUCTURE, "Starting head infrastructure")
-            head_proc = self.start_head_infrastructure(registry)
-            registry.add_process(head_proc)
+            if self.runtime.effective_frontend_type != "direct":
+                head_proc = self.start_head_infrastructure(registry)
+                registry.add_process(head_proc)
 
             # Stage 2: Workers
             reporter.report(JobStatus.WORKERS, JobStage.WORKERS, "Starting workers")