ishandhanani · ishandhanani · Dec 5, 2025 · Nov 28, 2025 · Dec 3, 2025 · Dec 5, 2025
diff --git a/docs/installation.md b/docs/installation.md
@@ -151,6 +151,31 @@ benchmark:
   concurrencies: [256, 512]
 ```
 
+## Profiling (torch / nsys)
+
+You can enable profiling via a top-level `profiling` section in your job YAML. Profiling and benchmarking do not run at the same time; when profiling is enabled, set `benchmark.type: "manual"` (or omit the benchmark) to avoid conflicts.
+
+Example:
+
+```yaml
+profiling:
+  type: "nsys"   # one of: "none", "torch", "nsys"
+  prefill:
+    isl: 1024     # input sequence length
+    osl: 2        # output sequence length
+    concurrency: 24
+    start_step: 0 
+    stop_step: 16
+  decode:
+    isl: 8
+    osl: 16
+    concurrency: 1024
+    start_step: 0
+    stop_step: 16
+```
+
+When `type: none`, normal serving runs with `dynamo.sglang`. Otherwise, serving uses `sglang.launch_server`.
+
 ## Validate with Dry Run
 
 Always validate before submitting:

diff --git a/scripts/profiling/profile.sh b/scripts/profiling/profile.sh
@@ -44,38 +44,68 @@ wait_until_ready() {
 }
 wait_until_ready "http://${head_node}:${head_port}"
 
-# Determine profiling parameters based on mode
-if [[ "${PROFILING_MODE}" == "prefill" ]]; then
-    # Prefill profiling: smaller batch, long input, short output
-    BATCH_SIZE=24
-    INPUT_LEN=1024
-    OUTPUT_LEN=2
-    PROFILE_STEPS_ARG=""
-    echo "Running prefill profiling with batch=${BATCH_SIZE}, input_len=${INPUT_LEN}, output_len=${OUTPUT_LEN}"
-else
-    # Decode profiling: large batch, short input, longer output with profile steps
-    BATCH_SIZE=1024
-    INPUT_LEN=8
-    OUTPUT_LEN=16
-    PROFILE_STEPS_ARG="--profile-steps 16"
-    echo "Running decode profiling with batch=${BATCH_SIZE}, input_len=${INPUT_LEN}, output_len=${OUTPUT_LEN}, profile_steps=16"
+# Determine profiling parameters strictly from environment 
+PROFILE_STEPS_ARG=""
+CLI_ARGS=""
+[[ -n "${PROFILE_CONCURRENCY}" ]] && CLI_ARGS+=" --batch-size ${PROFILE_CONCURRENCY}"
+# Require ISL/OSL to be provided; do not pass them as CLI args here
+if [[ -z "${PROFILE_ISL}" || -z "${PROFILE_OSL}" ]]; then
+    echo "Error: isl and osl must be set for profiling."
+    exit 1
+fi
+
+# Configure profiling steps range; set defaults independently if missing
+if [[ -z "${PROFILE_START_STEP}" ]]; then
+    echo "Warning: PROFILE_START_STEP not set; defaulting to 0"
+    PROFILE_START_STEP=0
+fi
+if [[ -z "${PROFILE_STOP_STEP}" ]]; then
+    echo "Warning: PROFILE_STOP_STEP not set; defaulting to 50"
+    PROFILE_STOP_STEP=50
 fi
 
-# Create profiling output directory
-mkdir -p ${SGLANG_TORCH_PROFILER_DIR} 2>/dev/null || true
 
-echo "Running torch profiler..."
+echo "Running profiler..."
 echo "$(date '+%Y-%m-%d %H:%M:%S')"
 
+# Create profiling output directory only when torch profiler dir is provided
+ACTIVITIES=""
+if [[ -n "${SGLANG_TORCH_PROFILER_DIR}" ]]; then
+    ACTIVITIES='["GPU"]'
+    mkdir -p "${SGLANG_TORCH_PROFILER_DIR}" 2>/dev/null || true
+    export SGLANG_TORCH_PROFILER_DIR=${SGLANG_TORCH_PROFILER_DIR}
+else
+    ACTIVITIES='["CUDA_PROFILER"]'
+    mkdir -p "/logs/profiles" 2>/dev/null || true
+fi
-else
-    curl -X POST http://${head_node}:${head_port}/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": [\"CUDA_PROFILER\"]}"
-    mkdir -p "/logs/profiles" 2>/dev/null || true
-fi
+else
+    response=$(curl -s -w "\n%{http_code}" -X POST http://${head_node}:${head_port}/start_profile \
+        -H "Content-Type: application/json" \
+        -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": [\"CUDA_PROFILER\"]}")
+    http_code=$(echo "$response" | tail -n1)
+    if [[ "$http_code" != "200" ]]; then
+        echo "Warning: Failed to start remote profiling (HTTP $http_code)"
+    fi
+    mkdir -p "/logs/profiles" 2>/dev/null || true
+fi
-else
-    curl -X POST http://${head_node}:${head_port}/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": [\"CUDA_PROFILER\"]}"
-    mkdir -p "/logs/profiles" 2>/dev/null || true
-fi
+else
+    response=$(curl -s -w "\n%{http_code}" -X POST http://${head_node}:${head_port}/start_profile \
+        -H "Content-Type: application/json" \
+        -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": [\"CUDA_PROFILER\"]}")
+    http_code=$(echo "$response" | tail -n1)
+    if [[ "$http_code" != "200" ]]; then
+        echo "Warning: Failed to start remote profiling (HTTP $http_code)"
+    fi
+    mkdir -p "/logs/profiles" 2>/dev/null || true
+fi
+
 set -x
-python3 -m sglang.bench_one_batch_server \
-    --model ${model_name} \
-    --base-url http://${head_node}:${head_port} \
-    --batch-size ${BATCH_SIZE} \
-    --input-len ${INPUT_LEN} \
-    --output-len ${OUTPUT_LEN} \
-    ${PROFILE_STEPS_ARG} \
-    --profile
+
+curl -X POST http://${head_node}:${head_port}/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"
+
+python3 -m sglang.bench_serving \
+--backend sglang \
+--model ${model_name} \
+--host ${head_node} --port ${head_port} \
+--dataset-name random \
+--max-concurrency $PROFILE_CONCURRENCY \
+--num-prompts 128 \
+--random-input-len $PROFILE_ISL \
+--random-output-len $PROFILE_OSL \
+--random-range-ratio 1 \
+--warmup-request 10
+
+pip install lm-eval tenacity
+python -m lm_eval \
+--model local-completions \
+--tasks gsm8k \
+--model_args \
+base_url=http://${head_node}:${head_port}/v1/completions,\
+model=${model_name},\
+tokenized_requests=False,tokenizer_backend=None,\
+num_concurrent=${PROFILE_CONCURRENCY},timeout=6000,max_retries=1 \
+--limit 10
+
 exit_code=$?
 set +x
 

diff --git a/scripts/templates/job_script_template_agg.j2 b/scripts/templates/job_script_template_agg.j2
@@ -189,12 +189,10 @@ WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip
 WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
 {% endraw %}
 {% endif %}
-{% if sglang_torch_profiler %}
 {% raw %}
-# Enable torch profiling mode
-WORKER_ARGS="$WORKER_ARGS --sglang-torch-profiler"
+# Set profiler mode from config
+WORKER_ARGS="$WORKER_ARGS --profiler {% endraw %}{{ profiler }}{% raw %}"
 {% endraw %}
-{% endif %}
 {% raw %}
 # Add SGLang config path (mounted in container at /logs/)
 WORKER_ARGS="$WORKER_ARGS --sglang-config-path /logs/sglang_config.yaml"
@@ -213,7 +211,7 @@ WORKER_ARGS="$WORKER_ARGS --setup-script {{ setup_script }}"
 {% raw %}
 # Launch nginx on node 0
 echo "Launching nginx on ${NGINX_NODE}"
-cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out --error=${LOG_DIR}/${NGINX_NODE}_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
 echo "$cmd"
 $cmd &
 {% endraw %}
@@ -222,7 +220,7 @@ $cmd &
 
 # Launch frontend on master node (node 1) - this will also start NATS/ETCD
 echo "Launching frontend + NATS/ETCD on master node ${MASTER_NODE}"
-cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out --error=${LOG_DIR}/${MASTER_NODE}_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
 echo "$cmd"
 $cmd &
 
@@ -236,7 +234,7 @@ if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then
         if [ $frontend_node_idx -lt $TOTAL_NODES ]; then
             node=${nodes[$frontend_node_idx]}
             echo "Launching additional frontend $frontend_idx on node $frontend_node_idx: $node"
-            cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out --error=${LOG_DIR}/${node}_frontend_${frontend_idx}.err python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
+            cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
             echo "$cmd"
             $cmd &
             frontend_idx=$((frontend_idx + 1))
@@ -280,7 +278,7 @@ for worker_idx in $(seq 0 $((AGG_WORKERS - 1))); do
 {% endraw %}
 {% endif %}
 {% raw %}
-        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_agg_w${worker_idx}.out --error=${LOG_DIR}/${node}_agg_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${AGG_NODES_PER_WORKER} --worker_type aggregated ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
+        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_agg_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${AGG_NODES_PER_WORKER} --worker_type aggregated ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
         echo "$cmd"
         $cmd &
     done
@@ -318,11 +316,11 @@ echo "srun $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --overlap --pty bas
 {% raw %}
 BENCHMARK_TYPE={% endraw %}{{ benchmark_type }}{% raw %}
 BENCHMARK_ARGS="{% endraw %}{{ benchmark_arg }}{% raw %}"
-srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --error=${LOG_DIR}/benchmark.err --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $AGG_WORKERS 0 0 $DECODE_GPUS ${BENCHMARK_ARGS} &
+srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $AGG_WORKERS 0 0 $DECODE_GPUS ${BENCHMARK_ARGS} &
 {% endraw %}
 {% endif %}
 
-{% if sglang_torch_profiler %}
+{% if profiler != 'none' %}
 
 {% raw %}
 # Torch profiling mode for aggregated workers
@@ -336,12 +334,12 @@ echo "Aggregated profiling will run on: $AGG_LEADER_NODE"
 # Run profiling on first aggregated worker's leader node
 # Use "decode" mode for aggregated since it profiles the full generation pipeline
 srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $AGG_LEADER_NODE \
-    --output=${LOG_DIR}/profile_aggregated.out --error=${LOG_DIR}/profile_aggregated.err --overlap \
-    bash -c "PROFILING_MODE=decode SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode /scripts/profiling/profile.sh 0 $AGG_WORKERS 0 $DECODE_GPUS $TOTAL_GPUS" &
+    --output=${LOG_DIR}/profile_aggregated.out --overlap \
+    bash -c "PROFILING_MODE=decode {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode {% endif %}{{ decode_profile_env }}{% raw %} /scripts/profiling/profile.sh 0 $AGG_WORKERS 0 $DECODE_GPUS $TOTAL_GPUS" &
 {% endraw %}
 {% endif %}
 
-{% if sglang_torch_profiler %}
+{% if profiler != 'none' %}
 {% raw %}
 # Wait for profiling script to complete
 echo "Waiting for profiling script to complete..."

diff --git a/scripts/templates/job_script_template_disagg.j2 b/scripts/templates/job_script_template_disagg.j2
@@ -9,7 +9,6 @@
 #SBATCH --account={{ account }}
 #SBATCH --time={{ time_limit }}
 #SBATCH --output={{ log_dir_prefix }}/%j_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}/log.out
-#SBATCH --error={{ log_dir_prefix }}/%j_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}/log.err
 #SBATCH --partition={{ partition }}
 
 # Constants
@@ -190,12 +189,10 @@ WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip
 WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
 {% endraw %}
 {% endif %}
-{% if sglang_torch_profiler %}
 {% raw %}
-# Enable torch profiling mode
-WORKER_ARGS="$WORKER_ARGS --sglang-torch-profiler"
+# Set profiler mode from config
+WORKER_ARGS="$WORKER_ARGS --profiler {% endraw %}{{ profiler }}{% raw %}"
 {% endraw %}
-{% endif %}
 {% raw %}
 # Add SGLang config path (mounted in container at /logs/)
 WORKER_ARGS="$WORKER_ARGS --sglang-config-path /logs/sglang_config.yaml"
@@ -214,7 +211,7 @@ WORKER_ARGS="$WORKER_ARGS --setup-script {{ setup_script }}"
 {% raw %}
 # Launch nginx on node 0
 echo "Launching nginx on ${NGINX_NODE}"
-cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out --error=${LOG_DIR}/${NGINX_NODE}_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
 echo "$cmd"
 $cmd &
 {% endraw %}
@@ -223,7 +220,7 @@ $cmd &
 
 # Launch frontend on master node (node 1) - this will also start NATS/ETCD
 echo "Launching frontend + NATS/ETCD on master node ${MASTER_NODE}"
-cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out --error=${LOG_DIR}/${MASTER_NODE}_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
 echo "$cmd"
 $cmd &
 
@@ -237,7 +234,7 @@ if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then
         if [ $frontend_node_idx -lt $TOTAL_NODES ]; then
             node=${nodes[$frontend_node_idx]}
             echo "Launching additional frontend $frontend_idx on node $frontend_node_idx: $node"
-            cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out --error=${LOG_DIR}/${node}_frontend_${frontend_idx}.err python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
+            cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
             echo "$cmd"
             $cmd &
             frontend_idx=$((frontend_idx + 1))
@@ -276,7 +273,7 @@ for worker_idx in $(seq 0 $((PREFILL_WORKERS - 1))); do
 {% endraw %}
 {% endif %}
 {% raw %}
-        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out --error=${LOG_DIR}/${node}_prefill_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill ${WORKER_ARGS} ${CONFIG_DUMP_ARG}"
+        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill ${WORKER_ARGS} ${CONFIG_DUMP_ARG}"
         echo "$cmd"
         $cmd &
     done
@@ -309,7 +306,7 @@ for worker_idx in $(seq 0 $((DECODE_WORKERS - 1))); do
 {% endraw %}
 {% endif %}
 {% raw %}
-        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out --error=${LOG_DIR}/${node}_decode_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
+        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
         echo "$cmd"
         $cmd &
     done
@@ -346,14 +343,14 @@ BENCHMARK_ARGS="{{ benchmark_arg }}"
 
 {% if do_benchmark %}
 {% raw %}
-srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --error=${LOG_DIR}/benchmark.err --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS ${BENCHMARK_ARGS} &
+srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS ${BENCHMARK_ARGS} &
 {% endraw %}
 {% endif %}
 
-{% if sglang_torch_profiler %}
+{% if profiler != 'none' %}
 {% raw %}
 # Torch profiling mode: run profiling on prefill and decode workers separately
-echo "Starting torch profiling..."
+echo "Starting profiler..."
 
 # Get leader nodes for first prefill and decode workers
 PREFILL_LEADER_NODE=${nodes[${prefill_leaders[0]}]}
@@ -364,17 +361,17 @@ echo "Decode profiling will run on: $DECODE_LEADER_NODE"
 
 # Run prefill profiling on first prefill worker's leader node
 srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $PREFILL_LEADER_NODE \
-    --output=${LOG_DIR}/profile_prefill.out --error=${LOG_DIR}/profile_prefill.err --overlap \
-    bash -c "PROFILING_MODE=prefill SGLANG_TORCH_PROFILER_DIR=/logs/profiles/prefill /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
+    --output=${LOG_DIR}/profile_prefill.out --overlap \
+    bash -c "PROFILING_MODE=prefill {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/prefill {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
 
 # Run decode profiling on first decode worker's leader node
 srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $DECODE_LEADER_NODE \
-    --output=${LOG_DIR}/profile_decode.out --error=${LOG_DIR}/profile_decode.err --overlap \
-    bash -c "PROFILING_MODE=decode SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
+    --output=${LOG_DIR}/profile_decode.out  --overlap \
+    bash -c "PROFILING_MODE=decode {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode {% endif %}{{ decode_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
 {% endraw %}
 {% endif %}
 
-{% if sglang_torch_profiler %}
+{% if profiler != 'none' %}
 {% raw %}
 # Wait for all profiling scripts to complete (both prefill and decode)
 echo "Waiting for all profiling scripts to complete..."

diff --git a/scripts/worker_setup.py b/scripts/worker_setup.py
@@ -103,9 +103,11 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
     )
 
     parser.add_argument(
-        "--sglang-torch-profiler",
-        action="store_true",
-        help="Enable torch profiling mode: use sglang.launch_server and skip --disaggregation-mode",
+        "--profiler",
+        type=str,
+        choices=["none", "torch", "nsys"],
+        default="none",
+        help="Profiling method for workers",
     )
 
     parser.add_argument(
@@ -182,7 +184,7 @@ def main(input_args: list[str] | None = None):
             args.nodes_per_worker,
             args.gpu_type,
             args.multiple_frontends_enabled,
-            args.sglang_torch_profiler,
+            args.profiler,
             args.sglang_config_path,
             args.dump_config_path,
             args.setup_script,
@@ -195,7 +197,7 @@ def main(input_args: list[str] | None = None):
             args.master_ip,
             args.nodes_per_worker,
             args.gpu_type,
-            args.sglang_torch_profiler,
+            args.profiler,
             args.sglang_config_path,
             args.dump_config_path,
             args.setup_script,
@@ -209,7 +211,7 @@ def main(input_args: list[str] | None = None):
             args.nodes_per_worker,
             args.gpu_type,
             args.multiple_frontends_enabled,
-            args.sglang_torch_profiler,
+            args.profiler,
             args.sglang_config_path,
             args.dump_config_path,
             args.setup_script,