Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,31 @@ benchmark:
concurrencies: [256, 512]
```

## Profiling (torch / nsys)

You can enable profiling via a top-level `profiling` section in your job YAML. Profiling and benchmarking do not run at the same time; when profiling is enabled, set `benchmark.type: "manual"` (or omit the benchmark) to avoid conflicts.

Example:

```yaml
profiling:
type: "nsys" # one of: "none", "torch", "nsys"
prefill:
isl: 1024 # input sequence length
osl: 2 # output sequence length
concurrency: 24
start_step: 0
stop_step: 16
decode:
isl: 8
osl: 16
concurrency: 1024
start_step: 0
stop_step: 16
```

When `type: none`, normal serving runs with `dynamo.sglang`. Otherwise, serving uses `sglang.launch_server`.

## Validate with Dry Run

Always validate before submitting:
Expand Down
82 changes: 56 additions & 26 deletions scripts/profiling/profile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,38 +44,68 @@ wait_until_ready() {
}
wait_until_ready "http://${head_node}:${head_port}"

# Determine profiling parameters based on mode
if [[ "${PROFILING_MODE}" == "prefill" ]]; then
# Prefill profiling: smaller batch, long input, short output
BATCH_SIZE=24
INPUT_LEN=1024
OUTPUT_LEN=2
PROFILE_STEPS_ARG=""
echo "Running prefill profiling with batch=${BATCH_SIZE}, input_len=${INPUT_LEN}, output_len=${OUTPUT_LEN}"
else
# Decode profiling: large batch, short input, longer output with profile steps
BATCH_SIZE=1024
INPUT_LEN=8
OUTPUT_LEN=16
PROFILE_STEPS_ARG="--profile-steps 16"
echo "Running decode profiling with batch=${BATCH_SIZE}, input_len=${INPUT_LEN}, output_len=${OUTPUT_LEN}, profile_steps=16"
# Determine profiling parameters strictly from environment
PROFILE_STEPS_ARG=""
CLI_ARGS=""
[[ -n "${PROFILE_CONCURRENCY}" ]] && CLI_ARGS+=" --batch-size ${PROFILE_CONCURRENCY}"
# Require ISL/OSL to be provided; do not pass them as CLI args here
if [[ -z "${PROFILE_ISL}" || -z "${PROFILE_OSL}" ]]; then
echo "Error: isl and osl must be set for profiling."
exit 1
fi

# Configure profiling steps range; set defaults independently if missing
if [[ -z "${PROFILE_START_STEP}" ]]; then
echo "Warning: PROFILE_START_STEP not set; defaulting to 0"
PROFILE_START_STEP=0
fi
if [[ -z "${PROFILE_STOP_STEP}" ]]; then
echo "Warning: PROFILE_STOP_STEP not set; defaulting to 50"
PROFILE_STOP_STEP=50
fi

# Create profiling output directory
mkdir -p ${SGLANG_TORCH_PROFILER_DIR} 2>/dev/null || true

echo "Running torch profiler..."
echo "Running profiler..."
echo "$(date '+%Y-%m-%d %H:%M:%S')"

# Create profiling output directory only when torch profiler dir is provided
ACTIVITIES=""
if [[ -n "${SGLANG_TORCH_PROFILER_DIR}" ]]; then
ACTIVITIES='["GPU"]'
mkdir -p "${SGLANG_TORCH_PROFILER_DIR}" 2>/dev/null || true
export SGLANG_TORCH_PROFILER_DIR=${SGLANG_TORCH_PROFILER_DIR}
else
ACTIVITIES='["CUDA_PROFILER"]'
mkdir -p "/logs/profiles" 2>/dev/null || true
fi
Comment on lines +77 to +80
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Add error handling for the remote profiling start request.

If the curl request fails (network issue, server error), the script continues without profiling being enabled. Consider checking the response status.

 else
-    curl -X POST http://${head_node}:${head_port}/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": [\"CUDA_PROFILER\"]}"
+    response=$(curl -s -w "\n%{http_code}" -X POST http://${head_node}:${head_port}/start_profile \
+        -H "Content-Type: application/json" \
+        -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": [\"CUDA_PROFILER\"]}")
+    http_code=$(echo "$response" | tail -n1)
+    if [[ "$http_code" != "200" ]]; then
+        echo "Warning: Failed to start remote profiling (HTTP $http_code)"
+    fi
     mkdir -p "/logs/profiles" 2>/dev/null || true
 fi
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
else
curl -X POST http://${head_node}:${head_port}/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": [\"CUDA_PROFILER\"]}"
mkdir -p "/logs/profiles" 2>/dev/null || true
fi
else
response=$(curl -s -w "\n%{http_code}" -X POST http://${head_node}:${head_port}/start_profile \
-H "Content-Type: application/json" \
-d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": [\"CUDA_PROFILER\"]}")
http_code=$(echo "$response" | tail -n1)
if [[ "$http_code" != "200" ]]; then
echo "Warning: Failed to start remote profiling (HTTP $http_code)"
fi
mkdir -p "/logs/profiles" 2>/dev/null || true
fi


set -x
python3 -m sglang.bench_one_batch_server \
--model ${model_name} \
--base-url http://${head_node}:${head_port} \
--batch-size ${BATCH_SIZE} \
--input-len ${INPUT_LEN} \
--output-len ${OUTPUT_LEN} \
${PROFILE_STEPS_ARG} \
--profile

curl -X POST http://${head_node}:${head_port}/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"

python3 -m sglang.bench_serving \
--backend sglang \
--model ${model_name} \
--host ${head_node} --port ${head_port} \
--dataset-name random \
--max-concurrency $PROFILE_CONCURRENCY \
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Add validation or default for PROFILE_CONCURRENCY.

PROFILE_CONCURRENCY is used directly without validation, unlike PROFILE_ISL and PROFILE_OSL. If unset, this will produce an invalid command.

Consider adding validation similar to ISL/OSL:

if [[ -z "${PROFILE_CONCURRENCY}" ]]; then
    echo "Warning: PROFILE_CONCURRENCY not set; defaulting to 1"
    PROFILE_CONCURRENCY=1
fi
🤖 Prompt for AI Agents
In scripts/profiling/profile.sh around line 93, PROFILE_CONCURRENCY is used
directly and can be empty, producing an invalid command; add a validation block
like the existing PROFILE_ISL/PROFILE_OSL checks that verifies
PROFILE_CONCURRENCY is set and a positive integer, echoing a warning and
defaulting it to 1 when unset or invalid (e.g., if empty or non-numeric), so the
subsequent --max-concurrency argument always receives a valid value.

--num-prompts 128 \
--random-input-len $PROFILE_ISL \
--random-output-len $PROFILE_OSL \
--random-range-ratio 1 \
--warmup-request 10

pip install lm-eval tenacity
python -m lm_eval \
--model local-completions \
--tasks gsm8k \
--model_args \
base_url=http://${head_node}:${head_port}/v1/completions,\
model=${model_name},\
tokenized_requests=False,tokenizer_backend=None,\
num_concurrent=${PROFILE_CONCURRENCY},timeout=6000,max_retries=1 \
--limit 10

exit_code=$?
set +x

Expand Down
24 changes: 11 additions & 13 deletions scripts/templates/job_script_template_agg.j2
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,10 @@ WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip
WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
{% endraw %}
{% endif %}
{% if sglang_torch_profiler %}
{% raw %}
# Enable torch profiling mode
WORKER_ARGS="$WORKER_ARGS --sglang-torch-profiler"
# Set profiler mode from config
WORKER_ARGS="$WORKER_ARGS --profiler {% endraw %}{{ profiler }}{% raw %}"
{% endraw %}
{% endif %}
{% raw %}
# Add SGLang config path (mounted in container at /logs/)
WORKER_ARGS="$WORKER_ARGS --sglang-config-path /logs/sglang_config.yaml"
Expand All @@ -213,7 +211,7 @@ WORKER_ARGS="$WORKER_ARGS --setup-script {{ setup_script }}"
{% raw %}
# Launch nginx on node 0
echo "Launching nginx on ${NGINX_NODE}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out --error=${LOG_DIR}/${NGINX_NODE}_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
echo "$cmd"
$cmd &
{% endraw %}
Expand All @@ -222,7 +220,7 @@ $cmd &

# Launch frontend on master node (node 1) - this will also start NATS/ETCD
echo "Launching frontend + NATS/ETCD on master node ${MASTER_NODE}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out --error=${LOG_DIR}/${MASTER_NODE}_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
echo "$cmd"
$cmd &

Expand All @@ -236,7 +234,7 @@ if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then
if [ $frontend_node_idx -lt $TOTAL_NODES ]; then
node=${nodes[$frontend_node_idx]}
echo "Launching additional frontend $frontend_idx on node $frontend_node_idx: $node"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out --error=${LOG_DIR}/${node}_frontend_${frontend_idx}.err python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
echo "$cmd"
$cmd &
frontend_idx=$((frontend_idx + 1))
Expand Down Expand Up @@ -280,7 +278,7 @@ for worker_idx in $(seq 0 $((AGG_WORKERS - 1))); do
{% endraw %}
{% endif %}
{% raw %}
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_agg_w${worker_idx}.out --error=${LOG_DIR}/${node}_agg_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${AGG_NODES_PER_WORKER} --worker_type aggregated ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_agg_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${AGG_NODES_PER_WORKER} --worker_type aggregated ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
echo "$cmd"
$cmd &
done
Expand Down Expand Up @@ -318,11 +316,11 @@ echo "srun $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --overlap --pty bas
{% raw %}
BENCHMARK_TYPE={% endraw %}{{ benchmark_type }}{% raw %}
BENCHMARK_ARGS="{% endraw %}{{ benchmark_arg }}{% raw %}"
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --error=${LOG_DIR}/benchmark.err --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $AGG_WORKERS 0 0 $DECODE_GPUS ${BENCHMARK_ARGS} &
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $AGG_WORKERS 0 0 $DECODE_GPUS ${BENCHMARK_ARGS} &
{% endraw %}
{% endif %}

{% if sglang_torch_profiler %}
{% if profiler != 'none' %}

{% raw %}
# Torch profiling mode for aggregated workers
Expand All @@ -336,12 +334,12 @@ echo "Aggregated profiling will run on: $AGG_LEADER_NODE"
# Run profiling on first aggregated worker's leader node
# Use "decode" mode for aggregated since it profiles the full generation pipeline
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $AGG_LEADER_NODE \
--output=${LOG_DIR}/profile_aggregated.out --error=${LOG_DIR}/profile_aggregated.err --overlap \
bash -c "PROFILING_MODE=decode SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode /scripts/profiling/profile.sh 0 $AGG_WORKERS 0 $DECODE_GPUS $TOTAL_GPUS" &
--output=${LOG_DIR}/profile_aggregated.out --overlap \
bash -c "PROFILING_MODE=decode {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode {% endif %}{{ decode_profile_env }}{% raw %} /scripts/profiling/profile.sh 0 $AGG_WORKERS 0 $DECODE_GPUS $TOTAL_GPUS" &
{% endraw %}
{% endif %}

{% if sglang_torch_profiler %}
{% if profiler != 'none' %}
{% raw %}
# Wait for profiling script to complete
echo "Waiting for profiling script to complete..."
Expand Down
33 changes: 15 additions & 18 deletions scripts/templates/job_script_template_disagg.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#SBATCH --account={{ account }}
#SBATCH --time={{ time_limit }}
#SBATCH --output={{ log_dir_prefix }}/%j_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}/log.out
#SBATCH --error={{ log_dir_prefix }}/%j_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}/log.err
#SBATCH --partition={{ partition }}

# Constants
Expand Down Expand Up @@ -190,12 +189,10 @@ WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip
WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
{% endraw %}
{% endif %}
{% if sglang_torch_profiler %}
{% raw %}
# Enable torch profiling mode
WORKER_ARGS="$WORKER_ARGS --sglang-torch-profiler"
# Set profiler mode from config
WORKER_ARGS="$WORKER_ARGS --profiler {% endraw %}{{ profiler }}{% raw %}"
{% endraw %}
{% endif %}
{% raw %}
# Add SGLang config path (mounted in container at /logs/)
WORKER_ARGS="$WORKER_ARGS --sglang-config-path /logs/sglang_config.yaml"
Expand All @@ -214,7 +211,7 @@ WORKER_ARGS="$WORKER_ARGS --setup-script {{ setup_script }}"
{% raw %}
# Launch nginx on node 0
echo "Launching nginx on ${NGINX_NODE}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out --error=${LOG_DIR}/${NGINX_NODE}_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
echo "$cmd"
$cmd &
{% endraw %}
Expand All @@ -223,7 +220,7 @@ $cmd &

# Launch frontend on master node (node 1) - this will also start NATS/ETCD
echo "Launching frontend + NATS/ETCD on master node ${MASTER_NODE}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out --error=${LOG_DIR}/${MASTER_NODE}_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
echo "$cmd"
$cmd &

Expand All @@ -237,7 +234,7 @@ if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then
if [ $frontend_node_idx -lt $TOTAL_NODES ]; then
node=${nodes[$frontend_node_idx]}
echo "Launching additional frontend $frontend_idx on node $frontend_node_idx: $node"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out --error=${LOG_DIR}/${node}_frontend_${frontend_idx}.err python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
echo "$cmd"
$cmd &
frontend_idx=$((frontend_idx + 1))
Expand Down Expand Up @@ -276,7 +273,7 @@ for worker_idx in $(seq 0 $((PREFILL_WORKERS - 1))); do
{% endraw %}
{% endif %}
{% raw %}
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out --error=${LOG_DIR}/${node}_prefill_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill ${WORKER_ARGS} ${CONFIG_DUMP_ARG}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill ${WORKER_ARGS} ${CONFIG_DUMP_ARG}"
echo "$cmd"
$cmd &
done
Expand Down Expand Up @@ -309,7 +306,7 @@ for worker_idx in $(seq 0 $((DECODE_WORKERS - 1))); do
{% endraw %}
{% endif %}
{% raw %}
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out --error=${LOG_DIR}/${node}_decode_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode ${CONFIG_DUMP_ARG} ${WORKER_ARGS}"
echo "$cmd"
$cmd &
done
Expand Down Expand Up @@ -346,14 +343,14 @@ BENCHMARK_ARGS="{{ benchmark_arg }}"

{% if do_benchmark %}
{% raw %}
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --error=${LOG_DIR}/benchmark.err --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS ${BENCHMARK_ARGS} &
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS ${BENCHMARK_ARGS} &
{% endraw %}
{% endif %}

{% if sglang_torch_profiler %}
{% if profiler != 'none' %}
{% raw %}
# Torch profiling mode: run profiling on prefill and decode workers separately
echo "Starting torch profiling..."
echo "Starting profiler..."

# Get leader nodes for first prefill and decode workers
PREFILL_LEADER_NODE=${nodes[${prefill_leaders[0]}]}
Expand All @@ -364,17 +361,17 @@ echo "Decode profiling will run on: $DECODE_LEADER_NODE"

# Run prefill profiling on first prefill worker's leader node
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $PREFILL_LEADER_NODE \
--output=${LOG_DIR}/profile_prefill.out --error=${LOG_DIR}/profile_prefill.err --overlap \
bash -c "PROFILING_MODE=prefill SGLANG_TORCH_PROFILER_DIR=/logs/profiles/prefill /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
--output=${LOG_DIR}/profile_prefill.out --overlap \
bash -c "PROFILING_MODE=prefill {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/prefill {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &

# Run decode profiling on first decode worker's leader node
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $DECODE_LEADER_NODE \
--output=${LOG_DIR}/profile_decode.out --error=${LOG_DIR}/profile_decode.err --overlap \
bash -c "PROFILING_MODE=decode SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
--output=${LOG_DIR}/profile_decode.out --overlap \
bash -c "PROFILING_MODE=decode {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode {% endif %}{{ decode_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
{% endraw %}
{% endif %}

{% if sglang_torch_profiler %}
{% if profiler != 'none' %}
{% raw %}
# Wait for all profiling scripts to complete (both prefill and decode)
echo "Waiting for all profiling scripts to complete..."
Expand Down
14 changes: 8 additions & 6 deletions scripts/worker_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,11 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
)

parser.add_argument(
"--sglang-torch-profiler",
action="store_true",
help="Enable torch profiling mode: use sglang.launch_server and skip --disaggregation-mode",
"--profiler",
type=str,
choices=["none", "torch", "nsys"],
default="none",
help="Profiling method for workers",
)

parser.add_argument(
Expand Down Expand Up @@ -182,7 +184,7 @@ def main(input_args: list[str] | None = None):
args.nodes_per_worker,
args.gpu_type,
args.multiple_frontends_enabled,
args.sglang_torch_profiler,
args.profiler,
args.sglang_config_path,
args.dump_config_path,
args.setup_script,
Expand All @@ -195,7 +197,7 @@ def main(input_args: list[str] | None = None):
args.master_ip,
args.nodes_per_worker,
args.gpu_type,
args.sglang_torch_profiler,
args.profiler,
args.sglang_config_path,
args.dump_config_path,
args.setup_script,
Expand All @@ -209,7 +211,7 @@ def main(input_args: list[str] | None = None):
args.nodes_per_worker,
args.gpu_type,
args.multiple_frontends_enabled,
args.sglang_torch_profiler,
args.profiler,
args.sglang_config_path,
args.dump_config_path,
args.setup_script,
Expand Down
Loading