Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 76 additions & 29 deletions scripts/profiling/profile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
# This script runs bench_one_batch_server with profiling enabled

model_name="deepseek-ai/DeepSeek-R1"
head_node="127.0.0.1"
head_port=30000
head_node="${HEAD_NODE:-127.0.0.1}"
head_port="${HEAD_PORT:-8000}"

# Parse arguments (same as sa-bench for consistency)
n_prefill=$1
Expand All @@ -26,7 +26,6 @@ echo " Decode GPUs: ${decode_gpus}"
echo " Total GPUs: ${total_gpus}"

# Wait for server to be ready using inline wait function
echo "Waiting for server at http://${head_node}:${head_port} to be ready..."
wait_until_ready() {
local SERVER_URL="$1"
while true; do
Expand All @@ -42,7 +41,36 @@ wait_until_ready() {
sleep 30
done
}
wait_until_ready "http://${head_node}:${head_port}"

# Parse prefill/decode leader IP lists from environment (comma-separated)
IFS=',' read -r -a PREFILL_IPS <<< "${PROFILE_PREFILL_IPS:-}"
IFS=',' read -r -a DECODE_IPS <<< "${PROFILE_DECODE_IPS:-}"

wait_all_workers_ready() {
local ips=("$@")
for ip in "${ips[@]}"; do
if [[ -z "${ip}" ]]; then
continue
fi
echo "Waiting for worker at http://${ip}:30000 to be ready..."
wait_until_ready "http://${ip}:30000"
done
}

# For PD disaggregation, wait for router once and then wait for all worker servers
if [[ "${PROFILING_MODE}" == "prefill" ]]; then
echo "Waiting for router at http://${head_node}:${head_port} to be ready..."
wait_until_ready "http://${head_node}:${head_port}"
fi

if [[ "${#PREFILL_IPS[@]}" -gt 0 || "${#DECODE_IPS[@]}" -gt 0 ]]; then
echo "Waiting for all profiling workers to be ready..."
wait_all_workers_ready "${PREFILL_IPS[@]}" "${DECODE_IPS[@]}"
else
# Backward-compatible single-node behavior
echo "Waiting for local ${PROFILING_MODE} server at http://127.0.0.1:30000 to be ready..."
wait_until_ready "http://127.0.0.1:30000"
fi

# Determine profiling parameters strictly from environment
PROFILE_STEPS_ARG=""
Expand Down Expand Up @@ -71,7 +99,7 @@ echo "$(date '+%Y-%m-%d %H:%M:%S')"
# Create profiling output directory only when torch profiler dir is provided
ACTIVITIES=""
if [[ -n "${SGLANG_TORCH_PROFILER_DIR}" ]]; then
ACTIVITIES='["CPU", "GPU"]'
ACTIVITIES='["CPU", "GPU", "MEM"]'
mkdir -p "${SGLANG_TORCH_PROFILER_DIR}" 2>/dev/null || true
export SGLANG_TORCH_PROFILER_DIR=${SGLANG_TORCH_PROFILER_DIR}
else
Expand All @@ -81,30 +109,49 @@ fi

set -x

curl -X POST http://${head_node}:${head_port}/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"

python3 -m sglang.bench_serving \
--backend sglang \
--model ${model_name} \
--host ${head_node} --port ${head_port} \
--dataset-name random \
--max-concurrency $PROFILE_CONCURRENCY \
--num-prompts 128 \
--random-input-len $PROFILE_ISL \
--random-output-len $PROFILE_OSL \
--random-range-ratio 1 \
--warmup-request 10

pip install lm-eval tenacity
python -m lm_eval \
--model local-completions \
--tasks gsm8k \
--model_args \
base_url=http://${head_node}:${head_port}/v1/completions,\
model=${model_name},\
tokenized_requests=False,tokenizer_backend=None,\
num_concurrent=${PROFILE_CONCURRENCY},timeout=6000,max_retries=1 \
--limit 10
start_profile_on_worker() {
local ip="$1"
if [[ -z "${ip}" ]]; then
return
fi
echo "Starting profiling on http://${ip}:30000"
curl -X POST "http://${ip}:30000/start_profile" -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"
}

if [[ "${#PREFILL_IPS[@]}" -gt 0 || "${#DECODE_IPS[@]}" -gt 0 ]]; then
for ip in "${PREFILL_IPS[@]}"; do
start_profile_on_worker "${ip}"
done
for ip in "${DECODE_IPS[@]}"; do
start_profile_on_worker "${ip}"
done
else
# Fallback to local single-node profiling
echo "Starting profiling on local server http://127.0.0.1:30000"
curl -X POST http://127.0.0.1:30000/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"
fi
Comment on lines +112 to +132
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Potential issue: start_step is passed as a string while num_steps is an integer.

In the JSON payload at line 118, start_step is quoted as a string (\"$PROFILE_START_STEP\") while num_steps is computed as an integer. This inconsistency may cause issues if the server expects consistent types.

-    curl -X POST "http://${ip}:30000/start_profile" -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"
+    curl -X POST "http://${ip}:30000/start_profile" -H "Content-Type: application/json" -d "{\"start_step\": $PROFILE_START_STEP, \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"

Also update line 131 for consistency:

-    curl -X POST http://127.0.0.1:30000/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"
+    curl -X POST http://127.0.0.1:30000/start_profile -H "Content-Type: application/json" -d "{\"start_step\": $PROFILE_START_STEP, \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}"
🤖 Prompt for AI Agents
In scripts/profiling/profile.sh around lines 112 to 132, the JSON payloads send
start_step as a quoted string while num_steps is an integer; change both curl -d
payloads (the one in start_profile_on_worker and the local fallback) to pass
start_step unquoted so it is a numeric value (i.e., remove the surrounding
quotes from $PROFILE_START_STEP) and keep num_steps computed as
$((PROFILE_STOP_STEP-PROFILE_START_STEP)); ensure ACTIVITIES remains valid JSON
(or pre-serialize it) so the overall payload types are consistent.


# Only the prefill profiling job needs to generate traffic through the router.
if [[ "${PROFILING_MODE}" == "prefill" ]]; then
python3 -m sglang.bench_serving \
--backend sglang \
--model ${model_name} \
--host ${head_node} --port ${head_port} \
--dataset-name random \
--max-concurrency $PROFILE_CONCURRENCY \
--num-prompts 128 \
--random-input-len $PROFILE_ISL \
--random-output-len $PROFILE_OSL \
--random-range-ratio 1 \
--warmup-request 5

pip install lm-eval tenacity > /dev/null
python -m lm_eval \
--model local-completions \
--tasks gsm8k \
--model_args base_url=http://${head_node}:${head_port}/v1/completions,model=${model_name},tokenized_requests=False,tokenizer_backend=None,num_concurrent=${PROFILE_CONCURRENCY},timeout=6000,max_retries=1 \
--limit 10
fi

exit_code=$?
set +x
Expand Down
88 changes: 68 additions & 20 deletions scripts/templates/job_script_template_disagg.j2
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#SBATCH --job-name={{ job_name }}
#SBATCH --nodes={{ total_nodes }}
#SBATCH --ntasks={{ total_nodes }}
#SBATCH --segment={{ total_nodes }}
#SBATCH --ntasks-per-node=1
{% if use_gpus_per_node_directive %}
#SBATCH --gpus-per-node={{ gpus_per_node }}
Expand Down Expand Up @@ -57,7 +58,7 @@ for i in "${!nodes[@]}"; do
done

{% endraw %}
{% if enable_multiple_frontends %}
{% if enable_multiple_frontends and not use_sglang_router %}
{% raw %}
# Multiple frontend architecture
# Node 0: nginx only + prefill shard
Expand Down Expand Up @@ -142,7 +143,7 @@ echo "Master IP address: $MASTER_IP"

# Compute leader nodes for each worker
{% endraw %}
{% if enable_multiple_frontends %}
{% if enable_multiple_frontends and not use_sglang_router %}
{% raw %}
# With multiple frontends: keep offset 0; nginx coexists on node 0
WORKER_NODE_OFFSET=0
Expand Down Expand Up @@ -183,7 +184,12 @@ ENROOT_ARGS="\
{% raw %}
WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip ${MASTER_IP}"
{% endraw %}
{% if enable_multiple_frontends %}
{% if use_sglang_router %}
{% raw %}
WORKER_ARGS="$WORKER_ARGS --use-sglang-router"
{% endraw %}
{% endif %}
{% if enable_multiple_frontends and not use_sglang_router %}
{% raw %}
# Add multiple frontends flag for worker setup
WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
Expand All @@ -204,7 +210,7 @@ WORKER_ARGS="$WORKER_ARGS --setup-script {{ setup_script }}"
{% raw %}

{% endraw %}
{% if enable_multiple_frontends %}
{% if enable_multiple_frontends and not use_sglang_router %}
{% raw %}
{% endraw %}
{% if total_nodes > 1 %}
Expand Down Expand Up @@ -314,7 +320,7 @@ done

echo ""
{% endraw %}
{% if enable_multiple_frontends %}
{% if enable_multiple_frontends and not use_sglang_router %}
{% raw %}
echo "Frontend available at: http://${NGINX_NODE}:8000"
echo "To connect to the nginx node:"
Expand All @@ -330,6 +336,38 @@ echo "srun $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --overlap --pty bas
{% endif %}
{% raw %}

# Launch sglang router when enabled
{% endraw %}{% if use_sglang_router %}{% raw %}
echo "Launching sglang router on ${nodes[0]}"
# Collect leader IPs for prefill and decode
PREFILL_LEADER_IPS=()
for idx in "${prefill_leaders[@]}"; do
node_name=${nodes[$idx]}
ip=$(get_node_ip "$node_name" "$SLURM_JOB_ID" "$NETWORK_INTERFACE")
PREFILL_LEADER_IPS+=("$ip")
done
DECODE_LEADER_IPS=()
for idx in "${decode_leaders[@]}"; do
node_name=${nodes[$idx]}
ip=$(get_node_ip "$node_name" "$SLURM_JOB_ID" "$NETWORK_INTERFACE")
DECODE_LEADER_IPS+=("$ip")
done

ROUTER_ARGS="--pd-disaggregation"
for ip in "${PREFILL_LEADER_IPS[@]}"; do
ROUTER_ARGS="$ROUTER_ARGS --prefill http://${ip}:30000"
done
for ip in "${DECODE_LEADER_IPS[@]}"; do
ROUTER_ARGS="$ROUTER_ARGS --decode http://${ip}:30000"
done

ROUTER_NODE=${nodes[0]}
cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$ROUTER_NODE --output=${LOG_DIR}/${ROUTER_NODE}_router.out python -m sglang_router.launch_router $ROUTER_ARGS --host 0.0.0.0 --port 8000"
echo "$cmd"
$cmd &
{% endraw %}{% endif %}
{% raw %}

echo ""
echo "Make sure to cancel the job at the end:"
echo "scancel $SLURM_JOB_ID"
Expand All @@ -349,25 +387,35 @@ srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --ou

{% if profiler != 'none' %}
{% raw %}
# Torch profiling mode: run profiling on prefill and decode workers separately
echo "Starting profiler..."
# Torch/NSYS profiling mode: run a single orchestrator that profiles all prefill and decode workers.
echo "Starting unified profiler..."

# Collect leader IPs for prefill and decode workers
PREFILL_LEADER_IPS=()
for idx in "${prefill_leaders[@]}"; do
node_name=${nodes[$idx]}
ip=$(get_node_ip "$node_name" "$SLURM_JOB_ID" "$NETWORK_INTERFACE")
PREFILL_LEADER_IPS+=("$ip")
done

# Get leader nodes for first prefill and decode workers
PREFILL_LEADER_NODE=${nodes[${prefill_leaders[0]}]}
DECODE_LEADER_NODE=${nodes[${decode_leaders[0]}]}
DECODE_LEADER_IPS=()
for idx in "${decode_leaders[@]}"; do
node_name=${nodes[$idx]}
ip=$(get_node_ip "$node_name" "$SLURM_JOB_ID" "$NETWORK_INTERFACE")
DECODE_LEADER_IPS+=("$ip")
done

echo "Prefill profiling will run on: $PREFILL_LEADER_NODE"
echo "Decode profiling will run on: $DECODE_LEADER_NODE"
PREFILL_LEADER_IPS_STR=$(IFS=,; echo "${PREFILL_LEADER_IPS[*]}")
DECODE_LEADER_IPS_STR=$(IFS=,; echo "${DECODE_LEADER_IPS[*]}")

# Run prefill profiling on first prefill worker's leader node
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $PREFILL_LEADER_NODE \
--output=${LOG_DIR}/profile_prefill.out --overlap \
bash -c "PROFILING_MODE=prefill {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/prefill {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
# Use the first prefill leader as the orchestrator node
PROFILE_ORCHESTRATOR_NODE=${nodes[${prefill_leaders[0]}]}
echo "Unified profiling will run on orchestrator node: $PROFILE_ORCHESTRATOR_NODE"

# Run decode profiling on first decode worker's leader node
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $DECODE_LEADER_NODE \
--output=${LOG_DIR}/profile_decode.out --overlap \
bash -c "PROFILING_MODE=decode {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode {% endif %}{{ decode_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
# Run a single profiling orchestrator that coordinates profiling across all leaders
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $PROFILE_ORCHESTRATOR_NODE \
--output=${LOG_DIR}/profile_all.out --overlap \
bash -c "PROFILING_MODE=prefill PROFILE_PREFILL_IPS=${PREFILL_LEADER_IPS_STR} PROFILE_DECODE_IPS=${DECODE_LEADER_IPS_STR} {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
Comment on lines +416 to +418
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Missing HEAD_NODE and HEAD_PORT environment variables for router-based profiling.

The profile.sh script reads HEAD_NODE and HEAD_PORT from environment (lines 9-10), but this srun command doesn't set them. When using sglang router, the profiling script needs to know the router endpoint to wait for readiness and send traffic.

-    bash -c "PROFILING_MODE=prefill PROFILE_PREFILL_IPS=${PREFILL_LEADER_IPS_STR} PROFILE_DECODE_IPS=${DECODE_LEADER_IPS_STR} {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
+    bash -c "PROFILING_MODE=prefill HEAD_NODE=${nodes[0]} HEAD_PORT=8000 PROFILE_PREFILL_IPS=${PREFILL_LEADER_IPS_STR} PROFILE_DECODE_IPS=${DECODE_LEADER_IPS_STR} {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &

Note: ${nodes[0]} is the router node. You may need to resolve its IP using get_node_ip or use the hostname directly if DNS resolution works in the container.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $PROFILE_ORCHESTRATOR_NODE \
--output=${LOG_DIR}/profile_all.out --overlap \
bash -c "PROFILING_MODE=prefill PROFILE_PREFILL_IPS=${PREFILL_LEADER_IPS_STR} PROFILE_DECODE_IPS=${DECODE_LEADER_IPS_STR} {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &
srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $PROFILE_ORCHESTRATOR_NODE \
--output=${LOG_DIR}/profile_all.out --overlap \
bash -c "PROFILING_MODE=prefill HEAD_NODE=${nodes[0]} HEAD_PORT=8000 PROFILE_PREFILL_IPS=${PREFILL_LEADER_IPS_STR} PROFILE_DECODE_IPS=${DECODE_LEADER_IPS_STR} {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" &

{% endraw %}
{% endif %}

Expand Down
9 changes: 9 additions & 0 deletions scripts/worker_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
help="Whether multiple frontend architecture is enabled (affects infrastructure setup)",
)

parser.add_argument(
"--use-sglang-router",
action="store_true",
help="Whether this job uses sglang router (PD disaggregation); skips NATS/ETCD/frontend bootstrap in workers.",
)

parser.add_argument(
"--dump-config-path",
type=str,
Expand Down Expand Up @@ -188,6 +194,7 @@ def main(input_args: list[str] | None = None):
args.sglang_config_path,
args.dump_config_path,
args.setup_script,
args.use_sglang_router,
)
elif args.worker_type == "decode":
setup_decode_worker(
Expand All @@ -201,6 +208,7 @@ def main(input_args: list[str] | None = None):
args.sglang_config_path,
args.dump_config_path,
args.setup_script,
args.use_sglang_router,
)
elif args.worker_type == "aggregated":
setup_aggregated_worker(
Expand All @@ -215,6 +223,7 @@ def main(input_args: list[str] | None = None):
args.sglang_config_path,
args.dump_config_path,
args.setup_script,
args.use_sglang_router,
)

logging.info(f"{args.worker_type.capitalize()} worker setup complete")
Expand Down
3 changes: 0 additions & 3 deletions scripts/worker_setup/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,6 @@ def build_sglang_command_from_yaml(
# Add all SGLang flags from config
for key, value in sorted(mode_config.items()):
flag_name = key.replace("_", "-")
# Skip disaggregation-mode flag for profiling
if flag_name == "disaggregation-mode":
continue
if isinstance(value, bool):
if value:
cmd_parts.append(f"--{flag_name}")
Expand Down
Loading