diff --git a/docs/installation.md b/docs/installation.md index cc8c36a2..10b7e01d 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -151,6 +151,31 @@ benchmark: concurrencies: [256, 512] ``` +## Profiling (torch / nsys) + +You can enable profiling via a top-level `profiling` section in your job YAML. Profiling and benchmarking do not run at the same time; when profiling is enabled, set `benchmark.type: "manual"` (or omit the benchmark) to avoid conflicts. + +Example: + +```yaml +profiling: + type: "nsys" # one of: "none", "torch", "nsys" + prefill: + isl: 1024 # input sequence length + osl: 2 # output sequence length + concurrency: 24 + start_step: 0 + stop_step: 16 + decode: + isl: 8 + osl: 16 + concurrency: 1024 + start_step: 0 + stop_step: 16 +``` + +When `type: none`, normal serving runs with `dynamo.sglang`. Otherwise, serving uses `sglang.launch_server`. + ## Validate with Dry Run Always validate before submitting: diff --git a/scripts/profiling/profile.sh b/scripts/profiling/profile.sh index 179841d8..04374e42 100755 --- a/scripts/profiling/profile.sh +++ b/scripts/profiling/profile.sh @@ -44,38 +44,68 @@ wait_until_ready() { } wait_until_ready "http://${head_node}:${head_port}" -# Determine profiling parameters based on mode -if [[ "${PROFILING_MODE}" == "prefill" ]]; then - # Prefill profiling: smaller batch, long input, short output - BATCH_SIZE=24 - INPUT_LEN=1024 - OUTPUT_LEN=2 - PROFILE_STEPS_ARG="" - echo "Running prefill profiling with batch=${BATCH_SIZE}, input_len=${INPUT_LEN}, output_len=${OUTPUT_LEN}" -else - # Decode profiling: large batch, short input, longer output with profile steps - BATCH_SIZE=1024 - INPUT_LEN=8 - OUTPUT_LEN=16 - PROFILE_STEPS_ARG="--profile-steps 16" - echo "Running decode profiling with batch=${BATCH_SIZE}, input_len=${INPUT_LEN}, output_len=${OUTPUT_LEN}, profile_steps=16" +# Determine profiling parameters strictly from environment +PROFILE_STEPS_ARG="" +CLI_ARGS="" +[[ -n "${PROFILE_CONCURRENCY}" ]] && CLI_ARGS+=" --batch-size ${PROFILE_CONCURRENCY}" +# Require ISL/OSL to be provided; do not pass them as CLI args here +if [[ -z "${PROFILE_ISL}" || -z "${PROFILE_OSL}" ]]; then + echo "Error: isl and osl must be set for profiling." + exit 1 +fi + +# Configure profiling steps range; set defaults independently if missing +if [[ -z "${PROFILE_START_STEP}" ]]; then + echo "Warning: PROFILE_START_STEP not set; defaulting to 0" + PROFILE_START_STEP=0 +fi +if [[ -z "${PROFILE_STOP_STEP}" ]]; then + echo "Warning: PROFILE_STOP_STEP not set; defaulting to 50" + PROFILE_STOP_STEP=50 fi -# Create profiling output directory -mkdir -p ${SGLANG_TORCH_PROFILER_DIR} 2>/dev/null || true -echo "Running torch profiler..." +echo "Running profiler..." echo "$(date '+%Y-%m-%d %H:%M:%S')" +# Create profiling output directory only when torch profiler dir is provided +ACTIVITIES="" +if [[ -n "${SGLANG_TORCH_PROFILER_DIR}" ]]; then + ACTIVITIES='["GPU"]' + mkdir -p "${SGLANG_TORCH_PROFILER_DIR}" 2>/dev/null || true + export SGLANG_TORCH_PROFILER_DIR=${SGLANG_TORCH_PROFILER_DIR} +else + ACTIVITIES='["CUDA_PROFILER"]' + mkdir -p "/logs/profiles" 2>/dev/null || true +fi + set -x -python3 -m sglang.bench_one_batch_server \ - --model ${model_name} \ - --base-url http://${head_node}:${head_port} \ - --batch-size ${BATCH_SIZE} \ - --input-len ${INPUT_LEN} \ - --output-len ${OUTPUT_LEN} \ - ${PROFILE_STEPS_ARG} \ - --profile + +curl -X POST http://${head_node}:${head_port}/start_profile -H "Content-Type: application/json" -d "{\"start_step\": \"$PROFILE_START_STEP\", \"num_steps\": $((PROFILE_STOP_STEP-PROFILE_START_STEP)), \"activities\": $ACTIVITIES}" + +python3 -m sglang.bench_serving \ +--backend sglang \ +--model ${model_name} \ +--host ${head_node} --port ${head_port} \ +--dataset-name random \ +--max-concurrency $PROFILE_CONCURRENCY \ +--num-prompts 128 \ +--random-input-len $PROFILE_ISL \ +--random-output-len $PROFILE_OSL \ +--random-range-ratio 1 \ +--warmup-request 10 + +pip install lm-eval tenacity +python -m lm_eval \ +--model local-completions \ +--tasks gsm8k \ +--model_args \ +base_url=http://${head_node}:${head_port}/v1/completions,\ +model=${model_name},\ +tokenized_requests=False,tokenizer_backend=None,\ +num_concurrent=${PROFILE_CONCURRENCY},timeout=6000,max_retries=1 \ +--limit 10 + exit_code=$? set +x diff --git a/scripts/templates/job_script_template_agg.j2 b/scripts/templates/job_script_template_agg.j2 index 10efd556..6e5b7038 100755 --- a/scripts/templates/job_script_template_agg.j2 +++ b/scripts/templates/job_script_template_agg.j2 @@ -189,12 +189,10 @@ WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled" {% endraw %} {% endif %} -{% if sglang_torch_profiler %} {% raw %} -# Enable torch profiling mode -WORKER_ARGS="$WORKER_ARGS --sglang-torch-profiler" +# Set profiler mode from config +WORKER_ARGS="$WORKER_ARGS --profiler {% endraw %}{{ profiler }}{% raw %}" {% endraw %} -{% endif %} {% raw %} # Add SGLang config path (mounted in container at /logs/) WORKER_ARGS="$WORKER_ARGS --sglang-config-path /logs/sglang_config.yaml" @@ -213,7 +211,7 @@ WORKER_ARGS="$WORKER_ARGS --setup-script {{ setup_script }}" {% raw %} # Launch nginx on node 0 echo "Launching nginx on ${NGINX_NODE}" -cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out --error=${LOG_DIR}/${NGINX_NODE}_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}" +cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}" echo "$cmd" $cmd & {% endraw %} @@ -222,7 +220,7 @@ $cmd & # Launch frontend on master node (node 1) - this will also start NATS/ETCD echo "Launching frontend + NATS/ETCD on master node ${MASTER_NODE}" -cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out --error=${LOG_DIR}/${MASTER_NODE}_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}" +cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}" echo "$cmd" $cmd & @@ -236,7 +234,7 @@ if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then if [ $frontend_node_idx -lt $TOTAL_NODES ]; then node=${nodes[$frontend_node_idx]} echo "Launching additional frontend $frontend_idx on node $frontend_node_idx: $node" - cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out --error=${LOG_DIR}/${node}_frontend_${frontend_idx}.err python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}" + cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}" echo "$cmd" $cmd & frontend_idx=$((frontend_idx + 1)) @@ -280,7 +278,7 @@ for worker_idx in $(seq 0 $((AGG_WORKERS - 1))); do {% endraw %} {% endif %} {% raw %} - cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_agg_w${worker_idx}.out --error=${LOG_DIR}/${node}_agg_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${AGG_NODES_PER_WORKER} --worker_type aggregated ${CONFIG_DUMP_ARG} ${WORKER_ARGS}" + cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_agg_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${AGG_NODES_PER_WORKER} --worker_type aggregated ${CONFIG_DUMP_ARG} ${WORKER_ARGS}" echo "$cmd" $cmd & done @@ -318,11 +316,11 @@ echo "srun $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --overlap --pty bas {% raw %} BENCHMARK_TYPE={% endraw %}{{ benchmark_type }}{% raw %} BENCHMARK_ARGS="{% endraw %}{{ benchmark_arg }}{% raw %}" -srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --error=${LOG_DIR}/benchmark.err --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $AGG_WORKERS 0 0 $DECODE_GPUS ${BENCHMARK_ARGS} & +srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $AGG_WORKERS 0 0 $DECODE_GPUS ${BENCHMARK_ARGS} & {% endraw %} {% endif %} -{% if sglang_torch_profiler %} +{% if profiler != 'none' %} {% raw %} # Torch profiling mode for aggregated workers @@ -336,12 +334,12 @@ echo "Aggregated profiling will run on: $AGG_LEADER_NODE" # Run profiling on first aggregated worker's leader node # Use "decode" mode for aggregated since it profiles the full generation pipeline srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $AGG_LEADER_NODE \ - --output=${LOG_DIR}/profile_aggregated.out --error=${LOG_DIR}/profile_aggregated.err --overlap \ - bash -c "PROFILING_MODE=decode SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode /scripts/profiling/profile.sh 0 $AGG_WORKERS 0 $DECODE_GPUS $TOTAL_GPUS" & + --output=${LOG_DIR}/profile_aggregated.out --overlap \ + bash -c "PROFILING_MODE=decode {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode {% endif %}{{ decode_profile_env }}{% raw %} /scripts/profiling/profile.sh 0 $AGG_WORKERS 0 $DECODE_GPUS $TOTAL_GPUS" & {% endraw %} {% endif %} -{% if sglang_torch_profiler %} +{% if profiler != 'none' %} {% raw %} # Wait for profiling script to complete echo "Waiting for profiling script to complete..." diff --git a/scripts/templates/job_script_template_disagg.j2 b/scripts/templates/job_script_template_disagg.j2 index d2222cce..3e12acbb 100755 --- a/scripts/templates/job_script_template_disagg.j2 +++ b/scripts/templates/job_script_template_disagg.j2 @@ -9,7 +9,6 @@ #SBATCH --account={{ account }} #SBATCH --time={{ time_limit }} #SBATCH --output={{ log_dir_prefix }}/%j_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}/log.out -#SBATCH --error={{ log_dir_prefix }}/%j_{{ prefill_workers }}P_{{ decode_workers }}D_{{ timestamp }}/log.err #SBATCH --partition={{ partition }} # Constants @@ -190,12 +189,10 @@ WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled" {% endraw %} {% endif %} -{% if sglang_torch_profiler %} {% raw %} -# Enable torch profiling mode -WORKER_ARGS="$WORKER_ARGS --sglang-torch-profiler" +# Set profiler mode from config +WORKER_ARGS="$WORKER_ARGS --profiler {% endraw %}{{ profiler }}{% raw %}" {% endraw %} -{% endif %} {% raw %} # Add SGLang config path (mounted in container at /logs/) WORKER_ARGS="$WORKER_ARGS --sglang-config-path /logs/sglang_config.yaml" @@ -214,7 +211,7 @@ WORKER_ARGS="$WORKER_ARGS --setup-script {{ setup_script }}" {% raw %} # Launch nginx on node 0 echo "Launching nginx on ${NGINX_NODE}" -cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out --error=${LOG_DIR}/${NGINX_NODE}_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}" +cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}" echo "$cmd" $cmd & {% endraw %} @@ -223,7 +220,7 @@ $cmd & # Launch frontend on master node (node 1) - this will also start NATS/ETCD echo "Launching frontend + NATS/ETCD on master node ${MASTER_NODE}" -cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out --error=${LOG_DIR}/${MASTER_NODE}_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}" +cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}" echo "$cmd" $cmd & @@ -237,7 +234,7 @@ if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then if [ $frontend_node_idx -lt $TOTAL_NODES ]; then node=${nodes[$frontend_node_idx]} echo "Launching additional frontend $frontend_idx on node $frontend_node_idx: $node" - cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out --error=${LOG_DIR}/${node}_frontend_${frontend_idx}.err python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}" + cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}" echo "$cmd" $cmd & frontend_idx=$((frontend_idx + 1)) @@ -276,7 +273,7 @@ for worker_idx in $(seq 0 $((PREFILL_WORKERS - 1))); do {% endraw %} {% endif %} {% raw %} - cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out --error=${LOG_DIR}/${node}_prefill_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill ${WORKER_ARGS} ${CONFIG_DUMP_ARG}" + cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill ${WORKER_ARGS} ${CONFIG_DUMP_ARG}" echo "$cmd" $cmd & done @@ -309,7 +306,7 @@ for worker_idx in $(seq 0 $((DECODE_WORKERS - 1))); do {% endraw %} {% endif %} {% raw %} - cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out --error=${LOG_DIR}/${node}_decode_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode ${CONFIG_DUMP_ARG} ${WORKER_ARGS}" + cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode ${CONFIG_DUMP_ARG} ${WORKER_ARGS}" echo "$cmd" $cmd & done @@ -346,14 +343,14 @@ BENCHMARK_ARGS="{{ benchmark_arg }}" {% if do_benchmark %} {% raw %} -srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --error=${LOG_DIR}/benchmark.err --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS ${BENCHMARK_ARGS} & +srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/benchmark.out --overlap bash /scripts/benchmarks/${BENCHMARK_TYPE}/bench.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS ${BENCHMARK_ARGS} & {% endraw %} {% endif %} -{% if sglang_torch_profiler %} +{% if profiler != 'none' %} {% raw %} # Torch profiling mode: run profiling on prefill and decode workers separately -echo "Starting torch profiling..." +echo "Starting profiler..." # Get leader nodes for first prefill and decode workers PREFILL_LEADER_NODE=${nodes[${prefill_leaders[0]}]} @@ -364,17 +361,17 @@ echo "Decode profiling will run on: $DECODE_LEADER_NODE" # Run prefill profiling on first prefill worker's leader node srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $PREFILL_LEADER_NODE \ - --output=${LOG_DIR}/profile_prefill.out --error=${LOG_DIR}/profile_prefill.err --overlap \ - bash -c "PROFILING_MODE=prefill SGLANG_TORCH_PROFILER_DIR=/logs/profiles/prefill /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" & + --output=${LOG_DIR}/profile_prefill.out --overlap \ + bash -c "PROFILING_MODE=prefill {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/prefill {% endif %}{{ prefill_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" & # Run decode profiling on first decode worker's leader node srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w $DECODE_LEADER_NODE \ - --output=${LOG_DIR}/profile_decode.out --error=${LOG_DIR}/profile_decode.err --overlap \ - bash -c "PROFILING_MODE=decode SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" & + --output=${LOG_DIR}/profile_decode.out --overlap \ + bash -c "PROFILING_MODE=decode {% endraw %}{% if profiler == 'torch' %}SGLANG_TORCH_PROFILER_DIR=/logs/profiles/decode {% endif %}{{ decode_profile_env }}{% raw %} /scripts/profiling/profile.sh $PREFILL_WORKERS $DECODE_WORKERS $PREFILL_GPUS $DECODE_GPUS $TOTAL_GPUS" & {% endraw %} {% endif %} -{% if sglang_torch_profiler %} +{% if profiler != 'none' %} {% raw %} # Wait for all profiling scripts to complete (both prefill and decode) echo "Waiting for all profiling scripts to complete..." diff --git a/scripts/worker_setup.py b/scripts/worker_setup.py index a6448962..8557634c 100644 --- a/scripts/worker_setup.py +++ b/scripts/worker_setup.py @@ -103,9 +103,11 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac ) parser.add_argument( - "--sglang-torch-profiler", - action="store_true", - help="Enable torch profiling mode: use sglang.launch_server and skip --disaggregation-mode", + "--profiler", + type=str, + choices=["none", "torch", "nsys"], + default="none", + help="Profiling method for workers", ) parser.add_argument( @@ -182,7 +184,7 @@ def main(input_args: list[str] | None = None): args.nodes_per_worker, args.gpu_type, args.multiple_frontends_enabled, - args.sglang_torch_profiler, + args.profiler, args.sglang_config_path, args.dump_config_path, args.setup_script, @@ -195,7 +197,7 @@ def main(input_args: list[str] | None = None): args.master_ip, args.nodes_per_worker, args.gpu_type, - args.sglang_torch_profiler, + args.profiler, args.sglang_config_path, args.dump_config_path, args.setup_script, @@ -209,7 +211,7 @@ def main(input_args: list[str] | None = None): args.nodes_per_worker, args.gpu_type, args.multiple_frontends_enabled, - args.sglang_torch_profiler, + args.profiler, args.sglang_config_path, args.dump_config_path, args.setup_script, diff --git a/scripts/worker_setup/command.py b/scripts/worker_setup/command.py index 1e24c8b2..4a2524eb 100644 --- a/scripts/worker_setup/command.py +++ b/scripts/worker_setup/command.py @@ -15,7 +15,7 @@ def build_sglang_command_from_yaml( port: int, total_nodes: int, rank: int, - use_profiling: bool = False, + profiler: str = "none", dump_config_path: str | None = None, ) -> str: """Build SGLang command using native YAML config support. @@ -33,7 +33,7 @@ def build_sglang_command_from_yaml( port: Port for distributed coordination total_nodes: Total number of nodes rank: Node rank (0-indexed) - use_profiling: Whether to use sglang.launch_server (profiling mode) + profiler: Profiling method: "none", "torch", or "nsys" Returns: Full command string ready to execute @@ -54,16 +54,21 @@ def build_sglang_command_from_yaml( env_exports = [] for key, value in env_vars.items(): env_exports.append(f"export {key}={value}") - if use_profiling: + if profiler == "torch": env_exports.append(f"export SGLANG_TORCH_PROFILER_DIR=/logs/profiles/{config_key}") # Determine Python module based on profiling mode - python_module = "sglang.launch_server" if use_profiling else "dynamo.sglang" + python_module = "sglang.launch_server" if profiler != "none" else "dynamo.sglang" + nsys_prefix=f"nsys profile -t cuda,nvtx --cuda-graph-trace=node -c cudaProfilerApi --capture-range-end stop --force-overwrite true -o /logs/profiles/{config_key}_{rank}" - if use_profiling: + if profiler != "none": # Profiling mode: inline all flags (sglang.launch_server doesn't support --config) mode_config = sglang_config.get(config_key, {}) - cmd_parts = [f"python3 -m {python_module}"] + # Wrap with NSYS on all ranks; outputs are isolated per-rank + if profiler == "nsys": + cmd_parts = [f"{nsys_prefix} python3 -m {python_module}"] + else: + cmd_parts = [f"python3 -m {python_module}"] # Add all SGLang flags from config for key, value in sorted(mode_config.items()): @@ -145,7 +150,7 @@ def get_gpu_command( port: int, total_nodes: int, rank: int, - use_profiling: bool = False, + profiler: str = "none", dump_config_path: str | None = None, ) -> str: """Generate command to run SGLang worker using YAML config. @@ -157,7 +162,7 @@ def get_gpu_command( port: Port for distributed coordination total_nodes: Total number of nodes rank: Node rank (0-indexed) - use_profiling: Whether to use sglang.launch_server (profiling mode) + profiler: Profiling method: "none", "torch", or "nsys" Returns: Command string to execute @@ -167,5 +172,5 @@ def get_gpu_command( logging.info(f"Building command from YAML config: {sglang_config_path}") return build_sglang_command_from_yaml( - worker_type, sglang_config_path, host_ip, port, total_nodes, rank, use_profiling, dump_config_path + worker_type, sglang_config_path, host_ip, port, total_nodes, rank, profiler, dump_config_path ) diff --git a/scripts/worker_setup/worker.py b/scripts/worker_setup/worker.py index aee8762c..c93c43b2 100644 --- a/scripts/worker_setup/worker.py +++ b/scripts/worker_setup/worker.py @@ -60,7 +60,7 @@ def setup_prefill_worker( nodes_per_worker: int, gpu_type: str, multiple_frontends_enabled: bool = False, - sglang_torch_profiler: bool = False, + profiler: str = "none", sglang_config_path: str | None = None, dump_config_path: str | None = None, setup_script: str | None = None, @@ -111,7 +111,7 @@ def setup_prefill_worker( port=DIST_INIT_PORT, total_nodes=nodes_per_worker, rank=local_rank, - use_profiling=sglang_torch_profiler, + profiler=profiler, dump_config_path=dump_config_path, ) return run_command(cmd_to_run) @@ -124,7 +124,7 @@ def setup_decode_worker( master_ip: str, nodes_per_worker: int, gpu_type: str, - sglang_torch_profiler: bool = False, + profiler: str = "none", sglang_config_path: str | None = None, dump_config_path: str | None = None, setup_script: str | None = None, @@ -153,7 +153,7 @@ def setup_decode_worker( port=DIST_INIT_PORT, total_nodes=nodes_per_worker, rank=local_rank, - use_profiling=sglang_torch_profiler, + profiler=profiler, dump_config_path=dump_config_path, ) return run_command(cmd_to_run) @@ -167,7 +167,7 @@ def setup_aggregated_worker( nodes_per_worker: int, gpu_type: str, multiple_frontends_enabled: bool = False, - sglang_torch_profiler: bool = False, + profiler: str = "none", sglang_config_path: str | None = None, dump_config_path: str | None = None, setup_script: str | None = None, @@ -218,7 +218,7 @@ def setup_aggregated_worker( port=DIST_INIT_PORT, total_nodes=nodes_per_worker, rank=local_rank, - use_profiling=sglang_torch_profiler, + profiler=profiler, dump_config_path=dump_config_path, ) return run_command(cmd_to_run) diff --git a/src/srtctl/backends/sglang.py b/src/srtctl/backends/sglang.py index 4cf6e301..91dd6fbd 100644 --- a/src/srtctl/backends/sglang.py +++ b/src/srtctl/backends/sglang.py @@ -101,9 +101,12 @@ def render_command(self, mode: str, config_path: Path = None) -> str: for key, val in env_vars.items(): lines.append(f"{key}={val} \\") - # Python command - use sglang.launch_server for profiling, dynamo.sglang otherwise - is_profiling = self.backend_config.get("enable_profiling", False) - if is_profiling: + # Python command - use sglang.launch_server when profiler != none, dynamo.sglang otherwise + profiling_type = (self.config.get("profiling") or {}).get("type", "none") + nsys_prefix = "nsys profile -t cuda,nvtx --cuda-graph-trace=node -c cudaProfilerApi --capture-range-end stop --force-overwrite true" + if profiling_type == "nsys": + lines.append(f"{nsys_prefix} python3 -m sglang.launch_server \\") + elif profiling_type == "torch": lines.append("python3 -m sglang.launch_server \\") else: lines.append("python3 -m dynamo.sglang \\") @@ -133,14 +136,14 @@ def _config_to_flags(self, config: dict) -> list[str]: List of flag strings with backslash continuations """ lines = [] - is_profiling = self.backend_config.get("enable_profiling", False) + profiling_type = (self.config.get("profiling") or {}).get("type", "none") for key, value in sorted(config.items()): # Convert underscores to hyphens flag_name = key.replace("_", "-") # Skip disaggregation-mode flag when profiling (sglang.launch_server doesn't accept it) - if is_profiling and flag_name == "disaggregation-mode": + if profiling_type in ("torch", "nsys") and flag_name == "disaggregation-mode": continue if isinstance(value, bool): @@ -188,7 +191,8 @@ def _get_enable_config_dump(self) -> bool: enable_config_dump = self.config.get("enable_config_dump", True) # Auto-disable when profiling is enabled (unless explicitly set to True) - if self.backend_config.get("enable_profiling", False): + profiling_type = (self.config.get("profiling") or {}).get("type", "none") + if profiling_type != "none": # When profiling, disable config dump by default # User can explicitly set enable_config_dump: true to override return False @@ -277,6 +281,30 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None) config_dir_path = srtctl_root / "configs" log_dir_path = srtctl_root / "logs" + # Build profiling env injections + profiling_cfg = self.config.get("profiling") or {} + prefill_cfg = profiling_cfg.get("prefill") or {} + decode_cfg = profiling_cfg.get("decode") or {} + + def build_env_str(cfg: dict) -> str: + parts: list[str] = [] + if "isl" in cfg and cfg["isl"] is not None: + parts.append(f"PROFILE_ISL={cfg['isl']}") + if "osl" in cfg and cfg["osl"] is not None: + parts.append(f"PROFILE_OSL={cfg['osl']}") + if "concurrency" in cfg and cfg["concurrency"] is not None: + parts.append(f"PROFILE_CONCURRENCY={cfg['concurrency']}") + if "start_step" in cfg and cfg["start_step"] is not None: + parts.append(f"PROFILE_START_STEP={cfg['start_step']}") + if "stop_step" in cfg and cfg["stop_step"] is not None: + parts.append(f"PROFILE_STOP_STEP={cfg['stop_step']}") + return " ".join(parts) + + prefill_profile_env = build_env_str(prefill_cfg) + decode_profile_env = build_env_str(decode_cfg) + + profiler_mode = profiling_cfg.get("type", "none") + # Template variables template_vars = { "job_name": job_name, @@ -307,7 +335,9 @@ def generate_slurm_script(self, config_path: Path = None, timestamp: str = None) # Auto-disabled when profiling unless explicitly enabled "enable_config_dump": self._get_enable_config_dump(), "log_dir_prefix": str(log_dir_path), # Absolute path to logs directory - "sglang_torch_profiler": self.backend_config.get("enable_profiling", False), + "profiler": profiler_mode, + "prefill_profile_env": prefill_profile_env, + "decode_profile_env": decode_profile_env, "setup_script": self.setup_script, "use_gpus_per_node_directive": get_srtslurm_setting("use_gpus_per_node_directive", True), "extra_container_mounts": ",".join(self.config.get("extra_mount") or []), diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py index e721a53d..998b5dae 100644 --- a/src/srtctl/core/schema.py +++ b/src/srtctl/core/schema.py @@ -162,6 +162,32 @@ class BenchmarkConfig(BaseModel): req_rate: Optional[str] = Field("inf", description="Request rate") +class ProfilingType(str, Enum): + """Supported profiling types.""" + + NSYS = "nsys" + TORCH = "torch" + NONE = "none" + + +class ProfilingPhaseConfig(BaseModel): + """Per-phase profiling parameters.""" + + isl: Optional[int] = Field(None, description="Input sequence length") + osl: Optional[int] = Field(None, description="Output sequence length") + concurrency: Optional[int] = Field(None, description="Batch size / concurrency") + start_step: Optional[int] = Field(None, description="Profiling start step") + stop_step: Optional[int] = Field(None, description="Profiling stop step") + + +class ProfilingConfig(BaseModel): + """Profiling configuration.""" + + type: ProfilingType = Field(ProfilingType.NONE, description="Profiling type") + prefill: Optional[ProfilingPhaseConfig] = None + decode: Optional[ProfilingPhaseConfig] = None + + class SGLangPrefillConfig(BaseModel): """SGLang prefill worker configuration. @@ -217,12 +243,6 @@ class BackendConfig(BaseModel): enable_multiple_frontends: bool = True num_additional_frontends: int = 9 - # Profiling settings - enable_profiling: bool = Field( - False, - description="Enable torch profiling mode (uses sglang.launch_server instead of dynamo.sglang)", - ) - class JobConfig(BaseModel): """Complete job configuration.""" @@ -235,6 +255,7 @@ class JobConfig(BaseModel): slurm: SlurmConfig = Field(default_factory=SlurmConfig) backend: Optional[BackendConfig] = None # Auto-populated benchmark: BenchmarkConfig = Field(default_factory=BenchmarkConfig) + profiling: ProfilingConfig = Field(default_factory=ProfilingConfig) # Additional optional settings enable_config_dump: bool = True @@ -275,7 +296,8 @@ def model_post_init(self, __context: Any) -> None: def _validate_profiling_mode(self) -> None: """Validate profiling mode constraints.""" - if not self.backend or not self.backend.enable_profiling: + prof = getattr(self, "profiling", None) + if not prof or prof.type in (ProfilingType.NONE, None): return # Auto-disable config dump when profiling (already handled in backend, but validate here too)