Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
bbe48fa
[None][feat] Add DWDP (Distributed Weight Data Parallelism) support f…
tianyuz-nv Mar 19, 2026
7e4a855
Merge upstream/main into dwdp_productization
tianyuz-nv Mar 23, 2026
f76d026
Register DWDP accuracy test in CI test lists
tianyuz-nv Mar 23, 2026
f153099
Fix CI: remove forbidden from_dict, init helper, apply pre-commit for…
tianyuz-nv Mar 23, 2026
38d6e92
Fix CI and add env var config for disaggregated benchmark
tianyuz-nv Mar 24, 2026
d52a3cf
Merge remote-tracking branch 'upstream/main' into dwdp_productization
tianyuz-nv Mar 24, 2026
f52716d
Remove unused helper variable to fix ruff F841
tianyuz-nv Mar 24, 2026
64a7c7d
Merge branch 'main' into dwdp_productization
Kefeng-Duan Mar 28, 2026
47618f3
Merge branch 'main' into dwdp_productization
Kefeng-Duan Mar 29, 2026
b10dc89
Improve DwdpConfig docstring per review feedback
tianyuz-nv Mar 30, 2026
7040370
Decouple DWDP from mainline disagg scripts per reviewer feedback
tianyuz-nv Mar 30, 2026
28390e5
fix(moe): remove commented-out barrier in moeA2AInitializeOp
tianyuz-nv Mar 30, 2026
a4085ea
fix(dwdp): move record_compute_and_prefetch_next to per-layer level
tianyuz-nv Mar 30, 2026
a8a0a8f
style: fix pre-commit formatting for DWDP files
tianyuz-nv Mar 30, 2026
26e3e9f
Merge remote-tracking branch 'upstream/main' into dwdp_productization
tianyuz-nv Mar 31, 2026
85def53
refactor(dwdp): address reviewer feedback on DWDP config and lifecycle
tianyuz-nv Mar 31, 2026
afcd735
test(api_stability): align llm.yaml dwdp_config with Optional + proto…
tianyuz-nv Mar 31, 2026
be12482
Merge remote-tracking branch 'upstream/main' into dwdp_productization
tianyuz-nv Apr 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp
Comment thread
tianyuz-nv marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ torch::Tensor moeA2AInitializeOp(torch::Tensor const& workspace, int64_t epRank,

// Synchronize among ranks
cudaDeviceSynchronize();
tensorrt_llm::mpi::MpiComm::world().barrier();
tensorrt_llm::mpi::MpiComm::session().barrier();

return metainfo;
}
Expand Down
189 changes: 189 additions & 0 deletions examples/disaggregated/slurm/benchmark/disaggr_torch_dwdp.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#!/bin/bash
set -euo pipefail

# Parse named arguments
while [[ $# -gt 0 ]]; do
case $1 in
# Benchmark Configuration
--benchmark-mode) benchmark_mode="$2"; shift 2 ;;

# Environment and paths
--trtllm-repo) trtllm_repo="$2"; shift 2 ;;
--work-dir) work_dir="$2"; shift 2 ;;
--full-logdir) full_logdir="$2"; shift 2 ;;
--container-name) container_name="$2"; shift 2 ;;
--container-mount) container_mount="$2"; shift 2 ;;
--container-image) container_image="$2"; shift 2 ;;
--build-wheel) build_wheel="$2"; shift 2 ;;
--cuda-architectures) cuda_architectures="$2"; shift 2 ;;
--trtllm-wheel-path) trtllm_wheel_path="$2"; shift 2 ;;
*)
echo "Unknown argument: $1"
exit 1
;;
esac
done

# Print all parsed arguments
echo "Parsed arguments:"
echo
echo "Benchmark Configuration:"
echo " benchmark_mode: ${benchmark_mode}"
echo
echo "Environment Configuration:"
echo " trtllm_repo: ${trtllm_repo}"
echo " work_dir: ${work_dir}"
echo " full_logdir: ${full_logdir}"
echo " container_mount: ${container_mount}"
echo " container_image: ${container_image}"
echo " build_wheel: ${build_wheel}"
echo " cuda_architectures: ${cuda_architectures}"
echo " trtllm_wheel_path: ${trtllm_wheel_path}"

# Set TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode
if [ "${benchmark_mode}" = "gen_only_no_context" ]; then
export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1
echo "Setting TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode"
fi

# Function to cleanup on failure
cleanup_on_failure() {
echo "Error: $1"
scancel ${SLURM_JOB_ID}
exit 1
}

replace_placeholder() {
file_path="$1"
all_nodes_str="$2"
new_file_path="$3"
cp "$file_path" "$new_file_path"
IFS=',' read -r -a node_array <<< "$all_nodes_str"
for i in "${!node_array[@]}"; do
current_val="${node_array[$i]}"
placeholder="<node${i}_placeholder>"

# Use sed to replace the placeholder with the value in-place
sed -i "s|$placeholder|$current_val|g" "${new_file_path}"
echo "Replaced $placeholder with $current_val in ${new_file_path}"
done
}

env > ${full_logdir}/environment.txt

# Start container
echo "Starting container..."
if ! srun -l --container-image=${container_image} \
--container-name=${container_name} \
--container-mounts=${container_mount} \
--mpi=pmix \
echo "Container up." &> ${full_logdir}/1_container_launch.log; then
cleanup_on_failure "Failed to start container. Check ${full_logdir}/1_container_launch.log"
fi

# Install TensorRT-LLM
if [ -n "${trtllm_wheel_path}" ]; then
# Install from pre-built wheel if path is provided
echo "Installing TensorRT-LLM from wheel: ${trtllm_wheel_path}..."
if ! srun --container-name=${container_name} \
--container-mounts=${container_mount} --no-container-mount-home \
--mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
bash -c "pip install ${trtllm_wheel_path}[devel]" \
&> ${full_logdir}/2_install.log; then
cleanup_on_failure "TensorRT-LLM wheel installation failed. Check ${full_logdir}/2_install.log for details"
fi
echo "TensorRT-LLM wheel installation completed successfully"
elif [ -d "${trtllm_repo}" ]; then
# Build and install from repository if no wheel path provided
echo "Installing TensorRT-LLM from ${trtllm_repo}..."
TRT_LLM_GIT_COMMIT=$(git -C ${trtllm_repo} rev-parse --short HEAD 2>/dev/null || echo "unknown")
echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"

if [ "${build_wheel}" = "true" ]; then
echo "Building TensorRT-LLM wheel on one node..."
build_command="python3 ./scripts/build_wheel.py --trt_root /usr/local/tensorrt --benchmarks --use_ccache --clean"
if [ -n "${cuda_architectures:-}" ]; then
build_command="${build_command} --cuda_architectures \"${cuda_architectures}\""
fi
if ! srun --container-name=${container_name} \
--container-mounts=${container_mount} \
--mpi=pmix --overlap -N 1 --ntasks-per-node=1 \
bash -c "cd ${trtllm_repo} && ${build_command}" \
&> ${full_logdir}/2_build.log; then
cleanup_on_failure "TensorRT-LLM build failed. Check ${full_logdir}/2_build.log for details"
fi
echo "TensorRT-LLM build completed successfully"
fi

echo "Installing TensorRT-LLM..."
if ! srun --container-name=${container_name} \
--container-mounts=${container_mount} --no-container-mount-home \
--mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
bash -c "cd ${trtllm_repo} && pip install -e .[devel]" \
&> ${full_logdir}/2_install.log; then
cleanup_on_failure "TensorRT-LLM installation failed. Check ${full_logdir}/2_install.log for details"
fi
echo "TensorRT-LLM installation completed successfully"
else
echo "trtllm_wheel_path and trtllm_repo are not provided, will use the installed TensorRT-LLM from the container"
# get_env file is in the same directory as this script
get_env_file=${work_dir}/get_env.py
if ! srun --container-name=${container_name} \
--container-mounts=${container_mount} --no-container-mount-home \
--mpi=pmix --overlap -N 1 --ntasks-per-node=1 \
bash -c "python ${get_env_file} -e ${full_logdir}/env_vars.json" \
&> ${full_logdir}/2_get_env.log; then
cleanup_on_failure "Failed to get TensorRT-LLM environment variables. Check ${full_logdir}/2_get_env.log for details"
fi
echo "TensorRT-LLM environment variables saved to ${full_logdir}/env_vars.json"
fi

# Get node lists and replace the placeholder with the actual node names
echo "SLURM_NODELIST: ${SLURM_NODELIST}"
all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort))
all_nodes_str=$(IFS=','; echo "${all_nodes[*]}")
echo "all_nodes_str: ${all_nodes_str}"

start_server_cmds_base_file=${full_logdir}/start_server_cmds_base.sh
start_server_cmds_file=${full_logdir}/start_server_cmds.sh
replace_placeholder "${start_server_cmds_base_file}" "${all_nodes_str}" "${start_server_cmds_file}"
server_config_base_file=${full_logdir}/server_config_base.yaml
server_config_file=${full_logdir}/server_config.yaml
replace_placeholder "${server_config_base_file}" "${all_nodes_str}" "${server_config_file}"
mpi_worker_config_base_file=${full_logdir}/mpi_worker_config_base.yaml
mpi_worker_config_file=${full_logdir}/mpi_worker_config.yaml
if [ -f "${mpi_worker_config_base_file}" ]; then
replace_placeholder "${mpi_worker_config_base_file}" "${all_nodes_str}" "${mpi_worker_config_file}"
fi
client_cmds_base_file=${full_logdir}/client_cmds_base.sh
client_cmds_file=${full_logdir}/client_cmds.sh
replace_placeholder "${client_cmds_base_file}" "${all_nodes_str}" "${client_cmds_file}"

# start the servers (skip ctx workers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set).
echo "Starting worker commands from ${start_server_cmds_file}..."
cat ${start_server_cmds_file} | while read cmd; do
# Skip ctx worker commands if in gen-only mode
# CTX appears as argument to start_worker.sh and in log filename
if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" = "1" ] && [[ "$cmd" == *"start_worker.sh CTX"* ]]; then
echo "Skipping ctx worker command (TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set): ${cmd}"
continue
fi
echo "Executing command: ${cmd}"
eval "${cmd}"
done
echo "Server is ready!"

# Start client commands
echo "Starting client commands from ${client_cmds_file}..."
while read -r cmd <&3; do
echo "Starting client command: ${cmd}"
eval "${cmd}"
if [ $? -ne 0 ]; then
cleanup_on_failure "Command failed: ${cmd}."
fi
done 3< "${client_cmds_file}"

echo "Job completed successfully, total runtime: $SECONDS seconds"

# try to kill the server and workers
scancel ${SLURM_JOB_ID}
61 changes: 61 additions & 0 deletions examples/disaggregated/slurm/benchmark/start_worker_dwdp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#! /bin/bash
set -u
set -e
set -x

config_file=${1}
numa_bind=${2}
log_dir=${3}
enable_nsys=${4}
ctx_profile_range=${5}
gen_profile_range=${6}
num_ctx_gpus=${7}
ctx_worker_env_var=${8}
gen_worker_env_var=${9}

unset UCX_NET_DEVICES
unset UCX_TLS

echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname)"

if [ "${SLURM_PROCID}" -lt "${num_ctx_gpus}" ]; then
worker_role="CTX"
worker_env_var=${ctx_worker_env_var}
profile_range=${ctx_profile_range}
else
worker_role="GEN"
worker_env_var=${gen_worker_env_var}
profile_range=${gen_profile_range}
fi

echo "worker_role: ${worker_role}, profile_range: ${profile_range}"

for env_var in ${worker_env_var}; do
export "${env_var}"
echo "Exported: ${env_var}"
done

if [ "${numa_bind}" = "true" ]; then
numa_bind_cmd="numactl -m 0,1"
echo "numactl -m 0,1 - Only allocate memory from nodes on GB200/GB300 NVL72"
else
numa_bind_cmd=""
echo "Not binding memory. If on GB200/GB300 NVL72, use \"numactl -m 0,1\" to only allocate memory from nodes."
fi

echo "config_file: ${config_file}"

nsys_prefix=""
if [ "${enable_nsys}" != "true" ]; then
echo "nsys is not enabled, start normal flow"
else
nsys_file=${log_dir}/nsys_worker_proc_${worker_role}_${SLURM_PROCID}
export TLLM_PROFILE_RECORD_GC=1
export TLLM_NVTX_DEBUG=1
export NSYS_MPI_STORE_TEAMS_PER_RANK=1
export TLLM_PROFILE_START_STOP=${profile_range}
echo "nsys is enabled on ${worker_role} ranks, TLLM_PROFILE_START_STOP=${profile_range}"
nsys_prefix="nsys profile -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
fi

${nsys_prefix} ${numa_bind_cmd} trtllm-serve disaggregated_mpi_worker -c ${config_file}
12 changes: 5 additions & 7 deletions examples/disaggregated/slurm/benchmark/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,13 @@ def assign_servers(
server_allocations[server_type][i] = server_allocation
port += 1

assign_servers(allocations, "GEN", num_gen_servers, gen_world_size,
gpus_per_node)
# Keep the allocation order aligned with disagg_utils, which builds
# server_configs as ctx_cfgs + gen_cfgs and assigns rank offsets in that
# same order during split_world_comm().
assign_servers(allocations, "CTX", num_ctx_servers, ctx_world_size,
gpus_per_node)
assign_servers(allocations, "GEN", num_gen_servers, gen_world_size,
gpus_per_node)

return allocations

Expand Down Expand Up @@ -506,17 +509,13 @@ def submit_job(config, log_dir, dry_run):
}
}

# Generate start worker commands with placeholder hostnames
for server_type in allocations.keys():
server_cfg = server_configs[server_type]

for server_id in allocations[server_type].keys():
allocation = allocations[server_type][server_id]
# Get GPU IDs for this server from allocation
# When multi-node, all nodes have same device list, so use first node [0]
gpu_ids = list(allocation["nodes"].values())[0]

# Build environment for this worker
cuda_devices = ','.join(map(str, gpu_ids))
worker_env = build_worker_environment(
worker_config=worker_config,
Expand All @@ -529,7 +528,6 @@ def submit_job(config, log_dir, dry_run):
)
export_str = format_export_string(worker_env)

# Use script_dir for start_worker.sh
cmd = [
"srun -l",
f"--nodelist {','.join(allocation['nodes'].keys())}",
Expand Down
Loading
Loading