NVIDIA · Kefeng-Duan · Apr 2, 2026 · Mar 19, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp
@@ -144,7 +144,7 @@ torch::Tensor moeA2AInitializeOp(torch::Tensor const& workspace, int64_t epRank,
 
     // Synchronize among ranks
     cudaDeviceSynchronize();
-    tensorrt_llm::mpi::MpiComm::world().barrier();
+    tensorrt_llm::mpi::MpiComm::session().barrier();
 
     return metainfo;
 }

@@ -0,0 +1,189 @@
+#!/bin/bash
+set -euo pipefail
+
+# Parse named arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        # Benchmark Configuration
+        --benchmark-mode) benchmark_mode="$2"; shift 2 ;;
+
+        # Environment and paths
+        --trtllm-repo) trtllm_repo="$2"; shift 2 ;;
+        --work-dir) work_dir="$2"; shift 2 ;;
+        --full-logdir) full_logdir="$2"; shift 2 ;;
+        --container-name) container_name="$2"; shift 2 ;;
+        --container-mount) container_mount="$2"; shift 2 ;;
+        --container-image) container_image="$2"; shift 2 ;;
+        --build-wheel) build_wheel="$2"; shift 2 ;;
+        --cuda-architectures) cuda_architectures="$2"; shift 2 ;;
+        --trtllm-wheel-path) trtllm_wheel_path="$2"; shift 2 ;;
+        *)
+            echo "Unknown argument: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Print all parsed arguments
+echo "Parsed arguments:"
+echo
+echo "Benchmark Configuration:"
+echo "  benchmark_mode: ${benchmark_mode}"
+echo
+echo "Environment Configuration:"
+echo "  trtllm_repo: ${trtllm_repo}"
+echo "  work_dir: ${work_dir}"
+echo "  full_logdir: ${full_logdir}"
+echo "  container_mount: ${container_mount}"
+echo "  container_image: ${container_image}"
+echo "  build_wheel: ${build_wheel}"
+echo "  cuda_architectures: ${cuda_architectures}"
+echo "  trtllm_wheel_path: ${trtllm_wheel_path}"
+
+# Set TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode
+if [ "${benchmark_mode}" = "gen_only_no_context" ]; then
+    export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1
+    echo "Setting TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode"
+fi
+
+# Function to cleanup on failure
+cleanup_on_failure() {
+    echo "Error: $1"
+    scancel ${SLURM_JOB_ID}
+    exit 1
+}
+
+replace_placeholder() {
+    file_path="$1"
+    all_nodes_str="$2"
+    new_file_path="$3"
+    cp "$file_path" "$new_file_path"
+    IFS=',' read -r -a node_array <<< "$all_nodes_str"
+    for i in "${!node_array[@]}"; do
+        current_val="${node_array[$i]}"
+        placeholder="<node${i}_placeholder>"
+
+        # Use sed to replace the placeholder with the value in-place
+        sed -i "s|$placeholder|$current_val|g" "${new_file_path}"
+        echo "Replaced $placeholder with $current_val in ${new_file_path}"
+    done
+}
+
+env > ${full_logdir}/environment.txt
+
+# Start container
+echo "Starting container..."
+if ! srun -l --container-image=${container_image} \
+        --container-name=${container_name} \
+        --container-mounts=${container_mount} \
+        --mpi=pmix \
+        echo "Container up." &> ${full_logdir}/1_container_launch.log; then
+    cleanup_on_failure "Failed to start container. Check ${full_logdir}/1_container_launch.log"
+fi
+
+# Install TensorRT-LLM
+if [ -n "${trtllm_wheel_path}" ]; then
+    # Install from pre-built wheel if path is provided
+    echo "Installing TensorRT-LLM from wheel: ${trtllm_wheel_path}..."
+    if ! srun --container-name=${container_name} \
+        --container-mounts=${container_mount} --no-container-mount-home \
+        --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
+        bash -c "pip install ${trtllm_wheel_path}[devel]" \
+        &> ${full_logdir}/2_install.log; then
+        cleanup_on_failure "TensorRT-LLM wheel installation failed. Check ${full_logdir}/2_install.log for details"
+    fi
+    echo "TensorRT-LLM wheel installation completed successfully"
+elif [ -d "${trtllm_repo}" ]; then
+    # Build and install from repository if no wheel path provided
+    echo "Installing TensorRT-LLM from ${trtllm_repo}..."
+    TRT_LLM_GIT_COMMIT=$(git -C ${trtllm_repo} rev-parse --short HEAD 2>/dev/null || echo "unknown")
+    echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
+
+    if [ "${build_wheel}" = "true" ]; then
+        echo "Building TensorRT-LLM wheel on one node..."
+        build_command="python3 ./scripts/build_wheel.py --trt_root /usr/local/tensorrt --benchmarks --use_ccache --clean"
+        if [ -n "${cuda_architectures:-}" ]; then
+            build_command="${build_command} --cuda_architectures \"${cuda_architectures}\""
+        fi
+        if ! srun --container-name=${container_name} \
+            --container-mounts=${container_mount} \
+            --mpi=pmix --overlap -N 1 --ntasks-per-node=1 \
+            bash -c "cd ${trtllm_repo} && ${build_command}" \
+            &> ${full_logdir}/2_build.log; then
+            cleanup_on_failure "TensorRT-LLM build failed. Check ${full_logdir}/2_build.log for details"
+        fi
+        echo "TensorRT-LLM build completed successfully"
+    fi
+
+    echo "Installing TensorRT-LLM..."
+    if ! srun --container-name=${container_name} \
+        --container-mounts=${container_mount} --no-container-mount-home \
+        --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
+        bash -c "cd ${trtllm_repo} && pip install -e .[devel]" \
+        &> ${full_logdir}/2_install.log; then
+        cleanup_on_failure "TensorRT-LLM installation failed. Check ${full_logdir}/2_install.log for details"
+    fi
+    echo "TensorRT-LLM installation completed successfully"
+else
+    echo "trtllm_wheel_path and trtllm_repo are not provided, will use the installed TensorRT-LLM from the container"
+    # get_env file is in the same directory as this script
+    get_env_file=${work_dir}/get_env.py
+    if ! srun --container-name=${container_name} \
+        --container-mounts=${container_mount} --no-container-mount-home \
+        --mpi=pmix --overlap -N 1 --ntasks-per-node=1 \
+        bash -c "python ${get_env_file} -e ${full_logdir}/env_vars.json" \
+        &> ${full_logdir}/2_get_env.log; then
+        cleanup_on_failure "Failed to get TensorRT-LLM environment variables. Check ${full_logdir}/2_get_env.log for details"
+    fi
+    echo "TensorRT-LLM environment variables saved to ${full_logdir}/env_vars.json"
+fi
+
+# Get node lists and replace the placeholder with the actual node names
+echo "SLURM_NODELIST: ${SLURM_NODELIST}"
+all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort))
+all_nodes_str=$(IFS=','; echo "${all_nodes[*]}")
+echo "all_nodes_str: ${all_nodes_str}"
+
+start_server_cmds_base_file=${full_logdir}/start_server_cmds_base.sh
+start_server_cmds_file=${full_logdir}/start_server_cmds.sh
+replace_placeholder "${start_server_cmds_base_file}" "${all_nodes_str}" "${start_server_cmds_file}"
+server_config_base_file=${full_logdir}/server_config_base.yaml
+server_config_file=${full_logdir}/server_config.yaml
+replace_placeholder "${server_config_base_file}" "${all_nodes_str}" "${server_config_file}"
+mpi_worker_config_base_file=${full_logdir}/mpi_worker_config_base.yaml
+mpi_worker_config_file=${full_logdir}/mpi_worker_config.yaml
+if [ -f "${mpi_worker_config_base_file}" ]; then
+    replace_placeholder "${mpi_worker_config_base_file}" "${all_nodes_str}" "${mpi_worker_config_file}"
+fi
+client_cmds_base_file=${full_logdir}/client_cmds_base.sh
+client_cmds_file=${full_logdir}/client_cmds.sh
+replace_placeholder "${client_cmds_base_file}" "${all_nodes_str}" "${client_cmds_file}"
+
+# start the servers (skip ctx workers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set).
+echo "Starting worker commands from ${start_server_cmds_file}..."
+cat ${start_server_cmds_file} | while read cmd; do
+    # Skip ctx worker commands if in gen-only mode
+    # CTX appears as argument to start_worker.sh and in log filename
+    if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" = "1" ] && [[ "$cmd" == *"start_worker.sh CTX"* ]]; then
+        echo "Skipping ctx worker command (TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set): ${cmd}"
+        continue
+    fi
+    echo "Executing command: ${cmd}"
+    eval "${cmd}"
+done
+echo "Server is ready!"
+
+# Start client commands
+echo "Starting client commands from ${client_cmds_file}..."
+while read -r cmd <&3; do
+    echo "Starting client command: ${cmd}"
+    eval "${cmd}"
+    if [ $? -ne 0 ]; then
+        cleanup_on_failure "Command failed: ${cmd}."
+    fi
+done 3< "${client_cmds_file}"
+
+echo "Job completed successfully, total runtime: $SECONDS seconds"
+
+# try to kill the server and workers
+scancel ${SLURM_JOB_ID}
@@ -0,0 +1,61 @@
+#! /bin/bash
+set -u
+set -e
+set -x
+
+config_file=${1}
+numa_bind=${2}
+log_dir=${3}
+enable_nsys=${4}
+ctx_profile_range=${5}
+gen_profile_range=${6}
+num_ctx_gpus=${7}
+ctx_worker_env_var=${8}
+gen_worker_env_var=${9}
+
+unset UCX_NET_DEVICES
+unset UCX_TLS
+
+echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname)"
+
+if [ "${SLURM_PROCID}" -lt "${num_ctx_gpus}" ]; then
+    worker_role="CTX"
+    worker_env_var=${ctx_worker_env_var}
+    profile_range=${ctx_profile_range}
+else
+    worker_role="GEN"
+    worker_env_var=${gen_worker_env_var}
+    profile_range=${gen_profile_range}
+fi
+
+echo "worker_role: ${worker_role}, profile_range: ${profile_range}"
+
+for env_var in ${worker_env_var}; do
+    export "${env_var}"
+    echo "Exported: ${env_var}"
+done
+
+if [ "${numa_bind}" = "true" ]; then
+    numa_bind_cmd="numactl -m 0,1"
+    echo "numactl -m 0,1 - Only allocate memory from nodes on GB200/GB300 NVL72"
+else
+    numa_bind_cmd=""
+    echo "Not binding memory. If on GB200/GB300 NVL72, use \"numactl -m 0,1\" to only allocate memory from nodes."
+fi
+
+echo "config_file: ${config_file}"
+
+nsys_prefix=""
+if [ "${enable_nsys}" != "true" ]; then
+    echo "nsys is not enabled, start normal flow"
+else
+    nsys_file=${log_dir}/nsys_worker_proc_${worker_role}_${SLURM_PROCID}
+    export TLLM_PROFILE_RECORD_GC=1
+    export TLLM_NVTX_DEBUG=1
+    export NSYS_MPI_STORE_TEAMS_PER_RANK=1
+    export TLLM_PROFILE_START_STOP=${profile_range}
+    echo "nsys is enabled on ${worker_role} ranks, TLLM_PROFILE_START_STOP=${profile_range}"
+    nsys_prefix="nsys profile -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
+fi
+
+${nsys_prefix} ${numa_bind_cmd} trtllm-serve disaggregated_mpi_worker -c ${config_file}
@@ -105,10 +105,13 @@ def assign_servers(
             server_allocations[server_type][i] = server_allocation
             port += 1
 
-    assign_servers(allocations, "GEN", num_gen_servers, gen_world_size,
-                   gpus_per_node)
+    # Keep the allocation order aligned with disagg_utils, which builds
+    # server_configs as ctx_cfgs + gen_cfgs and assigns rank offsets in that
+    # same order during split_world_comm().
     assign_servers(allocations, "CTX", num_ctx_servers, ctx_world_size,
                    gpus_per_node)
+    assign_servers(allocations, "GEN", num_gen_servers, gen_world_size,
+                   gpus_per_node)
 
     return allocations
 
@@ -506,17 +509,13 @@ def submit_job(config, log_dir, dry_run):
         }
     }
 
-    # Generate start worker commands with placeholder hostnames
     for server_type in allocations.keys():
         server_cfg = server_configs[server_type]
 
         for server_id in allocations[server_type].keys():
             allocation = allocations[server_type][server_id]
-            # Get GPU IDs for this server from allocation
-            # When multi-node, all nodes have same device list, so use first node [0]
             gpu_ids = list(allocation["nodes"].values())[0]
 
-            # Build environment for this worker
             cuda_devices = ','.join(map(str, gpu_ids))
             worker_env = build_worker_environment(
                 worker_config=worker_config,
@@ -529,7 +528,6 @@ def submit_job(config, log_dir, dry_run):
             )
             export_str = format_export_string(worker_env)
 
-            # Use script_dir for start_worker.sh
             cmd = [
                 "srun -l",
                 f"--nodelist {','.join(allocation['nodes'].keys())}",