diff --git a/components/backends/trtllm/launch/agg.sh b/components/backends/trtllm/launch/agg.sh index 5c7021c59c..5070b10f45 100755 --- a/components/backends/trtllm/launch/agg.sh +++ b/components/backends/trtllm/launch/agg.sh @@ -9,6 +9,7 @@ export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} export MODALITY=${MODALITY:-"text"} # If you want to use multimodal, set MODALITY to "multimodal" #export MODALITY=${MODALITY:-"multimodal"} +export MPI_CMD=${MPI_CMD:-""} # Setup cleanup trap cleanup() { @@ -21,11 +22,11 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --http-port 8000 & +$MPI_CMD python3 -m dynamo.frontend --http-port 8000 & DYNAMO_PID=$! # run worker -python3 -m dynamo.trtllm \ +$MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --modality "$MODALITY" \ diff --git a/components/backends/trtllm/launch/agg_metrics.sh b/components/backends/trtllm/launch/agg_metrics.sh index 3232576d76..212a012c41 100755 --- a/components/backends/trtllm/launch/agg_metrics.sh +++ b/components/backends/trtllm/launch/agg_metrics.sh @@ -7,6 +7,7 @@ export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} export MODALITY=${MODALITY:-"text"} +export MPI_CMD=${MPI_CMD:-""} # Setup cleanup trap cleanup() { @@ -18,12 +19,12 @@ cleanup() { trap cleanup EXIT INT TERM # Run frontend -python3 -m dynamo.frontend --http-port 8000 & +$MPI_CMD python3 -m dynamo.frontend --http-port 8000 & DYNAMO_PID=$! # Run worker DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ -python3 -m dynamo.trtllm \ +$MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --modality "$MODALITY" \ diff --git a/components/backends/trtllm/launch/agg_router.sh b/components/backends/trtllm/launch/agg_router.sh index ca6d439e63..907e6d4a8f 100755 --- a/components/backends/trtllm/launch/agg_router.sh +++ b/components/backends/trtllm/launch/agg_router.sh @@ -6,6 +6,7 @@ export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} +export MPI_CMD=${MPI_CMD:-""} # Setup cleanup trap cleanup() { @@ -18,11 +19,11 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --router-mode kv --http-port 8000 & +$MPI_CMD python3 -m dynamo.frontend --router-mode kv --http-port 8000 & DYNAMO_PID=$! # run worker -python3 -m dynamo.trtllm \ +$MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$AGG_ENGINE_ARGS" \ diff --git a/components/backends/trtllm/launch/disagg.sh b/components/backends/trtllm/launch/disagg.sh index f89eba5c9e..bedb685ffe 100755 --- a/components/backends/trtllm/launch/disagg.sh +++ b/components/backends/trtllm/launch/disagg.sh @@ -13,6 +13,7 @@ export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export MODALITY=${MODALITY:-"text"} # If you want to use multimodal, set MODALITY to "multimodal" #export MODALITY=${MODALITY:-"multimodal"} +export MPI_CMD=${MPI_CMD:-""} # Setup cleanup trap cleanup() { @@ -25,11 +26,11 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --http-port 8000 & +$MPI_CMD python3 -m dynamo.frontend --http-port 8000 & DYNAMO_PID=$! # run prefill worker -CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ +CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \ @@ -39,7 +40,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ PREFILL_PID=$! # run decode worker -CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ +CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \ diff --git a/components/backends/trtllm/launch/disagg_router.sh b/components/backends/trtllm/launch/disagg_router.sh index e29c851a56..bef13e8ab3 100755 --- a/components/backends/trtllm/launch/disagg_router.sh +++ b/components/backends/trtllm/launch/disagg_router.sh @@ -10,6 +10,7 @@ export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} +export MPI_CMD=${MPI_CMD:-""} # Setup cleanup trap cleanup() { @@ -22,7 +23,7 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --router-mode kv --http-port 8000 & +$MPI_CMD python3 -m dynamo.frontend --router-mode kv --http-port 8000 & DYNAMO_PID=$! @@ -35,7 +36,7 @@ else fi # run prefill worker -CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ +CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \ @@ -45,7 +46,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ PREFILL_PID=$! # run decode worker -CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ +CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \ diff --git a/components/backends/trtllm/launch/epd_disagg.sh b/components/backends/trtllm/launch/epd_disagg.sh index 60cfa1c249..40805c9413 100755 --- a/components/backends/trtllm/launch/epd_disagg.sh +++ b/components/backends/trtllm/launch/epd_disagg.sh @@ -16,6 +16,7 @@ export ENCODE_ENDPOINT=${ENCODE_ENDPOINT:-"dyn://dynamo.tensorrt_llm_encode.gene export MODALITY=${MODALITY:-"multimodal"} export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"} export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50} +export MPI_CMD=${MPI_CMD:-""} # Setup cleanup trap cleanup() { @@ -28,11 +29,11 @@ trap cleanup EXIT INT TERM # run frontend -python3 -m dynamo.frontend --http-port 8000 & +$MPI_CMD python3 -m dynamo.frontend --http-port 8000 & DYNAMO_PID=$! # run encode worker -CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ +CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$ENCODE_ENGINE_ARGS" \ @@ -44,7 +45,7 @@ CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ ENCODE_PID=$! # run prefill worker -CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ +CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \ @@ -55,7 +56,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ PREFILL_PID=$! # run decode worker -CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ +CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \ diff --git a/components/backends/trtllm/launch/gpt_oss_disagg.sh b/components/backends/trtllm/launch/gpt_oss_disagg.sh index db42c01771..67acedd6a8 100755 --- a/components/backends/trtllm/launch/gpt_oss_disagg.sh +++ b/components/backends/trtllm/launch/gpt_oss_disagg.sh @@ -8,17 +8,18 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/gpt_oss/prefill.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/gpt_oss/decode.yaml"} +export MPI_CMD=${MPI_CMD:-""} set -e trap 'echo Cleaning up...; kill 0' EXIT # run frontend -python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & +$MPI_CMD python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & # With tensor_parallel_size=4, each worker needs 4 GPUs # run prefill worker -CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ +CUDA_VISIBLE_DEVICES=0,1,2,3 $MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \ @@ -33,7 +34,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ --expert-parallel-size 4 & # run decode worker -CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \ +CUDA_VISIBLE_DEVICES=4,5,6,7 $MPI_CMD python3 -m dynamo.trtllm \ --model-path "$MODEL_PATH" \ --served-model-name "$SERVED_MODEL_NAME" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \