Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions components/backends/trtllm/launch/agg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal"
#export MODALITY=${MODALITY:-"multimodal"}
export MPI_CMD=${MPI_CMD:-""}

# Setup cleanup trap
cleanup() {
Expand All @@ -21,11 +22,11 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --http-port 8000 &
$MPI_CMD python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!

# run worker
python3 -m dynamo.trtllm \
$MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \
Expand Down
5 changes: 3 additions & 2 deletions components/backends/trtllm/launch/agg_metrics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
export MODALITY=${MODALITY:-"text"}
export MPI_CMD=${MPI_CMD:-""}

# Setup cleanup trap
cleanup() {
Expand All @@ -18,12 +19,12 @@ cleanup() {
trap cleanup EXIT INT TERM

# Run frontend
python3 -m dynamo.frontend --http-port 8000 &
$MPI_CMD python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!

# Run worker
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
python3 -m dynamo.trtllm \
$MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \
Expand Down
5 changes: 3 additions & 2 deletions components/backends/trtllm/launch/agg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
export MPI_CMD=${MPI_CMD:-""}

# Setup cleanup trap
cleanup() {
Expand All @@ -18,11 +19,11 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
$MPI_CMD python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
DYNAMO_PID=$!

# run worker
python3 -m dynamo.trtllm \
$MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
Expand Down
7 changes: 4 additions & 3 deletions components/backends/trtllm/launch/disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal"
#export MODALITY=${MODALITY:-"multimodal"}
export MPI_CMD=${MPI_CMD:-""}

# Setup cleanup trap
cleanup() {
Expand All @@ -25,11 +26,11 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --http-port 8000 &
$MPI_CMD python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!

# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
Expand All @@ -39,7 +40,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
PREFILL_PID=$!

# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
Expand Down
7 changes: 4 additions & 3 deletions components/backends/trtllm/launch/disagg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"}
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export MPI_CMD=${MPI_CMD:-""}

# Setup cleanup trap
cleanup() {
Expand All @@ -22,7 +23,7 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
$MPI_CMD python3 -m dynamo.frontend --router-mode kv --http-port 8000 &
DYNAMO_PID=$!


Expand All @@ -35,7 +36,7 @@ else
fi

# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
Expand All @@ -45,7 +46,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
PREFILL_PID=$!

# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
Expand Down
9 changes: 5 additions & 4 deletions components/backends/trtllm/launch/epd_disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export ENCODE_ENDPOINT=${ENCODE_ENDPOINT:-"dyn://dynamo.tensorrt_llm_encode.gene
export MODALITY=${MODALITY:-"multimodal"}
export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"}
export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
export MPI_CMD=${MPI_CMD:-""}

# Setup cleanup trap
cleanup() {
Expand All @@ -28,11 +29,11 @@ trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --http-port 8000 &
$MPI_CMD python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!

# run encode worker
CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$ENCODE_ENGINE_ARGS" \
Expand All @@ -44,7 +45,7 @@ CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
ENCODE_PID=$!

# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
Expand All @@ -55,7 +56,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
PREFILL_PID=$!

# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES $MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
Expand Down
7 changes: 4 additions & 3 deletions components/backends/trtllm/launch/gpt_oss_disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,18 @@ export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/gpt_oss/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/gpt_oss/decode.yaml"}
export MPI_CMD=${MPI_CMD:-""}

set -e
trap 'echo Cleaning up...; kill 0' EXIT


# run frontend
python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
$MPI_CMD python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &

# With tensor_parallel_size=4, each worker needs 4 GPUs
# run prefill worker
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=0,1,2,3 $MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
Expand All @@ -33,7 +34,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
--expert-parallel-size 4 &

# run decode worker
CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=4,5,6,7 $MPI_CMD python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
Expand Down
Loading