Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions recipes/llama-3-70b/model-cache/model-download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,22 @@ spec:
- secretRef:
name: hf-token-secret
env:
# NOTE: This is the model name for the llama-3-70b model
# Update this to model name for the model you are downloading
- name: MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
- name: HF_HOME
value: /model-store
- name: HF_HUB_ENABLE_HF_TRANSFER
value: "1"
- name: MODEL_REVISION
value: ddb4128556dfcff99e0c41aee159ea6c3e655dcd
args:
- |
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
export HF_HUB_ENABLE_HF_TRANSFER=1
huggingface-cli download $MODEL_NAME
hf download $MODEL_NAME --revision $MODEL_REVISION
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface/hub
mountPath: /model-store
volumes:
- name: model-cache
persistentVolumeClaim:
Expand Down
11 changes: 8 additions & 3 deletions recipes/llama-3-70b/vllm/agg/deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ spec:
size: 20Gi
extraPodSpec:
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
Expand All @@ -42,6 +47,6 @@ spec:
replicas: 1
resources:
limits:
gpu: "8"
gpu: "4"
requests:
gpu: "8"
gpu: "4"
159 changes: 115 additions & 44 deletions recipes/llama-3-70b/vllm/agg/perf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ kind: Job
metadata:
name: llama3-70b-agg-perf
spec:
backoffLimit: 3
backoffLimit: 1
completions: 1
parallelism: 1
template:
Expand All @@ -16,57 +16,128 @@ spec:
restartPolicy: Never
containers:
- name: perf
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
- |
# wait for the model to be ready
export ENDPOINT=llama3-70b-agg-0:8000
export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
export INTERVAL=5
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..."
sleep $INTERVAL
done
echo "✅ Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
# now run the benchmark
export ARTIFACT_DIR="/tmp/genai"
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
apt-get update && apt-get install -y curl jq procps git && apt-get clean
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
echo "aiperf installation completed";
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
cat /proc/sys/net/ipv4/ip_local_port_range
export COLUMNS=200
aiperf profile \
--model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \
--concurrency 64 \
--warmup-request-count 2 \
--request-count 320 \
--extra-inputs max_tokens:1024 \
--synthetic-input-tokens-mean 8192 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 1024 \
--output-tokens-stddev 0 \
--extra-inputs min_tokens:1024 \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--random-seed 1418186270 \
--artifact-dir $ARTIFACT_DIR \
--num-dataset-entries=3000 -- \
--max-threads 64
echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
cat $PERF_JSON | jq .
echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
cat $PERF_CSV
echo "Benchmark completed successfully!"
EPOCH=$(date +%s)
## utility functions -- can be moved to a bash script / configmap
wait_for_model_ready() {
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
sleep 5
done
echo "✅ Model '$TARGET_MODEL' is now available!"
echo "Model '$TARGET_MODEL' is now available!"
curl -s "http://$ENDPOINT/v1/models" | jq .
}
run_perf() {
local concurrency=$1
local isl=$2
local osl=$3
local max_threads=${concurrency}
key=concurrency_${concurrency}
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
mkdir -p "$ARTIFACT_DIR"
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \
--output-tokens-stddev 0 \
--extra-inputs max_tokens:$osl \
--extra-inputs min_tokens:$osl \
--extra-inputs ignore_eos:true \
--extra-inputs repetition_penalty:1.0 \
--extra-inputs temperature:0.0 \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency $concurrency \
--request-count $((10*concurrency)) \
--warmup-request-count $concurrency \
--conversation-num 12800 \
--random-seed 100 \
--workers-max $max_threads \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'\
--record-processors 32 \
--ui simple
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
ls -la $ARTIFACT_DIR
}
#### Actual execution ####
wait_for_model_ready
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
# Calculate total concurrency based on per-GPU concurrency and GPU count
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
# Write input_config.json
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
{
"gpu_count": $DEPLOYMENT_GPU_COUNT,
"max_threads": $max_threads,
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
"total_concurrency": $TOTAL_CONCURRENCY,
"mode": "$DEPLOYMENT_MODE",
"isl": $ISL,
"osl": $OSL,
"endpoint": "$ENDPOINT",
"model endpoint": "$TARGET_MODEL"
}
EOF

# Run perf with calculated total concurrency
run_perf $TOTAL_CONCURRENCY $ISL $OSL
echo "done with concurrency $TOTAL_CONCURRENCY"
env:
- name: TARGET_MODEL
value: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
- name: ENDPOINT
value: llama3-70b-agg-frontend:8000
- name: CONCURRENCY_PER_GPU
value: "16"
- name: DEPLOYMENT_GPU_COUNT
value: "4"
- name: ISL
value: "8192"
- name: OSL
value: "1024"
- name: DEPLOYMENT_MODE
value: agg
- name: AIPERF_HTTP_CONNECTION_LIMIT
value: "200"
- name: JOB_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['job-name']
- name: ROOT_ARTIFACT_DIR
value: /root/.cache/huggingface/perf
- name: HF_HOME
value: /root/.cache/huggingface
- name: PYTHONUNBUFFERED
value: "1"
image: python:3.12-slim
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface
workingDir: /workspace
imagePullSecrets:
- name: nvcrimagepullsecret
volumes:
- name: model-cache
persistentVolumeClaim:
Expand Down
14 changes: 12 additions & 2 deletions recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ spec:
size: 80Gi
extraPodSpec:
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
Expand All @@ -56,8 +61,13 @@ spec:
size: 80Gi
extraPodSpec:
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
args:
- "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
Expand Down
Loading
Loading