From 665ce7f8fbe1eec4e65cae70b0c548ebbda1625a Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 15 Oct 2025 12:16:40 -0700 Subject: [PATCH] fix: update model recipe for llama-3 70b to match with common recipe template (#3637) --- .../model-cache/model-download.yaml | 18 +- recipes/llama-3-70b/vllm/agg/deploy.yaml | 11 +- recipes/llama-3-70b/vllm/agg/perf.yaml | 159 +++++++++++++----- .../vllm/disagg-multi-node/deploy.yaml | 14 +- .../vllm/disagg-multi-node/perf.yaml | 159 +++++++++++++----- .../vllm/disagg-single-node/deploy.yaml | 14 +- .../vllm/disagg-single-node/perf.yaml | 159 +++++++++++++----- 7 files changed, 385 insertions(+), 149 deletions(-) diff --git a/recipes/llama-3-70b/model-cache/model-download.yaml b/recipes/llama-3-70b/model-cache/model-download.yaml index d8e1dfaa8cb8..1a71923b1559 100644 --- a/recipes/llama-3-70b/model-cache/model-download.yaml +++ b/recipes/llama-3-70b/model-cache/model-download.yaml @@ -22,24 +22,22 @@ spec: - secretRef: name: hf-token-secret env: - # NOTE: This is the model name for the llama-3-70b model - # Update this to model name for the model you are downloading - name: MODEL_NAME value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: HF_TOKEN + - name: HF_HOME + value: /model-store + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" + - name: MODEL_REVISION + value: ddb4128556dfcff99e0c41aee159ea6c3e655dcd args: - | set -eux pip install --no-cache-dir huggingface_hub hf_transfer - export HF_HUB_ENABLE_HF_TRANSFER=1 - huggingface-cli download $MODEL_NAME + hf download $MODEL_NAME --revision $MODEL_REVISION volumeMounts: - name: model-cache - mountPath: /root/.cache/huggingface/hub + mountPath: /model-store volumes: - name: model-cache persistentVolumeClaim: diff --git a/recipes/llama-3-70b/vllm/agg/deploy.yaml b/recipes/llama-3-70b/vllm/agg/deploy.yaml index 274b4633901e..09e56ec6a891 100644 --- a/recipes/llama-3-70b/vllm/agg/deploy.yaml +++ b/recipes/llama-3-70b/vllm/agg/deploy.yaml @@ -32,8 +32,13 @@ spec: size: 20Gi extraPodSpec: mainContainer: + env: + - name: SERVED_MODEL_NAME + value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" + - name: MODEL_PATH + value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd" args: - - "python3 -m dynamo.vllm --model RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" + - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" command: - /bin/sh - -c @@ -42,6 +47,6 @@ spec: replicas: 1 resources: limits: - gpu: "8" + gpu: "4" requests: - gpu: "8" \ No newline at end of file + gpu: "4" \ No newline at end of file diff --git a/recipes/llama-3-70b/vllm/agg/perf.yaml b/recipes/llama-3-70b/vllm/agg/perf.yaml index b750eb709c64..8c5a470f119c 100644 --- a/recipes/llama-3-70b/vllm/agg/perf.yaml +++ b/recipes/llama-3-70b/vllm/agg/perf.yaml @@ -5,7 +5,7 @@ kind: Job metadata: name: llama3-70b-agg-perf spec: - backoffLimit: 3 + backoffLimit: 1 completions: 1 parallelism: 1 template: @@ -16,57 +16,128 @@ spec: restartPolicy: Never containers: - name: perf - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 - workingDir: /workspace/components/backends/vllm command: - /bin/sh - -c - | - # wait for the model to be ready - export ENDPOINT=llama3-70b-agg-0:8000 - export TARGET_MODEL=RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic - export INTERVAL=5 - echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every ${INTERVAL}s)..." - while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do - echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..." - sleep $INTERVAL - done - echo "✅ Model '$TARGET_MODEL' is now available!" - curl -s "http://$ENDPOINT/v1/models" | jq . - # now run the benchmark - export ARTIFACT_DIR="/tmp/genai" - mkdir -p "$ARTIFACT_DIR" - echo "Running benchmark..." + apt-get update && apt-get install -y curl jq procps git && apt-get clean + pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366; + echo "aiperf installation completed"; + sysctl -w net.ipv4.ip_local_port_range="1024 65000" + cat /proc/sys/net/ipv4/ip_local_port_range export COLUMNS=200 - aiperf profile \ - --model "$TARGET_MODEL" \ - --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --url "$ENDPOINT" --streaming \ - --concurrency 64 \ - --warmup-request-count 2 \ - --request-count 320 \ - --extra-inputs max_tokens:1024 \ - --synthetic-input-tokens-mean 8192 \ - --synthetic-input-tokens-stddev 0 \ - --output-tokens-mean 1024 \ - --output-tokens-stddev 0 \ - --extra-inputs min_tokens:1024 \ - --extra-inputs ignore_eos:true \ - --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ - --random-seed 1418186270 \ - --artifact-dir $ARTIFACT_DIR \ - --num-dataset-entries=3000 -- \ - --max-threads 64 - echo "----------------json----------------" - PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) - cat $PERF_JSON | jq . - echo "----------------csv-----------------" - PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) - cat $PERF_CSV - echo "Benchmark completed successfully!" + EPOCH=$(date +%s) + ## utility functions -- can be moved to a bash script / configmap + wait_for_model_ready() { + echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." + while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do + echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models" + sleep 5 + done + echo "✅ Model '$TARGET_MODEL' is now available!" + echo "Model '$TARGET_MODEL' is now available!" + curl -s "http://$ENDPOINT/v1/models" | jq . + } + run_perf() { + local concurrency=$1 + local isl=$2 + local osl=$3 + local max_threads=${concurrency} + key=concurrency_${concurrency} + export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" + mkdir -p "$ARTIFACT_DIR" + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + aiperf profile --artifact-dir $ARTIFACT_DIR \ + --model $TARGET_MODEL \ + --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ + --endpoint-type chat --endpoint /v1/chat/completions \ + --streaming \ + --url http://$ENDPOINT \ + --synthetic-input-tokens-mean $isl \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean $osl \ + --output-tokens-stddev 0 \ + --extra-inputs max_tokens:$osl \ + --extra-inputs min_tokens:$osl \ + --extra-inputs ignore_eos:true \ + --extra-inputs repetition_penalty:1.0 \ + --extra-inputs temperature:0.0 \ + --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --concurrency $concurrency \ + --request-count $((10*concurrency)) \ + --warmup-request-count $concurrency \ + --conversation-num 12800 \ + --random-seed 100 \ + --workers-max $max_threads \ + -H 'Authorization: Bearer NOT USED' \ + -H 'Accept: text/event-stream'\ + --record-processors 32 \ + --ui simple + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + ls -la $ARTIFACT_DIR + } + #### Actual execution #### + wait_for_model_ready + mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" + # Calculate total concurrency based on per-GPU concurrency and GPU count + TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT)) + echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)" + # Write input_config.json + cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" </dev/null 2>&1; do - echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..." - sleep $INTERVAL - done - echo "✅ Model '$TARGET_MODEL' is now available!" - curl -s "http://$ENDPOINT/v1/models" | jq . - # now run the benchmark - export ARTIFACT_DIR="/tmp/genai" - mkdir -p "$ARTIFACT_DIR" - echo "Running benchmark..." + apt-get update && apt-get install -y curl jq procps git && apt-get clean + pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366; + echo "aiperf installation completed"; + sysctl -w net.ipv4.ip_local_port_range="1024 65000" + cat /proc/sys/net/ipv4/ip_local_port_range export COLUMNS=200 - aiperf profile \ - --model "$TARGET_MODEL" \ - --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --url "$ENDPOINT" --streaming \ - --concurrency 64 \ - --warmup-request-count 2 \ - --request-count 320 \ - --extra-inputs max_tokens:1024 \ - --synthetic-input-tokens-mean 8192 \ - --synthetic-input-tokens-stddev 0 \ - --output-tokens-mean 1024 \ - --output-tokens-stddev 0 \ - --extra-inputs min_tokens:1024 \ - --extra-inputs ignore_eos:true \ - --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ - --random-seed 1418186270 \ - --artifact-dir $ARTIFACT_DIR \ - --num-dataset-entries=3000 -- \ - --max-threads 64 - echo "----------------json----------------" - PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) - cat $PERF_JSON | jq . - echo "----------------csv-----------------" - PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) - cat $PERF_CSV - echo "Benchmark completed successfully!" + EPOCH=$(date +%s) + ## utility functions -- can be moved to a bash script / configmap + wait_for_model_ready() { + echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." + while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do + echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models" + sleep 5 + done + echo "✅ Model '$TARGET_MODEL' is now available!" + echo "Model '$TARGET_MODEL' is now available!" + curl -s "http://$ENDPOINT/v1/models" | jq . + } + run_perf() { + local concurrency=$1 + local isl=$2 + local osl=$3 + local max_threads=${concurrency} + key=concurrency_${concurrency} + export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" + mkdir -p "$ARTIFACT_DIR" + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + aiperf profile --artifact-dir $ARTIFACT_DIR \ + --model $TARGET_MODEL \ + --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ + --endpoint-type chat --endpoint /v1/chat/completions \ + --streaming \ + --url http://$ENDPOINT \ + --synthetic-input-tokens-mean $isl \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean $osl \ + --output-tokens-stddev 0 \ + --extra-inputs max_tokens:$osl \ + --extra-inputs min_tokens:$osl \ + --extra-inputs ignore_eos:true \ + --extra-inputs repetition_penalty:1.0 \ + --extra-inputs temperature:0.0 \ + --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --concurrency $concurrency \ + --request-count $((10*concurrency)) \ + --warmup-request-count $concurrency \ + --conversation-num 12800 \ + --random-seed 100 \ + --workers-max $max_threads \ + -H 'Authorization: Bearer NOT USED' \ + -H 'Accept: text/event-stream'\ + --record-processors 32 \ + --ui simple + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + ls -la $ARTIFACT_DIR + } + #### Actual execution #### + wait_for_model_ready + mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" + # Calculate total concurrency based on per-GPU concurrency and GPU count + TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT)) + echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)" + # Write input_config.json + cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" </dev/null 2>&1; do - echo "[$(date '+%H:%M:%S')] Model not ready yet, waiting ${INTERVAL}s..." - sleep $INTERVAL - done - echo "✅ Model '$TARGET_MODEL' is now available!" - curl -s "http://$ENDPOINT/v1/models" | jq . - # now run the benchmark - export ARTIFACT_DIR="/tmp/genai-$RANDOM" - mkdir -p "$ARTIFACT_DIR" - echo "Running benchmark..." + apt-get update && apt-get install -y curl jq procps git && apt-get clean + pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366; + echo "aiperf installation completed"; + sysctl -w net.ipv4.ip_local_port_range="1024 65000" + cat /proc/sys/net/ipv4/ip_local_port_range export COLUMNS=200 - aiperf profile \ - --model "$TARGET_MODEL" \ - --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --url "$ENDPOINT" --streaming \ - --concurrency 64 \ - --warmup-request-count 2 \ - --request-count 320 \ - --extra-inputs max_tokens:1024 \ - --synthetic-input-tokens-mean 8192 \ - --synthetic-input-tokens-stddev 0 \ - --output-tokens-mean 1024 \ - --output-tokens-stddev 0 \ - --extra-inputs min_tokens:1024 \ - --extra-inputs ignore_eos:true \ - --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ - --random-seed 1418186270 \ - --artifact-dir $ARTIFACT_DIR \ - --num-dataset-entries=3000 -- \ - --max-threads 64 - echo "----------------json----------------" - PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json) - cat $PERF_JSON | jq . - echo "----------------csv-----------------" - PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv) - cat $PERF_CSV - echo "Benchmark completed successfully!" + EPOCH=$(date +%s) + ## utility functions -- can be moved to a bash script / configmap + wait_for_model_ready() { + echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." + while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do + echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models" + sleep 5 + done + echo "✅ Model '$TARGET_MODEL' is now available!" + echo "Model '$TARGET_MODEL' is now available!" + curl -s "http://$ENDPOINT/v1/models" | jq . + } + run_perf() { + local concurrency=$1 + local isl=$2 + local osl=$3 + local max_threads=${concurrency} + key=concurrency_${concurrency} + export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" + mkdir -p "$ARTIFACT_DIR" + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + aiperf profile --artifact-dir $ARTIFACT_DIR \ + --model $TARGET_MODEL \ + --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ + --endpoint-type chat --endpoint /v1/chat/completions \ + --streaming \ + --url http://$ENDPOINT \ + --synthetic-input-tokens-mean $isl \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean $osl \ + --output-tokens-stddev 0 \ + --extra-inputs max_tokens:$osl \ + --extra-inputs min_tokens:$osl \ + --extra-inputs ignore_eos:true \ + --extra-inputs repetition_penalty:1.0 \ + --extra-inputs temperature:0.0 \ + --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --concurrency $concurrency \ + --request-count $((10*concurrency)) \ + --warmup-request-count $concurrency \ + --conversation-num 12800 \ + --random-seed 100 \ + --workers-max $max_threads \ + -H 'Authorization: Bearer NOT USED' \ + -H 'Accept: text/event-stream'\ + --record-processors 32 \ + --ui simple + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + ls -la $ARTIFACT_DIR + } + #### Actual execution #### + wait_for_model_ready + mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" + # Calculate total concurrency based on per-GPU concurrency and GPU count + TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT)) + echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)" + # Write input_config.json + cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <