From 6c32ec8fe49e340ae5b92b1619f3cde77b403422 Mon Sep 17 00:00:00 2001 From: jakub-sochacki Date: Tue, 14 Oct 2025 18:15:41 +0300 Subject: [PATCH 1/5] draft intel gaudi 3 integration Signed-off-by: jakub-sochacki --- buildkite/test-template-ci.j2 | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/buildkite/test-template-ci.j2 b/buildkite/test-template-ci.j2 index 64a6184e..a0fb27ff 100644 --- a/buildkite/test-template-ci.j2 +++ b/buildkite/test-template-ci.j2 @@ -9,6 +9,7 @@ {% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-torch-nightly" %} {% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %} {% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %} +{% set docker_image_hpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-hpu" %} {% endif %} {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} {% set default_working_dir = "/vllm-workspace/tests" %} @@ -507,6 +508,32 @@ steps: - exit_status: -10 # Agent was lost limit: 2 + - label: ":docker: build image HPU" + key: image-build-hpu + depends_on: ~ + agents: + queue: cpu_queue_postmerge_us_east_1 + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - | + #!/bin/bash + if [[ -z $(docker manifest inspect {{ docker_image_hpu }}) ]]; then + echo "Image not found, proceeding with build..." + else + echo "Image found" + exit 0 + fi + - "docker build --file docker/Dockerfile.hpu --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --tag {{ docker_image_hpu }} --progress plain ." + - "docker push {{ docker_image_hpu }}" + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + {% for step in steps %} {% if step.fast_check_only != true %} From f472a8d52f771eff68968a186db1dc508a839e8e Mon Sep 17 00:00:00 2001 From: jakub-sochacki Date: Wed, 15 Oct 2025 17:42:50 +0300 Subject: [PATCH 2/5] Add HPU image build with vllm-gaudi compatibility Signed-off-by: jakub-sochacki --- buildkite/test-template-ci.j2 | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/buildkite/test-template-ci.j2 b/buildkite/test-template-ci.j2 index a0fb27ff..e0899ba3 100644 --- a/buildkite/test-template-ci.j2 +++ b/buildkite/test-template-ci.j2 @@ -9,7 +9,8 @@ {% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-torch-nightly" %} {% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %} {% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %} -{% set docker_image_hpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-hpu" %} +{# Note: docker_image_hpu is NOT defined here because HPU uses a different commit tag (from vllm-gaudi compatibility) #} +{# HPU image tag is determined dynamically in the build step based on VLLM_STABLE_COMMIT #} {% endif %} {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} {% set default_working_dir = "/vllm-workspace/tests" %} @@ -508,6 +509,7 @@ steps: - exit_status: -10 # Agent was lost limit: 2 + {% if branch == "main" %} - label: ":docker: build image HPU" key: image-build-hpu depends_on: ~ @@ -517,14 +519,33 @@ steps: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - | #!/bin/bash - if [[ -z $(docker manifest inspect {{ docker_image_hpu }}) ]]; then + # Fetch the compatible vLLM commit for vllm-gaudi + VLLM_STABLE_COMMIT=$(curl -s https://raw.githubusercontent.com/vllm-project/vllm-gaudi/main/last-good-commit-for-vllm-gaudi/VLLM_STABLE_COMMIT | tr -d '\n') + echo "Compatible vLLM commit for vllm-gaudi: $VLLM_STABLE_COMMIT" + + # HPU images always use postmerge registry (main branch only) + REGISTRY="public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo" + + # HPU images use the stable commit tag, not BUILDKITE_COMMIT + HPU_IMAGE_TAG="${REGISTRY}:${VLLM_STABLE_COMMIT}-hpu" + + if [[ -z $(docker manifest inspect "$HPU_IMAGE_TAG") ]]; then echo "Image not found, proceeding with build..." else - echo "Image found" + echo "Image $HPU_IMAGE_TAG already exists" exit 0 fi - - "docker build --file docker/Dockerfile.hpu --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --tag {{ docker_image_hpu }} --progress plain ." - - "docker push {{ docker_image_hpu }}" + + # Build with the stable commit + docker build \ + --file docker/Dockerfile.hpu \ + --build-arg max_jobs=16 \ + --build-arg VLLM_COMMIT=$VLLM_STABLE_COMMIT \ + --build-arg VLLM_GAUDI_COMMIT=main \ + --tag "$HPU_IMAGE_TAG" \ + --progress plain . + + docker push "$HPU_IMAGE_TAG" env: DOCKER_BUILDKIT: "1" retry: @@ -533,6 +554,7 @@ steps: limit: 2 - exit_status: -10 # Agent was lost limit: 2 + {% endif %} {% for step in steps %} {% if step.fast_check_only != true %} From 1d5e9899ba8461912886c5525d595131ef0b89d7 Mon Sep 17 00:00:00 2001 From: jakub-sochacki Date: Wed, 29 Oct 2025 15:51:32 +0200 Subject: [PATCH 3/5] fix: correct branch path for vllm-gaudi VLLM_STABLE_COMMIT file Signed-off-by: jakub-sochacki --- buildkite/test-template-ci.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildkite/test-template-ci.j2 b/buildkite/test-template-ci.j2 index e0899ba3..5702405a 100644 --- a/buildkite/test-template-ci.j2 +++ b/buildkite/test-template-ci.j2 @@ -520,7 +520,7 @@ steps: - | #!/bin/bash # Fetch the compatible vLLM commit for vllm-gaudi - VLLM_STABLE_COMMIT=$(curl -s https://raw.githubusercontent.com/vllm-project/vllm-gaudi/main/last-good-commit-for-vllm-gaudi/VLLM_STABLE_COMMIT | tr -d '\n') + VLLM_STABLE_COMMIT=$(curl -s https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_STABLE_COMMIT | tr -d '\n') echo "Compatible vLLM commit for vllm-gaudi: $VLLM_STABLE_COMMIT" # HPU images always use postmerge registry (main branch only) From 8e5f332800b529d3f851e742f6f2e7de318548d7 Mon Sep 17 00:00:00 2001 From: jakub-sochacki Date: Wed, 29 Oct 2025 17:34:54 +0200 Subject: [PATCH 4/5] Use vllm-gaudi Dockerfile for HPU builds Signed-off-by: jakub-sochacki --- buildkite/test-template-ci.j2 | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/buildkite/test-template-ci.j2 b/buildkite/test-template-ci.j2 index 5702405a..1433c278 100644 --- a/buildkite/test-template-ci.j2 +++ b/buildkite/test-template-ci.j2 @@ -9,8 +9,6 @@ {% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-torch-nightly" %} {% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %} {% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %} -{# Note: docker_image_hpu is NOT defined here because HPU uses a different commit tag (from vllm-gaudi compatibility) #} -{# HPU image tag is determined dynamically in the build step based on VLLM_STABLE_COMMIT #} {% endif %} {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} {% set default_working_dir = "/vllm-workspace/tests" %} @@ -536,9 +534,10 @@ steps: exit 0 fi - # Build with the stable commit + git clone https://github.com/vllm-project/vllm-gaudi.git /tmp/vllm-gaudi + docker build \ - --file docker/Dockerfile.hpu \ + --file /tmp/vllm-gaudi/tests/ci_benchmark/Dockerfile.hpu \ --build-arg max_jobs=16 \ --build-arg VLLM_COMMIT=$VLLM_STABLE_COMMIT \ --build-arg VLLM_GAUDI_COMMIT=main \ From bb802f2594cdaa415f2ecff709b2bf7757ec4722 Mon Sep 17 00:00:00 2001 From: jakub-sochacki Date: Wed, 29 Oct 2025 17:44:28 +0200 Subject: [PATCH 5/5] Fix vllm-gaudi Dockerfile path Signed-off-by: jakub-sochacki --- buildkite/test-template-ci.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildkite/test-template-ci.j2 b/buildkite/test-template-ci.j2 index 1433c278..54772646 100644 --- a/buildkite/test-template-ci.j2 +++ b/buildkite/test-template-ci.j2 @@ -537,7 +537,7 @@ steps: git clone https://github.com/vllm-project/vllm-gaudi.git /tmp/vllm-gaudi docker build \ - --file /tmp/vllm-gaudi/tests/ci_benchmark/Dockerfile.hpu \ + --file /tmp/vllm-gaudi/tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \ --build-arg max_jobs=16 \ --build-arg VLLM_COMMIT=$VLLM_STABLE_COMMIT \ --build-arg VLLM_GAUDI_COMMIT=main \