From ea787b08ff3c8167f5568a0634f385fa3212dab4 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 4 Feb 2025 23:17:11 +0000 Subject: [PATCH 1/9] Test build to check processing by different K8 queues. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-pipeline.yaml | 4 ++++ .buildkite/test-template.j2 | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a847a68a6ef7..a038fb592dac 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -93,6 +93,7 @@ steps: - label: Core Test # 10min mirror_hardwares: [amd] + amd_gpus: 4 # Just for the sake of queue testing fast_check: true source_file_dependencies: - vllm/core @@ -105,6 +106,7 @@ steps: working_dir: "/vllm-workspace/tests" fast_check: true mirror_hardwares: [amd] + amd_gpus: 2 # Just for the sake of queue testing source_file_dependencies: - vllm/ commands: @@ -257,6 +259,7 @@ steps: - label: LoRA Test %N # 15min each mirror_hardwares: [amd] + amd_gpus: 8 source_file_dependencies: - vllm/lora - tests/lora @@ -283,6 +286,7 @@ steps: - label: Kernels Test %N # 1h each mirror_hardwares: [amd] + amd_gpus: 8 source_file_dependencies: - csrc/ - vllm/attention diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index ce448836a827..0d7c4ca9c75d 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -27,7 +27,15 @@ steps: depends_on: - "amd-build" agents: - queue: amd_gpu +{% if step.amd_gpus and step.amd_gpus==8%} + queue: amd_gpu_8 +{% elif step.amd_gpus and step.amd_gpus==4%} + queue: amd_gpu_4 +{% elif step.amd_gpus and step.amd_gpus==2%} + queue: amd_gpu_4 +{% else%} + queue: amd_gpu_1 +{% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: From 01dfddaa3bc466fae336d19dc39c42605eeb97d5 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 00:12:56 +0000 Subject: [PATCH 2/9] Testing. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 009e929ebace..feda9b8dfaaf 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,5 +1,5 @@ # default base image -ARG REMOTE_VLLM="0" +ARG REMOTE_VLLM="1" ARG USE_CYTHON="0" ARG BUILD_RPD="1" ARG COMMON_WORKDIR=/app From 7f80bf893fbb7c7332dbb48f8da2da6119f31644 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 00:33:23 +0000 Subject: [PATCH 3/9] Copying over the tests directory to enable CI testing. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index feda9b8dfaaf..c28ffee09497 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,5 +1,5 @@ # default base image -ARG REMOTE_VLLM="1" +ARG REMOTE_VLLM="0" ARG USE_CYTHON="0" ARG BUILD_RPD="1" ARG COMMON_WORKDIR=/app @@ -108,6 +108,8 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples +COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests + ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 14aaf35a1871e0bea62d05ca7e7b2de199991c6a Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 05:06:38 +0000 Subject: [PATCH 4/9] Comparing with MI250 in the "mi250_8xGPU" queue. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 0d7c4ca9c75d..67bd8b5d15ae 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -28,13 +28,13 @@ steps: - "amd-build" agents: {% if step.amd_gpus and step.amd_gpus==8%} - queue: amd_gpu_8 + queue: mi250_8xGPU {% elif step.amd_gpus and step.amd_gpus==4%} - queue: amd_gpu_4 + queue: mi250_8xGPU {% elif step.amd_gpus and step.amd_gpus==2%} - queue: amd_gpu_4 + queue: mi250_8xGPU {% else%} - queue: amd_gpu_1 + queue: mi250_8xGPU {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" From a1064893a9eda82cf29f1181a04fe753dd47c58d Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 06:39:04 +0000 Subject: [PATCH 5/9] Building with "test" as a --target Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 10 +++++----- Dockerfile.rocm | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 67bd8b5d15ae..7106395910d3 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -7,7 +7,7 @@ steps: - label: ":docker: build image" depends_on: ~ commands: - - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --progress plain ." + - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain ." - "docker push {{ docker_image_amd }}" key: "amd-build" env: @@ -28,13 +28,13 @@ steps: - "amd-build" agents: {% if step.amd_gpus and step.amd_gpus==8%} - queue: mi250_8xGPU + queue: amd_gpu_8 {% elif step.amd_gpus and step.amd_gpus==4%} - queue: mi250_8xGPU + queue: amd_gpu_4 {% elif step.amd_gpus and step.amd_gpus==2%} - queue: mi250_8xGPU + queue: amd_gpu_4 {% else%} - queue: mi250_8xGPU + queue: amd_gpu_1 {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" diff --git a/Dockerfile.rocm b/Dockerfile.rocm index c28ffee09497..3965880bfd7c 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,8 +108,8 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests - +#COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests +#COPY --from=export_vllm /.buildkite ${COMMON_WORKDIR}/vllm/.buildkite ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 6acfc3aba4cbc7ad79ad9ed86315e39bc37ff065 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 08:04:00 +0000 Subject: [PATCH 6/9] Fixing working directory property. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-pipeline.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a038fb592dac..9df17920788d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -92,6 +92,7 @@ steps: - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - label: Core Test # 10min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] amd_gpus: 4 # Just for the sake of queue testing fast_check: true @@ -178,6 +179,7 @@ steps: - pytest -v -s engine test_sequence.py test_config.py test_logger.py # OOM in the CI unless we run this separately - pytest -v -s tokenization + working_dir: "/vllm-workspace/tests" # optional - label: V1 Test #mirror_hardwares: [amd] @@ -219,6 +221,7 @@ steps: - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -237,6 +240,7 @@ steps: - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - label: LogitsProcessor Test # 5min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] source_file_dependencies: - vllm/model_executor/layers @@ -258,6 +262,7 @@ steps: - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] amd_gpus: 8 source_file_dependencies: @@ -285,6 +290,7 @@ steps: - pytest -v -s compile/test_full_graph.py - label: Kernels Test %N # 1h each + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] amd_gpus: 8 source_file_dependencies: @@ -296,6 +302,7 @@ steps: parallelism: 4 - label: Tensorizer Test # 11min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] soft_fail: true source_file_dependencies: @@ -338,6 +345,7 @@ steps: - pytest -v -s encoder_decoder - label: OpenAI-Compatible Tool Use # 20 min + working_dir: "/vllm-workspace/tests" fast_check: false mirror_hardwares: [ amd ] source_file_dependencies: From 172e0e8bd375d43ccfc41aa1d83f2d21256e78cf Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 18:17:17 +0000 Subject: [PATCH 7/9] Dummy alternation to confirm trouble with simultaneous test execution. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 3965880bfd7c..edb042c68f69 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,8 +108,7 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -#COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests -#COPY --from=export_vllm /.buildkite ${COMMON_WORKDIR}/vllm/.buildkite + ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 114e750973a4225d90126aca8582ffb31e34695f Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Thu, 6 Feb 2025 17:20:59 +0000 Subject: [PATCH 8/9] Dummy alternation to trigger a re-build and re-test. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index edb042c68f69..8c86c618103e 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,7 +108,7 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples - +# "Dummy alternation" ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 57c1133ae3d1cd6a0db84fafd63dab21fff349d0 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Mon, 17 Feb 2025 21:56:35 +0000 Subject: [PATCH 9/9] Enabling ROCm CI on MI250 machines: - correct build target - correct queue Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 10 +--------- Dockerfile.rocm | 1 - 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 7106395910d3..573675d67f86 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -27,15 +27,7 @@ steps: depends_on: - "amd-build" agents: -{% if step.amd_gpus and step.amd_gpus==8%} - queue: amd_gpu_8 -{% elif step.amd_gpus and step.amd_gpus==4%} - queue: amd_gpu_4 -{% elif step.amd_gpus and step.amd_gpus==2%} - queue: amd_gpu_4 -{% else%} - queue: amd_gpu_1 -{% endif%} + queue: amd_gpu commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 8c86c618103e..009e929ebace 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,7 +108,6 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -# "Dummy alternation" ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false