diff --git a/.buildkite/bootstrap-amd-omni.sh b/.buildkite/bootstrap-amd-omni.sh index a38b7622011..9e7021493c5 100755 --- a/.buildkite/bootstrap-amd-omni.sh +++ b/.buildkite/bootstrap-amd-omni.sh @@ -90,10 +90,18 @@ upload_pipeline() { FAIL_FAST=$(fail_fast) cd .buildkite + + # Select test definition file: merge suite for main, ready suite for PRs + if [[ $BUILDKITE_BRANCH == "main" ]]; then + TEST_YAML="test-amd-merge.yml" + else + TEST_YAML="test-amd-ready.yaml" + fi + ( set -x # Output pipeline.yaml with all blank lines removed - minijinja-cli test-template.j2 test-amd.yaml \ + minijinja-cli test-template.j2 "$TEST_YAML" \ -D branch="$BUILDKITE_BRANCH" \ -D list_file_diff="$LIST_FILE_DIFF" \ -D run_all="$RUN_ALL" \ diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index f86b4b5d958..a06cf96bff2 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -87,7 +87,11 @@ HF_CACHE="$(realpath ~)/huggingface" mkdir -p "${HF_CACHE}" HF_MOUNT="/root/.cache/huggingface" -commands=$@ +if [[ -n "${TEST_COMMAND:-}" ]]; then + commands="$TEST_COMMAND" +else + commands="$@" +fi echo "Commands:$commands" PARALLEL_JOB_COUNT=8 @@ -102,6 +106,7 @@ if [[ -z "$render_gid" ]]; then fi # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. +# TODO: @tjtanaa reenable to run VLLM_ROCM_USE_AITER=1 when AITER is shipped with prebuilt kernels. if [[ $commands == *"--shard-id="* ]]; then # assign job count as the number of shards used commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g') @@ -118,7 +123,7 @@ if [[ $commands == *"--shard-id="* ]]; then --rm \ -e MIOPEN_DEBUG_CONV_DIRECT=0 \ -e MIOPEN_DEBUG_CONV_GEMM=0 \ - -e VLLM_ROCM_USE_AITER=1 \ + -e VLLM_ROCM_USE_AITER=0 \ -e HIP_VISIBLE_DEVICES="${GPU}" \ -e HF_TOKEN \ -e AWS_ACCESS_KEY_ID \ @@ -153,7 +158,7 @@ else --rm \ -e MIOPEN_DEBUG_CONV_DIRECT=0 \ -e MIOPEN_DEBUG_CONV_GEMM=0 \ - -e VLLM_ROCM_USE_AITER=1 \ + -e VLLM_ROCM_USE_AITER=0 \ -e HF_TOKEN \ -e AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY \ diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml new file mode 100644 index 00000000000..78d910b2194 --- /dev/null +++ b/.buildkite/test-amd-merge.yml @@ -0,0 +1,197 @@ +steps: + +- label: "Simple Unit Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_ROCM_USE_AITER=0 + - "timeout 20m pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml" + +- label: "Diffusion Model Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "advanced_model and diffusion" --run-level "advanced_model" + +- label: "Diffusion Images API LoRA E2E" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - timeout 20m pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py + +- label: "Diffusion Model CPU offloading Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - | + timeout 20m bash -c ' + set +e + pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py + EXIT1=\$? + pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py + EXIT2=\$? + exit \$((EXIT1 | EXIT2)) + ' + +## ISSUE depends on `diffusers` package: https://github.com/huggingface/diffusers/issues/13274 +# - label: "Audio Generation Model Test" +# agent_pool: mi325_1 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export GPU_ARCHS=gfx942 +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py + +- label: "Diffusion Cache Backend Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 15m pytest -s -v -m "core_model and cache and diffusion and not distributed_cuda and L4" + +- label: "Diffusion Sequence Parallelism Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py + +# merge-only tests +- label: "Diffusion Tensor Parallelism Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - pytest -s -v tests/e2e/offline_inference/test_zimage_parallelism.py + +- label: "Diffusion GPU Worker Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py + +- label: "Benchmark & Engine Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - | + timeout 20m bash -c ' + set +e + pytest -s -v tests/benchmarks/test_serve_cli.py + EXIT1=\$? + pytest -s -v tests/engine/test_async_omni_engine_abort.py + EXIT2=\$? + exit \$((EXIT1 | EXIT2)) + ' + +- label: "Omni Model Test Qwen2-5-Omni" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py + +- label: "Omni Model Test Qwen3-Omni" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py + - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" + +- label: "Qwen3-TTS E2E Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py + +- label: "Diffusion Image Edit Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py + +# split Bagel Model Test with H100 (Real Weights) into three tests +- label: "Bagel Text2Img Model Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" -k "rocm" + +- label: "Bagel Img2Img Model Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" -k "rocm" + +- label: "Bagel Online Serving Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_IMAGE_FETCH_TIMEOUT=60 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" -k "rocm" diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml new file mode 100644 index 00000000000..b09fe2ebffa --- /dev/null +++ b/.buildkite/test-amd-ready.yaml @@ -0,0 +1,178 @@ +steps: + +- label: "Simple Unit Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_ROCM_USE_AITER=0 + - "timeout 20m pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml" + +- label: "Diffusion Model Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model" + +- label: "Diffusion Model CPU offloading Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - | + timeout 20m bash -c ' + set +e + pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py + EXIT1=\$? + pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py + EXIT2=\$? + exit \$((EXIT1 | EXIT2)) + ' + +## ISSUE depends on `diffusers` package: https://github.com/huggingface/diffusers/issues/13274 +# - label: "Audio Generation Model Test" +# agent_pool: mi325_1 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export GPU_ARCHS=gfx942 +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py + +- label: "Diffusion Cache Backend Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 15m pytest -s -v -m "core_model and cache and diffusion and not distributed_cuda and L4" + +- label: "Diffusion Sequence Parallelism Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py -m core_model + +- label: "Diffusion GPU Worker Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py + +- label: "Benchmark & Engine Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - | + timeout 30m bash -c ' + set +e + pytest -s -v tests/benchmarks/test_serve_cli.py + EXIT1=\$? + pytest -s -v tests/engine/test_async_omni_engine_abort.py + EXIT2=\$? + exit \$((EXIT1 | EXIT2)) + ' + +- label: "Omni Model Test Qwen2-5-Omni" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 17m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py + +- label: "Omni Model Test Qwen3-Omni" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py + - timeout 10m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model" + +- label: "Qwen3-TTS E2E Test" + agent_pool: mi325_2 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py + +- label: "Diffusion Image Edit Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py + +- label: "Bagel Text2Img Model Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" -k "rocm" + +- label: "Bagel Img2Img Model Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" -k "rocm" + +- label: "Bagel Online Serving Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - export GPU_ARCHS=gfx942 + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - export VLLM_IMAGE_FETCH_TIMEOUT=60 + - export VLLM_LOGGING_LEVEL=DEBUG + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ROCM_USE_AITER_RMSNORM=0 + - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" -k "rocm" diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2 index 291ed0a9ade..4612f8ccd5b 100644 --- a/.buildkite/test-template-amd-omni.j2 +++ b/.buildkite/test-template-amd-omni.j2 @@ -40,9 +40,14 @@ {% else %} queue: amd_mi325_1 {% endif %} - command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" +{% set cmd_body = (step.command or (step.commands | join("\n"))) | trim %} +{% set indented_cmd = cmd_body | replace("\n", "\n ") %} + command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh env: DOCKER_BUILDKIT: "1" + TEST_COMMAND: |- + (command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe }} + {{ indented_cmd | safe }} priority: 100 {% if step.grade and step.grade == "Blocking" %} soft_fail: false diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 8e9a3bfce81..ee371330bc1 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -12,6 +12,8 @@ if current_omni_platform.is_xpu(): stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "xpu" / "qwen2_5_omni_ci.yaml")] +elif current_omni_platform.is_rocm(): + stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")] # Create parameter combinations for model and stage config test_params = [ @@ -21,7 +23,7 @@ @pytest.mark.core_model @pytest.mark.benchmark -@hardware_test(res={"cuda": "L4", "xpu": "B60"}, num_cards=3) +@hardware_test(res={"cuda": "L4", "xpu": "B60", "rocm": "MI325"}, num_cards=3) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_bench_serve_chat(omni_server): command = [ diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py index 8c734c6a250..c7df4f91bed 100644 --- a/tests/e2e/offline_inference/test_bagel_img2img.py +++ b/tests/e2e/offline_inference/test_bagel_img2img.py @@ -5,7 +5,7 @@ End-to-end test for Bagel img2img generation. This test validates that the Bagel model generates images from an input image -and text prompt that match expected reference pixel values within a ±5 tolerance. +and text prompt that match expected reference pixel values within a ±10 tolerance. Equivalent to running: python3 examples/offline_inference/bagel/end2end.py \ @@ -25,6 +25,7 @@ from tests.conftest import modify_stage_config from tests.utils import hardware_test from vllm_omni.entrypoints.omni import Omni +from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image # Generated with seed=52, num_inference_steps=15, @@ -43,7 +44,21 @@ {"position": (256, 256), "rgb": (181, 202, 222)}, ] -PIXEL_TOLERANCE = 5 +if current_omni_platform.is_rocm(): + REFERENCE_PIXELS = [ + {"position": (100, 100), "rgb": (156, 172, 215)}, + {"position": (400, 50), "rgb": (106, 144, 216)}, + {"position": (700, 100), "rgb": (118, 158, 231)}, + {"position": (150, 400), "rgb": (183, 23, 48)}, + {"position": (512, 336), "rgb": (218, 215, 191)}, + {"position": (700, 400), "rgb": (194, 14, 42)}, + {"position": (100, 600), "rgb": (105, 10, 16)}, + {"position": (400, 600), "rgb": (167, 33, 46)}, + {"position": (700, 600), "rgb": (102, 86, 92)}, + {"position": (256, 256), "rgb": (181, 201, 220)}, + ] + +PIXEL_TOLERANCE = 10 DEFAULT_PROMPT = "<|fim_middle|><|im_start|>Change the grass color to red<|im_end|>" @@ -191,7 +206,7 @@ def _resolve_stage_config(config_path: str, run_level: str) -> str: @pytest.mark.core_model @pytest.mark.advanced_model @pytest.mark.diffusion -@hardware_test(res={"cuda": "H100"}) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) def test_bagel_img2img_shared_memory_connector(run_level): """Test Bagel img2img with shared memory connector.""" input_image = _load_input_image() diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index ed369aedd92..505e12438d0 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -31,6 +31,7 @@ from tests.conftest import modify_stage_config from tests.utils import hardware_test from vllm_omni.entrypoints.omni import Omni +from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image # Each entry contains (x, y) position and expected (R, G, B) values @@ -49,6 +50,20 @@ {"position": (256, 256), "rgb": (171, 160, 153)}, ] +if current_omni_platform.is_rocm(): + REFERENCE_PIXELS = [ + {"position": (100, 100), "rgb": (123, 119, 100)}, + {"position": (400, 50), "rgb": (162, 161, 142)}, + {"position": (700, 100), "rgb": (171, 156, 127)}, + {"position": (150, 400), "rgb": (131, 128, 112)}, + {"position": (512, 512), "rgb": (134, 61, 59)}, + {"position": (700, 400), "rgb": (204, 107, 43)}, + {"position": (100, 700), "rgb": (201, 180, 165)}, + {"position": (400, 700), "rgb": (140, 108, 87)}, + {"position": (700, 700), "rgb": (247, 205, 145)}, + {"position": (256, 256), "rgb": (171, 160, 153)}, + ] + # Maximum allowed difference per color channel PIXEL_TOLERANCE = 5 @@ -181,7 +196,7 @@ def _resolve_stage_config(config_path: str, run_level: str) -> str: @pytest.mark.core_model @pytest.mark.advanced_model @pytest.mark.diffusion -@hardware_test(res={"cuda": "H100"}) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) def test_bagel_text2img_shared_memory_connector(run_level): """Test Bagel text2img with shared memory connector.""" config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") diff --git a/tests/e2e/online_serving/test_bagel_online.py b/tests/e2e/online_serving/test_bagel_online.py index a5e26db1ea1..62e79906c03 100644 --- a/tests/e2e/online_serving/test_bagel_online.py +++ b/tests/e2e/online_serving/test_bagel_online.py @@ -97,7 +97,7 @@ def test_bagel_text2img_online(omni_server, openai_client) -> None: @pytest.mark.core_model @pytest.mark.advanced_model @pytest.mark.diffusion -@hardware_test(res={"cuda": "H100"}) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_bagel_img2img_online(omni_server, openai_client) -> None: """Test Bagel img2img via OpenAI-compatible chat completions API.""" diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index ef4d40198f0..fcda20ba388 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -44,13 +44,9 @@ def get_chunk_config(): return path -# CI stage config for 2xH100-80G GPUs or AMD GPU MI325 -if current_omni_platform.is_rocm(): - # ROCm stage config optimized for MI325 GPU - stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "rocm" / "qwen3_omni_ci.yaml")] -elif current_omni_platform.is_xpu(): +if current_omni_platform.is_xpu(): stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml")] -else: +else: # MI325 GPU should share the same config as H100 stage_configs = [get_chunk_config()] # Create parameter combinations for model and stage config diff --git a/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml index 9fa5930c8ec..b95afd22394 100644 --- a/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml +++ b/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml @@ -13,8 +13,8 @@ stage_args: model_arch: Qwen2_5OmniForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 2400 - max_num_batched_tokens: 2400 + max_model_len: 16384 + max_num_batched_tokens: 16384 max_num_seqs: 1 gpu_memory_utilization: 0.8 skip_mm_profiling: true @@ -44,8 +44,8 @@ stage_args: model_arch: Qwen2_5OmniForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 2400 - max_num_batched_tokens: 2400 + max_model_len: 16384 + max_num_batched_tokens: 16384 max_num_seqs: 1 gpu_memory_utilization: 0.8 skip_mm_profiling: true @@ -59,7 +59,7 @@ stage_args: temperature: 0.9 top_p: 0.8 top_k: 40 - max_tokens: 128 + max_tokens: 4096 seed: 42 detokenize: True repetition_penalty: 1.05 @@ -79,6 +79,8 @@ stage_args: trust_remote_code: true enable_prefix_caching: false engine_output_type: audio + max_num_batched_tokens: 4096 + max_model_len: 4096 engine_input_source: [1] final_output: true final_output_type: audio @@ -86,7 +88,7 @@ stage_args: temperature: 0.0 top_p: 1.0 top_k: -1 - max_tokens: 128 + max_tokens: 4096 seed: 42 detokenize: True repetition_penalty: 1.1