diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index f36909396675..89736eec1273 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -1,25 +1,37 @@ #!/bin/bash -# This script runs test inside the corresponding ROCm docker container. +# This script runs tests inside the corresponding ROCm docker container. +# It handles both single-node and multi-node test configurations. +# +# Multi-node detection: Instead of matching on fragile group names, we detect +# multi-node jobs structurally by looking for the bracket command syntax +# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable. set -o pipefail # Export Python path export PYTHONPATH=".." -# Print ROCm version -echo "--- Confirming Clean Initial State" -while true; do - sleep 3 - if grep -q clean /opt/amdgpu/etc/gpu_state; then - echo "GPUs state is \"clean\"" - break - fi -done - -echo "--- ROCm info" -rocminfo +############################################################################### +# Helper Functions +############################################################################### + +wait_for_clean_gpus() { + local timeout=${1:-300} + local start=$SECONDS + echo "--- Waiting for clean GPU state (timeout: ${timeout}s)" + while true; do + if grep -q clean /opt/amdgpu/etc/gpu_state; then + echo "GPUs state is \"clean\"" + return + fi + if (( SECONDS - start >= timeout )); then + echo "Error: GPUs did not reach clean state within ${timeout}s" >&2 + exit 1 + fi + sleep 3 + done +} -# cleanup older docker images cleanup_docker() { # Get Docker's root directory docker_root=$(docker info -f '{{.DockerRootDir}}') @@ -28,15 +40,12 @@ cleanup_docker() { exit 1 fi echo "Docker root directory: $docker_root" - # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') - # Define the threshold threshold=70 if [ "$disk_usage" -gt "$threshold" ]; then echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." - # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f - # Remove unused volumes / force the system prune for old images as well. docker volume prune -f && docker system prune --force --filter "until=72h" --all echo "Docker images and volumes cleanup completed." else @@ -45,193 +54,258 @@ cleanup_docker() { } cleanup_network() { - for node in $(seq 0 $((NUM_NODES-1))); do - if docker pr -a -q -f name="node${node}" | grep -q .; then - docker stop "node${node}" + local max_nodes=${NUM_NODES:-2} + for node in $(seq 0 $((max_nodes - 1))); do + if docker ps -a -q -f name="node${node}" | grep -q .; then + docker stop "node${node}" || true fi done - if docker network ls | grep docker-net; then - docker network rm docker-net + if docker network ls | grep -q docker-net; then + docker network rm docker-net || true fi } -# Call the cleanup docker function +is_multi_node() { + local cmds="$1" + # Primary signal: NUM_NODES environment variable set by the pipeline + if [[ "${NUM_NODES:-1}" -gt 1 ]]; then + return 0 + fi + # Fallback: detect the bracket syntax structurally + # Pattern: [...] && [...] (per-node command arrays) + if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then + return 0 + fi + return 1 +} + +############################################################################### +# Pytest marker re-quoting +# +# When commands are passed through Buildkite -> shell -> $* -> bash -c, +# quotes around pytest -m marker expressions get stripped: +# pytest -v -s -m 'not cpu_test' v1/core +# becomes: +# pytest -v -s -m not cpu_test v1/core +# +# pytest then interprets "cpu_test" as a file path, not part of the marker. +# This function detects unquoted multi-word marker expressions and re-quotes +# them so they survive the final bash -c expansion. +############################################################################### + +re_quote_pytest_markers() { + local cmds="$1" + # Pattern: -m not -> -m 'not ' + # Handles the common cases: 'not cpu_test', 'not slow_test', etc. + cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g") + echo "$cmds" +} + +############################################################################### +# ROCm-specific pytest command rewrites +# +# These apply ignore flags and environment overrides for tests that are not +# yet supported or behave differently on ROCm hardware. Kept as a single +# function so new exclusions are easy to add in one place. +############################################################################### + +apply_rocm_test_overrides() { + local cmds="$1" + + # --- Model registry filter --- + if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then + cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} + fi + + # --- LoRA: disable custom paged attention --- + if [[ $cmds == *"pytest -v -s lora"* ]]; then + cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} + fi + + # --- Kernel ignores --- + if [[ $cmds == *" kernels/core"* ]]; then + cmds="${cmds} \ + --ignore=kernels/core/test_fused_quant_layernorm.py \ + --ignore=kernels/core/test_permute_cols.py" + fi + + if [[ $cmds == *" kernels/attention"* ]]; then + cmds="${cmds} \ + --ignore=kernels/attention/test_attention_selector.py \ + --ignore=kernels/attention/test_encoder_decoder_attn.py \ + --ignore=kernels/attention/test_flash_attn.py \ + --ignore=kernels/attention/test_flashinfer.py \ + --ignore=kernels/attention/test_prefix_prefill.py \ + --ignore=kernels/attention/test_cascade_flash_attn.py \ + --ignore=kernels/attention/test_mha_attn.py \ + --ignore=kernels/attention/test_lightning_attn.py \ + --ignore=kernels/attention/test_attention.py" + fi + + if [[ $cmds == *" kernels/quantization"* ]]; then + cmds="${cmds} \ + --ignore=kernels/quantization/test_int8_quant.py \ + --ignore=kernels/quantization/test_machete_mm.py \ + --ignore=kernels/quantization/test_block_fp8.py \ + --ignore=kernels/quantization/test_block_int8.py \ + --ignore=kernels/quantization/test_marlin_gemm.py \ + --ignore=kernels/quantization/test_cutlass_scaled_mm.py \ + --ignore=kernels/quantization/test_int8_kernel.py" + fi + + if [[ $cmds == *" kernels/mamba"* ]]; then + cmds="${cmds} \ + --ignore=kernels/mamba/test_mamba_mixer2.py \ + --ignore=kernels/mamba/test_causal_conv1d.py \ + --ignore=kernels/mamba/test_mamba_ssm_ssd.py" + fi + + if [[ $cmds == *" kernels/moe"* ]]; then + cmds="${cmds} \ + --ignore=kernels/moe/test_moe.py \ + --ignore=kernels/moe/test_cutlass_moe.py \ + --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" + fi + + # --- Entrypoint ignores --- + if [[ $cmds == *" entrypoints/openai "* ]]; then + cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \ + --ignore=entrypoints/openai/test_audio.py \ + --ignore=entrypoints/openai/test_shutdown.py \ + --ignore=entrypoints/openai/test_completion.py \ + --ignore=entrypoints/openai/test_models.py \ + --ignore=entrypoints/openai/test_lora_adapters.py \ + --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ + --ignore=entrypoints/openai/test_root_path.py \ + --ignore=entrypoints/openai/test_tokenization.py \ + --ignore=entrypoints/openai/test_prompt_validation.py "} + fi + + if [[ $cmds == *" entrypoints/llm "* ]]; then + cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \ + --ignore=entrypoints/llm/test_chat.py \ + --ignore=entrypoints/llm/test_accuracy.py \ + --ignore=entrypoints/llm/test_init.py \ + --ignore=entrypoints/llm/test_prompt_validation.py "} + fi + + # Clean up escaped newlines from --ignore appends + cmds=$(echo "$cmds" | sed 's/ \\ / /g') + + echo "$cmds" +} + +############################################################################### +# Main +############################################################################### + +# --- GPU initialization --- +echo "--- Confirming Clean Initial State" +wait_for_clean_gpus + +echo "--- ROCm info" +rocminfo + +# --- Docker housekeeping --- cleanup_docker echo "--- Resetting GPUs" - echo "reset" > /opt/amdgpu/etc/gpu_state +wait_for_clean_gpus -while true; do - sleep 3 - if grep -q clean /opt/amdgpu/etc/gpu_state; then - echo "GPUs state is \"clean\"" - break - fi -done - +# --- Pull test image --- echo "--- Pulling container" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" docker pull "${image_name}" remove_docker_container() { - docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true + docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true } trap remove_docker_container EXIT +# --- Prepare commands --- echo "--- Running container" HF_CACHE="$(realpath ~)/huggingface" mkdir -p "${HF_CACHE}" HF_MOUNT="/root/.cache/huggingface" -commands=$@ +commands="$*" echo "Raw commands: $commands" -commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"} - -if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then - commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} -fi - -commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"} - -if [[ $commands == *"pytest -v -s lora"* ]]; then - commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} -fi - -#ignore certain kernels tests -if [[ $commands == *" kernels/core"* ]]; then - commands="${commands} \ - --ignore=kernels/core/test_fused_quant_layernorm.py \ - --ignore=kernels/core/test_permute_cols.py" -fi - -if [[ $commands == *" kernels/attention"* ]]; then - commands="${commands} \ - --ignore=kernels/attention/test_attention_selector.py \ - --ignore=kernels/attention/test_encoder_decoder_attn.py \ - --ignore=kernels/attention/test_flash_attn.py \ - --ignore=kernels/attention/test_flashinfer.py \ - --ignore=kernels/attention/test_prefix_prefill.py \ - --ignore=kernels/attention/test_cascade_flash_attn.py \ - --ignore=kernels/attention/test_mha_attn.py \ - --ignore=kernels/attention/test_lightning_attn.py \ - --ignore=kernels/attention/test_attention.py" -fi - -if [[ $commands == *" kernels/quantization"* ]]; then - commands="${commands} \ - --ignore=kernels/quantization/test_int8_quant.py \ - --ignore=kernels/quantization/test_machete_mm.py \ - --ignore=kernels/quantization/test_block_fp8.py \ - --ignore=kernels/quantization/test_block_int8.py \ - --ignore=kernels/quantization/test_marlin_gemm.py \ - --ignore=kernels/quantization/test_cutlass_scaled_mm.py \ - --ignore=kernels/quantization/test_int8_kernel.py" -fi - -if [[ $commands == *" kernels/mamba"* ]]; then - commands="${commands} \ - --ignore=kernels/mamba/test_mamba_mixer2.py \ - --ignore=kernels/mamba/test_causal_conv1d.py \ - --ignore=kernels/mamba/test_mamba_ssm_ssd.py" -fi - -if [[ $commands == *" kernels/moe"* ]]; then - commands="${commands} \ - --ignore=kernels/moe/test_moe.py \ - --ignore=kernels/moe/test_cutlass_moe.py \ - --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" -fi - -#ignore certain Entrypoints/openai tests -if [[ $commands == *" entrypoints/openai "* ]]; then - commands=${commands//" entrypoints/openai "/" entrypoints/openai \ - --ignore=entrypoints/openai/test_audio.py \ - --ignore=entrypoints/openai/test_shutdown.py \ - --ignore=entrypoints/openai/test_completion.py \ - --ignore=entrypoints/openai/test_models.py \ - --ignore=entrypoints/openai/test_lora_adapters.py \ - --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ - --ignore=entrypoints/openai/test_root_path.py \ - --ignore=entrypoints/openai/test_tokenization.py \ - --ignore=entrypoints/openai/test_prompt_validation.py "} -fi - -#ignore certain Entrypoints/llm tests -if [[ $commands == *" entrypoints/llm "* ]]; then - commands=${commands//" entrypoints/llm "/" entrypoints/llm \ - --ignore=entrypoints/llm/test_chat.py \ - --ignore=entrypoints/llm/test_accuracy.py \ - --ignore=entrypoints/llm/test_init.py \ - --ignore=entrypoints/llm/test_prompt_validation.py "} -fi - -commands=$(echo "$commands" | sed 's/ \\ / /g') +# Fix quoting before ROCm overrides (so overrides see correct structure) +commands=$(re_quote_pytest_markers "$commands") +commands=$(apply_rocm_test_overrides "$commands") echo "Final commands: $commands" -# --ignore=entrypoints/openai/test_encoder_decoder.py \ -# --ignore=entrypoints/openai/test_embedding.py \ -# --ignore=entrypoints/openai/test_oot_registration.py -# --ignore=entrypoints/openai/test_accuracy.py \ -# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13 - - MYPYTHONPATH=".." -# Test that we're launching on the machine that has -# proper access to GPUs +# Verify GPU access render_gid=$(getent group render | cut -d: -f3) if [[ -z "$render_gid" ]]; then echo "Error: 'render' group not found. This is required for GPU access." >&2 exit 1 fi -if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then - +# --- Route: multi-node vs single-node --- +if is_multi_node "$commands"; then + echo "--- Multi-node job detected" export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/') - if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then - prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g') - echo "PREFIX: ${prefix}" - export composite_command="(command rocm-smi || true)" - myIFS=$IFS - IFS=',' - read -ra node0 <<< ${BASH_REMATCH[2]} - read -ra node1 <<< ${BASH_REMATCH[3]} - IFS=$myIFS - for i in "${!node0[@]}";do - command_node_0=$(echo ${node0[i]} | sed 's/\"//g') - command_node_1=$(echo ${node1[i]} | sed 's/\"//g') - - export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'" - echo "COMMANDS: ${commands}" - composite_command=$(echo "${composite_command} && ${commands}") - done - /bin/bash -c "${composite_command}" - cleanup_network + # Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds] + # BASH_REMATCH[1] = prefix (everything before first bracket) + # BASH_REMATCH[2] = comma-separated node0 commands + # BASH_REMATCH[3] = comma-separated node1 commands + if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then + prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g') + echo "PREFIX: ${prefix}" + + export composite_command="(command rocm-smi || true)" + saved_IFS=$IFS + IFS=',' + read -ra node0 <<< "${BASH_REMATCH[2]}" + read -ra node1 <<< "${BASH_REMATCH[3]}" + IFS=$saved_IFS + + if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then + echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index." + fi + + for i in "${!node0[@]}"; do + command_node_0=$(echo "${node0[i]}" | sed 's/\"//g') + command_node_1=$(echo "${node1[i]}" | sed 's/\"//g') + + step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'" + echo "COMMANDS: ${step_cmd}" + composite_command="${composite_command} && ${step_cmd}" + done + + /bin/bash -c "${composite_command}" + cleanup_network else - echo "Failed to parse node commands! Exiting." - cleanup_network - exit 111 + echo "Multi-node job detected but failed to parse bracket command syntax." + echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]" + echo "Got: $commands" + cleanup_network + exit 111 fi else + echo "--- Single-node job" echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" docker run \ - --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ - --network=host \ - --shm-size=16gb \ - --group-add "$render_gid" \ - --rm \ - -e HF_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -v "${HF_CACHE}:${HF_MOUNT}" \ - -e "HF_HOME=${HF_MOUNT}" \ - -e "PYTHONPATH=${MYPYTHONPATH}" \ - --name "${container_name}" \ - "${image_name}" \ - /bin/bash -c "${commands}" + --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ + --network=host \ + --shm-size=16gb \ + --group-add "$render_gid" \ + --rm \ + -e HF_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + -e "PYTHONPATH=${MYPYTHONPATH}" \ + --name "${container_name}" \ + "${image_name}" \ + /bin/bash -c "${commands}" fi diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index 82ce2f420053..4f2380592d9e 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -28,3 +28,11 @@ steps: - pytest -v -s v1/engine/test_preprocess_error_handling.py # Run the rest of v1/engine tests - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py + mirror: + amd: + device: mi325_8 + depends_on: + - image-build-amd + commands: + - pytest -v -s v1/e2e + - pytest -v -s v1/engine diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 6aebb9aabe3e..5c58e97ef16e 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -24,11 +24,6 @@ steps: - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: Entrypoints Integration (API Server 1) timeout_in_minutes: 130 @@ -65,6 +60,11 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/pooling + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: Entrypoints Integration (Responses API) timeout_in_minutes: 50 diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index c6b43b97aecd..5c5a9dbcbb69 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -16,6 +16,7 @@ steps: - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker + # TODO: create another `optional` test group for slow tests - pytest -v -s -m 'not slow_test' v1/spec_decode - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/metrics @@ -25,6 +26,11 @@ steps: # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: V1 Others (CPU) depends_on: diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml index 8982dccc4dec..a3bd21ccff3c 100644 --- a/.buildkite/test_areas/models_language.yaml +++ b/.buildkite/test_areas/models_language.yaml @@ -55,6 +55,15 @@ steps: - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - label: Language Models Test (PPL) timeout_in_minutes: 110 @@ -73,6 +82,11 @@ steps: - tests/models/language/pooling commands: - pytest -v -s models/language/pooling -m 'not core_model' + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: Language Models Test (MTEB) timeout_in_minutes: 110 diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 3409f04a1bff..22226e8dab3e 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -305,6 +305,14 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ uv pip install --system /rixl_install/*.whl +# RIXL/MoRIIO runtime dependencies (RDMA userspace libraries) +RUN apt-get update -q -y && apt-get install -q -y \ + librdmacm1 \ + libibverbs1 \ + ibverbs-providers \ + ibverbs-utils \ + && rm -rf /var/lib/apt/lists/* + WORKDIR /vllm-workspace ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace @@ -330,6 +338,11 @@ RUN bash /tmp/install_torchcodec.sh \ # Copy in the v1 package (for python-only install test group) COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 +# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel +# See: https://github.com/pytorch/pytorch/issues/169857 +ENV MIOPEN_DEBUG_CONV_DIRECT=0 +ENV MIOPEN_DEBUG_CONV_GEMM=0 + # Source code is used in the `python_only_compile.sh` test # We hide it inside `src/` so that this source code # will not be imported by other tests diff --git a/tests/v1/kv_connector/unit/test_moriio_connector.py b/tests/v1/kv_connector/unit/test_moriio_connector.py index 1eca4964fd6c..17d951b914ea 100644 --- a/tests/v1/kv_connector/unit/test_moriio_connector.py +++ b/tests/v1/kv_connector/unit/test_moriio_connector.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.util import os +import subprocess from unittest.mock import MagicMock, patch import msgspec @@ -40,6 +41,19 @@ aiter_available = importlib.util.find_spec("aiter") is not None mori_available = importlib.util.find_spec("mori") is not None + + +def _rdma_available() -> bool: + """Check if RDMA devices are available.""" + try: + result = subprocess.run(["ibv_devinfo"], capture_output=True, text=True) + return "No IB devices found" not in result.stderr + except FileNotFoundError: + return False + + +rdma_available = _rdma_available() + pytestmark = pytest.mark.skipif( not (current_platform.is_rocm() and mori_available), reason="MoRIIOs are only available on ROCm with aiter package installed", @@ -393,6 +407,7 @@ def test_read_mode_loads_remote_block_ids(moriio_read_mode): @pytest.mark.skipif( not aiter_available, reason="Requires aiter package for ROCm FlashAttention backend" ) +@pytest.mark.skipif(not rdma_available, reason="No RDMA devices available") def test_register_kv_caches(mock_parallel_groups): """Test that MoRIIOConnector.register_kv_caches correctly registers kv caches.""" ROLE = "kv_consumer" @@ -488,6 +503,7 @@ def test_register_kv_caches(mock_parallel_groups): @pytest.mark.skipif( not aiter_available, reason="Requires aiter package for ROCm FlashAttention backend" ) +@pytest.mark.skipif(not rdma_available, reason="No RDMA devices available") def test_moriio_handshake_returns_metadata(mock_parallel_groups): """MoRIIO handshake socket returns valid agent metadata over ZMQ."""