diff --git a/.github/workflows/test_pytorch_wheels.yml b/.github/workflows/test_pytorch_wheels.yml index 62beed1d9d2..b9ee701ccab 100644 --- a/.github/workflows/test_pytorch_wheels.yml +++ b/.github/workflows/test_pytorch_wheels.yml @@ -139,6 +139,20 @@ jobs: --index-subdir=${{ inputs.amdgpu_family }} \ --activate-in-future-github-actions-steps + # TODO: If setup_venv.py switches to uv, update this to use uv pip install. + - name: Install rocm[devel] + run: | + ROCM_VERSION=$(python -c "from importlib.metadata import version; print(version('rocm'))") + python -m pip install \ + --index-url="${{ inputs.package_index_url }}/${{ inputs.amdgpu_family }}" \ + "rocm[devel]==${ROCM_VERSION}" + + # hipconfig --rocmpath points to the devel package root so PyTorch's + # _find_rocm_home() resolves correctly. + - name: Set ROCM_PATH + run: | + echo "ROCM_PATH=$(hipconfig --rocmpath)" >> $GITHUB_ENV + - name: Install test requirements run: | python -m pip install -r external-builds/pytorch/requirements-test.txt diff --git a/external-builds/pytorch/README.md b/external-builds/pytorch/README.md index d06925fbf7c..01fa0dc86d7 100644 --- a/external-builds/pytorch/README.md +++ b/external-builds/pytorch/README.md @@ -232,6 +232,16 @@ python run_pytorch_smoke_tests.py -- \ ### Running full PyTorch tests +> [!WARNING] +> Our CI currently tests with `rocm[devel]` installed alongside `torch` because +> some tests require development headers at runtime. If you are running tests +> locally, install the devel packages and set `ROCM_PATH` accordingly: +> +> ```bash +> pip install "rocm[devel]" --index-url +> export ROCM_PATH=$(hipconfig --rocmpath) +> ``` + We have a [`run_pytorch_tests.py`](run_pytorch_tests.py) script which runs PyTorch unit tests using pytest with additional test exclusion capabilities tailored for AMD ROCm GPUs. See the script for detailed diff --git a/external-builds/pytorch/skip_tests/generic.py b/external-builds/pytorch/skip_tests/generic.py index 275c7662612..08c75550e4a 100644 --- a/external-builds/pytorch/skip_tests/generic.py +++ b/external-builds/pytorch/skip_tests/generic.py @@ -24,15 +24,6 @@ # 'num_host_free': 0, 'reserved_bytes.allocated': 0, 'reserved_bytes.current': 0, 'reserved_bytes.freed': 0, # 'reserved_bytes.peak': 0, 'segment.allocated': 0, 'segment.current': 0, 'segment.freed': 0, 'segment.peak': 0}) "test_host_memory_stats", - # THIS IS AN OLD ERROR - # In file included from /home/tester/.cache/torch_extensions/py312_cpu/dummy_allocator/main_hip.cpp:5: - # /home/tester/TheRock/.venv/lib/python3.12/site-packages/torch/include/ATen/hip/Exceptions.h:4:10: fatal error: hipblas/hipblas.h: No such file or directory - # 4 | #include - # | ^~~~~~~~~~~~~~~~~~~ - # compilation terminated. - # NEW ERROR - # RuntimeError: Error building extension 'dummy_allocator' - "test_mempool_with_allocator", # Change detector test (Cublaslt vs Cublas depending on gcn_arch and torch version) # Always skip as this test is very basic and needs manual intervention for new architectures # See diff --git a/external-builds/pytorch/skip_tests/pytorch_2.10.py b/external-builds/pytorch/skip_tests/pytorch_2.10.py index 99573b613a0..b617acf4091 100644 --- a/external-builds/pytorch/skip_tests/pytorch_2.10.py +++ b/external-builds/pytorch/skip_tests/pytorch_2.10.py @@ -29,10 +29,6 @@ "test_memory_snapshot_with_cpp", # AssertionError: Scalars are not equal! "test_mempool_ctx_multithread", - # RuntimeError: Error building extension 'dummy_allocator' - "test_mempool_empty_cache_inactive", - # RuntimeError: Error building extension 'dummy_allocator_v1' - "test_mempool_limited_memory_with_allocator", # OSError: libhiprtc.so: cannot open shared object file: No such file or directory # File "/home/tester/TheRock/.venv/lib/python3.12/site-packages/torch/cuda/_utils.py", line 57, in _get_hiprtc_library # lib = ctypes.CDLL("libhiprtc.so") diff --git a/external-builds/pytorch/skip_tests/pytorch_2.11.py b/external-builds/pytorch/skip_tests/pytorch_2.11.py index b537c2a2fe8..1e38f59b78b 100644 --- a/external-builds/pytorch/skip_tests/pytorch_2.11.py +++ b/external-builds/pytorch/skip_tests/pytorch_2.11.py @@ -23,15 +23,6 @@ "test_memory_snapshot_with_cpp", # AssertionError: Scalars are not equal! "test_mempool_ctx_multithread", - # RuntimeError: Error building extension 'dummy_allocator' - "test_mempool_empty_cache_inactive", - # RuntimeError: Error building extension 'dummy_allocator_v1' - "test_mempool_limited_memory_with_allocator", - # new for pytorch 2.11 - # RuntimeError: Error building extension 'dummy_allocator_v3' - "test_tensor_delete_after_allocator_delete", - # RuntimeError: Error building extension 'dummy_allocator' - "test_deleted_mempool_not_used_on_oom", # ModuleNotFoundError: No module named 'torchvision' "test_resnet", # RuntimeError: miopenStatusUnknownError diff --git a/external-builds/pytorch/skip_tests/pytorch_2.8.py b/external-builds/pytorch/skip_tests/pytorch_2.8.py index 473df362e99..e4322612119 100644 --- a/external-builds/pytorch/skip_tests/pytorch_2.8.py +++ b/external-builds/pytorch/skip_tests/pytorch_2.8.py @@ -23,10 +23,6 @@ "test_memory_snapshot_script", "test_memory_snapshot_with_cpp", "test_mempool_ctx_multithread", - # RuntimeError: Error building extension 'dummy_allocator' - "test_mempool_empty_cache_inactive", - # RuntimeError: Error building extension 'dummy_allocator_v1' - "test_mempool_limited_memory_with_allocator", # This test was fixed in torch 2.9, see # https://github.com/ROCm/TheRock/issues/2206 "test_hip_device_count", diff --git a/external-builds/pytorch/skip_tests/pytorch_2.9.py b/external-builds/pytorch/skip_tests/pytorch_2.9.py index bc774a3e568..5f260b37aa8 100644 --- a/external-builds/pytorch/skip_tests/pytorch_2.9.py +++ b/external-builds/pytorch/skip_tests/pytorch_2.9.py @@ -33,10 +33,6 @@ "test_memory_snapshot_with_cpp", # AssertionError: Scalars are not equal! "test_mempool_ctx_multithread", - # RuntimeError: Error building extension 'dummy_allocator' - "test_mempool_empty_cache_inactive", - # RuntimeError: Error building extension 'dummy_allocator_v1' - "test_mempool_limited_memory_with_allocator", # for whatever reason these are also flaky: if run standalone they pass? # AttributeError: module 'torch.backends.cudnn.rnn' has no attribute 'fp32_precision' "test_fp32_precision_with_float32_matmul_precision",