Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
name = 'PyTorch'
version = '1.12.0'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2022a'}

sources = [{
'filename': '%(name)s-%(version)s.tar.gz',
'git_config': {
'url': 'https://github.com/pytorch',
'repo_name': 'pytorch',
'tag': 'v%(version)s',
'recursive': True,
},
}]
patches = [
'%(name)s-1.7.0_avoid-nan-in-test-torch.patch',
'%(name)s-1.7.0_disable-dev-shm-test.patch',
'%(name)s-1.8.1_dont-use-gpu-ccc-in-test.patch',
'%(name)s-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
'%(name)s-1.10.0_fix-test-dataloader-fixed-affinity.patch',
'%(name)s-1.10.0_skip_cmake_rpath.patch',
'%(name)s-1.11.0_increase-distributed-test-timeout.patch',
'%(name)s-1.11.0_increase_c10d_gloo_timeout.patch',
'%(name)s-1.11.0_disable_failing_jit_cuda_fuser_tests.patch',
]
checksums = [
None, # PyTorch-1.12.0.tar.gz
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
'89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
# PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
# PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
'313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch
# PyTorch-1.11.0_increase-distributed-test-timeout.patch
'087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
# PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
'20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
# PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51',
]

builddependencies = [
('CMake', '3.23.1'),
('hypothesis', '6.46.7'),
]
dependencies = [
('CUDA', '11.7.0', '', True),
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.10.4'),
('protobuf', '3.19.4'),
('protobuf-python', '3.19.4'),
('pybind11', '2.9.2'),
('SciPy-bundle', '2022.05'),
('PyYAML', '6.0'),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14'),
('FFmpeg', '4.4.2'),
('Pillow', '9.1.1'),
('cuDNN', '8.4.1.50', '-CUDA-%(cudaver)s', True),
('magma', '2.6.2', '-CUDA-%(cudaver)s'),
('NCCL', '2.12.12', '-CUDA-%(cudaver)s'),
('expecttest', '0.1.3'),
]

osdependencies = [('libibverbs-dev', 'libibverbs-devel', 'rdma-core-devel')]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = [
'3.5',
'3.7',
'5.2',
'6.0',
'6.1',
'7.0',
'7.2',
'7.5',
'8.0',
'8.6',
]

custom_opts = ['USE_CUPTI_SO=1']

excluded_tests = {
'': [
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
# 'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
# 'distributed/rpc/test_process_group_agent',
# This test fails constently when run as part of the test suite, but succeeds when run interactively
'test_model_dump',
# These tests appear flaky, possibly related to number of GPUs that are used
'distributed/fsdp/test_fsdp_memory',
'distributed/fsdp/test_fsdp_overlap',
]
}

runtest = "cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s"

# several tests are known to be flaky, and fail in some contexts (like having multiple GPUs available),
# so we allow up to 400 (out of ~90k) tests to fail before treating the installation to be faulty
max_failed_tests = 400

# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]

tests = ['%(name)s-check-cpp-extension.py']

moduleclass = 'ai'