Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
name = 'PyTorch'
version = '1.11.0'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2021b'}

source_urls = ['https://github.com/%(github_account)s/%(namelower)s/releases/download/v%(version)s']
sources = ['%(namelower)s-v%(version)s.tar.gz']
patches = [
'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
'PyTorch-1.10.0_fix-kineto-crash.patch',
'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
'PyTorch-1.10.0_fix-test-model_dump.patch',
'PyTorch-1.10.0_fix-vsx-vector-functions.patch',
'PyTorch-1.10.0_fix-XNNPACK-tests.patch',
'PyTorch-1.10.0_skip_cmake_rpath.patch',
'PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch',
'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch',
'PyTorch-1.11.0_fix-attention_cpp-compilation.patch',
'PyTorch-1.11.0_fix-fsdp-fp16-test.patch',
'PyTorch-1.11.0_fix_sharded_imports.patch',
'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch',
'PyTorch-1.11.0_fix-test_utils.patch',
'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
'PyTorch-1.11.0_increase_test_tolerances_TF32.patch',
'PyTorch-1.11.0_increase-tolerance-test_ops.patch',
'PyTorch-1.11.0_install-vsx-vec-headers.patch',
'PyTorch-1.11.0_skip_failing_ops_tests.patch',
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
'PyTorch-1.11.1_skip-test_sibling_fusion.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.12.1_fix-test_wishart_log_prob.patch',
'PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch',
]
checksums = [
'dc0c2b8d13c112a2b9ea8757a475b0ce2ca97cd19c50a8b70b8c286676616f1d', # pytorch-v1.11.0.tar.gz
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
'89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
# PyTorch-1.10.0_fix-kineto-crash.patch
'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb',
# PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
'313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
# PyTorch-1.10.0_fix-test-model_dump.patch
'339148ae1a028cda6e750ac93fa38a599f66c7abe26586c9219f1a206ea14557',
# PyTorch-1.10.0_fix-vsx-vector-functions.patch
'7bef5f96cb83b2d655d2f76dd7468a171d446f0b3e06da2232ec7f886484d312',
# PyTorch-1.10.0_fix-XNNPACK-tests.patch
'd3e749a2a42efce463e3b8a1aebb21f0edf2256682c4417297d9a44a6210e5f8',
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch
# PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch
'34ba476a7bcddec323bf9eca083cb4623d0f569d081aa3add3769c24f22849d2',
# PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51',
# PyTorch-1.11.0_fix-attention_cpp-compilation.patch
'84214fcc7e30cf70659a7c3bd70bf11e73d58fd4f7fff2c233e3225619b0e42c',
'bb1c4e6d6fd4b0cf57ff8b824c797331b533bb1ffc63f5db0bae3aee10c3dc13', # PyTorch-1.11.0_fix-fsdp-fp16-test.patch
'9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9', # PyTorch-1.11.0_fix_sharded_imports.patch
'21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
'4f7e25c4e2eb7094f92607df74488c6a4a35849fabf05fcf6c3655fa3f44a861', # PyTorch-1.11.0_fix-test_utils.patch
# PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
'20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
# PyTorch-1.11.0_increase-distributed-test-timeout.patch
'087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
# PyTorch-1.11.0_increase_test_tolerances_TF32.patch
'26e179a4f6f57e49209092612ae5f5cd8c03fd2ca84566ba0244eabefc3736ba',
# PyTorch-1.11.0_increase-tolerance-test_ops.patch
'ceec745c68a405bba79efb4dc61c662ca84eb950cd0163c7104330f4bf614cf5',
'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d', # PyTorch-1.11.0_install-vsx-vec-headers.patch
'8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2', # PyTorch-1.11.0_skip_failing_ops_tests.patch
# PyTorch-1.11.1_skip-test_init_from_local_shards.patch
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7',
# PyTorch-1.11.1_skip-test_sibling_fusion.patch
'3d6f6395d98e8e4ad76b0b63c625fddf082cf7f066eb97d4d82401f96dab2555',
'63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349', # PyTorch-1.12.1_skip-test_round_robin.patch
# PyTorch-1.12.1_fix-test_wishart_log_prob.patch
'cf475ae6e6234b96c8d1bf917597c5176c94b3ccd940b72f2e1cd0c979580f45',
# PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch
'd97cd6b0570a167ecc3e631dc4ea884d95ace285cc38aa980566f4fec2c0d089',
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.22.1'),
('hypothesis', '6.14.6'),
]

dependencies = [
('CUDA', '11.4.1', '', SYSTEM),
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.9.6'),
('protobuf', '3.17.3'),
('protobuf-python', '3.17.3'),
('pybind11', '2.7.1'),
('SciPy-bundle', '2021.10'),
('PyYAML', '5.4.1'),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14'),
('FFmpeg', '4.3.2'),
('Pillow', '8.3.2'),
('cuDNN', '8.2.2.26', '-CUDA-%(cudaver)s', SYSTEM),
('magma', '2.6.2', '-CUDA-%(cudaver)s'),
('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
('expecttest', '0.1.3'),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6']

custom_opts = ["USE_CUPTI_SO=1"]

excluded_tests = {
'': [
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
# 'distributed/test_distributed_fork',
# 'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
# 'distributed/rpc/test_process_group_agent',
# This test fails constently when run as part of the test suite, but succeeds when run interactively
# 'test_model_dump',
# These tests appear flaky, possibly related to number of GPUs that are used
'distributed/fsdp/test_fsdp_memory',
'distributed/fsdp/test_fsdp_overlap',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

# several tests are known to be flaky, and fail in some contexts (like having multiple GPUs available),
# so we allow up to 10 (out of ~90k) tests to fail before treating the installation to be faulty
# For the RTX 6000 on Skylake, that number might be up to 24
max_failed_tests = 10
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the merge of easybuilders/easybuild-easyblocks#2794 I'm guessing this will need to be higher. But let's see how many tests actually fail first, it might not be all that many since we still patched failing tests when the original EasyConfig 1.11.0 was developed :)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To (hopefully) add to this: I tried to install PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb which originally failed.
So I added the max_failed_tests = 10 to the EasyConfig file and tried to install it like that:

eb --include-easyblocks-from-pr=2794  --cuda-compute-capabilities=7.5 PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb

I got:

WARNING: 0 test failure, 463 test errors (out of 57757):
distributed/pipeline/sync/skip/test_gpipe (12 skipped, 1 warning, 1 error)
distributed/pipeline/sync/skip/test_leak (1 warning, 8 errors)
distributed/pipeline/sync/test_bugs (1 skipped, 1 warning, 3 errors)
distributed/pipeline/sync/test_inplace (2 xfailed, 1 warning, 1 error)
distributed/pipeline/sync/test_pipe (1 passed, 8 skipped, 1 warning, 47 errors)
distributed/pipeline/sync/test_transparency (1 warning, 1 error)
distributed/rpc/cuda/test_tensorpipe_agent (107 total tests, errors=1)
distributed/rpc/test_faulty_agent (28 total tests, errors=28)
distributed/rpc/test_tensorpipe_agent (424 total tests, errors=412)
distributed/test_store (19 total tests, errors=1)

I guess we need to do a bit more tuning here. :-)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, see the changes in #16339

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test report from installation.

The installation failed with:

Running test_xnnpack_integration ... [2022-10-12 02:18:04.597373]
Executing ['/sw-eb/software/Python/3.9.6-GCCcore-11.2.0/bin/python', 'test_xnnpack_integration.py', '-v'] ... [2022-10-12 02:18:04.597482]
/dev/shm/hpcsw/eb-kabv0vz7/tmpcfnhusx2/lib/python3.9/site-packages/torch/cuda/__init__.py:82: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up
 environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at  /dev/shm/hpcsw/PyTorch/1.11.0/foss-202
1b-CUDA-11.4.1/pytorch/c10/cuda/CUDAFunctions.cpp:112.)
  return torch._C._cuda_getDeviceCount() > 0
test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass) ... /dev/shm/hpcsw/eb-kabv0vz7/tmpcfnhusx2/lib/python3.9/site-packages/torch/testing/_internal/common_utils.py:424: UserWarning: 
Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered in
ternally at  /dev/shm/hpcsw/PyTorch/1.11.0/foss-2021b-CUDA-11.4.1/pytorch/c10/core/TensorImpl.h:1460.)
  return callable(*args, **kwargs)
ok
test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
test_conv2d (__main__.TestXNNPACKOps) ... ok
test_conv2d_transpose (__main__.TestXNNPACKOps) ... ok
test_linear (__main__.TestXNNPACKOps) ... ok
test_linear_1d_input (__main__.TestXNNPACKOps) ... ok
test_decomposed_linear (__main__.TestXNNPACKRewritePass) ... ok
test_linear (__main__.TestXNNPACKRewritePass) ... ok
test_combined_model (__main__.TestXNNPACKSerDes) ... ok
test_conv2d (__main__.TestXNNPACKSerDes) ... ok
test_conv2d_transpose (__main__.TestXNNPACKSerDes) ... ok
test_linear (__main__.TestXNNPACKSerDes) ... ok

----------------------------------------------------------------------
Ran 12 tests in 141.679s

OK (skipped=1)
distributed/pipeline/sync/skip/test_gpipe failed!
distributed/pipeline/sync/skip/test_leak failed!
distributed/pipeline/sync/test_bugs failed!
distributed/pipeline/sync/test_inplace failed!
distributed/pipeline/sync/test_pipe failed!
distributed/pipeline/sync/test_transparency failed!
distributed/rpc/cuda/test_tensorpipe_agent failed!
distributed/rpc/test_faulty_agent failed!
distributed/rpc/test_tensorpipe_agent failed!
distributed/test_store failed!
distributions/test_distributions failed!

== 2022-10-12 02:20:34,765 filetools.py:382 INFO Path /dev/shm/hpcsw/eb-kabv0vz7/tmpcfnhusx2 successfully removed.
== 2022-10-12 02:20:36,991 pytorch.py:344 WARNING 0 test failure, 24 test errors (out of 88784):
distributed/pipeline/sync/skip/test_gpipe (12 skipped, 1 warning, 1 error)
distributed/pipeline/sync/skip/test_leak (1 warning, 8 errors)
distributed/pipeline/sync/test_bugs (1 skipped, 1 warning, 3 errors)
distributed/pipeline/sync/test_inplace (2 xfailed, 1 warning, 1 error)
distributed/pipeline/sync/test_pipe (1 passed, 8 skipped, 1 warning, 47 errors)
distributed/pipeline/sync/test_transparency (1 warning, 1 error)
distributions/test_distributions (216 total tests, errors=3, skipped=5)

The PyTorch test suite is known to include some flaky tests, which may fail depending on the specifics of the system or the context in which they are run. For this PyTorch installation, EasyBuild allows up to 10 tests to fail. We recommend to double check that the failing tests listed above  are known to be flaky, or do not affect your intended usage of PyTorch. In case of doubt, reach out to the EasyBuild community (via GitHub, Slack, or mailing list).
== 2022-10-12 02:20:37,273 build_log.py:169 ERROR EasyBuild crashed with an error (at easybuild/base/exceptions.py:124 in __init__): Too many failed tests (24), maximum allowed is 10 (at easybuild/easyblocks/pytorch.py:348 in test_step)
== 2022-10-12 02:20:37,275 build_log.py:265 INFO ... (took 1 hour 58 mins 12 secs)
== 2022-10-12 02:20:37,278 filetools.py:2014 INFO Removing lock /sw-eb/software/.locks/_sw-eb_software_PyTorch_1.11.0-foss-2021b-CUDA-11.4.1.lock...
== 2022-10-12 02:20:37,280 filetools.py:382 INFO Path /sw-eb/software/.locks/_sw-eb_software_PyTorch_1.11.0-foss-2021b-CUDA-11.4.1.lock successfully removed.
== 2022-10-12 02:20:37,281 filetools.py:2018 INFO Lock removed: /sw-eb/software/.locks/_sw-eb_software_PyTorch_1.11.0-foss-2021b-CUDA-11.4.1.lock
== 2022-10-12 02:20:37,281 easyblock.py:4089 WARNING build failed (first 300 chars): Too many failed tests (24), maximum allowed is 10
== 2022-10-12 02:20:37,283 easyblock.py:319 INFO Closing log for application name PyTorch version 1.11.0

That was done before any changes were made.
Unfortunately, the log-file is too large to actually upload it.

Copy link
Copy Markdown
Contributor

@casparvl casparvl Oct 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused, 2 days ago you had 400-something errors (with pretty much all tests in distributed/rpc/test_tensorpipe_agent failing), yesterday, you had 24. What was the difference between these two runs?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for causing confusion. Two days ago I was trying out the new EasyBlock with the PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb which is not this one here. My wrong to put it here. Apologies.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok, clear then!

On a side note, I see the recent EasyBlock fails to properly count everything... Reported this in an issue and will fix it later.


# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]
tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'