diff --git a/easybuild/easyblocks/t/tensorflow.py b/easybuild/easyblocks/t/tensorflow.py index 433d72aba1f..fd57b6db022 100644 --- a/easybuild/easyblocks/t/tensorflow.py +++ b/easybuild/easyblocks/t/tensorflow.py @@ -170,7 +170,7 @@ def is_version_ok(version_range): 'com_github_googleapis_googleapis': '2.0.0:2.5.0', 'com_github_googlecloudplatform_google_cloud_cpp': '2.0.0:', # Not used due to $TF_NEED_GCP=0 'com_github_grpc_grpc': '2.2.0:', - 'com_googlesource_code_re2': '2.0.0:', # Requires the RE2 version with Abseil (or 2023-06-01+) + 'com_googlesource_code_re2': '2.0.0:', # Requires or 2023-06-01+ and building TF with system Abseil 'grpc': '2.0.0:2.2.0', } # Python packages installed as extensions or in the Python module @@ -424,8 +424,8 @@ def get_system_libs(self): libpaths.append(os.path.join(openssl_root, libpath)) if ignored_system_deps: - print_warning('%d TensorFlow dependencies have not been resolved by EasyBuild. Check the log for details.', - len(ignored_system_deps)) + print_warning('%d TensorFlow dependencies have not been resolved by EasyBuild. ' + "Search the log for 'TF_SYSTEM_LIBS' for details.", len(ignored_system_deps)) self.log.warning('For the following $TF_SYSTEM_LIBS dependencies TensorFlow will download a copy ' + 'because an EB dependency was not found: \n%s\n' + 'EC Dependencies: %s\n' + @@ -562,6 +562,8 @@ def configure_step(self): self._with_cuda = bool(cuda_root) + repo_env = {} # Variables that need to be passed as --repo_env to Bazel + config_env_vars = { 'CC_OPT_FLAGS': os.getenv('CXXFLAGS'), 'MPI_HOME': mpi_home, @@ -573,7 +575,6 @@ def configure_step(self): 'TF_NEED_CUDA': ('0', '1')[self._with_cuda], 'TF_NEED_OPENCL': ('0', '1')[bool(opencl_root)], 'TF_NEED_ROCM': '0', - 'TF_NEED_TENSORRT': '0', 'TF_SET_ANDROID_WORKSPACE': '0', 'TF_SYSTEM_LIBS': ','.join(self.system_libs_info[0]), } @@ -610,10 +611,10 @@ def configure_step(self): # Clang toggle since 2.14.0 if LooseVersion(self.version) > LooseVersion('2.13'): config_env_vars['TF_NEED_CLANG'] = '0' - # Hermietic python version since 2.14.0 + # Hermetic python version since 2.14.0 if LooseVersion(self.version) > LooseVersion('2.13'): pyver = det_python_version(self.python_cmd) - config_env_vars['TF_PYTHON_VERSION'] = '.'.join(pyver.split('.')[:2]) + repo_env['TF_PYTHON_VERSION'] = '.'.join(pyver.split('.')[:2]) if self._with_cuda: cuda_version = get_software_version('CUDA') @@ -625,18 +626,9 @@ def configure_step(self): else: compiler_path = which(os.getenv('CC'), on_error=ERROR) - # list of CUDA compute capabilities to use can be specifed in two ways (where (2) overrules (1)): - # (1) in the easyconfig file, via the custom cuda_compute_capabilities; - # (2) in the EasyBuild configuration, via --cuda-compute-capabilities configuration option; - ec_cuda_cc = self.cfg['cuda_compute_capabilities'] - cfg_cuda_cc = build_option('cuda_compute_capabilities') - cuda_cc = cfg_cuda_cc or ec_cuda_cc or [] + cuda_cc = self.cfg.get_cuda_cc_template_value("cuda_cc_space_sep", required=False).split() - if cfg_cuda_cc and ec_cuda_cc: - warning_msg = "cuda_compute_capabilities specified in easyconfig (%s) are overruled by " % ec_cuda_cc - warning_msg += "--cuda-compute-capabilities configuration option (%s)" % cfg_cuda_cc - print_warning(warning_msg) - elif not cuda_cc: + if not cuda_cc: warning_msg = "No CUDA compute capabilities specified, so using TensorFlow default " warning_msg += "(which may not be optimal for your system).\nYou should use " warning_msg += "the --cuda-compute-capabilities configuration option or the cuda_compute_capabilities " @@ -660,19 +652,25 @@ def configure_step(self): 'GCC_HOST_COMPILER_PATH': compiler_path, # This is the binutils bin folder: https://github.com/tensorflow/tensorflow/issues/39263 'GCC_HOST_COMPILER_PREFIX': self.binutils_bin_path, - 'TF_CUDA_COMPUTE_CAPABILITIES': ','.join(cuda_cc), - 'TF_CUDA_VERSION': cuda_maj_min_ver, }) # from v2.18 TF with CUDA needs this envs be set if LooseVersion(self.version) >= LooseVersion('2.18'): config_env_vars.update({ 'CUDA_NVCC': '1', - 'HERMETIC_CUDA_COMPUTE_CAPABILITIES': ','.join(cuda_cc), + 'HERMETIC_CUDA_VERSION': cuda_version, + 'HERMETIC_CUDA_COMPUTE_CAPABILITIES': ','.join(f"sm_{cc.replace('.', '')}" for cc in cuda_cc), + 'LOCAL_CUDA_PATH': cuda_root, + }) + else: + config_env_vars.update({ + 'TF_CUDA_COMPUTE_CAPABILITIES': ','.join(cuda_cc), + 'TF_CUDA_VERSION': cuda_maj_min_ver, }) - # for recent TensorFlow versions, $TF_CUDA_PATHS and $TF_CUBLAS_VERSION must also be set - if LooseVersion(self.version) >= LooseVersion('1.14'): + # for these TensorFlow versions, $TF_CUDA_PATHS and $TF_CUBLAS_VERSION must also be set + # TF 2.18 introduced "Hermetic CUDA" which doesn't use those env vars anymore + if '1.14' <= LooseVersion(self.version) < '2.18': # figure out correct major/minor version for CUBLAS from cublas_api.h cublas_api_header_glob_pattern = os.path.join(cuda_root, 'targets', '*', 'include', 'cublas_api.h') @@ -696,41 +694,71 @@ def configure_step(self): 'TF_CUDA_PATHS': cuda_root, 'TF_CUBLAS_VERSION': '.'.join(cublas_ver_parts), }) + elif LooseVersion(self.version) >= '2.18': + # TF_CUDA_PATHS replaced CUDNN_INSTALL_PATH, TENSORRT_INSTALL_PATH, NCCL_INSTALL_PATH, NCCL_HDR_PATH + # in 2.0. Version guard set to 2.18 to avoid potentially breaking older easyconfigs + repo_env['TF_CUDA_PATHS'] = cuda_root if cudnn_root: cudnn_version = get_software_version('cuDNN') cudnn_maj_min_patch_ver = '.'.join(cudnn_version.split('.')[:3]) - config_env_vars.update({ - 'CUDNN_INSTALL_PATH': cudnn_root, - 'TF_CUDNN_VERSION': cudnn_maj_min_patch_ver, - }) + if LooseVersion(self.version) >= '2.18': + repo_env['TF_CUDA_PATHS'] += ',' + cudnn_root + repo_env['TF_CUDNN_VERSION'] = cudnn_version + config_env_vars.update({ + 'LOCAL_CUDNN_PATH': cudnn_root, + 'HERMETIC_CUDNN_VERSION': cudnn_version, + }) + else: + config_env_vars.update({ + 'CUDNN_INSTALL_PATH': cudnn_root, + 'TF_CUDNN_VERSION': cudnn_maj_min_patch_ver, + }) else: raise EasyBuildError("TensorFlow has a strict dependency on cuDNN if CUDA is enabled") + if nccl_root: nccl_version = get_software_version('NCCL') # Ignore the PKG_REVISION identifier if it exists (i.e., report 2.4.6 for 2.4.6-1 or 2.4.6-2) nccl_version = nccl_version.split('-')[0] - config_env_vars.update({ - 'NCCL_INSTALL_PATH': nccl_root, - }) + if LooseVersion(self.version) >= '2.18': + repo_env['TF_CUDA_PATHS'] += ',' + nccl_root + config_env_vars['LOCAL_NCCL_PATH'] = nccl_root + else: + config_env_vars['NCCL_INSTALL_PATH'] = nccl_root else: nccl_version = '1.3' # Use simple downloadable version - config_env_vars.update({ - 'TF_NCCL_VERSION': nccl_version, - }) + if LooseVersion(self.version) >= '2.18': + repo_env['TF_NCCL_VERSION'] = nccl_version + else: + config_env_vars['TF_NCCL_VERSION'] = nccl_version + if tensorrt_root: tensorrt_version = get_software_version('TensorRT') - config_env_vars.update({ + tensor_rt_vars = { 'TF_NEED_TENSORRT': '1', - 'TENSORRT_INSTALL_PATH': tensorrt_root, 'TF_TENSORRT_VERSION': tensorrt_version, - }) + 'TENSORRT_INSTALL_PATH': tensorrt_root, + } + if LooseVersion(self.version) >= '2.18': + repo_env['TF_CUDA_PATHS'] += ',' + tensorrt_root + else: + tensor_rt_vars = {'TF_NEED_TENSORRT': '0'} + if LooseVersion(self.version) >= '2.18': + repo_env.update(tensor_rt_vars) + else: + config_env_vars.update(tensor_rt_vars) + + nvshmem_root = get_software_root('NVSHMEM') + if nvshmem_root and LooseVersion(self.version) >= '2.18': + repo_env['LOCAL_NVSHMEM_PATH'] = nvshmem_root configure_py_contents = read_file('configure.py') for key, val in sorted(config_env_vars.items()): if key.startswith('TF_') and key not in configure_py_contents: - self.log.warning('Did not find %s option in configure.py. Setting might not have any effect', key) + print_warning('Did not find %s option in configure.py. Setting might not have any effect', + key, log=self.log) env.setvar(key, val) # configure.py (called by configure script) already calls bazel to determine the bazel version @@ -744,11 +772,19 @@ def configure_step(self): cmd = self.cfg['preconfigopts'] + './configure ' + self.cfg['configopts'] run_shell_cmd(cmd) + tf_conf_bazelrc = os.path.join(self.start_dir, '.tf_configure.bazelrc') + + if LooseVersion(self.version) >= '2.17': + repo_env['WHEEL_NAME'] = 'tensorflow' + + write_file(tf_conf_bazelrc, + '\n'.join(f'build --repo_env {key}="{value}"' for key, value in repo_env.items()), + append=True) + # when building on Arm 64-bit we can't just use --copt=-mcpu=native (or likewise for any -mcpu=...), # because it breaks the build of XNNPACK; # see also https://github.com/easybuilders/easybuild-easyconfigs/issues/18899 if get_cpu_architecture() == AARCH64: - tf_conf_bazelrc = os.path.join(self.start_dir, '.tf_configure.bazelrc') regex_subs = [ # use --per_file_copt instead of --copt to selectively use -mcpu=native (not for XNNPACK), # the leading '-' ensures that -mcpu=native is *not* used when building XNNPACK; @@ -817,7 +853,7 @@ def patch_crosstool_files(self): regex_subs.extend([('-fPIE', '-fPIC'), ('"-pie"', '"-fPIC"')]) # patch all CROSSTOOL* scripts to fix hardcoding of locations of binutils/GCC binaries - for path, dirnames, filenames in os.walk(os.getcwd()): + for path, _dirnames, filenames in os.walk(os.getcwd()): for filename in filenames: if filename.startswith('CROSSTOOL'): full_path = os.path.join(path, filename) @@ -972,12 +1008,12 @@ def build_step(self): + self.target_opts + [self.cfg['buildopts']] ) - if LooseVersion(self.version) < LooseVersion('2.16'): + if LooseVersion(self.version) < '2.16': cmd += ['//tensorflow/tools/pip_package:build_pip_package'] - elif LooseVersion(self.version) < LooseVersion('2.17'): # for v2.16.x - cmd += ['//tensorflow/tools/pip_package:v2/wheel --repo_env=WHEEL_NAME=tensorflow'] + elif LooseVersion(self.version) < '2.17': # for v2.16.x + cmd += ['//tensorflow/tools/pip_package:v2/wheel'] else: - cmd += ['//tensorflow/tools/pip_package:wheel --repo_env=WHEEL_NAME=tensorflow'] + cmd += ['//tensorflow/tools/pip_package:wheel'] with self.set_tmp_dir(): run_shell_cmd(' '.join(cmd))