From b75a718dae4480b4d446cbc1ec22669af88894fc Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Fri, 22 Sep 2023 15:06:02 -0700 Subject: [PATCH] Ci enable distconv (#2235) * Enable CI testing for DistConv. Added DistConv CI tests Added Corona DistConv test and disabled FFT on ROCm Ensure that DistConv tests keep error signals Enable NVSHMEM on Lassen Added a multi-stage pipeline for Lassen Fixed a typo and disabled other tests. Added spack environment Added check stage for the catch tests Added the definition of the RESULTS_DIR environment variable Added release notes. Fixed the launcher for catch tests Changed the batch launch commands to be interactive to block completion. Added a wrapper shell script for launching the unit tests Added the number of nodes for the unit test. Cleaning up launching paths Added execute permissions for unit test script. Ingest the Spack dependent environment information. Fixing launch command and exclusion of externallayer Bugfix python Added number of tasks per node Added integration tests. Set some NVSHMEM runtime variables Uniquify the CI JOB_NAME fields for DistConv tests. * Re-introduced the WITH_CLEAN_BUILD flag. * Adapting new tests to the new build script framework with modules. * Increased the time limit for the build on Lassen. Code cleanup. * Removed duplicate get_distconv_environment function in the ci_test common python tools. Switched all tests to using the standard contrib args version. * Changed the default behavior on MIOpen systems to use a local cache for JIT state. * Added back note about existing issue in DiHydrogen. * Enable CI runs to specific a subset of unit tests to run. * Tweaking the allowed runtimes for tests. * Debuging the test selection. Increasing some test time limits. * Added test filter flags to all systems. * Increasing time limits * Added flags to skip integration tests on distconv CI runs. * Bumped up pooling time limit. * Testing out setting a set of MIOpen dB cache directories for CI testing, both for normal users and lbannusr. * Adding caching options for Corona and changed how the username is queried. * Updated CI tests to use common MIOpen caches. Split user and custom cache paths. * Fix the lassen multi-stage pipeline to record the spack architecture. * Increase the build time limit on Lassen. * Fixed the new lassen build to avoid installing pytest through spack. * Added the clean build flags into the multi-stage pipeline. * Skip failing tests in distconv. * Change the test utils to not set cluster value to unset, but rather None if it is not set. * Added support for passing in the system cluster name by default if it is known. * Cleanup the paths for the MIOpen caches. * Added a guard to skip inplace test if DistConv is disabled. * Removing unnecessary variable definitions. * ResNet tests should run on Corona. * Added support in the data coordinator for explicitly recording the dimensions of each field. This is then compared to the dimensions reported from the data reader, or if there are no valid data readers, it can be substituted. Note that for the time being this is redundant information, but it allows the catch2 test to properly construct a model without data readers. This fixes a bug seen on Corona where the MPI catch2 tests were failing because they allocated a set of buffers with a size of -1. Cleaned up the way in which the data coordinator checks for linearized size to reduce code duplication. Switch the data type of the data field dimensions to use El::Int rather than int values. Added a utility function to type cast between two vectors. * Force lassen to clean build. * Fixed the legacy HDF5 data reader. * Increased the timeout for the lassen build and test. * Bumped up the time limit on the catch tests for ROCm systems. * Increase the catch test sizes. * Trying to avoid forcing static linking when using NVSHMEM. * Changed the run catch tests script for flux to use a flux proxy. * Export the lbann setup for Lassen unit and integration tests. * Minimize what is saved from the catch2 unit tests. * Cleaning up the environment variables. * Added a flag to extend the spack env name. * Tweaking the flux proxy. * Change how the NVSHMEM variables are setup so that the .before_script sections do not collide. * Removed the -o cpu-affinity=per-task flag from the flux run commands on the catch2 tests because it is causing a hang on Corona. Removed the nested flux proxy commands in the flux catch2 tests since they should be unnecessary due to the flux proxy command that invokes the script. * Tweak the flux commands to resovle hang on Corona catch tests. * Cleaning up the flux launch commands on Tioga and Corona to help avoid a hang. * Added a job name suffix variable. * Ensure that the spack environment names are unique. * Tightened up the inclusion of the LBANN Python packages to avoid conflicts when using the test_compiler script to build LBANN. * Added support to Pip install into the lbann build directory. Removed setting the --test=root command from the extra root packages to avoid triggering spack build failures on Power systems. * Updated the baseline modules used on Corona and package versions on Lassen. * Fixing the allocation flux command for Tioga. * Changing it so that only Corona addes the -o pmi=pmix flags to flux. * Enable module generation for multiple core compilers. * Making the flux commands consistent. * Applied clang format. * Fixed the compiler path on Pasacl. * Reenable lassen multi-stage distconv test pipeline. * Fixed how the new Lassen distconv tests are invoked and avoid erronously resetting up the spack environment. Changed the saved spack environment name to SPACK_ENV_NAME. Cleaned up some dead code. * Added a second if clause to the integration tests so that there is always at least one true clause, so the stage will schedule. Fixed the regex so that the distconv substring doesn't have to come at the start of the string. * Consolidated the rules clause into a common one. * Fix the rules regex. * Added corona numbers for resnet. * Tweaking the CI rules to avoid integrations on distconv builds. * Tweaking how the lassen unit tests are called. * Disable nvshmem build on Lassen. Code cleanup and adding suggestions. * Changed the guard in resnet 50 test * Disable NVSHMEM environemnt variables. * Disabled Lassen DistConv unit tests. * Apply suggestions from code review Co-authored-by: Tom Benson --------- Co-authored-by: Tom Benson --- .gitlab-ci.yml | 53 ++++++ .gitlab/common/common.yml | 14 +- .gitlab/common/run-catch-tests-flux.sh | 11 +- .gitlab/common/run-catch-tests-lsf.sh | 78 ++++++++ .gitlab/corona/pipeline.yml | 13 +- .gitlab/lassen/multi_stage_pipeline.yml | 168 ++++++++++++++++++ .gitlab/lassen/run_integration_tests.sh | 37 ++++ .gitlab/lassen/run_unit_tests.sh | 36 ++++ .gitlab/pascal/pipeline.yml | 10 +- .gitlab/tioga/pipeline.yml | 14 +- CMakeLists.txt | 2 +- ReleaseNotes.txt | 2 + ci_test/common_python/test_util.py | 14 +- ci_test/common_python/tools.py | 39 ++-- .../test_integration_resnet50.py | 11 +- .../unit_tests/test_unit_inplace_distconv.py | 11 +- ...test_unit_layer_batched_matmul_distconv.py | 3 +- ...t_unit_layer_batchnorm_mem_opt_distconv.py | 6 +- ...er_channelwise_fully_connected_distconv.py | 3 +- .../test_unit_layer_convolution_distconv.py | 7 +- ...unit_layer_convolution_mem_opt_distconv.py | 5 +- .../test_unit_layer_gather_distconv.py | 3 +- .../test_unit_layer_identity_distconv.py | 3 +- .../test_unit_layer_leaky_relu_distconv.py | 3 +- .../test_unit_layer_pooling_distconv.py | 8 +- .../test_unit_layer_relu_distconv.py | 3 +- .../test_unit_layer_scatter_distconv.py | 3 +- .../buffered_data_coordinator.hpp | 4 +- .../data_coordinator/data_coordinator.hpp | 6 +- .../data_coordinator_metadata.hpp | 7 +- include/lbann/data_readers/data_reader.hpp | 4 +- .../lbann/data_readers/data_reader_HDF5.hpp | 6 +- .../lbann/data_readers/data_reader_csv.hpp | 2 +- .../data_readers/data_reader_hdf5_legacy.hpp | 4 +- .../lbann/data_readers/data_reader_image.hpp | 2 +- .../data_readers/data_reader_jag_conduit.hpp | 2 +- .../data_reader_merge_features.hpp | 2 +- .../data_reader_merge_samples.hpp | 4 +- .../lbann/data_readers/data_reader_mesh.hpp | 4 +- .../data_reader_npz_ras_lipid.hpp | 2 +- .../lbann/data_readers/data_reader_numpy.hpp | 4 +- .../data_readers/data_reader_numpy_npz.hpp | 4 +- .../data_reader_numpy_npz_conduit.hpp | 4 +- .../data_reader_pilot2_molecular.hpp | 4 +- .../lbann/data_readers/data_reader_python.hpp | 2 +- .../lbann/data_readers/data_reader_smiles.hpp | 2 +- .../data_readers/data_reader_synthetic.hpp | 16 +- include/lbann/layers/io/input_layer.hpp | 2 +- include/lbann/utils/dim_helpers.hpp | 6 + python/lbann/contrib/args.py | 1 + python/lbann/contrib/lc/launcher.py | 17 +- python/lbann/contrib/olcf/launcher.py | 5 +- python/lbann/launcher/flux.py | 5 - scripts/build_lbann.sh | 17 +- scripts/customize_build_env.sh | 12 +- .../buffered_data_coordinator.cpp | 14 +- src/data_coordinator/data_coordinator.cpp | 101 ++++------- src/data_readers/data_reader_HDF5.cpp | 4 +- src/data_readers/data_reader_hdf5_legacy.cpp | 2 +- src/data_readers/data_reader_jag_conduit.cpp | 6 +- .../data_reader_merge_samples.cpp | 4 +- src/data_readers/data_reader_python.cpp | 4 +- src/data_readers/data_reader_synthetic.cpp | 8 +- .../unit_test/data_reader_synthetic_test.cpp | 10 +- .../data_reader_synthetic_test_public_api.cpp | 12 +- src/layers/io/input_layer.cpp | 12 +- src/proto/proto_common.cpp | 6 +- src/utils/lbann_library.cpp | 14 +- src/utils/summary.cpp | 2 +- 69 files changed, 678 insertions(+), 231 deletions(-) create mode 100755 .gitlab/common/run-catch-tests-lsf.sh create mode 100644 .gitlab/lassen/multi_stage_pipeline.yml create mode 100755 .gitlab/lassen/run_integration_tests.sh create mode 100755 .gitlab/lassen/run_unit_tests.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 55d0a967cba..26695f9b98b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -40,6 +40,19 @@ corona testing: strategy: depend include: .gitlab/corona/pipeline.yml +corona distconv testing: + stage: run-all-clusters + variables: + JOB_NAME_SUFFIX: _distconv + SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" + SPACK_SPECS: "+rocm +distconv" + WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" + WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" + TEST_FLAG: "test_*_distconv.py" + trigger: + strategy: depend + include: .gitlab/corona/pipeline.yml + lassen testing: stage: run-all-clusters variables: @@ -49,6 +62,20 @@ lassen testing: strategy: depend include: .gitlab/lassen/pipeline.yml +lassen distconv testing: + stage: run-all-clusters + variables: + JOB_NAME_SUFFIX: _distconv + SPACK_ENV_BASE_NAME_MODIFIER: "-multi-stage-distconv" + SPACK_SPECS: "+cuda +distconv +fft" +# SPACK_SPECS: "+cuda +distconv +nvshmem +fft" + WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" + WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" + TEST_FLAG: "test_*_distconv.py" + trigger: + strategy: depend + include: .gitlab/lassen/multi_stage_pipeline.yml + pascal testing: stage: run-all-clusters variables: @@ -68,6 +95,19 @@ pascal compiler testing: strategy: depend include: .gitlab/pascal/pipeline_compiler_tests.yml +pascal distconv testing: + stage: run-all-clusters + variables: + JOB_NAME_SUFFIX: _distconv + SPACK_SPECS: "%gcc@10.3.1 +cuda +distconv +fft" + BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" + WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" + WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" + TEST_FLAG: "test_*_distconv.py" + trigger: + strategy: depend + include: .gitlab/pascal/pipeline.yml + tioga testing: stage: run-all-clusters variables: @@ -76,3 +116,16 @@ tioga testing: trigger: strategy: depend include: .gitlab/tioga/pipeline.yml + +tioga distconv testing: + stage: run-all-clusters + variables: + JOB_NAME_SUFFIX: _distconv + SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" + SPACK_SPECS: "+rocm +distconv" + WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" + WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" + TEST_FLAG: "test_*_distconv.py" + trigger: + strategy: depend + include: .gitlab/tioga/pipeline.yml diff --git a/.gitlab/common/common.yml b/.gitlab/common/common.yml index e2bdeaf1a8b..8e824f80656 100644 --- a/.gitlab/common/common.yml +++ b/.gitlab/common/common.yml @@ -29,11 +29,11 @@ variables: # This is based on the assumption that each runner will only ever # be able to run one pipeline on a given cluster at one time. - SPACK_ENV_BASE_NAME: gitlab-${CI_COMMIT_BRANCH}-${GITLAB_USER_LOGIN}-${SYSTEM_NAME}${SPACK_ENV_BASE_NAME_EXTENSION}-${CI_RUNNER_SHORT_TOKEN} + SPACK_ENV_BASE_NAME: gitlab${SPACK_ENV_BASE_NAME_MODIFIER}-${CI_COMMIT_BRANCH}-${GITLAB_USER_LOGIN}-${SYSTEM_NAME}${SPACK_ENV_BASE_NAME_EXTENSION}-${CI_RUNNER_SHORT_TOKEN} # This variable is the name used to identify the job in the Slurm # queue. We need this to be able to access the correct jobid. - JOB_NAME: ${CI_PROJECT_NAME}_${CI_PIPELINE_ID} + JOB_NAME: ${CI_PROJECT_NAME}_${CI_PIPELINE_ID}${JOB_NAME_SUFFIX} # This is needed to ensure that we run as lbannusr. LLNL_SERVICE_USER: lbannusr @@ -105,7 +105,7 @@ - ml use ${LBANN_MODFILES_DIR} - ml load lbann - echo "Using LBANN binary $(which lbann)" - - echo "export SPACK_DEP_ENV_NAME=${SPACK_ENV_NAME}" > spack-ci-env-name.sh + - echo "export SPACK_ENV_NAME=${SPACK_ENV_NAME}" > spack-ci-env-name.sh - echo "export SPACK_ARCH=${SPACK_ARCH}" >> spack-ci-env-name.sh - echo "export SPACK_ARCH_TARGET=${SPACK_ARCH_TARGET}" >> spack-ci-env-name.sh - echo "export LBANN_BUILD_PARENT_DIR=${LBANN_BUILD_PARENT_DIR}" >> spack-ci-env-name.sh @@ -137,7 +137,13 @@ - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/*.cmake - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/CMakeCache.txt - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/build.ninja - - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/unit_test/* - ${RESULTS_DIR}/* exclude: - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/**/*.o + - builds/lbann_${SYSTEM_NAME}_${SPACK_ENV_BASE_NAME}-*${CI_CONCURRENT_ID}-*/build/unit_test/* + +.lbann-test-rules: + rules: + - if: $JOB_NAME_SUFFIX == "_distconv" + when: never + - if: $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME == $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME diff --git a/.gitlab/common/run-catch-tests-flux.sh b/.gitlab/common/run-catch-tests-flux.sh index 236b330d925..b06d4d4dd09 100755 --- a/.gitlab/common/run-catch-tests-flux.sh +++ b/.gitlab/common/run-catch-tests-flux.sh @@ -52,14 +52,9 @@ export LD_LIBRARY_PATH=${ROCM_PATH}/lib:${LD_LIBRARY_PATH} cd ${LBANN_BUILD_DIR} - -flux run --label-io -n4 -N2 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task sh -c 'taskset -cp $$; printenv | grep VISIBLE' | sort - -flux run --label-io -n4 -N2 -g 1 -o cpu-affinity=off -o gpu-affinity=per-task sh -c 'taskset -cp $$; printenv | grep VISIBLE' | sort - echo "Running sequential catch tests" -flux run -N 1 -n 1 -g 1 -t 5m \ +flux run -N 1 -n 1 --exclusive -o nosetpgrp ${EXTRA_FLUX_ARGS} -t 5m \ ./unit_test/seq-catch-tests \ -r JUnit \ -o ${OUTPUT_DIR}/seq-catch-results.xml @@ -71,7 +66,7 @@ echo "Running MPI catch tests with ${LBANN_NNODES} nodes and ${TEST_TASKS_PER_NO flux run \ -N ${LBANN_NNODES} -n $((${TEST_TASKS_PER_NODE} * ${LBANN_NNODES})) \ - -g 1 -t 5m -o gpu-affinity=per-task -o cpu-affinity=per-task -o mpibind=off \ + -t 5m --exclusive -o nosetpgrp ${EXTRA_FLUX_ARGS} \ ./unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]"\ -r JUnit \ -o "${OUTPUT_DIR}/mpi-catch-results-rank=%r-size=%s.xml" @@ -83,7 +78,7 @@ echo "Running MPI filesystem catch tests" flux run \ -N ${LBANN_NNODES} -n $((${TEST_TASKS_PER_NODE} * ${LBANN_NNODES})) \ - -g 1 -t 5m -o gpu-affinity=per-task -o cpu-affinity=per-task -o mpibind=off \ + -t 5m --exclusive -o nosetpgrp ${EXTRA_FLUX_ARGS} \ ./unit_test/mpi-catch-tests -s "[filesystem]" \ -r JUnit \ -o "${OUTPUT_DIR}/mpi-catch-filesystem-results-rank=%r-size=%s.xml" diff --git a/.gitlab/common/run-catch-tests-lsf.sh b/.gitlab/common/run-catch-tests-lsf.sh new file mode 100755 index 00000000000..97cd4029ea9 --- /dev/null +++ b/.gitlab/common/run-catch-tests-lsf.sh @@ -0,0 +1,78 @@ +################################################################################ +## Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +#!/bin/bash +cd ${LBANN_BUILD_DIR} + +# Configure the output directory +OUTPUT_DIR=${CI_PROJECT_DIR}/${RESULTS_DIR} +if [[ -d ${OUTPUT_DIR} ]]; +then + rm -rf ${OUTPUT_DIR} +fi +mkdir -p ${OUTPUT_DIR} + +FAILED_JOBS="" + +lrun -N 1 -n 1 -W 5 \ + ./unit_test/seq-catch-tests \ + -r JUnit \ + -o ${OUTPUT_DIR}/seq-catch-results.xml +if [[ $? -ne 0 ]]; then + FAILED_JOBS+=" seq" +fi + +lrun -N ${LBANN_NNODES} -n $(($TEST_TASKS_PER_NODE * ${LBANN_NNODES})) \ + -T $TEST_TASKS_PER_NODE \ + -W 5 ${TEST_MPIBIND_FLAG} \ + ./unit_test/mpi-catch-tests "exclude:[externallayer]" "exclude:[filesystem]" \ + -r JUnit \ + -o "${OUTPUT_DIR}/mpi-catch-results-rank=%r-size=%s.xml" +if [[ $? -ne 0 ]]; then + FAILED_JOBS+=" mpi" +fi + +lrun -N ${LBANN_NNODES} -n $(($TEST_TASKS_PER_NODE * ${LBANN_NNODES})) \ + -T $TEST_TASKS_PER_NODE \ + -W 5 ${TEST_MPIBIND_FLAG} \ + ./unit_test/mpi-catch-tests "[filesystem]" \ + -r JUnit \ + -o "${OUTPUT_DIR}/mpi-catch-filesystem-results-rank=%r-size=%s.xml" +if [[ $? -ne 0 ]]; +then + FAILED_JOBS+=" mpi-filesystem" +fi + +# Try to write a semi-useful message to this file since it's being +# saved as an artifact. It's not completely outside the realm that +# someone would look at it. +if [[ -n "${FAILED_JOBS}" ]]; +then + echo "Some Catch2 tests failed:${FAILED_JOBS}" > ${OUTPUT_DIR}/catch-tests-failed.txt +fi + +# Return "success" so that the pytest-based testing can run. +exit 0 diff --git a/.gitlab/corona/pipeline.yml b/.gitlab/corona/pipeline.yml index 0039a19128a..60d76e2a120 100644 --- a/.gitlab/corona/pipeline.yml +++ b/.gitlab/corona/pipeline.yml @@ -55,7 +55,7 @@ allocate lc resources: - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "120m") - export LBANN_NNODES=$([[ -n "${WITH_WEEKLY}" ]] && echo "4" || echo "2") - export FLUX_F58_FORCE_ASCII=t - - jobid=$(flux --parent alloc -N ${LBANN_NNODES} -g 1 -t ${TEST_TIME} --job-name=${JOB_NAME} --bg) + - jobid=$(flux --parent alloc -N ${LBANN_NNODES} --exclusive -t ${TEST_TIME} --job-name=${JOB_NAME} --bg) - export JOB_ID=$jobid timeout: 6h @@ -79,6 +79,7 @@ build and install: - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(flux proxy ${JOB_ID} flux mini run -N 1 spack arch) - export SPACK_ARCH_TARGET=$(flux proxy ${JOB_ID} flux mini run -N 1 spack arch -t) + - export EXTRA_FLUX_ARGS="-o pmi=pmix" - !reference [.setup_lbann, script] - flux proxy ${JOB_ID} .gitlab/common/run-catch-tests-flux.sh @@ -97,7 +98,8 @@ unit tests: - export OMP_NUM_THREADS=10 - "export FLUX_JOB_ID=$(flux jobs -no {id}:{name} | grep ${JOB_NAME} | awk -F: '{print $1}')" - cd ci_test/unit_tests - - flux proxy ${FLUX_JOB_ID} lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml + # - echo "Running unit tests with file pattern: ${TEST_FLAG}" + - flux proxy ${FLUX_JOB_ID} python3 -m pytest -s -vv --durations=0 --junitxml=results.xml ${TEST_FLAG} artifacts: when: always paths: @@ -114,6 +116,8 @@ integration tests: stage: test dependencies: - build and install + rules: + - !reference [.lbann-test-rules, rules] script: - echo "== RUNNING PYTHON-BASED INTEGRATION TESTS ==" - echo "Testing $(which lbann)" @@ -121,8 +125,9 @@ integration tests: - "export FLUX_JOB_ID=$(flux jobs -no {id}:{name} | grep ${JOB_NAME} | awk -F: '{print $1}')" - cd ci_test/integration_tests - export WEEKLY_FLAG=${WITH_WEEKLY:+--weekly} - - echo "python3 -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml" - - flux proxy ${FLUX_JOB_ID} lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml + # - echo "Running integration tests with file pattern: ${TEST_FLAG}" + # - echo "python3 -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG}" + - flux proxy ${FLUX_JOB_ID} python3 -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG} artifacts: when: always paths: diff --git a/.gitlab/lassen/multi_stage_pipeline.yml b/.gitlab/lassen/multi_stage_pipeline.yml new file mode 100644 index 00000000000..212fe55f48d --- /dev/null +++ b/.gitlab/lassen/multi_stage_pipeline.yml @@ -0,0 +1,168 @@ +################################################################################ +## Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# This is the testing pipeline for the Lassen cluster at LLNL. This +# cluster builds the LBANN applications and libraries using a single +# compiler toolchain and then runs a collection of tests. Testing +# output is in JUnit format and parsed by the pipeline for web +# viewing. + +include: + - .gitlab/common/common.yml + +stages: + - build + - test + - cleanup + +# Build LBANN and establish the Spack environment for this pipeline. +build and install: + extends: + - .lassen common + - .lbann-base-vars + - .lbann-artifacts + stage: build + script: + - echo "== BUILDING LBANN ==" + - !reference [.setup_spack, script] + - lalloc 1 -W 60 -q pdebug ./scripts/build_lbann.sh ${SPACK_DEPS_FLAG} + -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + -p py-scipy@1.8.1 --pip pytest --pip tqdm -- + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} + - export TEST_TASKS_PER_NODE=2 + - export LBANN_NNODES=1 + - export TEST_MPIBIND_FLAG="--mpibind=off" + - export SPACK_ARCH=$(spack arch) + - export SPACK_ARCH_TARGET=$(spack arch -t) + - !reference [.setup_lbann, script] + - lalloc ${LBANN_NNODES} -W 45 -q pdebug .gitlab/common/run-catch-tests-lsf.sh + timeout: 2h + +# Run the Python-based unit tests. +unit tests: + extends: + - .lassen common + - .lbann-base-vars + - .uses spack environment + stage: test + dependencies: + - build and install + script: + - echo "== RUNNING PYTHON-BASED UNIT TESTS ==" +# - !reference [.setup_nvshmem, script] + - echo "Testing $(which lbann)" + - export OMP_NUM_THREADS=10 + - export LBANN_NNODES=1 + - export TEST_TASKS_PER_NODE=4 + - echo "Skipping unit tests due to PMI issue on lassen." +# - lalloc ${LBANN_NNODES} -W 30 -G guests -q pdebug .gitlab/lassen/run_unit_tests.sh + artifacts: + when: always + paths: + - ci_test/unit_tests/results.xml + reports: + junit: ci_test/unit_tests/results.xml + timeout: 1h + +# Run the Python-based integration tests. +integration tests: + extends: + - .lassen common + - .lbann-base-vars + - .uses spack environment + stage: test + dependencies: + - build and install + rules: + - !reference [.lbann-test-rules, rules] + script: + - echo "== RUNNING PYTHON-BASED INTEGRATION TESTS ==" +# - !reference [.setup_nvshmem, script] + - echo "Testing $(which lbann)" + - export OMP_NUM_THREADS=10 + - export LBANN_NNODES=2 + - export TEST_TASKS_PER_NODE=4 + - export WEEKLY_FLAG=${WITH_WEEKLY:+--weekly} + - lalloc ${LBANN_NNODES} -W 45 -G guests -q pdebug .gitlab/lassen/run_integration_tests.sh + artifacts: + when: always + paths: + - ci_test/integration_tests/results.xml + reports: + junit: ci_test/integration_tests/results.xml + timeout: 1h + +# This is a dummy job that checks the Catch2 testing. +check catch2 tests: + extends: + - .lassen common + - .lbann-base-vars + stage: test + dependencies: + - build and install + script: + - ([[ $(find ${RESULTS_DIR} -name "catch-tests-failed.txt" | wc -l) -eq 0 ]]) + artifacts: + reports: + junit: ${RESULTS_DIR}/*.xml + +# Cleanup the pipeline's Spack environment. +# Switching over to reusing Spack environments for each feature branch so don't remove them immediately +# Cleanup any build directories and spack environments older than 5 days since last use +remove spack environment: + extends: + - .lassen common + - .lbann-base-vars + - .cleanup old spack environment + stage: cleanup + variables: + GIT_STRATEGY: none + when: always + + +# Load the spack shell integration and load the environment. +.setup_nvshmem: + script: + - export NVSHMEM_IBRC_SUPPORT=1 + - export NVSHMEM_PMIX_SUPPORT="1" +# - export NVSHMEM_PMIX_SUPPORT="0" + - export NVSHMEM_LIBFABRIC_SUPPORT=0 + +# Variables for Lassen. Because this test uses "test_compiler.py", we +# don't need to specify the Spack spec or the job name. +.lassen common: + variables: + # Just the obvious identifier. Which specific node doesn't matter. + SYSTEM_NAME: lassen + SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} + SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git + + # These are system-specific specs that should be forwarded to the + # build script + SPACK_SPECS: "+cuda +half +fft" + tags: + - lassen + - shell diff --git a/.gitlab/lassen/run_integration_tests.sh b/.gitlab/lassen/run_integration_tests.sh new file mode 100755 index 00000000000..8637f071ebb --- /dev/null +++ b/.gitlab/lassen/run_integration_tests.sh @@ -0,0 +1,37 @@ +## Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +#!/bin/bash + +echo "Task: Intergation Tests" +cd ci_test/integration_tests +echo "Running integration tests with file pattern: ${TEST_FLAG}" +export OMP_NUM_THREADS=10 +lrun -N ${LBANN_NNODES} -T $TEST_TASKS_PER_NODE lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG} +#lrun -N ${LBANN_NNODES} -T $TEST_TASKS_PER_NODE python3 -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG} +status=$(($status + $?)) + +echo "Task: Finished" +exit $status diff --git a/.gitlab/lassen/run_unit_tests.sh b/.gitlab/lassen/run_unit_tests.sh new file mode 100755 index 00000000000..9725876d22e --- /dev/null +++ b/.gitlab/lassen/run_unit_tests.sh @@ -0,0 +1,36 @@ +## Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +#!/bin/bash +echo "Task: Unit Tests" +cd ci_test/unit_tests +echo "Running unit tests with file pattern: ${TEST_FLAG}" +export OMP_NUM_THREADS=10 +lrun -N ${LBANN_NNODES} -T $TEST_TASKS_PER_NODE lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml ${TEST_FLAG} +#lrun -N ${LBANN_NNODES} -T $TEST_TASKS_PER_NODE python3 -m pytest -s -vv --durations=0 --junitxml=results.xml ${TEST_FLAG} +status=$(($status + $?)) + +echo "Task: Finished" +exit $status diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index 7a24c9adb41..de71122fc88 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -94,7 +94,8 @@ unit tests: - export OMP_NUM_THREADS=10 - export SLURM_JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") - cd ci_test/unit_tests - - lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml + # - echo "Running unit tests with file pattern: ${TEST_FLAG}" + - lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml ${TEST_FLAG} artifacts: when: always paths: @@ -111,6 +112,8 @@ integration tests: stage: test dependencies: - build and install + rules: + - !reference [.lbann-test-rules, rules] script: - echo "== RUNNING PYTHON-BASED INTEGRATION TESTS ==" - echo "Testing $(which lbann)" @@ -118,8 +121,9 @@ integration tests: - export SLURM_JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") - cd ci_test/integration_tests - export WEEKLY_FLAG=${WITH_WEEKLY:+--weekly} - - echo "lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml" - - lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml + # - echo "Running integration tests with file pattern: ${TEST_FLAG}" + # - echo "lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG}" + - lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG} artifacts: when: always paths: diff --git a/.gitlab/tioga/pipeline.yml b/.gitlab/tioga/pipeline.yml index d284a9ab3da..712797eface 100644 --- a/.gitlab/tioga/pipeline.yml +++ b/.gitlab/tioga/pipeline.yml @@ -55,7 +55,7 @@ allocate lc resources: - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "120m") - export LBANN_NNODES=$([[ -n "${WITH_WEEKLY}" ]] && echo "4" || echo "2") - export FLUX_F58_FORCE_ASCII=t - - jobid=$(flux --parent alloc -N ${LBANN_NNODES} -g 1 -t ${TEST_TIME} --job-name=${JOB_NAME} --bg) + - jobid=$(flux --parent alloc -N ${LBANN_NNODES} --exclusive -t ${TEST_TIME} --job-name=${JOB_NAME} --bg) - export JOB_ID=$jobid timeout: 6h @@ -72,14 +72,14 @@ build and install: - "export JOB_ID=$(flux jobs -no {id}:{name} | grep ${JOB_NAME} | awk -F: '{print $1}')" - "export LBANN_NNODES=$(flux jobs -no {id}:{name}:{nnodes} | grep ${JOB_NAME} | awk -F: '{print $3}')" - !reference [.setup_spack, script] - - flux proxy ${JOB_ID} flux mini run -N 1 -t 30m ./scripts/build_lbann.sh ${SPACK_DEPS_FLAG} + - flux proxy ${JOB_ID} flux run -N 1 -t 30m ./scripts/build_lbann.sh ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} -p py-pip --ci-pip -- +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} - export TEST_TASKS_PER_NODE=4 - export TEST_MPIBIND_FLAG="--mpibind=off" - - export SPACK_ARCH=$(flux proxy ${JOB_ID} flux mini run -N 1 spack arch) - - export SPACK_ARCH_TARGET=$(flux proxy ${JOB_ID} flux mini run -N 1 spack arch -t) + - export SPACK_ARCH=$(flux proxy ${JOB_ID} flux run -N 1 spack arch) + - export SPACK_ARCH_TARGET=$(flux proxy ${JOB_ID} flux run -N 1 spack arch -t) - export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} - export LD_LIBRARY_PATH=${ROCM_PATH}/lib:${LD_LIBRARY_PATH} - !reference [.setup_lbann, script] @@ -102,7 +102,7 @@ unit tests: - "export FLUX_JOB_ID=$(flux jobs -no {id}:{name} | grep ${JOB_NAME} | awk -F: '{print $1}')" - cd ci_test/unit_tests - export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} - - flux proxy ${FLUX_JOB_ID} lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml + - flux proxy ${FLUX_JOB_ID} lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml ${TEST_FLAG} artifacts: when: always paths: @@ -119,6 +119,8 @@ integration tests: stage: test dependencies: - build and install + rules: + - !reference [.lbann-test-rules, rules] script: - echo "== RUNNING PYTHON-BASED INTEGRATION TESTS ==" - echo "Testing $(which lbann)" @@ -128,7 +130,7 @@ integration tests: - cd ci_test/integration_tests - export WEEKLY_FLAG=${WITH_WEEKLY:+--weekly} - export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} - - flux proxy ${FLUX_JOB_ID} lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml + - flux proxy ${FLUX_JOB_ID} lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG} artifacts: when: always paths: diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f9d6c6957d..c28334be7f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -394,7 +394,7 @@ if (LBANN_HAS_CUDA) if (LBANN_WITH_NVSHMEM) find_package(NVSHMEM REQUIRED) # Build LBANN as a static library to get around a bug in NVSHMEM - set(BUILD_SHARED_LIBS OFF) +# set(BUILD_SHARED_LIBS OFF) list(APPEND LBANN_CUDA_LIBS NVSHMEM::NVSHMEM) endif () set(LBANN_HAS_NVSHMEM "${NVSHMEM_FOUND}") diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt index 33c14187e4a..41198d21605 100644 --- a/ReleaseNotes.txt +++ b/ReleaseNotes.txt @@ -156,6 +156,8 @@ Build system: significantly decreases the startup time of running a CI job - Enforce consistent GPU targets in Spack environment - Switched from Bamboo to GitLab CI framework + - Added support for a moduler GitLab CI script on Lassen + - Added CI pipelines for +distconv and +nvshmem where appropriate Bug fixes: - Fixed GPU kernels that launched with more blocks than allowed diff --git a/ci_test/common_python/test_util.py b/ci_test/common_python/test_util.py index 8767a7c23b7..66bd3b26085 100644 --- a/ci_test/common_python/test_util.py +++ b/ci_test/common_python/test_util.py @@ -43,7 +43,7 @@ def lbann_test(check_gradients=False, train=False, **decorator_kwargs): The unit test in the wrapped function must return a ``test_util.ModelTester`` object, which contains all the necessary information to test the model (e.g., model, input/reference tensors). - + The decorator wraps the test with the appropriate setup phase, data reading, callbacks, and metrics so that the test functions properly. """ @@ -79,7 +79,7 @@ def wrapped(*args, **kwargs): upper_bound=tester.tolerance, error_on_failure=True, execution_modes='train' if train else 'test')) - + obj_func = None if check_gradients: if tester.check_gradients_tensor is None: @@ -138,7 +138,9 @@ def setup_func(lbann, weekly): return trainer, model, data_reader, optimizer, None # Don't request any specific number of nodes test = tools.create_tests(setup_func, file, **decorator_kwargs)[0] - cluster = kwargs.get('cluster', 'unset') + cluster = kwargs.get('cluster', None) + if cluster is None: + cluster = tools.system(lbann) weekly = kwargs.get('weekly', False) test(cluster, weekly, False) @@ -159,9 +161,9 @@ class ModelTester: reference: Optional[lbann.Layer] = None #: Reference LBANN node (optional) reference_tensor: Optional[ Any] = None #: Optional reference tensor to compare with - + # Tensor that will be used as the model objective function when checking - # gradients. Required if check_gradients is True in tester. + # gradients. Required if check_gradients is True in tester. check_gradients_tensor: Optional[lbann.Layer] = None loss: Optional[lbann.Layer] = None # Optional loss test @@ -234,7 +236,7 @@ def make_reference(self, ref: Any) -> lbann.Input: self.reference = refnode self.reference_tensor = ref return self.reference - + def set_check_gradients_tensor(self, tensor: lbann.Layer): """ Sets the tensor to be used as the objective function when running the diff --git a/ci_test/common_python/tools.py b/ci_test/common_python/tools.py index d87cccfeaab..443bd6cdebd 100644 --- a/ci_test/common_python/tools.py +++ b/ci_test/common_python/tools.py @@ -734,6 +734,7 @@ def create_tests(setup_func, other output data. """ + import lbann.contrib.lc.systems # Make sure test name is valid test_file = os.path.realpath(test_file) @@ -744,6 +745,31 @@ def create_tests(setup_func, # Make sure test name is prefixed with 'test_' test_name_base = 'test_' + test_name_base + # Check to see if we are testing on a ROCm system and then set a cache for CI testing + system = lbann.contrib.lc.systems.system() + if system in ('tioga', 'rzvernal', 'corona'): + if 'environment' in kwargs: + environment = kwargs.get('environment') + else: + environment = {} + + if os.environ.get('USER') == 'lbannusr': + basepath = '/p/vast1/lbannusr' + else: + basepath = '/p/vast1/lbann' + + tmpdir = os.environ.get('TMPDIR') + if os.path.isdir(basepath) and os.access(basepath, os.R_OK | os.W_OK): + db_path = basepath + else: + db_path = tmpdir + + environment['MIOPEN_USER_DB_PATH'] = f'{db_path}/MIOpen_user_db' + # Empirically the cache dir cannot be on a parallel file system + environment['MIOPEN_CUSTOM_CACHE_DIR'] =f'{tmpdir}/MIOpen_custom_cache' + + kwargs['environment'] = environment + def test_func(cluster, dirname, weekly): """Function that can interact with PyTest. @@ -1020,16 +1046,3 @@ def gpus_per_node(lbann): return getattr(lbann.contrib, compute_center).systems.gpus_per_node() else: return 0 - - -# Get the environment variables for Distconv. -def get_distconv_environment(init_nvshmem=False): - # TODO: Use the default halo exchange and shuffle method. See https://github.com/LLNL/lbann/issues/1659 - environment = {"LBANN_DISTCONV_HALO_EXCHANGE": "AL", - "LBANN_DISTCONV_TENSOR_SHUFFLER": "AL", - "LBANN_KEEP_ERROR_SIGNALS": "1", - } - if init_nvshmem: - environment["LBANN_INIT_NVSHMEM"] = 1 - - return environment diff --git a/ci_test/integration_tests/test_integration_resnet50.py b/ci_test/integration_tests/test_integration_resnet50.py index 793535f4bfc..c5dcfc98970 100644 --- a/ci_test/integration_tests/test_integration_resnet50.py +++ b/ci_test/integration_tests/test_integration_resnet50.py @@ -41,6 +41,7 @@ 'lassen': 0.10, 'ray': 0.15, 'tioga': 0.25, + 'corona': 0.61, } } @@ -59,6 +60,7 @@ 'lassen': 0.15, 'ray': 0.23, 'tioga': 0.43, + 'corona': 0.61, } } @@ -77,6 +79,11 @@ def setup_experiment(lbann, weekly): message = f'{os.path.basename(__file__)} requires VISION support with OPENCV' print('Skip - ' + message) pytest.skip(message) + + # Skip test on CPU systems + if not lbann.has_feature('GPU'): + pytest.skip('only run {} on GPU systems'.format(test_name)) + if weekly: options = weekly_options_and_targets else: @@ -165,10 +172,6 @@ def augment_test_func(test_func): # Define test function def func(cluster, dirname, weekly): - # Skip test on CPU systems - if cluster in ('catalyst', 'corona'): - pytest.skip('only run {} on GPU systems'.format(test_name)) - if weekly: targets = weekly_options_and_targets else: diff --git a/ci_test/unit_tests/test_unit_inplace_distconv.py b/ci_test/unit_tests/test_unit_inplace_distconv.py index 15f98c34c02..921ba3006b3 100644 --- a/ci_test/unit_tests/test_unit_inplace_distconv.py +++ b/ci_test/unit_tests/test_unit_inplace_distconv.py @@ -5,6 +5,7 @@ import os import sys import lbann.contrib.launcher +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -13,8 +14,16 @@ import tools @pytest.mark.parametrize('num_dims', [2, 3]) -@test_util.lbann_test(check_gradients=False, environment=tools.get_distconv_environment()) +@test_util.lbann_test(check_gradients=False, + environment=lbann.contrib.args.get_distconv_environment(), + skip_clusters=["corona"], + time_limit=3) def test_simple(num_dims): + if not lbann.has_feature('DISTCONV'): + message = f'{os.path.basename(__file__)} requires DISTCONV' + print('Skip - ' + message) + pytest.skip(message) + np.random.seed(20230607) # Two samples of 4x16x16 or 4x16x16x16 tensors shape = [2, 4] + [16] * num_dims diff --git a/ci_test/unit_tests/test_unit_layer_batched_matmul_distconv.py b/ci_test/unit_tests/test_unit_layer_batched_matmul_distconv.py index bbc6499a62e..24c3135ce2c 100644 --- a/ci_test/unit_tests/test_unit_layer_batched_matmul_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_batched_matmul_distconv.py @@ -5,6 +5,7 @@ import sys import numpy as np import pytest +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -286,7 +287,7 @@ def construct_data_reader(lbann): # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, - environment=tools.get_distconv_environment()): + environment=lbann.contrib.args.get_distconv_environment()): globals()[_test_func.__name__] = _test_func diff --git a/ci_test/unit_tests/test_unit_layer_batchnorm_mem_opt_distconv.py b/ci_test/unit_tests/test_unit_layer_batchnorm_mem_opt_distconv.py index f2bbefa6a05..e714d67d9b9 100644 --- a/ci_test/unit_tests/test_unit_layer_batchnorm_mem_opt_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_batchnorm_mem_opt_distconv.py @@ -5,6 +5,7 @@ import os import sys import lbann.contrib.launcher +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -13,7 +14,10 @@ import tools @pytest.mark.parametrize('num_dims', [2, 3]) -@test_util.lbann_test(check_gradients=True, train=True, environment=tools.get_distconv_environment()) +@test_util.lbann_test(check_gradients=True, + train=True, + environment=lbann.contrib.args.get_distconv_environment(), + time_limit=5) def test_simple(num_dims): np.random.seed(20230621) # Two samples of 4x16x16 or 4x16x16x16 tensors diff --git a/ci_test/unit_tests/test_unit_layer_channelwise_fully_connected_distconv.py b/ci_test/unit_tests/test_unit_layer_channelwise_fully_connected_distconv.py index 4ed627290b3..b5776b2e674 100644 --- a/ci_test/unit_tests/test_unit_layer_channelwise_fully_connected_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_channelwise_fully_connected_distconv.py @@ -5,6 +5,7 @@ import sys import numpy as np import pytest +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -335,5 +336,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -for _test_func in tools.create_tests(setup_experiment, __file__, environment=tools.get_distconv_environment()): +for _test_func in tools.create_tests(setup_experiment, __file__, environment=lbann.contrib.args.get_distconv_environment()): globals()[_test_func.__name__] = _test_func diff --git a/ci_test/unit_tests/test_unit_layer_convolution_distconv.py b/ci_test/unit_tests/test_unit_layer_convolution_distconv.py index d420c70a1d5..b68bebebfa2 100644 --- a/ci_test/unit_tests/test_unit_layer_convolution_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_convolution_distconv.py @@ -6,6 +6,7 @@ import sys import numpy as np import pytest +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -123,7 +124,7 @@ def setup_experiment(lbann, weekly): message = f'{os.path.basename(__file__)} requires DISTCONV' print('Skip - ' + message) pytest.skip(message) - + mini_batch_size = num_samples() // 2 trainer = lbann.Trainer(mini_batch_size=mini_batch_size) model = construct_model(lbann) @@ -292,5 +293,7 @@ def construct_data_reader(lbann): # Note: Create test name by removing ".py" from file name _test_name = os.path.splitext(os.path.basename(current_file))[0] for _test_func in tools.create_tests(setup_experiment, _test_name, - environment=tools.get_distconv_environment()): + environment=lbann.contrib.args.get_distconv_environment(), + skip_clusters=["tioga"], + time_limit=10): globals()[_test_func.__name__] = _test_func diff --git a/ci_test/unit_tests/test_unit_layer_convolution_mem_opt_distconv.py b/ci_test/unit_tests/test_unit_layer_convolution_mem_opt_distconv.py index 786c4d4a30d..dcd007bb84f 100644 --- a/ci_test/unit_tests/test_unit_layer_convolution_mem_opt_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_convolution_mem_opt_distconv.py @@ -5,6 +5,7 @@ import os import sys import lbann.contrib.launcher +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -13,7 +14,9 @@ import tools @pytest.mark.parametrize('num_dims', [2, 3]) -@test_util.lbann_test(check_gradients=True, environment=tools.get_distconv_environment()) +@test_util.lbann_test(check_gradients=True, + environment=lbann.contrib.args.get_distconv_environment(), + time_limit=10) def test_simple(num_dims): try: import torch diff --git a/ci_test/unit_tests/test_unit_layer_gather_distconv.py b/ci_test/unit_tests/test_unit_layer_gather_distconv.py index 81ddb42328d..30a703d268f 100644 --- a/ci_test/unit_tests/test_unit_layer_gather_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_gather_distconv.py @@ -3,6 +3,7 @@ import sys import numpy as np import pytest +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -197,5 +198,5 @@ def construct_data_reader(lbann): # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, - environment=tools.get_distconv_environment(init_nvshmem=True)): + environment=lbann.contrib.args.get_distconv_environment(init_nvshmem=True)): globals()[_test_func.__name__] = _test_func diff --git a/ci_test/unit_tests/test_unit_layer_identity_distconv.py b/ci_test/unit_tests/test_unit_layer_identity_distconv.py index d67ca4c5735..a301bd3ddf0 100644 --- a/ci_test/unit_tests/test_unit_layer_identity_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_identity_distconv.py @@ -5,6 +5,7 @@ import sys import numpy as np import pytest +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -243,7 +244,7 @@ def construct_data_reader(lbann): # ============================================== # Runtime parameters/arguments -environment = tools.get_distconv_environment() +environment = lbann.contrib.args.get_distconv_environment() environment['LBANN_KEEP_ERROR_SIGNALS'] = 1 # Create test functions that can interact with PyTest diff --git a/ci_test/unit_tests/test_unit_layer_leaky_relu_distconv.py b/ci_test/unit_tests/test_unit_layer_leaky_relu_distconv.py index bf46ddcd347..55fbf8414d1 100644 --- a/ci_test/unit_tests/test_unit_layer_leaky_relu_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_leaky_relu_distconv.py @@ -5,6 +5,7 @@ import sys import numpy as np import pytest +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -216,5 +217,5 @@ def construct_data_reader(lbann): # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, - environment=tools.get_distconv_environment()): + environment=lbann.contrib.args.get_distconv_environment()): globals()[_test_func.__name__] = _test_func diff --git a/ci_test/unit_tests/test_unit_layer_pooling_distconv.py b/ci_test/unit_tests/test_unit_layer_pooling_distconv.py index ebbaf9bf907..3f58e23f64f 100644 --- a/ci_test/unit_tests/test_unit_layer_pooling_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_pooling_distconv.py @@ -6,6 +6,8 @@ import sys import numpy as np import pytest +import lbann.contrib.args +from lbann.contrib.lc.systems import * # Bamboo utilities current_file = os.path.realpath(__file__) @@ -108,7 +110,7 @@ def setup_experiment(lbann, weekly): message = f'{os.path.basename(__file__)} requires DISTCONV' print('Skip - ' + message) pytest.skip(message) - + mini_batch_size = num_samples() // 2 trainer = lbann.Trainer(mini_batch_size=mini_batch_size) model = construct_model(lbann) @@ -310,10 +312,10 @@ def construct_data_reader(lbann): # ============================================== # Setup PyTest # ============================================== - # Create test functions that can interact with PyTest # Note: Create test name by removing ".py" from file name _test_name = os.path.splitext(os.path.basename(current_file))[0] for _test_func in tools.create_tests(setup_experiment, _test_name, - environment=tools.get_distconv_environment()): + environment=lbann.contrib.args.get_distconv_environment(), + time_limit=15): globals()[_test_func.__name__] = _test_func diff --git a/ci_test/unit_tests/test_unit_layer_relu_distconv.py b/ci_test/unit_tests/test_unit_layer_relu_distconv.py index f6b628c5ea2..b6fcfb59076 100644 --- a/ci_test/unit_tests/test_unit_layer_relu_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_relu_distconv.py @@ -5,6 +5,7 @@ import sys import numpy as np import pytest +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -214,5 +215,5 @@ def construct_data_reader(lbann): # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, - environment=tools.get_distconv_environment()): + environment=lbann.contrib.args.get_distconv_environment()): globals()[_test_func.__name__] = _test_func diff --git a/ci_test/unit_tests/test_unit_layer_scatter_distconv.py b/ci_test/unit_tests/test_unit_layer_scatter_distconv.py index be431d4cc75..36674fd363d 100644 --- a/ci_test/unit_tests/test_unit_layer_scatter_distconv.py +++ b/ci_test/unit_tests/test_unit_layer_scatter_distconv.py @@ -4,6 +4,7 @@ from unittest.mock import call import numpy as np import pytest +import lbann.contrib.args # Bamboo utilities current_file = os.path.realpath(__file__) @@ -212,5 +213,5 @@ def construct_data_reader(lbann): # Create test functions that can interact with PyTest for _test_func in tools.create_tests(setup_experiment, __file__, - environment=tools.get_distconv_environment(init_nvshmem=True)): + environment=lbann.contrib.args.get_distconv_environment(init_nvshmem=True)): globals()[_test_func.__name__] = _test_func diff --git a/include/lbann/data_coordinator/buffered_data_coordinator.hpp b/include/lbann/data_coordinator/buffered_data_coordinator.hpp index d4fbb7e09cb..c0959c36367 100644 --- a/include/lbann/data_coordinator/buffered_data_coordinator.hpp +++ b/include/lbann/data_coordinator/buffered_data_coordinator.hpp @@ -112,7 +112,9 @@ class buffered_data_coordinator : public data_coordinator /** @brief After registering the active data field, allocate storage for each * data field in the context maps within the double buffer. */ - void register_active_data_field(data_field_type const data_field) override; + void register_active_data_field( + data_field_type const& data_field, + std::vector const& data_field_dim_map) override; void fp_setup_data(data_buffer& buffer, El::Int cur_mini_batch_size); diff --git a/include/lbann/data_coordinator/data_coordinator.hpp b/include/lbann/data_coordinator/data_coordinator.hpp index 3b3f2937bde..0a987f4fceb 100644 --- a/include/lbann/data_coordinator/data_coordinator.hpp +++ b/include/lbann/data_coordinator/data_coordinator.hpp @@ -281,7 +281,9 @@ class data_coordinator bool at_new_epoch() const; - virtual void register_active_data_field(data_field_type const data_field); + virtual void + register_active_data_field(data_field_type const& data_field, + std::vector const& data_field_dim_map); //************************************************************************ // @@ -306,6 +308,8 @@ class data_coordinator data_reader_map_t m_data_readers; // std::map m_dataset_stats; + data_field_dim_map_type m_active_data_fields_dim_map; + std::set m_active_data_fields; public: // @todo BVE FIXME diff --git a/include/lbann/data_coordinator/data_coordinator_metadata.hpp b/include/lbann/data_coordinator/data_coordinator_metadata.hpp index dd7d1cabd8f..f76dfd044e8 100644 --- a/include/lbann/data_coordinator/data_coordinator_metadata.hpp +++ b/include/lbann/data_coordinator/data_coordinator_metadata.hpp @@ -31,6 +31,7 @@ #include "lbann/utils/enum_iterator.hpp" #include "lbann_config.hpp" +#include "lbann/data_readers/utils/input_data_type.hpp" #include #include @@ -52,12 +53,16 @@ enum class data_reader_target_mode std::string to_string(data_reader_target_mode m); /// Map from target modes to dimension maps using TargetModeDimMap = - std::unordered_map>; + std::unordered_map>; using data_reader_target_mode_iterator = enum_iterator; +/// Map from data_field_type to dimension maps +using data_field_dim_map_type = + std::unordered_map>; + enum class slice_points_mode { INDEPENDENT, diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp index 1920f9b1d2d..5a12a945579 100644 --- a/include/lbann/data_readers/data_reader.hpp +++ b/include/lbann/data_readers/data_reader.hpp @@ -384,9 +384,9 @@ class generic_data_reader virtual int get_linearized_size(data_field_type const& data_field) const; /// Get the dimensions of the data. - virtual const std::vector get_data_dims() const + virtual const std::vector get_data_dims() const { - return std::vector(0); + return std::vector(0); } virtual std::vector diff --git a/include/lbann/data_readers/data_reader_HDF5.hpp b/include/lbann/data_readers/data_reader_HDF5.hpp index c47799aa3a4..8fef8c53eab 100644 --- a/include/lbann/data_readers/data_reader_HDF5.hpp +++ b/include/lbann/data_readers/data_reader_HDF5.hpp @@ -136,7 +136,7 @@ class hdf5_data_reader return m_data_schema_filename; } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { return get_data_dims(INPUT_DATA_TYPE_SAMPLES); } @@ -192,7 +192,7 @@ class hdf5_data_reader /** filled in by construct_linearized_size_lookup_tables; * used by get_data_dims() */ - std::unordered_map> m_data_dims_lookup_table; + std::unordered_map> m_data_dims_lookup_table; /** filled in by construct_linearized_size_lookup_tables; * used by get_linearized_size() @@ -268,7 +268,7 @@ class hdf5_data_reader // methods follow //========================================================================= - const std::vector get_data_dims(std::string name = "") const; + const std::vector get_data_dims(std::string name = "") const; /** Returns the size of the requested field (datum, label, response, etc) */ int get_linearized_size(data_field_type const& data_field) const override; diff --git a/include/lbann/data_readers/data_reader_csv.hpp b/include/lbann/data_readers/data_reader_csv.hpp index a355b474ff3..28946c203d0 100644 --- a/include/lbann/data_readers/data_reader_csv.hpp +++ b/include/lbann/data_readers/data_reader_csv.hpp @@ -129,7 +129,7 @@ class csv_reader : public generic_data_reader } } int get_linearized_label_size() const override { return m_num_labels; } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { return {get_linearized_data_size()}; } diff --git a/include/lbann/data_readers/data_reader_hdf5_legacy.hpp b/include/lbann/data_readers/data_reader_hdf5_legacy.hpp index 395eb56c243..9ae06fb9536 100644 --- a/include/lbann/data_readers/data_reader_hdf5_legacy.hpp +++ b/include/lbann/data_readers/data_reader_hdf5_legacy.hpp @@ -124,7 +124,7 @@ class hdf5_reader : public generic_data_reader } return m_all_responses.size(); } - const std::vector get_data_dims() const override { return m_data_dims; } + const std::vector get_data_dims() const override { return m_data_dims; } #ifdef LBANN_HAS_DISTCONV bool is_tensor_shuffle_required() const override { return false; } @@ -155,7 +155,7 @@ class hdf5_reader : public generic_data_reader std::vector m_all_responses; std::vector m_file_paths; MPI_Comm m_comm; - std::vector m_data_dims; + std::vector m_data_dims; std::vector m_hyperslab_dims; hid_t m_fapl; hid_t m_dxpl; diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index 1e261d89ec0..171f5ce4594 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -75,7 +75,7 @@ class image_data_reader : public generic_data_reader return m_image_linearized_size; } int get_linearized_label_size() const override { return m_num_labels; } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { return {m_image_num_channels, m_image_height, m_image_width}; } diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index b6c93d90cec..c9747be8665 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -213,7 +213,7 @@ class data_reader_jag_conduit : public generic_data_reader std::vector get_linearized_response_sizes() const; /// Return the dimension of data - const std::vector get_data_dims() const override; + const std::vector get_data_dims() const override; int get_num_data() const override; int get_num_labels() const override; diff --git a/include/lbann/data_readers/data_reader_merge_features.hpp b/include/lbann/data_readers/data_reader_merge_features.hpp index abfe50104d2..c774a44c33a 100644 --- a/include/lbann/data_readers/data_reader_merge_features.hpp +++ b/include/lbann/data_readers/data_reader_merge_features.hpp @@ -73,7 +73,7 @@ class data_reader_merge_features : public generic_compound_data_reader { return m_label_reader->get_linearized_label_size(); } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { // Todo: Can we merge the dimensions of each reader sensibly? return {get_linearized_data_size()}; diff --git a/include/lbann/data_readers/data_reader_merge_samples.hpp b/include/lbann/data_readers/data_reader_merge_samples.hpp index 9417efff1a2..75d8658d2ac 100644 --- a/include/lbann/data_readers/data_reader_merge_samples.hpp +++ b/include/lbann/data_readers/data_reader_merge_samples.hpp @@ -75,7 +75,7 @@ class data_reader_merge_samples : public generic_compound_data_reader { return m_data_readers[0]->get_linearized_response_size(); } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { return m_data_readers[0]->get_data_dims(); } @@ -101,7 +101,7 @@ class data_reader_merge_samples : public generic_compound_data_reader void sanity_check_for_consistency(int num_labels, int data_size, int label_size, - const std::vector& data_dims); + const std::vector& data_dims); }; } // namespace lbann diff --git a/include/lbann/data_readers/data_reader_mesh.hpp b/include/lbann/data_readers/data_reader_mesh.hpp index 884e4122cd9..5cebe22060f 100644 --- a/include/lbann/data_readers/data_reader_mesh.hpp +++ b/include/lbann/data_readers/data_reader_mesh.hpp @@ -72,9 +72,9 @@ class mesh_reader : public generic_data_reader { return m_data_height * m_data_width; } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { - return {static_cast(m_channels.size()), m_data_height, m_data_width}; + return {static_cast(m_channels.size()), m_data_height, m_data_width}; } protected: diff --git a/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp b/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp index 67a8d2766ae..9a1dde96ae1 100644 --- a/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp +++ b/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp @@ -77,7 +77,7 @@ class ras_lipid_conduit_data_reader : public generic_data_reader } // const std::vector get_data_dims() const override { return // m_data_dims; } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { return {get_linearized_data_size()}; } diff --git a/include/lbann/data_readers/data_reader_numpy.hpp b/include/lbann/data_readers/data_reader_numpy.hpp index 37778e0ece6..b2988323ea4 100644 --- a/include/lbann/data_readers/data_reader_numpy.hpp +++ b/include/lbann/data_readers/data_reader_numpy.hpp @@ -61,9 +61,9 @@ class numpy_reader : public generic_data_reader int get_num_labels() const override { return m_num_labels; } int get_linearized_data_size() const override { return m_num_features; } int get_linearized_label_size() const override { return m_num_labels; } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { - std::vector dims(m_data.shape.begin() + 1, m_data.shape.end()); + std::vector dims(m_data.shape.begin() + 1, m_data.shape.end()); if (m_supported_input_types.at(INPUT_DATA_TYPE_LABELS) || m_supported_input_types.at(INPUT_DATA_TYPE_RESPONSES)) { dims.back() -= 1; diff --git a/include/lbann/data_readers/data_reader_numpy_npz.hpp b/include/lbann/data_readers/data_reader_numpy_npz.hpp index ee96150aceb..78609418790 100644 --- a/include/lbann/data_readers/data_reader_numpy_npz.hpp +++ b/include/lbann/data_readers/data_reader_numpy_npz.hpp @@ -74,9 +74,9 @@ class numpy_npz_reader : public generic_data_reader { return m_num_response_features; } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { - std::vector dims(m_data.shape.begin() + 1, m_data.shape.end()); + std::vector dims(m_data.shape.begin() + 1, m_data.shape.end()); return dims; } diff --git a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp index 64c34bb3a9a..43ac9516982 100644 --- a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp +++ b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp @@ -75,7 +75,7 @@ class numpy_npz_conduit_reader : public generic_data_reader { return m_num_response_features; } - const std::vector get_data_dims() const override { return m_data_dims; } + const std::vector get_data_dims() const override { return m_data_dims; } protected: void do_preload_data_store() override; @@ -93,7 +93,7 @@ class numpy_npz_conduit_reader : public generic_data_reader /// Number of features in each response. int m_num_response_features = 0; - std::vector m_data_dims; + std::vector m_data_dims; int m_data_word_size = 0; size_t m_response_word_size = 0; diff --git a/include/lbann/data_readers/data_reader_pilot2_molecular.hpp b/include/lbann/data_readers/data_reader_pilot2_molecular.hpp index 8cd4acd63fc..f0e82f02d09 100644 --- a/include/lbann/data_readers/data_reader_pilot2_molecular.hpp +++ b/include/lbann/data_readers/data_reader_pilot2_molecular.hpp @@ -59,7 +59,7 @@ class pilot2_molecular_reader : public generic_data_reader { return m_num_features * (m_num_neighbors + 1); } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { return m_shape; // return {m_num_neighbors + 1, (int) m_features.shape[2], @@ -138,7 +138,7 @@ class pilot2_molecular_reader : public generic_data_reader DataType bond_len_scale_factor = 10.0; /// support for data_store_pilot2_molecular - std::vector m_shape; + std::vector m_shape; /// support for data_store_pilot2_molecular int m_word_size; diff --git a/include/lbann/data_readers/data_reader_python.hpp b/include/lbann/data_readers/data_reader_python.hpp index 8255ccb73fe..7765643b8e8 100644 --- a/include/lbann/data_readers/data_reader_python.hpp +++ b/include/lbann/data_readers/data_reader_python.hpp @@ -49,7 +49,7 @@ class python_reader : public generic_data_reader std::string get_type() const override { return "python_reader"; } - const std::vector get_data_dims() const override; + const std::vector get_data_dims() const override; int get_num_labels() const override; int get_linearized_data_size() const override; int get_linearized_label_size() const override; diff --git a/include/lbann/data_readers/data_reader_smiles.hpp b/include/lbann/data_readers/data_reader_smiles.hpp index 92aff6ef65b..661b08132fa 100644 --- a/include/lbann/data_readers/data_reader_smiles.hpp +++ b/include/lbann/data_readers/data_reader_smiles.hpp @@ -75,7 +75,7 @@ class smiles_data_reader { return m_linearized_response_size; } - const std::vector get_data_dims() const override + const std::vector get_data_dims() const override { return {get_linearized_data_size()}; } diff --git a/include/lbann/data_readers/data_reader_synthetic.hpp b/include/lbann/data_readers/data_reader_synthetic.hpp index e148bb7e9ce..d4b73351b5b 100644 --- a/include/lbann/data_readers/data_reader_synthetic.hpp +++ b/include/lbann/data_readers/data_reader_synthetic.hpp @@ -46,15 +46,15 @@ class data_reader_synthetic : public generic_data_reader // TODO: add what data distribution to use data_reader_synthetic(int num_samples, int num_features, bool shuffle = true); data_reader_synthetic(int num_samples, - std::vector dims, + std::vector dims, int num_labels, bool shuffle = true); data_reader_synthetic(int num_samples, - std::vector dims, - std::vector response_dims, + std::vector dims, + std::vector response_dims, bool shuffle = true); data_reader_synthetic(int num_samples, - std::map> data_fields, + std::map> data_fields, bool shuffle = true); data_reader_synthetic(const data_reader_synthetic&) = default; data_reader_synthetic& operator=(const data_reader_synthetic&) = default; @@ -86,7 +86,7 @@ class data_reader_synthetic : public generic_data_reader return get_linear_size(m_response_dimensions); } - const std::vector get_data_dims() const override { return m_dimensions; } + const std::vector get_data_dims() const override { return m_dimensions; } int get_num_labels() const override { return m_num_labels; } int get_num_responses() const override @@ -112,11 +112,11 @@ class data_reader_synthetic : public generic_data_reader /** Number of labels in the dataset. */ int m_num_labels; /** Shape of the data. */ - std::vector m_dimensions; + std::vector m_dimensions; /** Shape of the responses. */ - std::vector m_response_dimensions; + std::vector m_response_dimensions; - std::map> m_synthetic_data_fields; + std::map> m_synthetic_data_fields; }; } // namespace lbann diff --git a/include/lbann/layers/io/input_layer.hpp b/include/lbann/layers/io/input_layer.hpp index ab8114dad72..1df759cb168 100644 --- a/include/lbann/layers/io/input_layer.hpp +++ b/include/lbann/layers/io/input_layer.hpp @@ -160,7 +160,7 @@ class input_layer : public data_type_layer /** * Get the dimensions of the underlying data. */ - std::vector get_data_dims(DataReaderMetaData& dr_metadata, + std::vector get_data_dims(DataReaderMetaData& dr_metadata, int child_index = 0) const; /** @name Serialization */ diff --git a/include/lbann/utils/dim_helpers.hpp b/include/lbann/utils/dim_helpers.hpp index 78a1bcc2040..d3b55ea0bde 100644 --- a/include/lbann/utils/dim_helpers.hpp +++ b/include/lbann/utils/dim_helpers.hpp @@ -99,6 +99,12 @@ auto get_packed_strides(std::vector const& dims) return get_strides(dims.size(), dims.data(), T(1)); } +template +auto vector_cast(std::vector const& from) +{ + return std::vector{from.cbegin(), from.cend()}; +} + namespace details { template diff --git a/python/lbann/contrib/args.py b/python/lbann/contrib/args.py index 9cc02647eae..b038b6e3e2d 100644 --- a/python/lbann/contrib/args.py +++ b/python/lbann/contrib/args.py @@ -80,6 +80,7 @@ def get_distconv_environment(parallel_io=False, num_io_partitions=1, init_nvshme num_io_partitions (int): The number of processes to read a single sample. """ + # TODO: Use the default halo exchange and shuffle method. See https://github.com/LLNL/lbann/issues/1659 environment = { 'DISTCONV_WS_CAPACITY_FACTOR': 0.8, 'LBANN_DISTCONV_HALO_EXCHANGE': 'AL', diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index 0595f6559b2..2b6d6049b73 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -61,6 +61,9 @@ def prepend_environment_path(key, prefix): if scheduler == 'slurm' and has_gpu(system): launcher_args.extend(['--mpibind=off']) + if scheduler == 'flux' and system == 'corona': + launcher_args.extend(['-o pmi=pmix']) + # Optimized thread affinity for Pascal # Note: Both GPUs are on socket 0, so we only use cores on that # socket. @@ -91,8 +94,11 @@ def prepend_environment_path(key, prefix): # Optimizations for Tioga if system in ('tioga', 'rzvernal'): #set_environment('NCCL_SOCKET_IFNAME', 'hsi') - set_environment('MIOPEN_DEBUG_DISABLE_FIND_DB', '1') - set_environment('MIOPEN_DISABLE_CACHE', '1') + set_environment('MIOPEN_DEBUG_DISABLE_FIND_DB', '0') + set_environment('MIOPEN_DISABLE_CACHE', '0') + tmpdir = os.environ.get('TMPDIR') + set_environment('MIOPEN_USER_DB_PATH', f'{tmpdir}/MIOpen_user_db') + set_environment('MIOPEN_CUSTOM_CACHE_DIR', f'{tmpdir}/MIOpen_custom_cache') if os.getenv('CRAY_LD_LIBRARY_PATH') is not None: prepend_environment_path('LD_LIBRARY_PATH', os.getenv('CRAY_LD_LIBRARY_PATH')) if os.getenv('ROCM_PATH') is not None: @@ -117,8 +123,11 @@ def prepend_environment_path(key, prefix): set_environment('OMPI_MCA_mpi_warn_on_fork', 0) #set_environment('NCCL_SOCKET_IFNAME', 'hsi') - set_environment('MIOPEN_DEBUG_DISABLE_FIND_DB', '1') - set_environment('MIOPEN_DISABLE_CACHE', '1') + set_environment('MIOPEN_DEBUG_DISABLE_FIND_DB', '0') + set_environment('MIOPEN_DISABLE_CACHE', '0') + tmpdir = os.environ.get('TMPDIR') + set_environment('MIOPEN_USER_DB_PATH', f'{tmpdir}/MIOpen_user_db') + set_environment('MIOPEN_CUSTOM_CACHE_DIR', f'{tmpdir}/MIOpen_custom_cache') if os.getenv('ROCM_PATH') is not None: prepend_environment_path('LD_LIBRARY_PATH', os.path.join(os.getenv('ROCM_PATH'), 'llvm', 'lib')) diff --git a/python/lbann/contrib/olcf/launcher.py b/python/lbann/contrib/olcf/launcher.py index 408cd86ee8c..234d98048d9 100644 --- a/python/lbann/contrib/olcf/launcher.py +++ b/python/lbann/contrib/olcf/launcher.py @@ -103,8 +103,9 @@ def prepend_environment_path(key, prefix): #set_environment('NCCL_SOCKET_IFNAME', 'hsi') set_environment('MIOPEN_DEBUG_DISABLE_FIND_DB', '1') set_environment('MIOPEN_DISABLE_CACHE', '1') - set_environment('MIOPEN_USER_DB_PATH', '/tmp') - set_environment('MIOPEN_CUSTOM_CACHE_DIR', '/tmp') + tmpdir = os.environ.get('TMPDIR') + set_environment('MIOPEN_USER_DB_PATH', f'{tmpdir}/MIOpen_user_db') + set_environment('MIOPEN_CUSTOM_CACHE_DIR', f'{tmpdir}/MIOpen_custom_cache') # set_environment('MIOPEN_ENABLE_LOGGING','1') # set_environment('MIOPEN_ENABLE_LOGGING_CMD', '1') # set_environment('MIOPEN_LOG_LEVEL', '6') diff --git a/python/lbann/launcher/flux.py b/python/lbann/launcher/flux.py index f68d2ef4639..0c7a38b8fa8 100644 --- a/python/lbann/launcher/flux.py +++ b/python/lbann/launcher/flux.py @@ -121,14 +121,9 @@ def add_parallel_command(self, args.append(f'--setattr=system.cwd={work_dir}') args.append(f'--nodes={nodes}') args.append(f'--ntasks={nodes * procs_per_node}') - args.append(f'-o per-resource.type=node') - args.append(f'-o per-resource.count={procs_per_node}') args.append(f'--exclusive') - args.append(f'-g 1') # --gpus-per-task # Ramesh had used a -c flag but doesn't seem to use it right now # args.append(f'-c {int(self.cores_per_node / procs_per_node)}') #--cores-per-task - args.append(f'-o gpu-affinity=per-task') - args.append(f'-o cpu-affinity=per-task') args.append(f'-o nosetpgrp') use_this_rccl=os.getenv('LBANN_USE_THIS_RCCL') if use_this_rccl is not None: diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index f2f918563a7..ea0128a7244 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -470,7 +470,7 @@ fi if [[ ! "${LBANN_VARIANTS}" =~ .*"^dihydrogen".* ]]; then # If the user didn't supply a specific version of DiHydrogen on the command line add one # Due to concretizer errors force the openmp variant for DiHydrogen -# DIHYDROGEN="^dihydrogen${DIHYDROGEN_VER}}" +# DIHYDROGEN="^dihydrogen${DIHYDROGEN_VER}" DIHYDROGEN="^dihydrogen${DIHYDROGEN_VER} ${CENTER_BLAS_LIBRARY}" fi @@ -539,7 +539,10 @@ if [[ ! "${LBANN_VARIANTS}" =~ .*"~python".* ]]; then # Specifically, for use within the data reader, NumPy has to have the same # C++ std library if [[ ! "${PKG_LIST}" =~ .*"py-numpy".* ]]; then - PKG_LIST="${PKG_LIST} py-numpy@1.16.0:" + PKG_LIST="${PKG_LIST} py-numpy@1.16.0:1.24.3" + fi + if [[ ! "${PKG_LIST}" =~ .*"py-pip".* ]]; then + PKG_LIST="${PKG_LIST} py-pip@22.2.2:" fi fi fi @@ -771,7 +774,7 @@ if [[ -z "${CONFIG_FILE_NAME}" ]]; then # Put the compilers into the SITE scope so that we can execute # spack load commands later without activating the environment - CMD="spack compiler find --scope site ${CENTER_COMPILER_PATHS}" + CMD="spack compiler find --scope env:${LBANN_ENV} ${CENTER_COMPILER_PATHS}" echo ${CMD} | tee -a ${LOG} [[ -z "${DRY_RUN:-}" ]] && { ${CMD} || exit_on_failure "${CMD}"; } @@ -922,7 +925,7 @@ if [[ -z "${CONFIG_FILE_NAME}" ]]; then ########################################################################################## # Actually install LBANN's dependencies from local source - CMD="spack install --test root --reuse --only dependencies ${BUILD_JOBS}" + CMD="spack install --reuse --only dependencies ${BUILD_JOBS}" echo ${CMD} | tee -a ${LOG} [[ -z "${DRY_RUN:-}" ]] && { ${CMD} || exit_on_failure "${CMD}"; } @@ -938,7 +941,7 @@ if [[ -z "${CONFIG_FILE_NAME}" ]]; then if [[ -n "${SPACK_EXTRA_ROOT_PACKAGES:-}" ]]; then for p in ${SPACK_EXTRA_ROOT_PACKAGES} do - CMD="spack install --test root --reuse ${BUILD_JOBS} ${p}" + CMD="spack install --reuse ${BUILD_JOBS} ${p}" echo ${CMD} | tee -a ${LOG} [[ -z "${DRY_RUN:-}" ]] && { ${CMD} || exit_on_failure "${CMD}"; } done @@ -949,9 +952,9 @@ if [[ -z "${CONFIG_FILE_NAME}" ]]; then for p in ${PIP_EXTRAS} do if [[ -e "${p}" ]]; then - CMD="python3 -m pip install -r ${p}" + CMD="python3 -m pip install --prefix ${LBANN_INSTALL_DIR} -r ${p}" else - CMD="python3 -m pip install ${p}" + CMD="python3 -m pip install --prefix ${LBANN_INSTALL_DIR} ${p}" fi echo ${CMD} | tee -a ${LOG} [[ -z "${DRY_RUN:-}" ]] && { ${CMD} || exit_on_failure "${CMD}"; } diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 4ac956be36f..3b6c09c1a42 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -116,7 +116,7 @@ set_center_specific_modules() MODULE_CMD="module load gcc/10.2.1 mvapich2/2.3.6 python/3.7.2" ;; "zen" | "zen2") # Corona - MODULE_CMD="module load StdEnv gcc-tce/10.3.1 cmake/3.23.1 openmpi-tce/4.1.2 rocm/5.6.0" + MODULE_CMD="module load StdEnv gcc/10.3.1-magic cmake/3.23.1 openmpi/4.1.2 rocm/5.6.0" # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; "zen3") # Tioga, RZVernal @@ -189,12 +189,12 @@ set_center_specific_spack_dependencies() CENTER_COMPILER_PATHS="/usr/tce/packages/gcc/gcc-8.3.1 /usr/tce/packages/clang/clang-10.0.1-gcc-8.3.1/" CENTER_COMPILER="%gcc@8.3.1" DEPENDENTS_CENTER_COMPILER="%gcc@8.3.1" - CENTER_DEPENDENCIES="^spectrum-mpi ^cuda@11.6.112 ^libtool@2.4.2 ^python@3.9.10 ^protobuf@3.10.0 ^py-protobuf@3.10.0 ^openblas ^nccl@2.16.2-1" + CENTER_DEPENDENCIES="^spectrum-mpi ^cuda@11.6.112 ^libtool@2.4.2 ^python@3.9.10: ^protobuf@3.20.3 ^py-protobuf@3.20.3 ^openblas ^nccl@2.16.2-1" CENTER_BLAS_LIBRARY="blas=openblas" ;; "broadwell" | "haswell" | "sandybridge") # Pascal, RZHasGPU, Surface # On LC the mvapich2 being used is built against HWLOC v1 - CENTER_COMPILER_PATHS="/usr/tce/packages/gcc/gcc-10.3.1/ /usr/workspace/brain/tom/pascal/llvm/latest/" + CENTER_COMPILER_PATHS="/usr/tce/packages/gcc/gcc-10.3.1-magic /usr/workspace/brain/tom/pascal/llvm/latest/" CENTER_COMPILER="%gcc" # CENTER_COMPILER="%clang" # DEPENDENTS_CENTER_COMPILER="%gcc@10.3.1" @@ -608,6 +608,12 @@ cat <> ${yaml} core_compilers: - '${CORE_COMPILER}' EOF + if [[ ${DEPENDENTS_CENTER_COMPILER} ]]; then + DEPENDENTS_CORE_COMPILER=$(echo "${DEPENDENTS_CENTER_COMPILER}" | tr -d '%') +cat <> ${yaml} + - '${DEPENDENTS_CORE_COMPILER}' +EOF + fi fi } diff --git a/src/data_coordinator/buffered_data_coordinator.cpp b/src/data_coordinator/buffered_data_coordinator.cpp index 85a1681231e..52ff3138e6b 100644 --- a/src/data_coordinator/buffered_data_coordinator.cpp +++ b/src/data_coordinator/buffered_data_coordinator.cpp @@ -43,9 +43,11 @@ namespace lbann { template void buffered_data_coordinator::register_active_data_field( - data_field_type const data_field) + data_field_type const& data_field, + std::vector const& data_field_dim_map) + { - data_coordinator::register_active_data_field(data_field); + data_coordinator::register_active_data_field(data_field, data_field_dim_map); for (const auto& buf_map : m_data_buffers) { const data_buffer_map_t& buffer_map = buf_map; for (auto& [mode, buffer] : buffer_map) { @@ -119,6 +121,10 @@ void buffered_data_coordinator::setup_data_fields( if (phase_io_buffer->IsEmpty() || phase_io_buffer->Width() == 0 || phase_io_buffer->Height() == 0) { El::Int linearized_size = get_linearized_size(data_field); + if (linearized_size == -1) { + LBANN_ERROR("Invalid value for the linearized size of data field ", + data_field); + } data_buffer->m_input_buffers[data_field]->Resize(linearized_size, max_mini_batch_size); @@ -482,7 +488,7 @@ bool buffered_data_coordinator::load_from_checkpoint_shared( #ifdef LBANN_HAS_CEREAL_XML_ARCHIVES "_dc.xml" #else // defined LBANN_HAS_CEREAL_BINARY_ARCHIVES - "_dc.bin" + "_dc.bin" #endif // LBANN_HAS_CEREAL_XML_ARCHIVES ); buf = create_cereal_archive_binary_string(*this); @@ -529,7 +535,7 @@ bool buffered_data_coordinator< #ifdef LBANN_HAS_CEREAL_XML_ARCHIVES "_dc.xml" #else // defined LBANN_HAS_CEREAL_BINARY_ARCHIVES - "_dc.bin" + "_dc.bin" #endif // LBANN_HAS_CEREAL_XML_ARCHIVES ); return true; diff --git a/src/data_coordinator/data_coordinator.cpp b/src/data_coordinator/data_coordinator.cpp index e918ec2f23f..861df3644ab 100644 --- a/src/data_coordinator/data_coordinator.cpp +++ b/src/data_coordinator/data_coordinator.cpp @@ -29,6 +29,7 @@ #include "lbann/execution_algorithms/execution_context.hpp" #include #include +#include #include #include @@ -230,21 +231,22 @@ TargetModeDimMap data_coordinator::get_data_dims() map[data_reader_target_mode::INPUT] = dr->get_data_dims(); if (dr->has_labels()) { map[data_reader_target_mode::CLASSIFICATION] = - std::vector(1, dr->get_num_labels()); + std::vector(1, dr->get_num_labels()); } else { - map[data_reader_target_mode::CLASSIFICATION] = std::vector(1, 0); + map[data_reader_target_mode::CLASSIFICATION] = + std::vector(1, 0); } if (dr->has_responses()) { map[data_reader_target_mode::REGRESSION] = - std::vector(1, dr->get_num_responses()); + std::vector(1, dr->get_num_responses()); } else { - map[data_reader_target_mode::REGRESSION] = std::vector(1, 0); + map[data_reader_target_mode::REGRESSION] = std::vector(1, 0); } map[data_reader_target_mode::RECONSTRUCTION] = dr->get_data_dims(); map[data_reader_target_mode::LABEL_RECONSTRUCTION] = dr->get_data_dims(); - map[data_reader_target_mode::NA] = std::vector(1, 0); + map[data_reader_target_mode::NA] = std::vector(1, 0); return map; } } @@ -349,6 +351,26 @@ long data_coordinator::get_linearized_size( linearized_size = tmp_size; } } + auto& dim_map = m_active_data_fields_dim_map.at(data_field); + if (linearized_size != get_linear_size(dim_map)) { + if (linearized_size == -1) { + LBANN_WARNING("Unable to find data readers; using data field map for " + "linearized size for data field: ", + data_field, + " = ", + get_linear_size(dim_map)); + linearized_size = get_linear_size(dim_map); + } + else { + LBANN_ERROR("The data readers and data field map disagree on the " + "linearized size of the field: ", + data_field, + ": ", + linearized_size, + " != ", + get_linear_size(dim_map)); + } + } return linearized_size; } @@ -357,26 +379,7 @@ long data_coordinator::get_linearized_size( */ long data_coordinator::get_linearized_data_size() const { - long linearized_data_size = -1; - generic_data_reader* dr; - for (auto mode : execution_mode_iterator()) { - dr = get_data_reader(mode); - if (dr != nullptr) { - long tmp_data_size = dr->get_linearized_data_size(); - if (linearized_data_size != -1 && linearized_data_size != tmp_data_size) { - LBANN_ERROR( - "data_coordinator: ", - to_string(mode), - " data set size (", - std::to_string(tmp_data_size), - ") does not match the currently established data set size (", - std::to_string(linearized_data_size), - ")"); - } - linearized_data_size = tmp_data_size; - } - } - return linearized_data_size; + return get_linearized_size(INPUT_DATA_TYPE_SAMPLES); } /** @@ -384,27 +387,7 @@ long data_coordinator::get_linearized_data_size() const */ long data_coordinator::get_linearized_label_size() const { - long linearized_label_size = -1; - generic_data_reader* dr; - for (auto mode : execution_mode_iterator()) { - dr = get_data_reader(mode); - if (dr != nullptr) { - long tmp_label_size = dr->get_linearized_label_size(); - if (linearized_label_size != -1 && - linearized_label_size != tmp_label_size) { - LBANN_ERROR( - "data_coordinator: ", - to_string(mode), - " label set size (", - std::to_string(tmp_label_size), - ") does not match the currently established data set size (", - std::to_string(linearized_label_size), - ")"); - } - linearized_label_size = tmp_label_size; - } - } - return linearized_label_size; + return get_linearized_size(INPUT_DATA_TYPE_LABELS); } /** @@ -412,27 +395,7 @@ long data_coordinator::get_linearized_label_size() const */ long data_coordinator::get_linearized_response_size() const { - long linearized_response_size = -1; - generic_data_reader* dr; - for (auto mode : execution_mode_iterator()) { - dr = get_data_reader(mode); - if (dr != nullptr) { - long tmp_response_size = dr->get_linearized_response_size(); - if (linearized_response_size != -1 && - linearized_response_size != tmp_response_size) { - LBANN_ERROR( - "data_coordinator: ", - to_string(mode), - " response set size (", - std::to_string(tmp_response_size), - ") does not match the currently established data set size (", - std::to_string(linearized_response_size), - ")"); - } - linearized_response_size = tmp_response_size; - } - } - return linearized_response_size; + return get_linearized_size(INPUT_DATA_TYPE_RESPONSES); } // At the start of the epoch, set the execution mode and make sure @@ -578,9 +541,11 @@ bool data_coordinator::at_new_epoch() const } void data_coordinator::register_active_data_field( - data_field_type const data_field) + data_field_type const& data_field, + std::vector const& data_field_dim_map) { m_active_data_fields.insert(data_field); + m_active_data_fields_dim_map[data_field] = data_field_dim_map; } size_t data_coordinator::get_num_iterations_per_epoch(execution_mode mode) const diff --git a/src/data_readers/data_reader_HDF5.cpp b/src/data_readers/data_reader_HDF5.cpp index 643d6bbfa5e..14b16ce4c68 100644 --- a/src/data_readers/data_reader_HDF5.cpp +++ b/src/data_readers/data_reader_HDF5.cpp @@ -877,9 +877,9 @@ void hdf5_data_reader::repack_image(conduit::Node& node, } } -const std::vector hdf5_data_reader::get_data_dims(std::string name) const +const std::vector hdf5_data_reader::get_data_dims(std::string name) const { - std::unordered_map>::const_iterator iter = + std::unordered_map>::const_iterator iter = m_data_dims_lookup_table.find(name); if (iter == m_data_dims_lookup_table.end()) { LBANN_ERROR( diff --git a/src/data_readers/data_reader_hdf5_legacy.cpp b/src/data_readers/data_reader_hdf5_legacy.cpp index 1702be96132..049c79ba701 100644 --- a/src/data_readers/data_reader_hdf5_legacy.cpp +++ b/src/data_readers/data_reader_hdf5_legacy.cpp @@ -225,7 +225,7 @@ void hdf5_reader::load() hsize_t dims[4]; CHECK_HDF5(H5Sget_simple_extent_dims(h_space, dims, NULL)); CHECK_HDF5(H5Dclose(h_data)); - m_data_dims = std::vector(dims, dims + 4); + m_data_dims = std::vector(dims, dims + 4); } else { LBANN_ERROR("The number of HDF5 samples should not be zero"); diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index ad5485287d8..0512b0048ae 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -1188,14 +1188,14 @@ const std::vector data_reader_jag_conduit::get_dims( return {}; } -const std::vector data_reader_jag_conduit::get_data_dims() const +const std::vector data_reader_jag_conduit::get_data_dims() const { #if 1 return {get_linearized_data_size()}; #else - std::vector all_dim; + std::vector all_dim; for (const auto t : m_independent) { - const std::vector ld = get_dims(t); + const std::vector ld = get_dims(t); all_dim.insert(all_dim.end(), ld.begin(), ld.end()); } if (all_dim.empty()) { diff --git a/src/data_readers/data_reader_merge_samples.cpp b/src/data_readers/data_reader_merge_samples.cpp index 4c241da5426..d138af82a1a 100644 --- a/src/data_readers/data_reader_merge_samples.cpp +++ b/src/data_readers/data_reader_merge_samples.cpp @@ -74,7 +74,7 @@ void data_reader_merge_samples::sanity_check_for_consistency( int num_labels, int data_size, int label_size, - const std::vector& data_dims) + const std::vector& data_dims) { for (auto&& reader : m_data_readers) { if (num_labels != reader->get_num_labels()) { @@ -118,7 +118,7 @@ void data_reader_merge_samples::load() int num_labels = m_data_readers[0]->get_num_labels(); int data_size = m_data_readers[0]->get_linearized_data_size(); int label_size = m_data_readers[0]->get_linearized_label_size(); - const std::vector data_dims = m_data_readers[0]->get_data_dims(); + const std::vector data_dims = m_data_readers[0]->get_data_dims(); sanity_check_for_consistency(num_labels, data_size, label_size, data_dims); size_t global_num_samples = compute_num_samples_psum(); diff --git a/src/data_readers/data_reader_python.cpp b/src/data_readers/data_reader_python.cpp index e9a7898cfd9..e5db6ca3c7d 100644 --- a/src/data_readers/data_reader_python.cpp +++ b/src/data_readers/data_reader_python.cpp @@ -92,9 +92,9 @@ python_reader::~python_reader() } } -const std::vector python_reader::get_data_dims() const +const std::vector python_reader::get_data_dims() const { - std::vector dims; + std::vector dims; for (const auto& d : m_sample_dims) { dims.push_back(d); } diff --git a/src/data_readers/data_reader_synthetic.cpp b/src/data_readers/data_reader_synthetic.cpp index 2f7a6a551ea..a02fa799761 100644 --- a/src/data_readers/data_reader_synthetic.cpp +++ b/src/data_readers/data_reader_synthetic.cpp @@ -56,7 +56,7 @@ data_reader_synthetic::data_reader_synthetic(int num_samples, {} data_reader_synthetic::data_reader_synthetic(int num_samples, - std::vector dims, + std::vector dims, int num_labels, bool shuffle) : generic_data_reader(shuffle), @@ -74,8 +74,8 @@ data_reader_synthetic::data_reader_synthetic(int num_samples, } data_reader_synthetic::data_reader_synthetic(int num_samples, - std::vector dims, - std::vector response_dims, + std::vector dims, + std::vector response_dims, bool shuffle) : generic_data_reader(shuffle), m_num_samples(num_samples), @@ -94,7 +94,7 @@ data_reader_synthetic::data_reader_synthetic(int num_samples, data_reader_synthetic::data_reader_synthetic( int num_samples, - std::map> data_fields, + std::map> data_fields, bool shuffle) : generic_data_reader(shuffle), m_num_samples(num_samples), diff --git a/src/data_readers/unit_test/data_reader_synthetic_test.cpp b/src/data_readers/unit_test/data_reader_synthetic_test.cpp index 221fc971487..ae60e529c62 100644 --- a/src/data_readers/unit_test/data_reader_synthetic_test.cpp +++ b/src/data_readers/unit_test/data_reader_synthetic_test.cpp @@ -91,7 +91,7 @@ TEST_CASE("Synthetic data reader classification tests", auto s = GENERATE(range(1, 11)); El::Int num_samples = s; - std::vector dims = {s, s}; + std::vector dims = {s, s}; ; El::Int num_labels = s * 2; @@ -152,9 +152,9 @@ TEST_CASE("Synthetic data reader regression tests", auto s = GENERATE(range(1, 11)); El::Int num_samples = s; - std::vector dims = {s, s}; + std::vector dims = {s, s}; ; - std::vector response_dims = {s + 1, s + 1}; + std::vector response_dims = {s + 1, s + 1}; SECTION("fetch data and response") { @@ -217,10 +217,10 @@ TEST_CASE("Synthetic data reader data field", auto s = GENERATE(range(1, 4)); El::Int num_samples = s; std::vector data_fields = {"foo", "bar"}; - std::map> fields; + std::map> fields; int f = 0; for (auto const& data_field : data_fields) { - std::vector dims = {s + f, s + f}; + std::vector dims = {s + f, s + f}; fields[data_field] = dims; ++f; } diff --git a/src/data_readers/unit_test/data_reader_synthetic_test_public_api.cpp b/src/data_readers/unit_test/data_reader_synthetic_test_public_api.cpp index c1cf5f1bcf5..3b5b42bc32d 100644 --- a/src/data_readers/unit_test/data_reader_synthetic_test_public_api.cpp +++ b/src/data_readers/unit_test/data_reader_synthetic_test_public_api.cpp @@ -95,9 +95,9 @@ TEST_CASE("Synthetic data reader public API tests", GENERATE(std::string("labels"), std::string("responses"))); auto s = GENERATE(range(1, 11)); El::Int num_samples = s; - std::vector dims = {s, s}; + std::vector dims = {s, s}; El::Int num_labels = s * 2; - std::vector response_dims = {s + 1, s + 1}; + std::vector response_dims = {s + 1, s + 1}; std::map> owning_local_input_buffers; @@ -209,13 +209,13 @@ TEST_CASE("Synthetic data reader public API tests - arbitrary field", auto s = GENERATE(range(1, 2)); El::Int num_samples = s; std::set data_fields = {"foo", "bar"}; - std::map> fields; + std::map> fields; int f = 0; std::map> owning_local_input_buffers; std::map local_input_buffers; for (auto const& data_field : data_fields) { - std::vector dims = {s + f, s + f}; + std::vector dims = {s + f, s + f}; fields[data_field] = dims; ++f; auto local_mat = std::make_unique(); @@ -263,7 +263,7 @@ TEST_CASE("Synthetic data reader public API tests - arbitrary field", SECTION("fetch arbitrary bad data field with extra fields") { - std::map> test_fields; + std::map> test_fields; lbann::data_field_type bad_field = "bar"; for (auto const& data_field : data_fields) { if (data_field != bad_field) { @@ -296,7 +296,7 @@ TEST_CASE("Synthetic data reader public API tests - arbitrary field", SECTION("fetch arbitrary bad data fields - no extra buffers") { - std::map> test_fields; + std::map> test_fields; std::map test_local_input_buffers; lbann::data_field_type bad_field = "bar"; for (auto const& data_field : data_fields) { diff --git a/src/layers/io/input_layer.cpp b/src/layers/io/input_layer.cpp index 64f0424efe4..ba32466f64a 100644 --- a/src/layers/io/input_layer.cpp +++ b/src/layers/io/input_layer.cpp @@ -82,12 +82,16 @@ void input_layer::setup_dims( { data_type_layer::setup_dims(dr_metadata); for (int i = 0; i < this->get_num_children(); ++i) { - this->set_output_dims(get_data_dims(dr_metadata, i), i); + this->set_output_dims(vector_cast(get_data_dims(dr_metadata, i)), i); } if (m_data_field == "") { LBANN_ERROR("Failed to setup input layer with empty data field"); } - get_trainer().get_data_coordinator().register_active_data_field(m_data_field); + get_trainer().get_data_coordinator().register_active_data_field(m_data_field, + // BVE FIXME HACK FOR NOW + // Redundantly store + // the dimensions + get_data_dims(dr_metadata, 0)); } template @@ -181,7 +185,7 @@ void input_layer::set_samples( } template -std::vector input_layer::get_data_dims( +std::vector input_layer::get_data_dims( DataReaderMetaData& dr_metadata, int child_index) const { @@ -203,7 +207,7 @@ std::vector input_layer::get_data_dims( else { LBANN_ERROR("Unknown data_field_type value provided: " + m_data_field); } - return std::vector(1, 0); + return std::vector(1, 0); } #ifdef LBANN_HAS_ONNX diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 951f4936098..848531e6592 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -425,15 +425,15 @@ void init_data_readers( if (readme.num_labels() != 0) { reader = new data_reader_synthetic(readme.num_samples(), - parse_list(readme.synth_dimensions()), + parse_list(readme.synth_dimensions()), readme.num_labels(), shuffle); } else { reader = new data_reader_synthetic( readme.num_samples(), - parse_list(readme.synth_dimensions()), - parse_list(readme.synth_response_dimensions()), + parse_list(readme.synth_dimensions()), + parse_list(readme.synth_response_dimensions()), shuffle); } } diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index 09f315594c2..18a013e2172 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -53,7 +53,7 @@ namespace lbann { // Creates a datareader metadata to get around the need for an actual // datareader in inference only mode -auto mock_dr_metadata(std::vector input_dims, std::vector output_dims) +auto mock_dr_metadata(std::vector input_dims, std::vector output_dims) { DataReaderMetaData drmd; auto& md_dims = drmd.data_dims; @@ -66,8 +66,8 @@ auto mock_dr_metadata(std::vector input_dims, std::vector output_dims) std::unique_ptr load_inference_model(lbann_comm* lc, std::string cp_dir, int mbs, - std::vector input_dims, - std::vector output_dims) + std::vector input_dims, + std::vector output_dims) { persist p; p.open_restart(cp_dir.c_str()); @@ -628,6 +628,14 @@ void print_lbann_configuration(lbann_comm* comm, std::cout << " MV2_USE_CUDA : " << (env != nullptr ? env : "") << std::endl; std::cout << std::endl; +#ifdef LBANN_HAS_ROCM + std::cout << " MIOpen DB Cache : " << std::endl; + const auto* env_db = std::getenv("MIOPEN_USER_DB_PATH"); + std::cout << " MIOPEN_USER_DB_PATH : " << (env_db != nullptr ? env_db : "") << std::endl; + const auto* env_cache = std::getenv("MIOPEN_CUSTOM_CACHE_DIR"); + std::cout << " MIOPEN_CUSTOM_CACHE_DIR : " << (env_cache != nullptr ? env_cache : "") << std::endl; +#endif // LBANN_HAS_ROCM + #ifdef LBANN_HAS_DIHYDROGEN std::cout << "DiHydrogen Features:" << std::endl; std::cout << " DaCe : "; diff --git a/src/utils/summary.cpp b/src/utils/summary.cpp index 6a3342f4517..486a2be89d8 100644 --- a/src/utils/summary.cpp +++ b/src/utils/summary.cpp @@ -61,7 +61,7 @@ lbann_summary::~lbann_summary() void lbann_summary::report_image(std::string const& tag, std::string const& img_format, CPUMat const& image, - std::vector const& dims_in, + std::vector const& dims_in, int step) { std::vector dims(dims_in.begin(), dims_in.end());