diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 1f0b729ed..e3c808410 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -49,7 +49,7 @@ env: jobs: run_tests: - name: Run tests ${{ matrix.subset }} with ${{ matrix.os }}, Python ${{ matrix.py_v}}, RedisAI ${{ matrix.rai }} + name: Run tests ${{ matrix.subset }} with ${{ matrix.os }}, Python ${{ matrix.py_v}} runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -63,9 +63,6 @@ jobs: - os: macos-14 py_v: "3.9" - env: - SMARTSIM_REDISAI: ${{ matrix.rai }} - steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -109,15 +106,10 @@ jobs: - name: Install SmartSim (with ML backends) run: | python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis - python -m pip install .[dev,mypy,ml] - - - name: Install ML Runtimes with Smart (with pt, tf, and onnx support) - if: contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12') - run: smart build --device cpu --onnx -v + python -m pip install .[dev,mypy] - - name: Install ML Runtimes with Smart (no ONNX,TF on Apple Silicon) - if: contains( matrix.os, 'macos-14' ) - run: smart build --device cpu --no_tf -v + - name: Install ML Runtimes + run: smart build --device cpu -v - name: Run mypy run: | diff --git a/.gitignore b/.gitignore index 77b91d586..97132aff7 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ tests/test_output # Dependencies smartsim/_core/.third-party smartsim/_core/.dragon +smartsim/_core/build # Docs _build diff --git a/README.md b/README.md index c0986042e..610d6608c 100644 --- a/README.md +++ b/README.md @@ -643,11 +643,11 @@ from C, C++, Fortran and Python with the SmartRedis Clients: 1.2.7 PyTorch - 2.0.1 + 2.1.0 TensorFlow\Keras - 2.13.1 + 2.15.0 ONNX diff --git a/doc/changelog.md b/doc/changelog.md index 26388a05e..8dcb08d3a 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -9,6 +9,39 @@ Jump to: ## SmartSim +### Cuda 12 and ROCm support branch + +To be merged into `develop` at some future point in time + +Description + +- Refactor to the RedisAI build to allow more flexibility in versions + and sources of ML backends +- Add Dockerfiles with GPU support +- Fine grain build support for GPUs +- Update Torch to 2.1.0, Tensorflow to 2.15.0 +- Better error messages in build process + +Detailed Notes + +- The RedisAIBuilder class was completely overhauled to allow users to + express a wider range of support for hardware/software stacks. This + will be extended to support ROCm, CUDA-11, and CUDA-12. +- Versions for each of these packages are no longer specified in an + internal class. Instead a default set of JSON files specifies the + sources and versions. Users can specify their own custom specifications + at smart build time +- Two new Dockerfiles are now provided (one each for 11.8 and 12.1) that + can be used to build a container to run the tutorials. No HPC support + should be expected at this time +- SmartSim can now be built using Cuda version 11.8 or Cuda 12.1 by specify + `smart build --device=cuda118` or `smart build --device=cuda121`. The + original `smart build --device=gpu` will default to using Cuda 11.8. +- As a result of the previous change, SmartSim now requires C++17 and a + minimum Cuda version of 11.8 in order to build Torch 2.1.0. +- Error messages were not being interpolated correctly. This has been + addressed to provide more context when exposing error messages to users. + ### Development branch To be released at some future point in time diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index 02c17e1fd..226ccb085 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -18,7 +18,7 @@ Prerequisites Basic ===== -The base prerequisites to install SmartSim and SmartRedis are: +The base prerequisites to install SmartSim and SmartRedis wtih CPU-only support are: - Python 3.9-3.11 - Pip @@ -27,13 +27,11 @@ The base prerequisites to install SmartSim and SmartRedis are: - C++ compiler - GNU Make > 4.0 - git - - `git-lfs`_ - -.. _git-lfs: https://github.com/git-lfs/git-lfs?utm_source=gitlfs_site&utm_medium=installation_link&utm_campaign=gitlfs .. note:: - GCC 5-9, 11, and 12 is recommended. There are known bugs with GCC 10. + GCC 9, 11-13 is recommended (here are known issues compiling with GCC 10). For + CUDA 11.8, GCC 9 or 11 must be used. .. warning:: @@ -43,66 +41,146 @@ The base prerequisites to install SmartSim and SmartRedis are: `which gcc g++` do not point to Apple Clang. -GPU Support -=========== +ML Library Support +================== -The machine-learning backends have additional requirements in order to -use GPUs for inference +We currently support both Nvidia and AMD GPUs when using RedisAI for GPU inference. The support +for these GPUs often depends on the version of the CUDA or ROCm stack that is availble on your +machine. In _most_ cases, the versions backwards compatible. If you encounter problems, please +contact us and we can build the backend libraries for your desired version of CUDA and ROCm. - - `CUDA Toolkit 11 (tested with 11.8) `_ - - `cuDNN 8 (tested with 8.9.1) `_ - - OS: Linux - - GPU: Nvidia +CPU backends are provided for Apple (both Intel and Apple Silicon) and Linux (x86_64). -Be sure to reference the :ref:`installation notes ` for helpful +Be sure to reference the table below to find which versions of the ML libraries are supported for +your particular platform. Additional, see :ref:`installation notes ` for helpful information regarding various system types before installation. -================== -Supported Versions -================== +Linux +----- +.. tabs:: -.. list-table:: Supported System for Pre-built Wheels - :widths: 50 50 50 50 - :header-rows: 1 - :align: center + .. group-tab:: CUDA 11 + + Additional requirements: + + * GCC <= 11 + * CUDA Toolkit 11.7 or 11.8 + * cuDNN 8.9 + + .. list-table:: Nvidia CUDA 11 + :widths: 50 50 50 50 + :header-rows: 1 + :align: center + + * - Python Versions + - Torch + - Tensorflow + - ONNX Runtime + * - 3.9-3.11 + - 2.3.1 + - 2.14.1 + - 1.17.3 + + .. group-tab:: CUDA 12 + + Additional requirements: + + * CUDA Toolkit 12 + * cuDNN 8.9 + + .. list-table:: Nvidia CUDA 12 + :widths: 50 50 50 50 + :header-rows: 1 + :align: center + + * - Python Versions + - Torch + - Tensorflow + - ONNX Runtime + * - 3.9-3.11 + - 2.3.1 + - 2.17 + - 1.17.3 + + .. group-tab:: ROCm 6 + + .. list-table:: AMD ROCm 6.1 + :widths: 50 50 50 50 + :header-rows: 1 + :align: center + + * - Python Versions + - Torch + - Tensorflow + - ONNX Runtime + * - 3.9-3.11 + - 2.4.1 + - N/A + - N/A + + .. group-tab:: CPU + + .. list-table:: CPU-only + :widths: 50 50 50 50 + :header-rows: 1 + :align: center + + * - Python Versions + - Torch + - Tensorflow + - ONNX Runtime + * - 3.9-3.11 + - 2.4.0 + - 2.15 + - 1.17.3 + +MacOSX +------ - * - Platform - - CPU - - GPU - - Python Versions - * - MacOS - - x86_64, aarch64 - - Not supported - - 3.9 - 3.11 - * - Linux - - x86_64 - - Nvidia - - 3.9 - 3.11 +.. tabs:: + .. group-tab:: Apple Silicon -.. note:: + .. list-table:: Apple Silicon ARM64 (no Metal support) + :widths: 50 50 50 50 + :header-rows: 1 + :align: center - Users have succesfully run SmartSim on Windows using Windows Subsystem for Linux - with Nvidia support. Generally, users should follow the Linux instructions here, - however we make no guarantee or offer of support. + * - Python Versions + - Torch + - Tensorflow + - ONNX Runtime + * - 3.9-3.11 + - 2.4.0 + - 2.17 + - 1.17.3 + .. group-tab:: Intel Mac (x86) -Native support for various machine learning libraries and their -versions is dictated by our dependency on RedisAI_ 1.2.7. + .. list-table:: CPU-only + :widths: 50 50 50 50 + :header-rows: 1 + :align: center -+------------------+----------+-------------+---------------+ -| RedisAI | PyTorch | Tensorflow | ONNX Runtime | -+==================+==========+=============+===============+ -| 1.2.7 (default) | 2.0.1 | 2.13.1 | 1.16.3 | -+------------------+----------+-------------+---------------+ + * - Python Versions + - Torch + - Tensorflow + - ONNX Runtime + * - 3.9-3.11 + - 2.2.0 + - 2.15 + - 1.17.3 -.. warning:: - On Apple Silicon, only the PyTorch backend is supported for now. Please contact us - if you need support for other backends +.. note:: -TensorFlow_ 2.0 and Keras_ are supported through `graph freezing`_. + Users have succesfully run SmartSim on Windows using Windows Subsystem for Linux + with Nvidia support. Generally, users should follow the Linux instructions here, + however we make no guarantee or offer of support. + + +TensorFlow_ and Keras_ are supported through `graph freezing`_. ScikitLearn_ and Spark_ models are supported by SmartSim as well through the use of the ONNX_ runtime (which is not built by @@ -167,21 +245,8 @@ and install SmartSim from PyPI with the following command: pip install smartsim -If you would like SmartSim to also install python machine learning libraries -that can be used outside SmartSim to build SmartSim-compatible models, you -can request their installation through the ``[ml]`` optional dependencies, -as follows: - -.. code-block:: bash - - # For bash - pip install smartsim[ml] - # For zsh - pip install smartsim\[ml\] - -At this point, SmartSim is installed and can be used for more basic features. -If you want to use the machine learning features of SmartSim, you will need -to install the ML backends in the section below. +At this point, SmartSim can be used for describing and launching experiments, but +without any database/feature store functionality which allows for ML-enabled workflows. Step 2: Build SmartSim @@ -198,19 +263,19 @@ To see all the installation options: smart --help -CPU Install ------------ - -To install the default ML backends for CPU, run - .. code-block:: bash # run one of the following - smart build --device cpu # install PT and TF for cpu - smart build --device cpu --onnx # install all backends (PT, TF, ONNX) on cpu + smart build --device cpu # For unaccelerated AI/ML loads + smart build --device cuda118 # Nvidia Accelerator with CUDA 11.8 + smart build --device cuda125 # Nvidia Accelerator with CUDA 12.5 + smart build --device rocm57 # AMD Accelerator with ROCm 5.7.0 -By default, ``smart`` will install PyTorch and TensorFlow backends -for use in SmartSim. +By default, ``smart`` will install all backends available for the specified accelerator +_and_ the compatible versions of the Python packages associated with the backends. To +disable support for a specific backend, ``smart build`` accepts the flags +``--skip-torch``, ``--skip-tensorflow``, ``--skip-onnx`` which can also be used in +combination. .. note:: @@ -218,19 +283,6 @@ for use in SmartSim. all of the previous installs for the ML backends and ``smart clobber`` will remove all pre-built dependencies as well as the ML backends. - -GPU Install ------------ - -With the proper environment setup (see :ref:`GPU support`) the only difference -to building SmartSim with GPU support is to specify a different ``device`` - -.. code-block:: bash - - # run one of the following - smart build --device gpu # install PT and TF for gpu - smart build --device gpu --onnx # install all backends (PT, TF, ONNX) on gpu - .. note:: GPU builds can be troublesome due to the way that RedisAI and the ML-package @@ -251,9 +303,7 @@ For example, to install dragon alongside the RedisAI CPU backends, you can run .. code-block:: bash - # run one of the following smart build --device cpu --dragon # install Dragon, PT and TF for cpu - smart build --device cpu --onnx --dragon # install Dragon and all backends (PT, TF, ONNX) on cpu .. note:: Dragon is only supported on Linux systems. For further information, you @@ -319,35 +369,11 @@ source remains at the site of the clone instead of in site-packages. .. code-block:: bash cd smartsim - pip install -e .[dev,ml] # for bash users - pip install -e .\[dev,ml\] # for zsh users - -Use the now installed ``smart`` cli to install the machine learning runtimes and dragon. - -.. tabs:: - - .. tab:: Linux - - .. code-block:: bash - - # run one of the following - smart build --device cpu --onnx --dragon # install with cpu-only support - smart build --device gpu --onnx --dragon # install with both cpu and gpu support - - - .. tab:: MacOS (Intel x64) - - .. code-block:: bash - - smart build --device cpu --onnx # install all backends (PT, TF, ONNX) on gpu - - - .. tab:: MacOS (Apple Silicon) - - .. code-block:: bash - - smart build --device cpu --no_tf # Only install PyTorch (TF/ONNX unsupported) + pip install -e .[dev] # for bash users + pip install -e ".[dev]" # for zsh users +Use the now installed ``smart`` cli to install the machine learning runtimes and +dragon. Referring to "Step 2: Build SmartSim above". Build the SmartRedis library ============================ diff --git a/doc/installation_instructions/platform.rst b/doc/installation_instructions/platform.rst index 086fc2951..057a25d87 100644 --- a/doc/installation_instructions/platform.rst +++ b/doc/installation_instructions/platform.rst @@ -12,6 +12,8 @@ that SmartSim may be used on. .. include:: platform/frontier.rst +.. include:: platform/perlmutter.rst + .. include:: platform/cray.rst .. include:: platform/ncar-cheyenne.rst diff --git a/doc/installation_instructions/platform/frontier.rst b/doc/installation_instructions/platform/frontier.rst index e23856155..d4db76a6d 100644 --- a/doc/installation_instructions/platform/frontier.rst +++ b/doc/installation_instructions/platform/frontier.rst @@ -1,23 +1,14 @@ OLCF Frontier ============= -Summary -------- - -Frontier is an AMD CPU/AMD GPU system. - -As of 2023-07-06, users can use the following instructions, however we -anticipate that all the SmartSim dependencies will be available system-wide via -the modules system. - Known limitations ----------------- We are continually working on getting all the features of SmartSim working on Frontier, however we do have some known limitations: -* For now, only Torch models are supported. We are working to find a recipe to - install Tensorflow with ROCm support from scratch +* For now, only Torch and ONNX runtime models are supported. If you need + Tensorflow support please contact us * The colocated database will fail without specifying ``custom_pinning``. This is because the default pinning assumes that processor 0 is available, but the 'low-noise' default on Frontier reserves the processor on each NUMA node. @@ -30,8 +21,8 @@ Frontier, however we do have some known limitations: Please raise an issue in the SmartSim Github or contact the developers if the above issues are affecting your workflow or if you find any other problems. -Build process -------------- +One-time Setup +-------------- To install the SmartRedis and SmartSim python packages on Frontier, please follow these instructions, being sure to set the following variables @@ -41,23 +32,22 @@ these instructions, being sure to set the following variables export PROJECT_NAME=CHANGE_ME export VENV_NAME=CHANGE_ME -Then continue with the install: +**Step 1:** Create and activate a virtual environment for SmartSim: .. code:: bash - module load PrgEnv-gnu-amd git-lfs cmake cray-python - module unload xalt amd-mixed - module load rocm/4.5.2 - export CC=gcc - export CXX=g++ + module load PrgEnv-gnu cray-python + module load rocm/6.1.3 export SCRATCH=/lustre/orion/$PROJECT_NAME/scratch/$USER/ export VENV_HOME=$SCRATCH/$VENV_NAME/ python3 -m venv $VENV_HOME source $VENV_HOME/bin/activate - pip install torch==1.11.0+rocm4.5.2 torchvision==0.12.0+rocm4.5.2 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/rocm4.5.2 +**Step 2:** Install SmartSim in the conda environment: + +.. code:: bash cd $SCRATCH git clone https://github.com/CrayLabs/SmartRedis.git @@ -67,34 +57,33 @@ Then continue with the install: # Download SmartSim and site-specific files cd $SCRATCH - git clone https://github.com/CrayLabs/site-deployments.git - git clone https://github.com/CrayLabs/SmartSim.git - cd SmartSim - pip install -e .[dev] + pip install git+https://github.com/CrayLabs/SmartSim.git -Next to finish the compilation, we need to manually modify one of the auxiliary -cmake files that comes packaged with Torch +**Step 3:** Build Redis, RedisAI, the backends, and all the Python packages: .. code:: bash - export TORCH_CMAKE_DIR=$(python -c 'import torch;print(torch.utils.cmake_prefix_path)') - # Manual step: modify all references to the 'rocm' directory to rocm-4.5.2 - vim $TORCH_CMAKE_DIR/Caffe2/Caffe2Targets.cmake + smart build --device=rocm-6 -Finally, build Redis (or keydb for a more performant solution), RedisAI, and the -machine-learning backends using: +**Step 4:** Check that SmartSim has been installed and built correctly: .. code:: bash - KEYDB_FLAG="" # set this to --keydb if desired - smart build --device gpu --torch_dir $TORCH_CMAKE_DIR --no_tf -v $(KEYDB_FLAG) + smart validate --device gpu + +The following output indicates a successful install: + +.. code:: bash -Set up environment ------------------- + [SmartSim] INFO Verifying Tensor Transfer + [SmartSim] INFO Verifying Torch Backend + 16:26:35 login SmartSim[557020:MainThread] INFO Success! + +Post-installation +----------------- Before running SmartSim, the environment should match the one used to -build, and some variables should be set to work around some ROCm PyTorch -issues: +build, and some variables should be set to optimize performance: .. code:: bash @@ -104,10 +93,10 @@ issues: .. code:: bash - module load PrgEnv-gnu-amd git-lfs cmake cray-python - module unload xalt amd-mixed - module load rocm/4.5.2 + module load PrgEnv-gnu + module load rocm/6.1.3 + # Optimizations for inference export SCRATCH=/lustre/orion/$PROJECT_NAME/scratch/$USER/ export MIOPEN_USER_DB_PATH=/tmp/miopendb/ export MIOPEN_SYSTEM_DB_PATH=$MIOPEN_USER_DB_PATH @@ -115,7 +104,6 @@ issues: export MIOPEN_DISABLE_CACHE=1 export VENV_HOME=$SCRATCH/$VENV_NAME/ source $VENV_HOME/bin/activate - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$VENV_HOME/lib/python3.9/site-packages/torch/lib Binding DBs to Slingshot ------------------------ @@ -129,17 +117,3 @@ following way: exp = Experiment("my_exp", launcher="slurm") orc = exp.create_database(db_nodes=3, interface=["hsn0","hsn1","hsn2","hsn3"], single_cmd=True) - -Running tests -------------- - -The same environment set to run SmartSim must be set to run tests. The -environment variables needed to run the test suite are the following: - -.. code:: bash - - export SMARTSIM_TEST_ACCOUNT=PROJECT_NAME # Change this to above - export SMARTSIM_TEST_LAUNCHER=slurm - export SMARTSIM_TEST_DEVICE=gpu - export SMARTSIM_TEST_PORT=6789 - export SMARTSIM_TEST_INTERFACE="hsn0,hsn1,hsn2,hsn3" diff --git a/doc/installation_instructions/platform/perlmutter.rst b/doc/installation_instructions/platform/perlmutter.rst new file mode 100644 index 000000000..6d1e22e1e --- /dev/null +++ b/doc/installation_instructions/platform/perlmutter.rst @@ -0,0 +1,55 @@ +NERSC Perlmutter +================ + +One-time Setup +-------------- + +To install SmartSim on Perlmutter, follow these steps: + +**Step 1:** Create and activate a conda environment for SmartSim: + +.. code:: bash + + module load conda + conda create -n smartsim python=3.11 + conda activate smartsim + +**Step 2:** Install SmartSim in the conda environment: + +.. code:: bash + + pip install git+https://github.com/CrayLabs/SmartSim.git + +**Step 3:** Build Redis, RedisAI, the backends, and all the Python packages: + +.. code:: bash + + module load cudatoolkit/12.2 cudnn/8.9.3_cuda12 + smart build --device=cuda-12 + +**Step 4:** Check that SmartSim has been installed and built correctly: + +.. code:: bash + + smart validate --device gpu + +The following output indicates a successful install: + +.. code:: bash + + [SmartSim] INFO Verifying Tensor Transfer + [SmartSim] INFO Verifying Torch Backend + [SmartSim] INFO Verifying ONNX Backend + [SmartSim] INFO Verifying TensorFlow Backend + 16:26:35 login SmartSim[557020:MainThread] INFO Success! + +Post-installation +----------------- + +After completing the above steps to install SmartSim in a conda environment, you +can reload the conda environment by running the following commands: + +.. code:: bash + + module load conda cudatoolkit/12.2 cudnn/8.9.3_cuda12 + conda activate smartsim diff --git a/doc/installation_instructions/site-install.rst b/doc/installation_instructions/site-install.rst index 26ecd6c13..53e0ff8bf 100644 --- a/doc/installation_instructions/site-install.rst +++ b/doc/installation_instructions/site-install.rst @@ -11,5 +11,5 @@ from source with the following steps replacing ``COMPILER_VERSION`` and module use -a /lus/scratch/smartsim/local/modulefiles module load cudatoolkit/11.8 cudnn smartsim-deps/COMPILER_VERSION/SMARTSIM_VERSION - pip install smartsim[ml] - smart build --only_python_packages --device gpu [--onnx] + pip install smartsim + smart build --skip-backends --device gpu [--onnx] diff --git a/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb b/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb index 2d19cab13..2b5f0a3a5 100644 --- a/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb +++ b/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb @@ -132,7 +132,7 @@ "\n", "ML Backends Requested\n", "╒════════════╤════════╤══════╕\n", - "│ PyTorch │ 2.0.1 │ \u001b[32mTrue\u001b[0m │\n", + "│ PyTorch │ 2.1.0 │ \u001b[32mTrue\u001b[0m │\n", "│ TensorFlow │ 2.13.1 │ \u001b[32mTrue\u001b[0m │\n", "│ ONNX │ 1.14.1 │ \u001b[32mTrue\u001b[0m │\n", "╘════════════╧════════╧══════╛\n", diff --git a/docker/prod-cuda11/Dockerfile b/docker/prod-cuda11/Dockerfile new file mode 100644 index 000000000..ef73e2e01 --- /dev/null +++ b/docker/prod-cuda11/Dockerfile @@ -0,0 +1,61 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +FROM ubuntu:22.04 + +LABEL maintainer="Cray Labs" +LABEL org.opencontainers.image.source https://github.com/CrayLabs/SmartSim + +ARG DEBIAN_FRONTEND="noninteractive" +ENV TZ=US/Seattle + +# Make basic dependencies +RUN apt-get update \ + && apt-get install --no-install-recommends -y build-essential \ + git gcc make git-lfs wget libopenmpi-dev openmpi-bin unzip \ + python3-pip python3 python3-dev cmake wget apt-utils + +# # Install Cudatoolkit 11.8 +ENV TERM="xterm" +RUN wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run && \ + chmod +x ./cuda_11.8.0_520.61.05_linux.run && \ + ./cuda_11.8.0_520.61.05_linux.run --silent --toolkit && \ + rm ./cuda_11.8.0_520.61.05_linux.run + +# Install cuDNN 8.9.7 +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcudnn8_8.9.7.29-1+cuda11.8_amd64.deb && \ + dpkg -i libcudnn8_8.9.7.29-1+cuda11.8_amd64.deb && \ + rm ./libcudnn8_8.9.7.29-1+cuda11.8_amd64.deb + + # Install SmartSim and SmartRedis + RUN pip install git+https://github.com/CrayLabs/SmartRedis.git && \ + pip install "smartsim[ml] @ git+https://github.com/CrayLabs/SmartSim.git" + + ENV CUDA_HOME="/usr/local/cuda/" + ENV PATH="${PATH}:${CUDA_HOME}/bin" + + # Build ML Backends + RUN smart build --device=gpu --onnx diff --git a/docker/prod-cuda12/Dockerfile b/docker/prod-cuda12/Dockerfile new file mode 100644 index 000000000..bbdfd3513 --- /dev/null +++ b/docker/prod-cuda12/Dockerfile @@ -0,0 +1,64 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +FROM ubuntu:22.04 + +LABEL maintainer="Cray Labs" +LABEL org.opencontainers.image.source https://github.com/CrayLabs/SmartSim + +ARG DEBIAN_FRONTEND="noninteractive" +ENV TZ=US/Seattle + +# Make basic dependencies +RUN apt-get update \ + && apt-get install --no-install-recommends -y build-essential \ + git gcc make git-lfs wget libopenmpi-dev openmpi-bin unzip \ + python3-pip python3 python3-dev cmake wget + +# Install Cudatoolkit 12.5 +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + apt-get update -y && \ + apt-get install -y cuda-toolkit-12-5 + +# Install cuDNN 8.9.7 +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb && \ + dpkg -i libcudnn8_8.9.7.29-1+cuda12.2_amd64.deb + +# Install SmartSim and SmartRedis +RUN pip install git+https://github.com/CrayLabs/SmartRedis.git && \ + pip install git+https://github.com/CrayLabs/SmartSim.git@cuda-12-support + +ENV CUDA_HOME="/usr/local/cuda/" +ENV PATH="${PATH}:${CUDA_HOME}/bin" + +# Install machine-learning python packages consistent with RedisAI +# Note: pytorch gets installed in the smart build step +# This step will be deprecated in a future update +RUN pip install tensorflow==2.15.0 + +# Build ML Backends +RUN smart build --device=cuda121 diff --git a/setup.py b/setup.py index 42892ed7a..5b23fca4c 100644 --- a/setup.py +++ b/setup.py @@ -137,7 +137,7 @@ class BuildError(Exception): "types-redis", "types-tabulate", "types-tqdm", - "types-tensorflow==2.12.0.9", + "types-tensorflow", "types-setuptools", "typing_extensions>=4.1.0", ], @@ -151,7 +151,7 @@ class BuildError(Exception): "nbsphinx==0.9.3", "docutils==0.18.1", "torch==2.0.1", - "tensorflow==2.13.1", + "tensorflow>=2.14,<3.0", "ipython", "jinja2==3.1.2", "sphinx-design", @@ -159,8 +159,6 @@ class BuildError(Exception): "sphinx-autodoc-typehints", "myst_parser", ], - # see smartsim/_core/_install/buildenv.py for more details - **versions.ml_extras_required(), } @@ -175,10 +173,11 @@ class BuildError(Exception): "redis>=4.5", "tqdm>=4.50.2", "filelock>=3.4.2", - "protobuf~=3.20", + "GitPython<=3.1.43", + "protobuf<=3.20.3", "jinja2>=3.1.2", "watchdog>4,<5", - "pydantic==1.10.14", + "pydantic>2", "pyzmq>=25.1.2", "pygithub>=2.3.0", "numpy<2", diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 951521f17..5d094b72f 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -25,26 +25,34 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import importlib.metadata +import operator import os -import platform -import sys +import re +import shutil +import textwrap import typing as t from pathlib import Path from tabulate import tabulate from smartsim._core._cli.scripts.dragon_install import install_dragon -from smartsim._core._cli.utils import SMART_LOGGER_FORMAT, color_bool, pip +from smartsim._core._cli.utils import SMART_LOGGER_FORMAT from smartsim._core._install import builder -from smartsim._core._install.buildenv import ( - BuildEnv, - DbEngine, - SetupError, - Version_, - VersionConflictError, - Versioner, +from smartsim._core._install.buildenv import BuildEnv, DbEngine, Version_, Versioner +from smartsim._core._install.mlpackages import ( + DEFAULT_MLPACKAGE_PATH, + DEFAULT_MLPACKAGES, + MLPackageCollection, + load_platform_configs, ) -from smartsim._core._install.builder import BuildError, Device +from smartsim._core._install.platform import ( + Architecture, + Device, + OperatingSystem, + Platform, +) +from smartsim._core._install.redisaiBuilder import RedisAIBuilder from smartsim._core.config import CONFIG from smartsim._core.utils.helpers import installed_redisai_backends from smartsim.error import SSConfigError @@ -55,25 +63,6 @@ # NOTE: all smartsim modules need full paths as the smart cli # may be installed into a different directory. -_TPinningStr = t.Literal["==", "!=", ">=", ">", "<=", "<", "~="] - - -def check_py_onnx_version(versions: Versioner) -> None: - """Check Python environment for ONNX installation""" - _check_packages_in_python_env( - { - "onnx": Version_(versions.ONNX), - "skl2onnx": Version_(versions.REDISAI.skl2onnx), - "onnxmltools": Version_(versions.REDISAI.onnxmltools), - "scikit-learn": Version_(getattr(versions.REDISAI, "scikit-learn")), - }, - ) - - -def check_py_tf_version(versions: Versioner) -> None: - """Check Python environment for TensorFlow installation""" - _check_packages_in_python_env({"tensorflow": Version_(versions.TENSORFLOW)}) - def check_backends_install() -> bool: """Checks if backends have already been installed. @@ -115,8 +104,6 @@ def build_database( database_builder = builder.DatabaseBuilder( build_env(), jobs=build_env.JOBS, - _os=builder.OperatingSystem.from_str(platform.system()), - architecture=builder.Architecture.from_str(platform.machine()), malloc=build_env.MALLOC, verbose=verbose, ) @@ -125,220 +112,92 @@ def build_database( f"Building {database_name} version {versions.REDIS} " f"from {versions.REDIS_URL}" ) - database_builder.build_from_git(versions.REDIS_URL, versions.REDIS_BRANCH) + database_builder.build_from_git( + versions.REDIS_URL, branch=versions.REDIS_BRANCH + ) database_builder.cleanup() - logger.info(f"{database_name} build complete!") + logger.info(f"{database_name} build complete!") + else: + logger.warning( + f"{database_name} was previously built, run 'smart clobber' to rebuild" + ) def build_redis_ai( + platform: Platform, + mlpackages: MLPackageCollection, build_env: BuildEnv, - versions: Versioner, - device: Device, - use_torch: bool = True, - use_tf: bool = True, - use_onnx: bool = False, - torch_dir: t.Union[str, Path, None] = None, - libtf_dir: t.Union[str, Path, None] = None, - verbose: bool = False, - torch_with_mkl: bool = True, + verbose: bool, ) -> None: - # make sure user isn't trying to do something silly on MacOS - if build_env.PLATFORM == "darwin" and device == Device.GPU: - raise BuildError("SmartSim does not support GPU on MacOS") - - # decide which runtimes to build - print("\nML Backends Requested") - backends_table = [ - ["PyTorch", versions.TORCH, color_bool(use_torch)], - ["TensorFlow", versions.TENSORFLOW, color_bool(use_tf)], - ["ONNX", versions.ONNX, color_bool(use_onnx)], - ] - print(tabulate(backends_table, tablefmt="fancy_outline"), end="\n\n") - print(f"Building for GPU support: {color_bool(device == Device.GPU)}\n") - - if not check_backends_install(): - sys.exit(1) - - # TORCH - if use_torch and torch_dir: - torch_dir = Path(torch_dir).resolve() - if not torch_dir.is_dir(): - raise SetupError( - f"Could not find requested user Torch installation: {torch_dir}" - ) - - # TF - if use_tf and libtf_dir: - libtf_dir = Path(libtf_dir).resolve() - if not libtf_dir.is_dir(): - raise SetupError( - f"Could not find requested user TF installation: {libtf_dir}" - ) - - build_env_dict = build_env() - - rai_builder = builder.RedisAIBuilder( - build_env=build_env_dict, - jobs=build_env.JOBS, - _os=builder.OperatingSystem.from_str(platform.system()), - architecture=builder.Architecture.from_str(platform.machine()), - torch_dir=str(torch_dir) if torch_dir else "", - libtf_dir=str(libtf_dir) if libtf_dir else "", - build_torch=use_torch, - build_tf=use_tf, - build_onnx=use_onnx, - verbose=verbose, - torch_with_mkl=torch_with_mkl, + logger.info("Building RedisAI and backends...") + rai_builder = RedisAIBuilder( + platform, mlpackages, build_env, CONFIG.build_path, verbose ) - - if rai_builder.is_built: - logger.info("RedisAI installed. Run `smart clean` to remove.") - else: - # get the build environment, update with CUDNN env vars - # if present and building for GPU, otherwise warn the user - if device == Device.GPU: - gpu_env = build_env.get_cudnn_env() - cudnn_env_vars = [ - "CUDNN_LIBRARY", - "CUDNN_INCLUDE_DIR", - "CUDNN_INCLUDE_PATH", - "CUDNN_LIBRARY_PATH", - ] - if not gpu_env: - logger.warning( - "CUDNN environment variables not found.\n" - f"Looked for {cudnn_env_vars}" - ) - else: - build_env_dict.update(gpu_env) - # update RAI build env with cudnn env vars - rai_builder.env = build_env_dict - - logger.info( - f"Building RedisAI version {versions.REDISAI}" - f" from {versions.REDISAI_URL}" - ) - - # NOTE: have the option to add other builds here in the future - # like "from_tarball" - rai_builder.build_from_git( - versions.REDISAI_URL, versions.REDISAI_BRANCH, device - ) - logger.info("ML Backends and RedisAI build complete!") - - -def check_py_torch_version(versions: Versioner, device: Device = Device.CPU) -> None: - """Check Python environment for TensorFlow installation""" - if BuildEnv.is_macos(): - if device == Device.GPU: - raise BuildError("SmartSim does not support GPU on MacOS") - device_suffix = "" - else: # linux - if device == Device.CPU: - device_suffix = versions.TORCH_CPU_SUFFIX - elif device == Device.GPU: - device_suffix = versions.TORCH_CUDA_SUFFIX - else: - raise BuildError("Unrecognized device requested") - - torch_deps = { - "torch": Version_(f"{versions.TORCH}{device_suffix}"), - "torchvision": Version_(f"{versions.TORCHVISION}{device_suffix}"), + rai_builder.build() + rai_builder.cleanup_build() + + +def parse_requirement( + requirement: str, +) -> t.Tuple[str, t.Optional[str], t.Callable[[Version_], bool]]: + operators = { + "==": operator.eq, + "<=": operator.le, + ">=": operator.ge, + "<": operator.lt, + ">": operator.gt, } - missing, conflicts = _assess_python_env( - torch_deps, - package_pinning="==", - validate_installed_version=_create_torch_version_validator( - with_suffix=device_suffix - ), + semantic_version_pattern = r"\d+(?:\.\d+(?:\.\d+)?)?([^\s]*)" + pattern = ( + r"^" # Start + r"([a-zA-Z0-9_\-]+)" # Package name + r"(?:\[[a-zA-Z0-9_\-,]+\])?" # Any extras + r"(?:([<>=!~]{1,2})" # Pinning string + rf"({semantic_version_pattern}))?" # A version number + r"$" # End ) + match = re.match(pattern, requirement) + if match is None: + raise ValueError(f"Invalid requirement string: {requirement}") + module_name, cmp_op, version_str, suffix = match.groups() + version = Version_(version_str) if version_str is not None else None + if cmp_op is None: + is_compatible = lambda _: True # pylint: disable=unnecessary-lambda-assignment + elif (cmp := operators.get(cmp_op, None)) is None: + raise ValueError(f"Unrecognized comparison operator: {cmp_op}") + else: - if len(missing) == len(torch_deps) and not conflicts: - # All PyTorch deps are not installed and there are no conflicting - # python packages. We can try to install torch deps into the current env. - logger.info( - "Torch version not found in python environment. " - "Attempting to install via `pip`" - ) - wheel_device = ( - device.value if device == Device.CPU else device_suffix.replace("+", "") - ) - pip( - "install", - "--extra-index-url", - f"https://download.pytorch.org/whl/{wheel_device}", - *(f"{package}=={version}" for package, version in torch_deps.items()), - ) - elif missing or conflicts: - logger.warning(_format_incompatible_python_env_message(missing, conflicts)) - - -def _create_torch_version_validator( - with_suffix: str, -) -> t.Callable[[str, t.Optional[Version_]], bool]: - def check_torch_version(package: str, version: t.Optional[Version_]) -> bool: - if not BuildEnv.check_installed(package, version): - return False - # Default check only looks at major/minor version numbers, - # Torch requires we look at the patch as well - installed = BuildEnv.get_py_package_version(package) - if with_suffix and with_suffix not in installed.patch: - raise VersionConflictError( - package, - installed, - version or Version_(f"X.X.X{with_suffix}"), - msg=( - f"{package}=={installed} does not satisfy device " - f"suffix requirement: {with_suffix}" - ), + def is_compatible(other: Version_) -> bool: + assert version is not None # For type check, always should be true + match_ = re.match(rf"^{semantic_version_pattern}$", other) + return ( + cmp(other, version) and match_ is not None and match_.group(1) == suffix ) - return True - return check_torch_version + return module_name, f"{cmp_op}{version}" if version else None, is_compatible -def _check_packages_in_python_env( - packages: t.Mapping[str, t.Optional[Version_]], - package_pinning: _TPinningStr = "==", - validate_installed_version: t.Optional[ - t.Callable[[str, t.Optional[Version_]], bool] - ] = None, -) -> None: - # TODO: Do not like how the default validation function will always look for - # a `==` pinning. Maybe turn `BuildEnv.check_installed` into a factory - # that takes a pinning and returns an appropriate validation fn? - validate_installed_version = validate_installed_version or BuildEnv.check_installed - missing, conflicts = _assess_python_env( - packages, - package_pinning, - validate_installed_version, - ) +def check_ml_python_packages(packages: MLPackageCollection) -> None: + missing = [] + conflicts = [] + + for package in packages.values(): + for requirement in package.python_packages: + module_name, version_spec, is_compatible = parse_requirement(requirement) + try: + installed = BuildEnv.get_py_package_version(module_name) + if not is_compatible(installed): + conflicts.append( + f"{module_name}: {installed} is installed, " + f"but {version_spec or 'Any'} is required" + ) + except importlib.metadata.PackageNotFoundError: + missing.append(module_name) if missing or conflicts: logger.warning(_format_incompatible_python_env_message(missing, conflicts)) -def _assess_python_env( - packages: t.Mapping[str, t.Optional[Version_]], - package_pinning: _TPinningStr, - validate_installed_version: t.Callable[[str, t.Optional[Version_]], bool], -) -> t.Tuple[t.List[str], t.List[str]]: - missing: t.List[str] = [] - conflicts: t.List[str] = [] - - for name, version in packages.items(): - spec = f"{name}{package_pinning}{version}" if version else name - try: - if not validate_installed_version(name, version): - # Not installed! - missing.append(spec) - except VersionConflictError: - # Incompatible version found - conflicts.append(spec) - - return missing, conflicts - - def _format_incompatible_python_env_message( missing: t.Collection[str], conflicting: t.Collection[str] ) -> str: @@ -349,20 +208,24 @@ def _format_incompatible_python_env_message( missing_str = fmt_list("Missing", missing) conflict_str = fmt_list("Conflicting", conflicting) sep = "\n" if missing_str and conflict_str else "" - return ( - "Python Env Status Warning!\n" - "Requested Packages are Missing or Conflicting:\n\n" - f"{missing_str}{sep}{conflict_str}\n\n" - "Consider installing packages at the requested versions via `pip` or " - "uninstalling them, installing SmartSim with optional ML dependencies " - "(`pip install smartsim[ml]`), and running `smart clean && smart build ...`" - ) + + return textwrap.dedent(f"""\ + Python Package Warning: + + Requested packages are missing or have a version mismatch with + their respective backend: + + {missing_str}{sep}{conflict_str} + + Consider uninstalling any conflicting packages and rerunning + `smart build` if you encounter issues. + """) def _configure_keydb_build(versions: Versioner) -> None: """Configure the redis versions to be used during the build operation""" versions.REDIS = Version_("6.2.0") - versions.REDIS_URL = "https://github.com/EQ-Alpha/KeyDB" + versions.REDIS_URL = "https://github.com/EQ-Alpha/KeyDB.git" versions.REDIS_BRANCH = "v6.2.0" CONFIG.conf_path = Path(CONFIG.core_path, "config", "keydb.conf") @@ -376,14 +239,33 @@ def _configure_keydb_build(versions: Versioner) -> None: def execute( args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / ) -> int: + + # Unpack various arguments verbose = args.v keydb = args.keydb - device = Device(args.device.lower()) + device = Device.from_str(args.device.lower()) is_dragon_requested = args.dragon - # torch and tf build by default - pt = not args.no_pt # pylint: disable=invalid-name - tf = not args.no_tf # pylint: disable=invalid-name - onnx = args.onnx + + if Path(CONFIG.build_path).exists(): + logger.warning(f"Build path already exists, removing: {CONFIG.build_path}") + shutil.rmtree(CONFIG.build_path) + + # The user should never have to specify the OS and Architecture + current_platform = Platform( + OperatingSystem.autodetect(), Architecture.autodetect(), device + ) + + # Configure the ML Packages + configs = load_platform_configs(Path(args.config_dir)) + mlpackages = configs[current_platform] + + # Build all backends by default, pop off the ones that user wants skipped + if args.skip_torch and "libtorch" in mlpackages: + mlpackages.pop("libtorch") + if args.skip_tensorflow and "libtensorflow" in mlpackages: + mlpackages.pop("libtensorflow") + if args.skip_onnx and "onnxruntime" in mlpackages: + mlpackages.pop("onnxruntime") build_env = BuildEnv(checks=True) logger.info("Running SmartSim build process...") @@ -409,6 +291,9 @@ def execute( version_names = list(vers.keys()) print(tabulate(vers, headers=version_names, tablefmt="github"), "\n") + logger.info("ML Packages") + print(mlpackages) + if is_dragon_requested: install_to = CONFIG.core_path / ".dragon" return_code = install_dragon(install_to) @@ -420,42 +305,25 @@ def execute( else: logger.warning("Dragon installation failed") - try: - if not args.only_python_packages: - # REDIS/KeyDB - build_database(build_env, versions, keydb, verbose) - - # REDISAI - build_redis_ai( - build_env, - versions, - device, - pt, - tf, - onnx, - args.torch_dir, - args.libtensorflow_dir, - verbose=verbose, - torch_with_mkl=args.torch_with_mkl, - ) - except (SetupError, BuildError) as e: - logger.error(str(e)) - return os.EX_SOFTWARE + # REDIS/KeyDB + build_database(build_env, versions, keydb, verbose) + + if (CONFIG.lib_path / "redisai.so").exists(): + logger.warning("RedisAI was previously built, run 'smart clean' to rebuild") + elif not args.skip_backends: + build_redis_ai(current_platform, mlpackages, build_env, verbose) + else: + logger.info("Skipping compilation of RedisAI and backends") backends = installed_redisai_backends() backends_str = ", ".join(s.capitalize() for s in backends) if backends else "No" - logger.info(f"{backends_str} backend(s) built") - - try: - if "torch" in backends: - check_py_torch_version(versions, device) - if "tensorflow" in backends: - check_py_tf_version(versions) - if "onnxruntime" in backends: - check_py_onnx_version(versions) - except (SetupError, BuildError) as e: - logger.error(str(e)) - return os.EX_SOFTWARE + logger.info(f"{backends_str} backend(s) available") + + if not args.skip_python_packages: + for package in mlpackages.values(): + logger.info(f"Installing python packages for {package.name}") + package.pip_install(quiet=not verbose) + check_ml_python_packages(mlpackages) logger.info("SmartSim build complete!") return os.EX_OK @@ -463,7 +331,14 @@ def execute( def configure_parser(parser: argparse.ArgumentParser) -> None: """Builds the parser for the command""" - warn_usage = "(ONLY USE IF NEEDED)" + + available_devices = [] + for platform in DEFAULT_MLPACKAGES: + if (platform.operating_system == OperatingSystem.autodetect()) and ( + platform.architecture == Architecture.autodetect() + ): + available_devices.append(platform.device.value) + parser.add_argument( "-v", action="store_true", @@ -474,7 +349,7 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: "--device", type=str.lower, default=Device.CPU.value, - choices=[device.value for device in Device], + choices=available_devices, help="Device to build ML runtimes for", ) parser.add_argument( @@ -484,40 +359,35 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: help="Install the dragon runtime", ) parser.add_argument( - "--only_python_packages", + "--skip-python-packages", action="store_true", - default=False, - help="Only evaluate the python packages (i.e. skip building backends)", + help="Do not install the python packages that match the backends", ) parser.add_argument( - "--no_pt", + "--skip-backends", action="store_true", - default=False, - help="Do not build PyTorch backend", + help="Do not compile RedisAI and the backends", ) parser.add_argument( - "--no_tf", + "--skip-torch", action="store_true", - default=False, - help="Do not build TensorFlow backend", + help="Do not build PyTorch backend", ) parser.add_argument( - "--onnx", + "--skip-tensorflow", action="store_true", - default=False, - help="Build ONNX backend (off by default)", + help="Do not build TensorFlow backend", ) parser.add_argument( - "--torch_dir", - default=None, - type=str, - help=f"Path to custom /torch/share/cmake/Torch/ directory {warn_usage}", + "--skip-onnx", + action="store_true", + help="Do not build the ONNX backend", ) parser.add_argument( - "--libtensorflow_dir", - default=None, + "--config-dir", + default=str(DEFAULT_MLPACKAGE_PATH), type=str, - help=f"Path to custom libtensorflow directory {warn_usage}", + help="Path to directory with JSON files describing platform and packages", ) parser.add_argument( "--keydb", @@ -525,9 +395,3 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: default=False, help="Build KeyDB instead of Redis", ) - parser.add_argument( - "--no_torch_with_mkl", - dest="torch_with_mkl", - action="store_false", - help="Do not build Torch with Intel MKL", - ) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index a2e8ed36f..8028b8ecf 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -7,7 +7,7 @@ from github.GitReleaseAsset import GitReleaseAsset from smartsim._core._cli.utils import pip -from smartsim._core._install.builder import WebTGZ +from smartsim._core._install.utils import retrieve from smartsim._core.config import CONFIG from smartsim._core.utils.helpers import check_platform, is_crayex_platform from smartsim.error.errors import SmartSimCLIActionCancelled @@ -159,8 +159,7 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib if working_dir.exists() and list(working_dir.rglob("*.whl")): return working_dir - archive = WebTGZ(asset.browser_download_url) - archive.extract(working_dir) + retrieve(asset.browser_download_url, working_dir) logger.debug(f"Retrieved {asset.browser_download_url} to {working_dir}") return working_dir diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 6d7c72f17..b7905b773 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -27,7 +27,6 @@ import argparse import contextlib import io -import multiprocessing as mp import os import os.path import tempfile @@ -39,7 +38,7 @@ from smartsim import Experiment from smartsim._core._cli.utils import SMART_LOGGER_FORMAT -from smartsim._core._install.builder import Device +from smartsim._core.types import Device from smartsim._core.utils.helpers import installed_redisai_backends from smartsim._core.utils.network import find_free_port from smartsim.log import get_logger @@ -207,25 +206,8 @@ def _make_managed_local_orc( def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None: - recv_conn, send_conn = mp.Pipe(duplex=False) - # Build the model in a subproc so that keras does not hog the gpu - proc = mp.Process(target=_build_tf_frozen_model, args=(send_conn, tmp_dir)) - proc.start() - - # do not need the sending connection in this proc anymore - send_conn.close() - - proc.join(timeout=600) - if proc.is_alive(): - proc.terminate() - raise Exception("Failed to build a simple keras model within 2 minutes") - try: - model_path, inputs, outputs = recv_conn.recv() - except EOFError as e: - raise Exception( - "Failed to receive serialized model from subprocess. " - "Is the `tensorflow` python package installed?" - ) from e + + model_path, inputs, outputs = _build_tf_frozen_model(tmp_dir) client.set_model_from_file( "keras-fcn", @@ -240,8 +222,9 @@ def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None: client.get_tensor("keras-output") -def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None: - from tensorflow import keras +def _build_tf_frozen_model(tmp_dir: str) -> t.Tuple[str, t.List[str], t.List[str]]: + + from tensorflow import keras # pylint: disable=no-name-in-module from smartsim.ml.tf import freeze_model @@ -258,7 +241,7 @@ def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None: optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] ) model_path, inputs, outputs = freeze_model(fcn, tmp_dir, "keras_model.pb") - conn.send((model_path, inputs, outputs)) + return model_path, inputs, outputs def _test_torch_install(client: Client, device: Device) -> None: @@ -283,10 +266,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: net.eval() forward_input = torch.rand(1, 1, 3, 3).to(device_) - traced = torch.jit.trace(net, forward_input) # type: ignore[no-untyped-call] + traced = torch.jit.trace( # type: ignore[no-untyped-call, unused-ignore] + net, forward_input + ) buffer = io.BytesIO() - torch.jit.save(traced, buffer) # type: ignore[no-untyped-call] + torch.jit.save(traced, buffer) # type: ignore[no-untyped-call, unused-ignore] model = buffer.getvalue() client.set_model("torch-nn", model, backend="TORCH", device=device.value.upper()) diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index a066ab16a..ac5c345fc 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -55,30 +55,6 @@ class SetupError(Exception): """ -class VersionConflictError(SetupError): - """An error for when version numbers of some library/package/program/etc - do not match and build may not be able to continue - """ - - def __init__( - self, - name: str, - current_version: "Version_", - target_version: "Version_", - msg: t.Optional[str] = None, - ) -> None: - if msg is None: - msg = ( - f"Incompatible version for {name} detected: " - f"{name} {target_version} requested but {name} {current_version} " - "installed." - ) - super().__init__(msg) - self.name = name - self.current_version = current_version - self.target_version = target_version - - # so as to not conflict with pkg_resources.packaging.version.Version # pylint: disable-next=invalid-name class Version_(str): @@ -156,74 +132,6 @@ def get_env(var: str, default: str) -> str: return os.environ.get(var, default) -class RedisAIVersion(Version_): - """A subclass of Version_ that holds the dependency sets for RedisAI - - this class serves two purposes: - - 1. It is used to populate the [ml] ``extras_require`` of the setup.py. - This is because the RedisAI version will determine which ML based - dependencies are required. - - 2. Used to set the default values for PyTorch, TF, and ONNX - given the SMARTSIM_REDISAI env var set by the user. - - NOTE: Torch requires additional information depending on whether - CPU or GPU support is requested - """ - - defaults = { - "1.2.7": { - "tensorflow": "2.13.1", - "onnx": "1.14.1", - "skl2onnx": "1.16.0", - "onnxmltools": "1.12.0", - "scikit-learn": "1.3.2", - "torch": "2.0.1", - "torch_cpu_suffix": "+cpu", - "torch_cuda_suffix": "+cu117", - "torchvision": "0.15.2", - }, - } - - def __init__(self, vers: str) -> None: # pylint: disable=super-init-not-called - min_rai_version = min(Version_(ver) for ver in self.defaults) - if min_rai_version > vers: - raise SetupError( - f"RedisAI version must be greater than or equal to {min_rai_version}" - ) - if vers not in self.defaults: - if vers.startswith("1.2"): - # resolve to latest version for 1.2.x - # the str representation will still be 1.2.x - self.version = "1.2.7" - else: - raise SetupError( - ( - f"Invalid RedisAI version {vers}. Options are " - f"{self.defaults.keys()}" - ) - ) - else: - self.version = vers - - def __getattr__(self, name: str) -> str: - try: - return self.defaults[self.version][name] - except KeyError: - raise AttributeError( - f"'{type(self).__name__}' object has no attribute '{name}'\n\n" - "This is likely a problem with the SmartSim build process;" - "if this problem persists please log a new issue at " - "https://github.com/CrayLabs/SmartSim/issues " - "or get in contact with us at " - "https://www.craylabs.org/docs/community.html" - ) from None - - def get_defaults(self) -> t.Dict[str, str]: - return self.defaults[self.version].copy() - - class Versioner: """Versioner is responsible for managing all the versions within SmartSim including SmartSim itself. @@ -242,9 +150,7 @@ class Versioner: ``smart build`` command to determine which dependency versions to look for and download. - Default versions for SmartSim, Redis, and RedisAI are - all set here. Setting a default version for RedisAI also dictates - default versions of the machine learning libraries. + Default versions for SmartSim, Redis, and RedisAI are specified here. """ # compatible Python version @@ -256,61 +162,24 @@ class Versioner: # Redis REDIS = Version_(get_env("SMARTSIM_REDIS", "7.2.4")) - REDIS_URL = get_env("SMARTSIM_REDIS_URL", "https://github.com/redis/redis.git/") + REDIS_URL = get_env("SMARTSIM_REDIS_URL", "https://github.com/redis/redis.git") REDIS_BRANCH = get_env("SMARTSIM_REDIS_BRANCH", REDIS) # RedisAI - REDISAI = RedisAIVersion(get_env("SMARTSIM_REDISAI", "1.2.7")) + REDISAI = "1.2.7" REDISAI_URL = get_env( - "SMARTSIM_REDISAI_URL", "https://github.com/RedisAI/RedisAI.git/" + "SMARTSIM_REDISAI_URL", "https://github.com/RedisAI/RedisAI.git" ) REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}") - # ML/DL (based on RedisAI version defaults) - # torch can be set by the user because we download that for them - TORCH = Version_(get_env("SMARTSIM_TORCH", REDISAI.torch)) - TORCHVISION = Version_(get_env("SMARTSIM_TORCHVIS", REDISAI.torchvision)) - TORCH_CPU_SUFFIX = Version_(get_env("TORCH_CPU_SUFFIX", REDISAI.torch_cpu_suffix)) - TORCH_CUDA_SUFFIX = Version_( - get_env("TORCH_CUDA_SUFFIX", REDISAI.torch_cuda_suffix) - ) - - # TensorFlow and ONNX only use the defaults, but these are not built into - # the RedisAI package and therefore the user is free to pick other versions. - TENSORFLOW = Version_(REDISAI.tensorflow) - ONNX = Version_(REDISAI.onnx) - def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]: pkg_map = { "SMARTSIM": self.SMARTSIM, db_name: self.REDIS, "REDISAI": self.REDISAI, - "TORCH": self.TORCH, - "TENSORFLOW": self.TENSORFLOW, - "ONNX": self.ONNX, } return {"Packages": tuple(pkg_map), "Versions": tuple(pkg_map.values())} - def ml_extras_required(self) -> t.Dict[str, t.List[str]]: - """Optional ML/DL dependencies we suggest for the user. - - The defaults are based on the RedisAI version - """ - ml_defaults = self.REDISAI.get_defaults() - - # remove torch-related fields as they are subject to change - # by having the user change hardware (cpu/gpu) - _torch_fields = [ - "torch", - "torchvision", - "torch_cpu_suffix", - "torch_cuda_suffix", - ] - for field in _torch_fields: - ml_defaults.pop(field) - - return {"ml": [f"{lib}=={vers}" for lib, vers in ml_defaults.items()]} - @staticmethod def get_sha(setup_py_dir: Path) -> str: """Get the git sha of the current branch""" @@ -385,7 +254,7 @@ def __init__(self, checks: bool = True) -> None: self.check_dependencies() def check_dependencies(self) -> None: - deps = ["git", "git-lfs", "make", "wget", "cmake", self.CC, self.CXX] + deps = ["git", "make", "wget", "cmake", self.CC, self.CXX] if int(self.CHECKS) == 0: for dep in deps: self.check_build_dependency(dep) @@ -498,23 +367,6 @@ def check_build_dependency(command: str) -> None: except OSError: raise SetupError(f"{command} must be installed to build SmartSim") from None - @classmethod - def check_installed( - cls, package: str, version: t.Optional[Version_] = None - ) -> bool: - """Check if a package is installed. If version is provided, check if - it's a compatible version. (major and minor the same) - """ - try: - installed = cls.get_py_package_version(package) - except importlib.metadata.PackageNotFoundError: - return False - if version: - # detect if major or minor versions differ - if installed.major != version.major or installed.minor != version.minor: - raise VersionConflictError(package, installed, version) - return True - @staticmethod def get_py_package_version(package: str) -> Version_: return Version_(importlib.metadata.version(package)) diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index 8f5bdc557..17036e825 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -26,98 +26,32 @@ # pylint: disable=too-many-lines -import concurrent.futures -import enum -import fileinput -import itertools import os -import platform import re import shutil import stat import subprocess -import sys -import tarfile -import tempfile import typing as t -import urllib.request -import zipfile -from abc import ABC, abstractmethod -from dataclasses import dataclass from pathlib import Path -from shutil import which from subprocess import SubprocessError -# NOTE: This will be imported by setup.py and hence no smartsim related -# items should be imported into this file. +from smartsim._core._install.utils import retrieve +from smartsim._core.utils import expand_exe_path + +if t.TYPE_CHECKING: + from typing_extensions import Never # TODO: check cmake version and use system if possible to avoid conflicts -TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime", "tflite"] _PathLike = t.Union[str, "os.PathLike[str]"] _T = t.TypeVar("_T") _U = t.TypeVar("_U") -def expand_exe_path(exe: str) -> str: - """Takes an executable and returns the full path to that executable - - :param exe: executable or file - :raises TypeError: if file is not an executable - :raises FileNotFoundError: if executable cannot be found - """ - - # which returns none if not found - in_path = which(exe) - if not in_path: - if os.path.isfile(exe) and os.access(exe, os.X_OK): - return os.path.abspath(exe) - if os.path.isfile(exe) and not os.access(exe, os.X_OK): - raise TypeError(f"File, {exe}, is not an executable") - raise FileNotFoundError(f"Could not locate executable {exe}") - return os.path.abspath(in_path) - - class BuildError(Exception): pass -class Architecture(enum.Enum): - X64 = ("x86_64", "amd64") - ARM64 = ("arm64",) - - @classmethod - def from_str(cls, string: str, /) -> "Architecture": - string = string.lower() - for type_ in cls: - if string in type_.value: - return type_ - raise BuildError(f"Unrecognized or unsupported architecture: {string}") - - -class Device(enum.Enum): - CPU = "cpu" - GPU = "gpu" - - -class OperatingSystem(enum.Enum): - LINUX = ("linux", "linux2") - DARWIN = ("darwin",) - - @classmethod - def from_str(cls, string: str, /) -> "OperatingSystem": - string = string.lower() - for type_ in cls: - if string in type_.value: - return type_ - raise BuildError(f"Unrecognized or unsupported operating system: {string}") - - -class Platform(t.NamedTuple): - os: OperatingSystem - architecture: Architecture - - class Builder: """Base class for building third-party libraries""" @@ -135,13 +69,10 @@ def __init__( self, env: t.Dict[str, str], jobs: int = 1, - _os: OperatingSystem = OperatingSystem.from_str(platform.system()), - architecture: Architecture = Architecture.from_str(platform.machine()), verbose: bool = False, ) -> None: # build environment from buildenv self.env = env - self._platform = Platform(_os, architecture) # Find _core directory and set up paths _core_dir = Path(os.path.abspath(__file__)).parent.parent @@ -176,11 +107,6 @@ def out(self) -> t.Optional[int]: def is_built(self) -> bool: raise NotImplementedError - def build_from_git( - self, git_url: str, branch: str, device: Device = Device.CPU - ) -> None: - raise NotImplementedError - @staticmethod def binary_path(binary: str) -> str: binary_ = shutil.which(binary) @@ -256,15 +182,11 @@ def __init__( build_env: t.Optional[t.Dict[str, str]] = None, malloc: str = "libc", jobs: int = 1, - _os: OperatingSystem = OperatingSystem.from_str(platform.system()), - architecture: Architecture = Architecture.from_str(platform.machine()), verbose: bool = False, ) -> None: super().__init__( build_env or {}, jobs=jobs, - _os=_os, - architecture=architecture, verbose=verbose, ) self.malloc = malloc @@ -277,9 +199,7 @@ def is_built(self) -> bool: keydb_files = {"keydb-server", "keydb-cli"} return redis_files.issubset(bin_files) or keydb_files.issubset(bin_files) - def build_from_git( - self, git_url: str, branch: str, device: Device = Device.CPU - ) -> None: + def build_from_git(self, git_url: str, branch: str) -> None: """Build Redis from git :param git_url: url from which to retrieve Redis :param branch: branch to checkout @@ -301,23 +221,7 @@ def build_from_git( if not self.is_valid_url(git_url): raise BuildError(f"Malformed {database_name} URL: {git_url}") - clone_cmd = config_git_command( - self._platform, - [ - self.binary_path("git"), - "clone", - git_url, - "--branch", - branch, - "--depth", - "1", - database_name, - ], - ) - - # clone Redis - self.run_command(clone_cmd, cwd=self.build_dir) - + retrieve(git_url, self.build_dir / database_name, branch=branch, depth=1) # build Redis build_cmd = [ self.binary_path("make"), @@ -354,724 +258,3 @@ def build_from_git( _ = expand_exe_path(str(redis_cli)) except (TypeError, FileNotFoundError) as e: raise BuildError("Installation of redis-cli failed!") from e - - -class _RAIBuildDependency(ABC): - """An interface with a collection of magic methods so that - ``RedisAIBuilder`` can fetch and place its own dependencies - """ - - @property - @abstractmethod - def __rai_dependency_name__(self) -> str: ... - - @abstractmethod - def __place_for_rai__(self, target: _PathLike) -> Path: ... - - @staticmethod - @abstractmethod - def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: ... - - -def _place_rai_dep_at( - target: _PathLike, verbose: bool -) -> t.Callable[[_RAIBuildDependency], Path]: - def _place(dep: _RAIBuildDependency) -> Path: - if verbose: - print(f"Placing: '{dep.__rai_dependency_name__}'") - path = dep.__place_for_rai__(target) - if verbose: - print(f"Placed: '{dep.__rai_dependency_name__}' at '{path}'") - return path - - return _place - - -class RedisAIBuilder(Builder): - """Class to build RedisAI from Source - Supported build method: - - from git - See buildenv.py for buildtime configuration of RedisAI - version and url. - """ - - def __init__( - self, - _os: OperatingSystem = OperatingSystem.from_str(platform.system()), - architecture: Architecture = Architecture.from_str(platform.machine()), - build_env: t.Optional[t.Dict[str, str]] = None, - torch_dir: str = "", - libtf_dir: str = "", - build_torch: bool = True, - build_tf: bool = True, - build_onnx: bool = False, - jobs: int = 1, - verbose: bool = False, - torch_with_mkl: bool = True, - ) -> None: - super().__init__( - build_env or {}, - jobs=jobs, - _os=_os, - architecture=architecture, - verbose=verbose, - ) - - self.rai_install_path: t.Optional[Path] = None - - # convert to int for RAI build script - self._torch = build_torch - self._tf = build_tf - self._onnx = build_onnx - self.libtf_dir = libtf_dir - self.torch_dir = torch_dir - - # extra configuration options - self.torch_with_mkl = torch_with_mkl - - # Sanity checks - self._validate_platform() - - def _validate_platform(self) -> None: - unsupported = [] - if self._platform not in _DLPackRepository.supported_platforms(): - unsupported.append("DLPack") - if self.fetch_tf and (self._platform not in _TFArchive.supported_platforms()): - unsupported.append("Tensorflow") - if self.fetch_onnx and ( - self._platform not in _ORTArchive.supported_platforms() - ): - unsupported.append("ONNX") - if self.fetch_torch and ( - self._platform not in _PTArchive.supported_platforms() - ): - unsupported.append("PyTorch") - if unsupported: - raise BuildError( - f"The {', '.join(unsupported)} backend(s) are not supported " - f"on {self._platform.os} with {self._platform.architecture}" - ) - - @property - def rai_build_path(self) -> Path: - return Path(self.build_dir, "RedisAI") - - @property - def is_built(self) -> bool: - server = self.lib_path.joinpath("backends").is_dir() - cli = self.lib_path.joinpath("redisai.so").is_file() - return server and cli - - @property - def build_torch(self) -> bool: - return self._torch - - @property - def fetch_torch(self) -> bool: - return self.build_torch and not self.torch_dir - - @property - def build_tf(self) -> bool: - return self._tf - - @property - def fetch_tf(self) -> bool: - return self.build_tf and not self.libtf_dir - - @property - def build_onnx(self) -> bool: - return self._onnx - - @property - def fetch_onnx(self) -> bool: - return self.build_onnx - - def get_deps_dir_path_for(self, device: Device) -> Path: - def fail_to_format(reason: str) -> BuildError: # pragma: no cover - return BuildError(f"Failed to format RedisAI dependency path: {reason}") - - _os, architecture = self._platform - if _os == OperatingSystem.DARWIN: - os_ = "macos" - elif _os == OperatingSystem.LINUX: - os_ = "linux" - else: # pragma: no cover - raise fail_to_format(f"Unknown operating system: {_os}") - if architecture == Architecture.X64: - arch = "x64" - elif architecture == Architecture.ARM64: - arch = "arm64v8" - else: # pragma: no cover - raise fail_to_format(f"Unknown architecture: {architecture}") - return self.rai_build_path / f"deps/{os_}-{arch}-{device.value}" - - def _get_deps_to_fetch_for( - self, device: Device - ) -> t.Tuple[_RAIBuildDependency, ...]: - os_, arch = self._platform - # TODO: It would be nice if the backend version numbers were declared - # alongside the python package version numbers so that all of the - # dependency versions were declared in single location. - # Unfortunately importing into this module is non-trivial as it - # is used as script in the SmartSim `setup.py`. - - # DLPack is always required - fetchable_deps: t.List[_RAIBuildDependency] = [_DLPackRepository("v0.5_RAI")] - if self.fetch_torch: - pt_dep = _choose_pt_variant(os_)(arch, device, "2.0.1", self.torch_with_mkl) - fetchable_deps.append(pt_dep) - if self.fetch_tf: - fetchable_deps.append(_TFArchive(os_, arch, device, "2.13.1")) - if self.fetch_onnx: - fetchable_deps.append(_ORTArchive(os_, device, "1.16.3")) - - return tuple(fetchable_deps) - - def symlink_libtf(self, device: Device) -> None: - """Add symbolic link to available libtensorflow in RedisAI deps. - - :param device: cpu or gpu - """ - rai_deps_path = sorted( - self.rai_build_path.glob(os.path.join("deps", f"*{device.value}*")) - ) - if not rai_deps_path: - raise FileNotFoundError("Could not find RedisAI 'deps' directory") - - # There should only be one path for a given device, - # and this should hold even if in the future we use - # an external build of RedisAI - rai_libtf_path = rai_deps_path[0] / "libtensorflow" - rai_libtf_path.resolve() - if rai_libtf_path.is_dir(): - shutil.rmtree(rai_libtf_path) - - os.makedirs(rai_libtf_path) - libtf_path = Path(self.libtf_dir).resolve() - - # Copy include directory to deps/libtensorflow - include_src_path = libtf_path / "include" - if not include_src_path.exists(): - raise FileNotFoundError(f"Could not find include directory in {libtf_path}") - os.symlink(include_src_path, rai_libtf_path / "include") - - # RedisAI expects to find a lib directory, which is only - # available in some distributions. - rai_libtf_lib_dir = rai_libtf_path / "lib" - os.makedirs(rai_libtf_lib_dir) - src_libtf_lib_dir = libtf_path / "lib" - # If the lib directory existed in the libtensorflow distribution, - # copy its content, otherwise gather library files from - # libtensorflow base dir and copy them into destination lib dir - if src_libtf_lib_dir.is_dir(): - library_files = sorted(src_libtf_lib_dir.glob("*")) - if not library_files: - raise FileNotFoundError( - f"Could not find libtensorflow library files in {src_libtf_lib_dir}" - ) - else: - library_files = sorted(libtf_path.glob("lib*.so*")) - if not library_files: - raise FileNotFoundError( - f"Could not find libtensorflow library files in {libtf_path}" - ) - - for src_file in library_files: - dst_file = rai_libtf_lib_dir / src_file.name - if not dst_file.is_file(): - os.symlink(src_file, dst_file) - - def build_from_git( - self, git_url: str, branch: str, device: Device = Device.CPU - ) -> None: - """Build RedisAI from git - - :param git_url: url from which to retrieve RedisAI - :param branch: branch to checkout - :param device: cpu or gpu - """ - # delete previous build dir (should never be there) - if self.rai_build_path.is_dir(): - shutil.rmtree(self.rai_build_path) - - # Check RedisAI URL - if not self.is_valid_url(git_url): - raise BuildError(f"Malformed RedisAI URL: {git_url}") - - # clone RedisAI - clone_cmd = config_git_command( - self._platform, - [ - self.binary_path("env"), - "GIT_LFS_SKIP_SMUDGE=1", - "git", - "clone", - "--recursive", - git_url, - "--branch", - branch, - "--depth=1", - os.fspath(self.rai_build_path), - ], - ) - - self.run_command(clone_cmd, out=subprocess.DEVNULL, cwd=self.build_dir) - self._fetch_deps_for(device) - - if self.libtf_dir and device.value: - self.symlink_libtf(device) - - build_cmd = self._rai_build_env_prefix( - with_pt=self.build_torch, - with_tf=self.build_tf, - with_ort=self.build_onnx, - extra_env={"GPU": "1" if device == Device.GPU else "0"}, - ) - - if self.torch_dir: - self.env["Torch_DIR"] = str(self.torch_dir) - - build_cmd.extend( - [ - self.binary_path("make"), - "-C", - str(self.rai_build_path / "opt"), - "-j", - f"{self.jobs}", - "build", - ] - ) - self.run_command(build_cmd, cwd=self.rai_build_path) - - self._install_backends(device) - if self.user_supplied_backend("torch"): - self._move_torch_libs() - self.cleanup() - - def user_supplied_backend(self, backend: TRedisAIBackendStr) -> bool: - if backend == "torch": - return bool(self.build_torch and not self.fetch_torch) - if backend == "tensorflow": - return bool(self.build_tf and not self.fetch_tf) - if backend == "onnxruntime": - return bool(self.build_onnx and not self.fetch_onnx) - if backend == "tflite": - return False - raise BuildError(f"Unrecognized backend requested {backend}") - - def _rai_build_env_prefix( - self, - with_tf: bool, - with_pt: bool, - with_ort: bool, - extra_env: t.Optional[t.Dict[str, str]] = None, - ) -> t.List[str]: - extra_env = extra_env or {} - return [ - self.binary_path("env"), - f"WITH_PT={1 if with_pt else 0}", - f"WITH_TF={1 if with_tf else 0}", - "WITH_TFLITE=0", # never use TF Lite (for now) - f"WITH_ORT={1 if with_ort else 0}", - *(f"{key}={val}" for key, val in extra_env.items()), - ] - - def _fetch_deps_for(self, device: Device) -> None: - if not self.rai_build_path.is_dir(): - raise BuildError("RedisAI build directory not found") - - deps_dir = self.get_deps_dir_path_for(device) - deps_dir.mkdir(parents=True, exist_ok=True) - if any(deps_dir.iterdir()): - raise BuildError("RAI build dependency directory is not empty") - to_fetch = self._get_deps_to_fetch_for(device) - placed_paths = _threaded_map( - _place_rai_dep_at(deps_dir, self.verbose), to_fetch - ) - unique_placed_paths = {os.fspath(path.resolve()) for path in placed_paths} - if len(unique_placed_paths) != len(to_fetch): - raise BuildError( - f"Expected to place {len(to_fetch)} dependencies, but only " - f"found {len(unique_placed_paths)}" - ) - - def _install_backends(self, device: Device) -> None: - """Move backend libraries to smartsim/_core/lib/ - :param device: cpu or cpu - """ - self.rai_install_path = self.rai_build_path.joinpath( - f"install-{device.value}" - ).resolve() - rai_lib = self.rai_install_path / "redisai.so" - rai_backends = self.rai_install_path / "backends" - - if rai_backends.is_dir(): - self.copy_dir(rai_backends, self.lib_path / "backends", set_exe=True) - if rai_lib.is_file(): - self.copy_file(rai_lib, self.lib_path / "redisai.so", set_exe=True) - - def _move_torch_libs(self) -> None: - """Move pip install torch libraries - Since we use pip installed torch libraries for building - RedisAI, we need to move them into the LD_runpath of redisai.so - in the smartsim/_core/lib directory. - """ - ss_rai_torch_path = self.lib_path / "backends" / "redisai_torch" - ss_rai_torch_lib_path = ss_rai_torch_path / "lib" - - # retrieve torch shared libraries and copy to the - # smartsim/_core/lib/backends/redisai_torch/lib dir - # self.torch_dir should be /path/to/torch/share/cmake/Torch - # so we take the great grandparent here - pip_torch_path = Path(self.torch_dir).parent.parent.parent - pip_torch_lib_path = pip_torch_path / "lib" - - self.copy_dir(pip_torch_lib_path, ss_rai_torch_lib_path, set_exe=True) - - # also move the openmp files if on a mac - if sys.platform == "darwin": - dylibs = pip_torch_path / ".dylibs" - self.copy_dir(dylibs, ss_rai_torch_path / ".dylibs", set_exe=True) - - -def _threaded_map(fn: t.Callable[[_T], _U], items: t.Iterable[_T]) -> t.Sequence[_U]: - items = tuple(items) - if not items: # No items so no work to do - return () - num_workers = min(len(items), (os.cpu_count() or 4) * 5) - with concurrent.futures.ThreadPoolExecutor(num_workers) as pool: - return tuple(pool.map(fn, items)) - - -class _WebLocation(ABC): - @property - @abstractmethod - def url(self) -> str: ... - - -class _WebGitRepository(_WebLocation): - def clone( - self, - target: _PathLike, - depth: t.Optional[int] = None, - branch: t.Optional[str] = None, - ) -> None: - depth_ = ("--depth", str(depth)) if depth is not None else () - branch_ = ("--branch", branch) if branch is not None else () - _git("clone", "-q", *depth_, *branch_, self.url, os.fspath(target)) - - -@t.final -@dataclass(frozen=True) -class _DLPackRepository(_WebGitRepository, _RAIBuildDependency): - version: str - - @staticmethod - def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: - return ( - (OperatingSystem.LINUX, Architecture.X64), - (OperatingSystem.DARWIN, Architecture.X64), - (OperatingSystem.DARWIN, Architecture.ARM64), - ) - - @property - def url(self) -> str: - return "https://github.com/RedisAI/dlpack.git" - - @property - def __rai_dependency_name__(self) -> str: - return f"dlpack@{self.url}" - - def __place_for_rai__(self, target: _PathLike) -> Path: - target = Path(target) / "dlpack" - self.clone(target, branch=self.version, depth=1) - if not target.is_dir(): - raise BuildError("Failed to place dlpack") - return target - - -class _WebArchive(_WebLocation): - @property - def name(self) -> str: - _, name = self.url.rsplit("/", 1) - return name - - def download(self, target: _PathLike) -> Path: - target = Path(target) - if target.is_dir(): - target = target / self.name - file, _ = urllib.request.urlretrieve(self.url, target) - return Path(file).resolve() - - -class _ExtractableWebArchive(_WebArchive, ABC): - @abstractmethod - def _extract_download(self, download_path: Path, target: _PathLike) -> None: ... - - def extract(self, target: _PathLike) -> None: - with tempfile.TemporaryDirectory() as tmp_dir: - arch_path = self.download(tmp_dir) - self._extract_download(arch_path, target) - - -class _WebTGZ(_ExtractableWebArchive): - def _extract_download(self, download_path: Path, target: _PathLike) -> None: - with tarfile.open(download_path, "r") as tgz_file: - tgz_file.extractall(target) - - -class _WebZip(_ExtractableWebArchive): - def _extract_download(self, download_path: Path, target: _PathLike) -> None: - with zipfile.ZipFile(download_path, "r") as zip_file: - zip_file.extractall(target) - - -class WebTGZ(_WebTGZ): - def __init__(self, url: str) -> None: - self._url = url - - @property - def url(self) -> str: - return self._url - - -@dataclass(frozen=True) -class _PTArchive(_WebZip, _RAIBuildDependency): - architecture: Architecture - device: Device - version: str - with_mkl: bool - - @staticmethod - def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: - # TODO: This will need to be revisited if the inheritance tree gets deeper - return tuple( - itertools.chain.from_iterable( - var.supported_platforms() for var in _PTArchive.__subclasses__() - ) - ) - - @property - def __rai_dependency_name__(self) -> str: - return f"libtorch@{self.url}" - - @staticmethod - def _patch_out_mkl(libtorch_root: Path) -> None: - _modify_source_files( - libtorch_root / "share/cmake/Caffe2/public/mkl.cmake", - r"find_package\(MKL QUIET\)", - "# find_package(MKL QUIET)", - ) - - def extract(self, target: _PathLike) -> None: - super().extract(target) - if not self.with_mkl: - self._patch_out_mkl(Path(target)) - - def __place_for_rai__(self, target: _PathLike) -> Path: - self.extract(target) - target = Path(target) / "libtorch" - if not target.is_dir(): - raise BuildError("Failed to place RAI dependency: `libtorch`") - return target - - -@t.final -class _PTArchiveLinux(_PTArchive): - @staticmethod - def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: - return ((OperatingSystem.LINUX, Architecture.X64),) - - @property - def url(self) -> str: - if self.device == Device.GPU: - pt_build = "cu117" - else: - pt_build = Device.CPU.value - # pylint: disable-next=line-too-long - libtorch_archive = ( - f"libtorch-cxx11-abi-shared-without-deps-{self.version}%2B{pt_build}.zip" - ) - return f"https://download.pytorch.org/libtorch/{pt_build}/{libtorch_archive}" - - -@t.final -class _PTArchiveMacOSX(_PTArchive): - @staticmethod - def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: - return ( - (OperatingSystem.DARWIN, Architecture.ARM64), - (OperatingSystem.DARWIN, Architecture.X64), - ) - - @property - def url(self) -> str: - if self.device == Device.GPU: - raise BuildError("RedisAI does not currently support GPU on Mac OSX") - if self.architecture == Architecture.X64: - pt_build = Device.CPU.value - libtorch_archive = f"libtorch-macos-{self.version}.zip" - root_url = "https://download.pytorch.org/libtorch" - return f"{root_url}/{pt_build}/{libtorch_archive}" - if self.architecture == Architecture.ARM64: - libtorch_archive = f"libtorch-macos-arm64-{self.version}.zip" - # pylint: disable-next=line-too-long - root_url = ( - "https://github.com/CrayLabs/ml_lib_builder/releases/download/v0.1/" - ) - return f"{root_url}/{libtorch_archive}" - - raise BuildError(f"Unsupported architecture for Pytorch: {self.architecture}") - - -def _choose_pt_variant( - os_: OperatingSystem, -) -> t.Union[t.Type[_PTArchiveLinux], t.Type[_PTArchiveMacOSX]]: - if os_ == OperatingSystem.DARWIN: - return _PTArchiveMacOSX - if os_ == OperatingSystem.LINUX: - return _PTArchiveLinux - - raise BuildError(f"Unsupported OS for PyTorch: {os_}") - - -@t.final -@dataclass(frozen=True) -class _TFArchive(_WebTGZ, _RAIBuildDependency): - os_: OperatingSystem - architecture: Architecture - device: Device - version: str - - @staticmethod - def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: - return ( - (OperatingSystem.LINUX, Architecture.X64), - (OperatingSystem.DARWIN, Architecture.X64), - ) - - @property - def url(self) -> str: - if self.architecture == Architecture.X64: - tf_arch = "x86_64" - else: - raise BuildError( - f"Unexpected Architecture for TF Archive: {self.architecture}" - ) - - if self.os_ == OperatingSystem.LINUX: - tf_os = "linux" - tf_device = self.device - elif self.os_ == OperatingSystem.DARWIN: - tf_os = "darwin" - if self.device == Device.GPU: - raise BuildError("RedisAI does not currently support GPU on Macos") - tf_device = Device.CPU - else: - raise BuildError(f"Unexpected OS for TF Archive: {self.os_}") - return ( - "https://storage.googleapis.com/tensorflow/libtensorflow/" - f"libtensorflow-{tf_device.value}-{tf_os}-{tf_arch}-{self.version}.tar.gz" - ) - - @property - def __rai_dependency_name__(self) -> str: - return f"libtensorflow@{self.url}" - - def __place_for_rai__(self, target: _PathLike) -> Path: - target = Path(target) / "libtensorflow" - target.mkdir() - self.extract(target) - return target - - -@t.final -@dataclass(frozen=True) -class _ORTArchive(_WebTGZ, _RAIBuildDependency): - os_: OperatingSystem - device: Device - version: str - - @staticmethod - def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: - return ( - (OperatingSystem.LINUX, Architecture.X64), - (OperatingSystem.DARWIN, Architecture.X64), - ) - - @property - def url(self) -> str: - ort_url_base = ( - "https://github.com/microsoft/onnxruntime/releases/" - f"download/v{self.version}" - ) - if self.os_ == OperatingSystem.LINUX: - ort_os = "linux" - ort_arch = "x64" - ort_build = "-gpu" if self.device == Device.GPU else "" - elif self.os_ == OperatingSystem.DARWIN: - ort_os = "osx" - ort_arch = "x86_64" - ort_build = "" - if self.device == Device.GPU: - raise BuildError("RedisAI does not currently support GPU on Macos") - else: - raise BuildError(f"Unexpected OS for TF Archive: {self.os_}") - ort_archive = f"onnxruntime-{ort_os}-{ort_arch}{ort_build}-{self.version}.tgz" - return f"{ort_url_base}/{ort_archive}" - - @property - def __rai_dependency_name__(self) -> str: - return f"onnxruntime@{self.url}" - - def __place_for_rai__(self, target: _PathLike) -> Path: - target = Path(target).resolve() / "onnxruntime" - self.extract(target) - try: - (extracted_dir,) = target.iterdir() - except ValueError: - raise BuildError( - "Unexpected number of files extracted from ORT archive" - ) from None - for file in extracted_dir.iterdir(): - file.rename(target / file.name) - extracted_dir.rmdir() - return target - - -def _git(*args: str) -> None: - git = Builder.binary_path("git") - cmd = (git,) + args - with subprocess.Popen(cmd) as proc: - proc.wait() - if proc.returncode != 0: - raise BuildError( - f"Command `{' '.join(cmd)}` failed with exit code {proc.returncode}" - ) - - -def config_git_command(plat: Platform, cmd: t.Sequence[str]) -> t.List[str]: - """Modify git commands to include autocrlf when on a platform that needs - autocrlf enabled to behave correctly - """ - cmd = list(cmd) - where = next((i for i, tok in enumerate(cmd) if tok.endswith("git")), len(cmd)) + 2 - if where >= len(cmd): - raise ValueError(f"Failed to locate git command in '{' '.join(cmd)}'") - if plat == Platform(OperatingSystem.DARWIN, Architecture.ARM64): - cmd = ( - cmd[:where] - + ["--config", "core.autocrlf=false", "--config", "core.eol=lf"] - + cmd[where:] - ) - return cmd - - -def _modify_source_files( - files: t.Union[_PathLike, t.Iterable[_PathLike]], regex: str, replacement: str -) -> None: - compiled_regex = re.compile(regex) - with fileinput.input(files=files, inplace=True) as handles: - for line in handles: - line = compiled_regex.sub(replacement, line) - print(line, end="") diff --git a/smartsim/_core/_install/configs/mlpackages/DarwinARM64CPU.json b/smartsim/_core/_install/configs/mlpackages/DarwinARM64CPU.json new file mode 100644 index 000000000..2f49a393e --- /dev/null +++ b/smartsim/_core/_install/configs/mlpackages/DarwinARM64CPU.json @@ -0,0 +1,47 @@ +{ + "platform": { + "operating_system":"darwin", + "architecture":"arm64", + "device":"cpu" + }, + "ml_packages": [ + { + "name": "dlpack", + "version": "v0.5_RAI", + "pip_index": "", + "python_packages": [], + "lib_source": "https://github.com/RedisAI/dlpack.git" + }, + { + "name": "libtorch", + "version": "2.4.0", + "pip_index": "", + "python_packages": [ + "torch==2.4.0", + "torchvision==0.19.0", + "torchaudio==2.4.0" + ], + "lib_source": "https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-2.4.0.zip", + "rai_patches": [ + { + "description": "Patch RedisAI module to require C++17 standard instead of C++14", + "source_file": "src/backends/libtorch_c/CMakeLists.txt", + "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)", + "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)" + } + ] + }, + { + "name": "onnxruntime", + "version": "1.17.3", + "pip_index": "", + "python_packages": [ + "onnx==1.15", + "skl2onnx", + "scikit-learn", + "onnxmltools" + ], + "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-arm64-1.17.3.tgz" + } + ] +} diff --git a/smartsim/_core/_install/configs/mlpackages/DarwinX64CPU.json b/smartsim/_core/_install/configs/mlpackages/DarwinX64CPU.json new file mode 100644 index 000000000..e7b67e35b --- /dev/null +++ b/smartsim/_core/_install/configs/mlpackages/DarwinX64CPU.json @@ -0,0 +1,56 @@ +{ + "platform": { + "operating_system":"darwin", + "architecture":"x86_64", + "device":"cpu" + }, + "ml_packages": [ + { + "name": "dlpack", + "version": "v0.5_RAI", + "pip_index": "", + "python_packages": [], + "lib_source": "https://github.com/RedisAI/dlpack.git" + }, + { + "name": "libtorch", + "version": "2.2.2", + "pip_index": "", + "python_packages": [ + "torch==2.2.2", + "torchvision==0.17.2", + "torchaudio==2.2.2" + ], + "lib_source": "https://download.pytorch.org/libtorch/cpu/libtorch-macos-x86_64-2.2.2.zip", + "rai_patches": [ + { + "description": "Patch RedisAI module to require C++17 standard instead of C++14", + "source_file": "src/backends/libtorch_c/CMakeLists.txt", + "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)", + "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)" + } + ] + }, + { + "name": "libtensorflow", + "version": "2.15", + "pip_index": "", + "python_packages": [ + "tensorflow==2.15" + ], + "lib_source": "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-2.15.0.tar.gz" + }, + { + "name": "onnxruntime", + "version": "1.17.3", + "pip_index": "", + "python_packages": [ + "onnx==1.15", + "skl2onnx", + "scikit-learn", + "onnxmltools" + ], + "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-x86_64-1.17.3.tgz" + } + ] +} diff --git a/smartsim/_core/_install/configs/mlpackages/LinuxX64CPU.json b/smartsim/_core/_install/configs/mlpackages/LinuxX64CPU.json new file mode 100644 index 000000000..cc2f81194 --- /dev/null +++ b/smartsim/_core/_install/configs/mlpackages/LinuxX64CPU.json @@ -0,0 +1,56 @@ +{ + "platform": { + "operating_system":"linux", + "architecture":"x86_64", + "device":"cpu" + }, + "ml_packages": [ + { + "name": "dlpack", + "version": "v0.5_RAI", + "pip_index": "", + "python_packages": [], + "lib_source": "https://github.com/RedisAI/dlpack.git" + }, + { + "name": "libtorch", + "version": "2.4.0", + "pip_index": "https://download.pytorch.org/whl/cpu", + "python_packages": [ + "torch==2.4.0+cpu", + "torchvision==0.19.0+cpu", + "torchaudio==2.4.0+cpu" + ], + "lib_source": "https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.4.0%2Bcpu.zip", + "rai_patches": [ + { + "description": "Patch RedisAI module to require C++17 standard instead of C++14", + "source_file": "src/backends/libtorch_c/CMakeLists.txt", + "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)", + "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)" + } + ] + }, + { + "name": "libtensorflow", + "version": "2.15", + "pip_index": "", + "python_packages": [ + "tensorflow==2.15" + ], + "lib_source": "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-2.15.0.tar.gz" + }, + { + "name": "onnxruntime", + "version": "1.17.3", + "pip_index": "", + "python_packages": [ + "onnx<=1.15", + "skl2onnx", + "scikit-learn", + "onnxmltools" + ], + "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz" + } + ] +} diff --git a/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA11.json b/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA11.json new file mode 100644 index 000000000..cf302534c --- /dev/null +++ b/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA11.json @@ -0,0 +1,56 @@ +{ + "platform": { + "operating_system":"linux", + "architecture":"x86_64", + "device":"cuda-11" + }, + "ml_packages": [ + { + "name": "dlpack", + "version": "v0.5_RAI", + "pip_index": "", + "python_packages": [], + "lib_source": "https://github.com/RedisAI/dlpack.git" + }, + { + "name": "libtorch", + "version": "2.3.1", + "pip_index": "https://download.pytorch.org/whl/cu118", + "python_packages": [ + "torch==2.3.1+cu118", + "torchvision==0.18.1+cu118", + "torchaudio==2.3.1+cu118" + ], + "lib_source": "https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcu118.zip", + "rai_patches": [ + { + "description": "Patch RedisAI module to require C++17 standard instead of C++14", + "source_file": "src/backends/libtorch_c/CMakeLists.txt", + "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)", + "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)" + } + ] + }, + { + "name": "libtensorflow", + "version": "2.14.1", + "pip_index": "", + "python_packages": [ + "tensorflow==2.14.1" + ], + "lib_source": "https://github.com/CrayLabs/ml_lib_builder/releases/download/v0.2/libtensorflow-2.14.1-linux-x64-cuda-11.8.0.tgz" + }, + { + "name": "onnxruntime", + "version": "1.17.3", + "pip_index": "", + "python_packages": [ + "onnx==1.15", + "skl2onnx", + "scikit-learn", + "onnxmltools" + ], + "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-1.17.3.tgz" + } + ] +} diff --git a/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA12.json b/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA12.json new file mode 100644 index 000000000..a415b3103 --- /dev/null +++ b/smartsim/_core/_install/configs/mlpackages/LinuxX64CUDA12.json @@ -0,0 +1,64 @@ +{ + "platform": { + "operating_system":"linux", + "architecture":"x86_64", + "device":"cuda-12" + }, + "ml_packages": [ + { + "name": "dlpack", + "version": "v0.5_RAI", + "pip_index": "", + "python_packages": [], + "lib_source": "https://github.com/RedisAI/dlpack.git" + }, + { + "name": "libtorch", + "version": "2.3.1", + "pip_index": "https://download.pytorch.org/whl/cu121", + "python_packages": [ + "torch==2.3.1+cu121", + "torchvision==0.18.1+cu121", + "torchaudio==2.3.1+cu121" + ], + "lib_source": "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcu121.zip", + "rai_patches": [ + { + "description": "Patch RedisAI module to require C++17 standard instead of C++14", + "source_file": "src/backends/libtorch_c/CMakeLists.txt", + "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)", + "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)" + } + ] + }, + { + "name": "libtensorflow", + "version": "2.15", + "pip_index": "", + "python_packages": [ + "tensorflow==2.15" + ], + "lib_source": "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-2.15.0.tar.gz", + "rai_patches": [ + { + "description": "Patch RedisAI to point to correct tsl directory", + "source_file": "CMakeLists.txt", + "regex": "INCLUDE_DIRECTORIES\\(\\$\\{depsAbs\\}/libtensorflow/include\\)", + "replacement": "INCLUDE_DIRECTORIES(${depsAbs}/libtensorflow/include ${depsAbs}/libtensorflow/include/external/local_tsl)" + } + ] + }, + { + "name": "onnxruntime", + "version": "1.17.3", + "pip_index": "", + "python_packages": [ + "onnx==1.15", + "skl2onnx", + "scikit-learn", + "onnxmltools" + ], + "lib_source": "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-cuda12-1.17.3.tgz" + } + ] +} diff --git a/smartsim/_core/_install/configs/mlpackages/LinuxX64ROCM6.json b/smartsim/_core/_install/configs/mlpackages/LinuxX64ROCM6.json new file mode 100644 index 000000000..b4673e901 --- /dev/null +++ b/smartsim/_core/_install/configs/mlpackages/LinuxX64ROCM6.json @@ -0,0 +1,47 @@ +{ + "platform": { + "operating_system":"linux", + "architecture":"x86_64", + "device":"rocm-6" + }, + "ml_packages": [ + { + "name": "dlpack", + "version": "v0.5_RAI", + "pip_index": "", + "python_packages": [], + "lib_source": "https://github.com/RedisAI/dlpack.git" + }, + { + "name": "libtorch", + "version": "2.4.0", + "pip_index": "https://download.pytorch.org/whl/rocm6.1", + "python_packages": [ + "torch==2.4.0+rocm6.1", + "torchvision==0.19.0+rocm6.1", + "torchaudio==2.4.0+rocm6.1" + ], + "lib_source": "https://download.pytorch.org/libtorch/rocm6.1/libtorch-cxx11-abi-shared-with-deps-2.4.1%2Brocm6.1.zip", + "rai_patches": [ + { + "description": "Patch RedisAI module to require C++17 standard instead of C++14", + "source_file": "src/backends/libtorch_c/CMakeLists.txt", + "regex": "set_property\\(TARGET\\storch_c\\sPROPERTY\\sCXX_STANDARD\\s(98|11|14)\\)", + "replacement": "set_property(TARGET torch_c PROPERTY CXX_STANDARD 17)" + }, + { + "description": "Fix Regex, Load HIP", + "source_file": "../package/libtorch/share/cmake/Caffe2/public/LoadHIP.cmake", + "regex": ".*string.*", + "replacement": "" + }, + { + "description": "Replace `/opt/rocm` with `$ENV{ROCM_PATH}`", + "source_file": "../package/libtorch/share/cmake/Caffe2/Caffe2Targets.cmake", + "regex": "/opt/rocm", + "replacement": "$ENV{ROCM_PATH}" + } + ] + } + ] +} diff --git a/smartsim/_core/_install/mlpackages.py b/smartsim/_core/_install/mlpackages.py new file mode 100644 index 000000000..04e3798d3 --- /dev/null +++ b/smartsim/_core/_install/mlpackages.py @@ -0,0 +1,198 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os +import pathlib +import re +import subprocess +import sys +import typing as t +from collections.abc import MutableMapping +from dataclasses import dataclass + +from tabulate import tabulate + +from .platform import Platform +from .types import PathLike +from .utils import retrieve + + +class RequireRelativePath(Exception): + pass + + +@dataclass +class RAIPatch: + """Holds information about how to patch a RedisAI source file + + :param description: Human-readable description of the patch's purpose + :param replacement: "The replacement for the line found by the regex" + :param source_file: A relative path to the chosen file + :param regex: A regex pattern to match in the given file + + """ + + description: str + replacement: str + source_file: pathlib.Path + regex: re.Pattern[str] + + def __post_init__(self) -> None: + self.source_file = pathlib.Path(self.source_file) + self.regex = re.compile(self.regex) + + +@dataclass +class MLPackage: + """Describes the python and C/C++ library for an ML package""" + + name: str + version: str + pip_index: str + python_packages: t.List[str] + lib_source: PathLike + rai_patches: t.Tuple[RAIPatch, ...] = () + + def retrieve(self, destination: PathLike) -> None: + """Retrieve an archive and/or repository for the package + + :param destination: Path to place the extracted package or repository + """ + retrieve(self.lib_source, pathlib.Path(destination)) + + def pip_install(self, quiet: bool = False) -> None: + """Install associated python packages + + :param quiet: If True, suppress most of the pip output, defaults to False + """ + if self.python_packages: + install_command = [sys.executable, "-m", "pip", "install"] + if self.pip_index: + install_command += ["--index-url", self.pip_index] + if quiet: + install_command += ["--quiet", "--no-warn-conflicts"] + install_command += self.python_packages + subprocess.check_call(install_command) + + +class MLPackageCollection(MutableMapping[str, MLPackage]): + """Collects multiple MLPackages + + Define a collection of MLPackages available for a specific platform + """ + + def __init__(self, platform: Platform, ml_packages: t.Sequence[MLPackage]): + self.platform = platform + self._ml_packages = {pkg.name: pkg for pkg in ml_packages} + + @classmethod + def from_json_file(cls, json_file: PathLike) -> "MLPackageCollection": + """Create an MLPackageCollection specified from a JSON file + + :param json_file: path to the JSON file + :return: An instance of MLPackageCollection for a platform + """ + with open(json_file, "r", encoding="utf-8") as file_handle: + config_json = json.load(file_handle) + platform = Platform.from_strs(**config_json["platform"]) + + for ml_package in config_json["ml_packages"]: + # Convert the dictionary representation to a RAIPatch + if "rai_patches" in ml_package: + patch_list = ml_package.pop("rai_patches") + ml_package["rai_patches"] = [RAIPatch(**patch) for patch in patch_list] + + ml_packages = [ + MLPackage(**ml_package) for ml_package in config_json["ml_packages"] + ] + return cls(platform, ml_packages) + + def __iter__(self) -> t.Iterator[str]: + """Iterate over the mlpackages in the collection + + :return: Iterator over mlpackages + """ + return iter(self._ml_packages) + + def __getitem__(self, key: str) -> MLPackage: + """Retrieve an MLPackage based on its name + + :param key: Name of the python package (e.g. libtorch) + :return: MLPackage with all requirements + """ + return self._ml_packages[key] + + def __len__(self) -> int: + return len(self._ml_packages) + + def __delitem__(self, key: str) -> None: + del self._ml_packages[key] + + def __setitem__(self, key: t.Any, value: t.Any) -> t.NoReturn: + raise TypeError(f"{type(self).__name__} does not support item assignment") + + def __contains__(self, key: object) -> bool: + return key in self._ml_packages + + def __str__(self, tablefmt: str = "github") -> str: + """Display package names and versions as a table + + :param tablefmt: Tabulate format, defaults to "github" + """ + + return tabulate( + [[k, v.version] for k, v in self._ml_packages.items()], + headers=["Package", "Version"], + tablefmt=tablefmt, + ) + + +def load_platform_configs( + config_file_path: pathlib.Path, +) -> t.Dict[Platform, MLPackageCollection]: + """Create MLPackageCollections from JSON files in directory + + :param config_file_path: Directory with JSON files describing the + configuration by platform + :return: Dictionary whose keys are the supported platform and values + are its associated MLPackageCollection + """ + if not config_file_path.is_dir(): + path = os.fspath(config_file_path) + msg = f"Platform configuration directory `{path}` does not exist" + raise FileNotFoundError(msg) + configs = {} + for config_file in config_file_path.glob("*.json"): + dependencies = MLPackageCollection.from_json_file(config_file) + configs[dependencies.platform] = dependencies + return configs + + +DEFAULT_MLPACKAGE_PATH: t.Final = ( + pathlib.Path(__file__).parent / "configs" / "mlpackages" +) +DEFAULT_MLPACKAGES: t.Final = load_platform_configs(DEFAULT_MLPACKAGE_PATH) diff --git a/smartsim/_core/_install/platform.py b/smartsim/_core/_install/platform.py new file mode 100644 index 000000000..bef13c6a0 --- /dev/null +++ b/smartsim/_core/_install/platform.py @@ -0,0 +1,226 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import enum +import json +import os +import pathlib +import platform +import typing as t +from dataclasses import dataclass + +from typing_extensions import Self + + +class PlatformError(Exception): + pass + + +class UnsupportedError(PlatformError): + pass + + +class Architecture(enum.Enum): + """Identifiers for supported CPU architectures + + :return: An enum representing the CPU architecture + """ + + X64 = "x86_64" + ARM64 = "arm64" + + @classmethod + def from_str(cls, string: str) -> "Architecture": + """Return enum associated with the architecture + + :param string: String representing the architecture, see platform.machine + :return: Enum for a specific architecture + """ + string = string.lower() + return cls(string) + + @classmethod + def autodetect(cls) -> "Architecture": + """Automatically return the architecture of the current machine + + :return: enum of this platform's architecture + """ + return cls.from_str(platform.machine()) + + +class Device(enum.Enum): + """Identifiers for the device stack + + :return: Enum associated with the device stack + """ + + CPU = "cpu" + CUDA11 = "cuda-11" + CUDA12 = "cuda-12" + ROCM5 = "rocm-5" + ROCM6 = "rocm-6" + + @classmethod + def from_str(cls, str_: str) -> "Device": + """Return enum associated with the device + + :param string: String representing the device and version + :return: Enum for a specific device + """ + str_ = str_.lower() + if str_ == "gpu": + # TODO: auto detect which device to use + # currently hard coded to `cuda11` + return cls.CUDA11 + return cls(str_) + + @classmethod + def detect_cuda_version(cls) -> t.Optional["Device"]: + """Find the enum based on environment CUDA + + :return: Enum for the version of CUDA currently available + """ + if cuda_home := os.environ.get("CUDA_HOME"): + cuda_path = pathlib.Path(cuda_home) + with open(cuda_path / "version.json", "r", encoding="utf-8") as file_handle: + cuda_versions = json.load(file_handle) + major = cuda_versions["cuda"]["version"].split(".")[0] + return cls.from_str(f"cuda-{major}") + return None + + @classmethod + def detect_rocm_version(cls) -> t.Optional["Device"]: + """Find the enum based on environment ROCm + + :return: Enum for the version of ROCm currently available + """ + if rocm_home := os.environ.get("ROCM_HOME"): + rocm_path = pathlib.Path(rocm_home) + fname = rocm_path / ".info" / "version" + with open(fname, "r", encoding="utf-8") as file_handle: + major = file_handle.readline().split("-")[0].split(".")[0] + return cls.from_str(f"rocm-{major}") + return None + + def is_gpu(self) -> bool: + """Whether the enum is categorized as a GPU + + :return: True if GPU + """ + return self != type(self).CPU + + def is_cuda(self) -> bool: + """Whether the enum is associated with a CUDA device + + :return: True for any supported CUDA enums + """ + cls = type(self) + return self in cls.cuda_enums() + + def is_rocm(self) -> bool: + """Whether the enum is associated with a ROCm device + + :return: True for any supported ROCm enums + """ + cls = type(self) + return self in cls.rocm_enums() + + @classmethod + def cuda_enums(cls) -> t.Tuple["Device", ...]: + """Detect all CUDA devices supported by SmartSim + + :return: all enums associated with CUDA + """ + return tuple(device for device in cls if "cuda" in device.value) + + @classmethod + def rocm_enums(cls) -> t.Tuple["Device", ...]: + """Detect all ROCm devices supported by SmartSim + + :return: all enums associated with ROCm + """ + return tuple(device for device in cls if "rocm" in device.value) + + +class OperatingSystem(enum.Enum): + """Enum for all supported operating systems""" + + LINUX = "linux" + DARWIN = "darwin" + + @classmethod + def from_str(cls, string: str, /) -> "OperatingSystem": + """Return enum associated with the OS + + :param string: String representing the OS + :return: Enum for a specific OS + """ + string = string.lower() + return cls(string) + + @classmethod + def autodetect(cls) -> "OperatingSystem": + """Automatically return the OS of the current machine + + :return: enum of this platform's OS + """ + return cls.from_str(platform.system()) + + +@dataclass(frozen=True) +class Platform: + """Container describing relevant identifiers for a platform""" + + operating_system: OperatingSystem + architecture: Architecture + device: Device + + @classmethod + def from_strs(cls, operating_system: str, architecture: str, device: str) -> Self: + """Factory method for Platform from string onput + + :param os: String identifier for the OS + :param architecture: String identifier for the architecture + :param device: String identifer for the device and version + :return: Instance of Platform + """ + return cls( + OperatingSystem.from_str(operating_system), + Architecture.from_str(architecture), + Device.from_str(device), + ) + + def __str__(self) -> str: + """Human-readable representation of Platform + + :return: String created from the values of the enums for each property + """ + output = [ + self.operating_system.name, + self.architecture.name, + self.device.name, + ] + return "-".join(output) diff --git a/smartsim/_core/_install/redisaiBuilder.py b/smartsim/_core/_install/redisaiBuilder.py new file mode 100644 index 000000000..1dce6ddb4 --- /dev/null +++ b/smartsim/_core/_install/redisaiBuilder.py @@ -0,0 +1,301 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import fileinput +import os +import pathlib +import shutil +import stat +import subprocess +import typing as t +from collections import deque + +from smartsim._core._cli.utils import SMART_LOGGER_FORMAT +from smartsim._core._install.buildenv import BuildEnv +from smartsim._core._install.mlpackages import MLPackageCollection, RAIPatch +from smartsim._core._install.platform import OperatingSystem, Platform +from smartsim._core._install.utils import retrieve +from smartsim._core.config import CONFIG +from smartsim.log import get_logger + +logger = get_logger("Smart", fmt=SMART_LOGGER_FORMAT) +_SUPPORTED_ROCM_ARCH = "gfx90a" + + +class RedisAIBuildError(Exception): + pass + + +class RedisAIBuilder: + """Class to build RedisAI from Source""" + + def __init__( + self, + platform: Platform, + mlpackages: MLPackageCollection, + build_env: BuildEnv, + main_build_path: pathlib.Path, + verbose: bool = False, + source: t.Union[str, pathlib.Path] = "https://github.com/RedisAI/RedisAI.git", + version: str = "v1.2.7", + ) -> None: + + self.platform = platform + self.mlpackages = mlpackages + self.build_env = build_env + self.verbose = verbose + self.source = source + self.version = version + self._root_path = main_build_path / "RedisAI" + + self.cleanup_build() + + @property + def src_path(self) -> pathlib.Path: + return pathlib.Path(self._root_path / "src") + + @property + def build_path(self) -> pathlib.Path: + return pathlib.Path(self._root_path / "build") + + @property + def package_path(self) -> pathlib.Path: + return pathlib.Path(self._root_path / "package") + + def cleanup_build(self) -> None: + """Removes all directories associated with the build""" + shutil.rmtree(self.src_path, ignore_errors=True) + shutil.rmtree(self.build_path, ignore_errors=True) + shutil.rmtree(self.package_path, ignore_errors=True) + + @property + def is_built(self) -> bool: + """Determine whether RedisAI and backends were built + + :return: True if all backends and RedisAI module are in + the expected location + """ + backend_dir = CONFIG.lib_path / "backends" + rai_exists = [ + (backend_dir / f"redisai_{backend_name}").is_dir() + for backend_name in self.mlpackages + ] + rai_exists.append((CONFIG.lib_path / "redisai.so").is_file()) + return all(rai_exists) + + @property + def build_torch(self) -> bool: + """Whether to build torch backend + + :return: True if torch backend should be built + """ + return "libtorch" in self.mlpackages + + @property + def build_tensorflow(self) -> bool: + """Whether to build tensorflow backend + + :return: True if tensorflow backend should be built + """ + return "libtensorflow" in self.mlpackages + + @property + def build_onnxruntime(self) -> bool: + """Whether to build onnx backend + + :return: True if onnx backend should be built + """ + return "onnxruntime" in self.mlpackages + + def build(self) -> None: + """Build RedisAI + + :param git_url: url from which to retrieve RedisAI + :param branch: branch to checkout + :param device: cpu or gpu + """ + + # Following is needed to make sure that the clone/checkout is not + # impeded by git LFS limits imposed by RedisAI + os.environ["GIT_LFS_SKIP_SMUDGE"] = "1" + + self.src_path.mkdir(parents=True) + self.build_path.mkdir(parents=True) + self.package_path.mkdir(parents=True) + + retrieve(self.source, self.src_path, depth=1, branch=self.version) + + self._prepare_packages() + + for package in self.mlpackages.values(): + self._patch_source_files(package.rai_patches) + cmake_command = self._rai_cmake_cmd() + build_command = self._rai_build_cmd + + if self.platform.device.is_rocm() and "libtorch" in self.mlpackages: + pytorch_rocm_arch = os.environ.get("PYTORCH_ROCM_ARCH") + if not pytorch_rocm_arch: + logger.info( + f"PYTORCH_ROCM_ARCH not set. Defaulting to '{_SUPPORTED_ROCM_ARCH}'" + ) + os.environ["PYTORCH_ROCM_ARCH"] = _SUPPORTED_ROCM_ARCH + elif pytorch_rocm_arch != _SUPPORTED_ROCM_ARCH: + logger.warning( + f"PYTORCH_ROCM_ARCH is not {_SUPPORTED_ROCM_ARCH} which is the " + "only officially supported architecture. This may still work " + "if you are supplying your own version of libtensorflow." + ) + + logger.info("Configuring CMake Build") + if self.verbose: + print(" ".join(cmake_command)) + self.run_command(cmake_command, self.build_path) + + logger.info("Building RedisAI") + if self.verbose: + print(" ".join(build_command)) + self.run_command(build_command, self.build_path) + + if self.platform.operating_system == OperatingSystem.LINUX: + self._set_execute(CONFIG.lib_path / "redisai.so") + + @staticmethod + def _set_execute(target: pathlib.Path) -> None: + """Set execute permissions for file + + :param target: The target file to add execute permission + """ + permissions = os.stat(target).st_mode | stat.S_IXUSR + os.chmod(target, permissions) + + @staticmethod + def _find_closest_object( + start_path: pathlib.Path, target_obj: str + ) -> t.Optional[pathlib.Path]: + queue = deque([start_path]) + while queue: + current_dir = queue.popleft() + current_target = current_dir / target_obj + if current_target.exists(): + return current_target.parent + for sub_dir in current_dir.iterdir(): + if sub_dir.is_dir(): + queue.append(sub_dir) + return None + + def _prepare_packages(self) -> None: + """Ensure that retrieved archives/packages are in the expected location + + RedisAI requires that the root directory of the backend is at + DEP_PATH/example_backend. Due to difficulties in retrieval methods and + naming conventions from different sources, this cannot be standardized. + Instead we try to find the parent of the "include" directory and assume + this is the root. + """ + + for package in self.mlpackages.values(): + logger.info(f"Retrieving package: {package.name} {package.version}") + target_dir = self.package_path / package.name + package.retrieve(target_dir) + # Move actual contents to root of the expected location + actual_root = self._find_closest_object(target_dir, "include") + if actual_root and actual_root != target_dir: + logger.debug( + ( + "Non-standard location found: \n", + f"{actual_root} -> {target_dir}", + ) + ) + for file in actual_root.iterdir(): + file.rename(target_dir / file.name) + + def run_command(self, cmd: t.Union[str, t.List[str]], cwd: pathlib.Path) -> None: + """Executor of commands usedi in the build + + :param cmd: The actual command to execute + :param cwd: The working directory to execute in + """ + stdout = None if self.verbose else subprocess.DEVNULL + stderr = None if self.verbose else subprocess.PIPE + proc = subprocess.run( + cmd, cwd=str(cwd), stdout=stdout, stderr=stderr, check=False + ) + if proc.returncode != 0: + if stderr: + print(proc.stderr.decode("utf-8")) + raise RedisAIBuildError( + f"RedisAI build failed during command: {' '.join(cmd)}" + ) + + def _rai_cmake_cmd(self) -> t.List[str]: + """Build the CMake configuration command + + :return: CMake command with correct options + """ + + def on_off(expression: bool) -> t.Literal["ON", "OFF"]: + return "ON" if expression else "OFF" + + cmake_args = { + "BUILD_TF": on_off(self.build_tensorflow), + "BUILD_ORT": on_off(self.build_onnxruntime), + "BUILD_TORCH": on_off(self.build_torch), + "BUILD_TFLITE": "OFF", + "DEPS_PATH": str(self.package_path), + "DEVICE": "gpu" if self.platform.device.is_gpu() else "cpu", + "INSTALL_PATH": str(CONFIG.lib_path), + "CMAKE_C_COMPILER": self.build_env.CC, + "CMAKE_CXX_COMPILER": self.build_env.CXX, + } + if self.platform.device.is_rocm(): + cmake_args["Torch_DIR"] = str(self.package_path / "libtorch") + cmd = ["cmake"] + cmd += (f"-D{key}={value}" for key, value in cmake_args.items()) + cmd.append(str(self.src_path)) + return cmd + + @property + def _rai_build_cmd(self) -> t.List[str]: + """Shell command to build RedisAI and modules + + With the CMake based install, very little needs to be done here. + "make install" is used to ensure that all resulting RedisAI backends + and their dependencies end up in the same location with the correct + RPATH if applicable. + + :return: Command used to compile RedisAI and backends + """ + return "make install -j VERBOSE=1".split(" ") + + def _patch_source_files(self, patches: t.Tuple[RAIPatch, ...]) -> None: + """Apply specified RedisAI patches""" + for patch in patches: + with fileinput.input( + str(self.src_path / patch.source_file), inplace=True + ) as file_handle: + for line in file_handle: + line = patch.regex.sub(patch.replacement, line) + print(line, end="") diff --git a/smartsim/_core/_install/types.py b/smartsim/_core/_install/types.py new file mode 100644 index 000000000..0266ace34 --- /dev/null +++ b/smartsim/_core/_install/types.py @@ -0,0 +1,30 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import typing as t + +PathLike = t.Union[str, pathlib.Path] diff --git a/smartsim/_core/_install/utils/__init__.py b/smartsim/_core/_install/utils/__init__.py new file mode 100644 index 000000000..4e47cf282 --- /dev/null +++ b/smartsim/_core/_install/utils/__init__.py @@ -0,0 +1,27 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from .retrieve import retrieve diff --git a/smartsim/_core/_install/utils/retrieve.py b/smartsim/_core/_install/utils/retrieve.py new file mode 100644 index 000000000..fcac565d4 --- /dev/null +++ b/smartsim/_core/_install/utils/retrieve.py @@ -0,0 +1,185 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pathlib +import shutil +import tarfile +import typing as t +import zipfile +from urllib.parse import urlparse +from urllib.request import urlretrieve + +import git +from tqdm import tqdm + +from smartsim._core._install.platform import Architecture, OperatingSystem +from smartsim._core._install.types import PathLike + + +class UnsupportedArchive(Exception): + pass + + +class _TqdmUpTo(tqdm): # type: ignore[type-arg] + """Provides `update_to(n)` which uses `tqdm.update(delta_n)` + + From tqdm doumentation for progress bar when downloading + """ + + def update_to( + self, num_blocks: int = 1, bsize: int = 1, tsize: t.Optional[int] = None + ) -> t.Optional[bool]: + """Update progress in tqdm-like way + + :param b: number of blocks transferred so far, defaults to 1 + :param bsize: size of each block (in tqdm units), defaults to 1 + :param tsize: total size (in tqdm units), defaults to None + :return: Update + """ + + if tsize is not None: + self.total = tsize + return self.update(num_blocks * bsize - self.n) # also sets self.n = b * bsize + + +def _from_local_archive( + source: PathLike, + destination: pathlib.Path, + **kwargs: t.Any, +) -> None: + """Decompress a local archive + + :param source: Path to the archive on a local system + :param destination: Where to unpack the archive + """ + if tarfile.is_tarfile(source): + with tarfile.open(source) as archive: + archive.extractall(path=destination, **kwargs) + if zipfile.is_zipfile(source): + with zipfile.ZipFile(source) as archive: + archive.extractall(path=destination, **kwargs) + + +def _from_local_directory( + source: PathLike, + destination: pathlib.Path, + **kwargs: t.Any, +) -> None: + """Copy the contents of a directory + + :param source: source directory + :param destination: desitnation directory + """ + shutil.copytree(source, destination, **kwargs) + + +def _from_http( + source: str, + destination: pathlib.Path, + **kwargs: t.Any, +) -> None: + """Download and decompress a package + + :param source: URL to a particular package + :param destination: Where to unpack the archive + """ + with _TqdmUpTo( + unit="B", + unit_scale=True, + unit_divisor=1024, + miniters=1, + desc=source.split("/")[-1], + ) as _t: # all optional kwargs + local_file, _ = urlretrieve(source, reporthook=_t.update_to, **kwargs) + _t.total = _t.n + + _from_local_archive(local_file, destination) + os.remove(local_file) + + +def _from_git(source: str, destination: pathlib.Path, **clone_kwargs: t.Any) -> None: + """Clone a repository + + :param source: Path to the remote (URL or local) repository + :param destination: where to clone the repository + :param clone_kwargs: various options to send to the clone command + """ + is_mac = OperatingSystem.autodetect() == OperatingSystem.DARWIN + is_arm64 = Architecture.autodetect() == Architecture.ARM64 + if is_mac and is_arm64: + config_options = ["--config core.autocrlf=false", "--config core.eol=lf"] + allow_unsafe_options = True + else: + config_options = None + allow_unsafe_options = False + git.Repo.clone_from( + source, + destination, + multi_options=config_options, + allow_unsafe_options=allow_unsafe_options, + **clone_kwargs, + ) + + +def retrieve( + source: PathLike, destination: pathlib.Path, **retrieve_kwargs: t.Any +) -> None: + """Primary method for retrieval + + Automatically choose the correct method based on the extension and/or source + of the archive. If downloaded, this will also decompress the archive and + extract + + :param source: URL or path to find the package + :param destination: where to place the package + :raises UnsupportedArchive: Unknown archive type + :raises FileNotFound: Path to archive does not exist + """ + parsed_url = urlparse(str(source)) + url_scheme = parsed_url.scheme + if parsed_url.path.endswith(".git"): + _from_git(str(source), destination, **retrieve_kwargs) + elif url_scheme == "http": + _from_http(str(source), destination, **retrieve_kwargs) + elif url_scheme == "https": + _from_http(str(source), destination, **retrieve_kwargs) + else: # This is probably a path + source_path = pathlib.Path(source) + if not source_path.exists(): + raise FileNotFoundError(f"Package path or file does not exist: {source}") + if source_path.is_dir(): + _from_local_directory(source, destination, **retrieve_kwargs) + elif source_path.is_file() and source_path.suffix in ( + ".gz", + ".zip", + ".tgz", + ): + _from_local_archive(source, destination, **retrieve_kwargs) + else: + raise UnsupportedArchive( + f"Source ({source}) is not a supported archive or directory " + ) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 9cf950b21..03c284edb 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -33,7 +33,7 @@ import psutil from ...error import SSConfigError -from ..utils.helpers import expand_exe_path +from ..utils import expand_exe_path # Configuration Values # @@ -94,13 +94,28 @@ class Config: def __init__(self) -> None: # SmartSim/smartsim/_core self.core_path = Path(os.path.abspath(__file__)).parent.parent + # TODO: Turn this into a property. Need to modify the configuration + # of KeyDB vs Redis at build time + self.conf_dir = self.core_path / "config" + self.conf_path = self.conf_dir / "redis.conf" - dependency_path = os.environ.get("SMARTSIM_DEP_INSTALL_PATH", self.core_path) + @property + def dependency_path(self) -> Path: + return Path( + os.environ.get("SMARTSIM_DEP_INSTALL_PATH", str(self.core_path)) + ).resolve() + + @property + def lib_path(self) -> Path: + return Path(self.dependency_path, "lib") - self.lib_path = Path(dependency_path, "lib").resolve() - self.bin_path = Path(dependency_path, "bin").resolve() - self.conf_path = Path(dependency_path, "config", "redis.conf") - self.conf_dir = Path(self.core_path, "config") + @property + def bin_path(self) -> Path: + return Path(self.dependency_path, "bin") + + @property + def build_path(self) -> Path: + return Path(self.dependency_path, "build") @property def redisai(self) -> str: @@ -157,7 +172,7 @@ def database_file_parse_interval(self) -> int: @property def dragon_dotenv(self) -> Path: """Returns the path to a .env file containing dragon environment variables""" - return self.conf_dir / "dragon" / ".env" + return Path(self.conf_dir / "dragon" / ".env") @property def dragon_server_path(self) -> t.Optional[str]: diff --git a/smartsim/_core/types.py b/smartsim/_core/types.py new file mode 100644 index 000000000..d3dc029ea --- /dev/null +++ b/smartsim/_core/types.py @@ -0,0 +1,32 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import enum + + +class Device(enum.Enum): + CPU = "cpu" + GPU = "gpu" diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index 3ea928797..cddbc4ce9 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -29,6 +29,7 @@ colorize, delete_elements, execute_platform_cmd, + expand_exe_path, installed_redisai_backends, is_crayex_platform, ) diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index df2c016a1..b17be763b 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -39,12 +39,11 @@ from pathlib import Path from shutil import which -from smartsim._core._install.builder import TRedisAIBackendStr as _TRedisAIBackendStr - if t.TYPE_CHECKING: from types import FrameType +_TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime"] _TSignalHandlerFn = t.Callable[[int, t.Optional["FrameType"]], object] @@ -230,7 +229,9 @@ def redis_install_base(backends_path: t.Optional[str] = None) -> Path: # pylint: disable-next=import-outside-toplevel from ..._core.config import CONFIG - base_path = Path(backends_path) if backends_path else CONFIG.lib_path / "backends" + base_path: Path = ( + Path(backends_path) if backends_path else CONFIG.lib_path / "backends" + ) return base_path @@ -255,10 +256,10 @@ def installed_redisai_backends( "tensorflow", "torch", "onnxruntime", - "tflite", } - return {backend for backend in backends if _installed(base_path, backend)} + installed = {backend for backend in backends if _installed(base_path, backend)} + return installed def get_ts_ms() -> int: diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index 5cb0d061f..fa9983c50 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -27,7 +27,8 @@ import typing as t from pathlib import Path -from .._core._install.builder import Device +from smartsim._core.types import Device + from ..error import SSUnsupportedError __all__ = ["DBObject", "DBModel", "DBScript"] diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index cab138685..965b10db7 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -31,7 +31,8 @@ from tabulate import tabulate -from .._core._install.builder import Device +from smartsim._core.types import Device + from ..error import ( EntityExistsError, SmartSimError, diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index a11a594fc..3e8baad5c 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -35,7 +35,8 @@ from os import getcwd from os import path as osp -from .._core._install.builder import Device +from smartsim._core.types import Device + from .._core.utils.helpers import cat_arg_and_value from ..error import EntityExistsError, SSUnsupportedError from ..log import get_logger diff --git a/smartsim/ml/tf/__init__.py b/smartsim/ml/tf/__init__.py index 46d89d733..ee791ea98 100644 --- a/smartsim/ml/tf/__init__.py +++ b/smartsim/ml/tf/__init__.py @@ -31,23 +31,12 @@ logger = get_logger(__name__) vers = Versioner() -TF_VERSION = vers.TENSORFLOW try: import tensorflow as tf except ImportError: # pragma: no cover raise ModuleNotFoundError( - f"TensorFlow {TF_VERSION} is not installed. " - "Please install it to use smartsim.ml.tf" - ) from None - -try: - installed_tf = Version_(tf.__version__) - assert installed_tf >= TF_VERSION -except AssertionError: # pragma: no cover - raise SmartSimError( - f"TensorFlow >= {TF_VERSION} is required for smartsim. " - f"tf, you have {tf.__version__}" + f"TensorFlow is not installed. Please install it to use smartsim.ml.tf" ) from None diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index cf69b65e5..4e45f1847 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -29,7 +29,7 @@ import keras import tensorflow as tf -from tensorflow.python.framework.convert_to_constants import ( +from tensorflow.python.framework.convert_to_constants import ( # type: ignore[import-not-found,unused-ignore] convert_variables_to_constants_v2, ) @@ -62,7 +62,7 @@ def freeze_model( tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) ) - frozen_func = convert_variables_to_constants_v2(full_model) + frozen_func = convert_variables_to_constants_v2(full_model) # type: ignore[no-untyped-call,unused-ignore] frozen_func.graph.as_graph_def() input_names = [x.name.split(":")[0] for x in frozen_func.inputs] @@ -97,7 +97,7 @@ def serialize_model(model: keras.Model) -> t.Tuple[str, t.List[str], t.List[str] tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) ) - frozen_func = convert_variables_to_constants_v2(full_model) + frozen_func = convert_variables_to_constants_v2(full_model) # type: ignore[no-untyped-call,unused-ignore] frozen_func.graph.as_graph_def() input_names = [x.name.split(":")[0] for x in frozen_func.inputs] diff --git a/tests/backends/run_torch.py b/tests/backends/run_torch.py index 6e9ba2859..b3c0fc964 100644 --- a/tests/backends/run_torch.py +++ b/tests/backends/run_torch.py @@ -74,7 +74,7 @@ def calc_svd(input_tensor): return input_tensor.svd() -def run(device): +def run(device, num_devices): # connect a client to the database client = Client(cluster=False) @@ -92,9 +92,23 @@ def run(device): net = create_torch_model() # 20 samples of "image" data example_forward_input = torch.rand(20, 1, 28, 28) - client.set_model("cnn", net, "TORCH", device=device) client.put_tensor("input", example_forward_input.numpy()) - client.run_model("cnn", inputs=["input"], outputs=["output"]) + if device == "CPU": + client.set_model("cnn", net, "TORCH", device=device) + client.run_model("cnn", inputs=["input"], outputs=["output"]) + else: + client.set_model_multigpu( + "cnn", net, "TORCH", first_gpu=0, num_gpus=num_devices + ) + client.run_model_multigpu( + "cnn", + offset=1, + first_gpu=0, + num_gpus=num_devices, + inputs=["input"], + outputs=["output"], + ) + output = client.get_tensor("output") print(f"Prediction: {output}") @@ -106,5 +120,11 @@ def run(device): parser.add_argument( "--device", type=str, default="CPU", help="device type for model execution" ) + parser.add_argument( + "--num-devices", + type=int, + default=1, + help="Number of devices to set the model on", + ) args = parser.parse_args() - run(args.device) + run(args.device, args.num_devices) diff --git a/tests/backends/test_cli_mini_exp.py b/tests/backends/test_cli_mini_exp.py index 2fde2ff5f..3379bf2ee 100644 --- a/tests/backends/test_cli_mini_exp.py +++ b/tests/backends/test_cli_mini_exp.py @@ -32,6 +32,7 @@ import smartsim._core._cli.validate import smartsim._core._install.builder as build +from smartsim._core._install.platform import Device from smartsim._core.utils.helpers import installed_redisai_backends sklearn_available = True @@ -79,7 +80,7 @@ def _mock_make_managed_local_orc(*a, **kw): location=test_dir, port=db_port, # Always test on CPU, heads don't always have GPU - device=build.Device.CPU, + device=Device.CPU, # Test the backends the dev has installed with_tf="tensorflow" in backends, with_pt="torch" in backends, diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index c995f76ca..6aff6b0ba 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -65,9 +65,11 @@ def test_torch_model_and_script( db = prepare_db(single_db).orchestrator wlm_experiment.reconnect_orchestrator(db.checkpoint_file) test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 run_settings = wlm_experiment.create_run_settings( - "python", f"run_torch.py --device={test_device}" + "python", + ["run_torch.py", f"--device={test_device}", f"--num-devices={test_num_gpus}"], ) if wlmutils.get_test_launcher() != "local": run_settings.set_tasks(1) diff --git a/tests/install/test_build.py b/tests/install/test_build.py new file mode 100644 index 000000000..f8a5c4896 --- /dev/null +++ b/tests/install/test_build.py @@ -0,0 +1,148 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import operator + +import pytest + +from smartsim._core._cli.build import parse_requirement +from smartsim._core._install.buildenv import Version_ + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +_SUPPORTED_OPERATORS = ("==", ">=", ">", "<=", "<") + + +@pytest.mark.parametrize( + "spec, name, pin", + ( + pytest.param("foo", "foo", None, id="Just Name"), + pytest.param("foo==1", "foo", "==1", id="With Major"), + pytest.param("foo==1.2", "foo", "==1.2", id="With Minor"), + pytest.param("foo==1.2.3", "foo", "==1.2.3", id="With Patch"), + pytest.param("foo[with-extras]==1.2.3", "foo", "==1.2.3", id="With Extra"), + pytest.param( + "foo[with,many,extras]==1.2.3", "foo", "==1.2.3", id="With Many Extras" + ), + *( + pytest.param( + f"foo{symbol}1.2.3{tag}", + "foo", + f"{symbol}1.2.3{tag}", + id=f"{symbol=} | {tag=}", + ) + for symbol in _SUPPORTED_OPERATORS + for tag in ("", "+cuda", "+rocm", "+cpu") + ), + ), +) +def test_parse_requirement_name_and_version(spec, name, pin): + p_name, p_pin, _ = parse_requirement(spec) + assert p_name == name + assert p_pin == pin + + +# fmt: off +@pytest.mark.parametrize( + "spec, ver, should_pass", + ( + pytest.param("foo" , Version_("1.2.3") , True, id="No spec"), + # EQ -------------------------------------------------------------------------- + pytest.param("foo==1.2.3" , Version_("1.2.3") , True, id="EQ Spec, EQ Version"), + pytest.param("foo==1.2.3" , Version_("1.2.5") , False, id="EQ Spec, GT Version"), + pytest.param("foo==1.2.3" , Version_("1.2.2") , False, id="EQ Spec, LT Version"), + pytest.param("foo==1.2.3+rocm", Version_("1.2.3+rocm"), True, id="EQ Spec, Compatible Version with suffix"), + pytest.param("foo==1.2.3" , Version_("1.2.3+cuda"), False, id="EQ Spec, Compatible Version, Extra Suffix"), + pytest.param("foo==1.2.3+cuda", Version_("1.2.3") , False, id="EQ Spec, Compatible Version, Missing Suffix"), + pytest.param("foo==1.2.3+cuda", Version_("1.2.3+rocm"), False, id="EQ Spec, Compatible Version, Mismatched Suffix"), + # LT -------------------------------------------------------------------------- + pytest.param("foo<1.2.3" , Version_("1.2.3") , False, id="LT Spec, EQ Version"), + pytest.param("foo<1.2.3" , Version_("1.2.5") , False, id="LT Spec, GT Version"), + pytest.param("foo<1.2.3" , Version_("1.2.2") , True, id="LT Spec, LT Version"), + pytest.param("foo<1.2.3+rocm" , Version_("1.2.2+rocm"), True, id="LT Spec, Compatible Version with suffix"), + pytest.param("foo<1.2.3" , Version_("1.2.2+cuda"), False, id="LT Spec, Compatible Version, Extra Suffix"), + pytest.param("foo<1.2.3+cuda" , Version_("1.2.2") , False, id="LT Spec, Compatible Version, Missing Suffix"), + pytest.param("foo<1.2.3+cuda" , Version_("1.2.2+rocm"), False, id="LT Spec, Compatible Version, Mismatched Suffix"), + # LE -------------------------------------------------------------------------- + pytest.param("foo<=1.2.3" , Version_("1.2.3") , True, id="LE Spec, EQ Version"), + pytest.param("foo<=1.2.3" , Version_("1.2.5") , False, id="LE Spec, GT Version"), + pytest.param("foo<=1.2.3" , Version_("1.2.2") , True, id="LE Spec, LT Version"), + pytest.param("foo<=1.2.3+rocm", Version_("1.2.3+rocm"), True, id="LE Spec, Compatible Version with suffix"), + pytest.param("foo<=1.2.3" , Version_("1.2.3+cuda"), False, id="LE Spec, Compatible Version, Extra Suffix"), + pytest.param("foo<=1.2.3+cuda", Version_("1.2.3") , False, id="LE Spec, Compatible Version, Missing Suffix"), + pytest.param("foo<=1.2.3+cuda", Version_("1.2.3+rocm"), False, id="LE Spec, Compatible Version, Mismatched Suffix"), + # GT -------------------------------------------------------------------------- + pytest.param("foo>1.2.3" , Version_("1.2.3") , False, id="GT Spec, EQ Version"), + pytest.param("foo>1.2.3" , Version_("1.2.5") , True, id="GT Spec, GT Version"), + pytest.param("foo>1.2.3" , Version_("1.2.2") , False, id="GT Spec, LT Version"), + pytest.param("foo>1.2.3+rocm" , Version_("1.2.4+rocm"), True, id="GT Spec, Compatible Version with suffix"), + pytest.param("foo>1.2.3" , Version_("1.2.4+cuda"), False, id="GT Spec, Compatible Version, Extra Suffix"), + pytest.param("foo>1.2.3+cuda" , Version_("1.2.4") , False, id="GT Spec, Compatible Version, Missing Suffix"), + pytest.param("foo>1.2.3+cuda" , Version_("1.2.4+rocm"), False, id="GT Spec, Compatible Version, Mismatched Suffix"), + # GE -------------------------------------------------------------------------- + pytest.param("foo>=1.2.3" , Version_("1.2.3") , True, id="GE Spec, EQ Version"), + pytest.param("foo>=1.2.3" , Version_("1.2.5") , True, id="GE Spec, GT Version"), + pytest.param("foo>=1.2.3" , Version_("1.2.2") , False, id="GE Spec, LT Version"), + pytest.param("foo>=1.2.3+rocm", Version_("1.2.3+rocm"), True, id="GE Spec, Compatible Version with suffix"), + pytest.param("foo>=1.2.3" , Version_("1.2.3+cuda"), False, id="GE Spec, Compatible Version, Extra Suffix"), + pytest.param("foo>=1.2.3+cuda", Version_("1.2.3") , False, id="GE Spec, Compatible Version, Missing Suffix"), + pytest.param("foo>=1.2.3+cuda", Version_("1.2.3+rocm"), False, id="GE Spec, Compatible Version, Mismatched Suffix"), + ) +) +# fmt: on +def test_parse_requirement_comparison_fn(spec, ver, should_pass): + _, _, cmp = parse_requirement(spec) + assert cmp(ver) == should_pass + + +@pytest.mark.parametrize( + "spec, ctx", + ( + *( + pytest.param( + f"thing{symbol}", + pytest.raises(ValueError, match="Invalid requirement string:"), + id=f"No version w/ operator {symbol}", + ) + for symbol in _SUPPORTED_OPERATORS + ), + pytest.param( + "thing>=>1.2.3", + pytest.raises(ValueError, match="Invalid requirement string:"), + id="Operator too long", + ), + pytest.param( + "thing<>1.2.3", + pytest.raises(ValueError, match="Unrecognized comparison operator: <>"), + id="Nonsense operator", + ), + ), +) +def test_parse_requirement_errors_on_invalid_spec(spec, ctx): + with ctx: + parse_requirement(spec) diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py deleted file mode 100644 index feaf7e54f..000000000 --- a/tests/install/test_builder.py +++ /dev/null @@ -1,404 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -import functools -import pathlib -import textwrap -import time - -import pytest - -import smartsim._core._install.builder as build -from smartsim._core._install.buildenv import RedisAIVersion - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - -RAI_VERSIONS = RedisAIVersion("1.2.7") - -for_each_device = pytest.mark.parametrize( - "device", [build.Device.CPU, build.Device.GPU] -) - -_toggle_build_optional_backend = lambda backend: pytest.mark.parametrize( - f"build_{backend}", - [ - pytest.param(switch, id=f"with{'' if switch else 'out'}-{backend}") - for switch in (True, False) - ], -) -toggle_build_tf = _toggle_build_optional_backend("tf") -toggle_build_pt = _toggle_build_optional_backend("pt") -toggle_build_ort = _toggle_build_optional_backend("ort") - - -@pytest.mark.parametrize( - "mock_os", [pytest.param(os_, id=f"os='{os_}'") for os_ in ("Windows", "Java", "")] -) -def test_os_enum_raises_on_unsupported(mock_os): - with pytest.raises(build.BuildError, match="operating system") as err_info: - build.OperatingSystem.from_str(mock_os) - - -@pytest.mark.parametrize( - "mock_arch", - [ - pytest.param(arch_, id=f"arch='{arch_}'") - for arch_ in ("i386", "i686", "i86pc", "aarch64", "armv7l", "") - ], -) -def test_arch_enum_raises_on_unsupported(mock_arch): - with pytest.raises(build.BuildError, match="architecture"): - build.Architecture.from_str(mock_arch) - - -@pytest.fixture -def p_test_dir(test_dir): - yield pathlib.Path(test_dir).resolve() - - -@for_each_device -def test_rai_builder_raises_if_attempting_to_place_deps_when_build_dir_dne( - monkeypatch, p_test_dir, device -): - monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) - monkeypatch.setattr( - build.RedisAIBuilder, - "rai_build_path", - property(lambda self: p_test_dir / "path/to/dir/that/dne"), - ) - rai_builder = build.RedisAIBuilder() - with pytest.raises(build.BuildError, match=r"build directory not found"): - rai_builder._fetch_deps_for(device) - - -@for_each_device -def test_rai_builder_raises_if_attempting_to_place_deps_in_nonempty_dir( - monkeypatch, p_test_dir, device -): - (p_test_dir / "some_file.txt").touch() - monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) - monkeypatch.setattr( - build.RedisAIBuilder, "rai_build_path", property(lambda self: p_test_dir) - ) - monkeypatch.setattr( - build.RedisAIBuilder, "get_deps_dir_path_for", lambda *a, **kw: p_test_dir - ) - rai_builder = build.RedisAIBuilder() - - with pytest.raises(build.BuildError, match=r"is not empty"): - rai_builder._fetch_deps_for(device) - - -invalid_build_arm64 = [ - dict(build_tf=True, build_onnx=True), - dict(build_tf=False, build_onnx=True), - dict(build_tf=True, build_onnx=False), -] -invalid_build_ids = [ - ",".join([f"{key}={value}" for key, value in d.items()]) - for d in invalid_build_arm64 -] - - -@pytest.mark.parametrize("build_options", invalid_build_arm64, ids=invalid_build_ids) -def test_rai_builder_raises_if_unsupported_deps_on_arm64(build_options): - with pytest.raises(build.BuildError, match=r"are not supported on.*ARM64"): - build.RedisAIBuilder( - _os=build.OperatingSystem.DARWIN, - architecture=build.Architecture.ARM64, - **build_options, - ) - - -def _confirm_inst_presence(type_, should_be_present, seq): - expected_num_occurrences = 1 if should_be_present else 0 - occurrences = filter(lambda item: isinstance(item, type_), seq) - return expected_num_occurrences == len(tuple(occurrences)) - - -# Helper functions to check for the presence (or absence) of a -# ``_RAIBuildDependency`` dependency in a list of dependencies that need to be -# fetched by a ``RedisAIBuilder`` instance -dlpack_dep_presence = functools.partial( - _confirm_inst_presence, build._DLPackRepository, True -) -pt_dep_presence = functools.partial(_confirm_inst_presence, build._PTArchive) -tf_dep_presence = functools.partial(_confirm_inst_presence, build._TFArchive) -ort_dep_presence = functools.partial(_confirm_inst_presence, build._ORTArchive) - - -@for_each_device -@toggle_build_tf -@toggle_build_pt -@toggle_build_ort -def test_rai_builder_will_add_dep_if_backend_requested_wo_duplicates( - monkeypatch, device, build_tf, build_pt, build_ort -): - monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) - - rai_builder = build.RedisAIBuilder( - build_tf=build_tf, build_torch=build_pt, build_onnx=build_ort - ) - requested_backends = rai_builder._get_deps_to_fetch_for(build.Device(device)) - assert dlpack_dep_presence(requested_backends) - assert tf_dep_presence(build_tf, requested_backends) - assert pt_dep_presence(build_pt, requested_backends) - assert ort_dep_presence(build_ort, requested_backends) - - -@for_each_device -@toggle_build_tf -@toggle_build_pt -def test_rai_builder_will_not_add_dep_if_custom_dep_path_provided( - monkeypatch, device, p_test_dir, build_tf, build_pt -): - monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) - mock_ml_lib = p_test_dir / "some/ml/lib" - mock_ml_lib.mkdir(parents=True) - rai_builder = build.RedisAIBuilder( - build_tf=build_tf, - build_torch=build_pt, - build_onnx=False, - libtf_dir=str(mock_ml_lib if build_tf else ""), - torch_dir=str(mock_ml_lib if build_pt else ""), - ) - requested_backends = rai_builder._get_deps_to_fetch_for(device) - assert dlpack_dep_presence(requested_backends) - assert tf_dep_presence(False, requested_backends) - assert pt_dep_presence(False, requested_backends) - assert ort_dep_presence(False, requested_backends) - assert len(requested_backends) == 1 - - -def test_rai_builder_raises_if_it_fetches_an_unexpected_number_of_ml_deps( - monkeypatch, p_test_dir -): - monkeypatch.setattr(build.RedisAIBuilder, "_validate_platform", lambda a: None) - monkeypatch.setattr( - build.RedisAIBuilder, "rai_build_path", property(lambda self: p_test_dir) - ) - monkeypatch.setattr( - build, - "_place_rai_dep_at", - lambda target, verbose: lambda dep: target - / "whoops_all_ml_deps_extract_to_a_dir_with_this_name", - ) - rai_builder = build.RedisAIBuilder(build_tf=True, build_torch=True, build_onnx=True) - with pytest.raises( - build.BuildError, - match=r"Expected to place \d+ dependencies, but only found \d+", - ): - rai_builder._fetch_deps_for(build.Device.CPU) - - -def test_threaded_map(): - def _some_io_op(x): - return x * x - - assert (0, 1, 4, 9, 16) == tuple(build._threaded_map(_some_io_op, range(5))) - - -def test_threaded_map_returns_early_if_nothing_to_map(): - sleep_duration = 60 - - def _some_long_io_op(_): - time.sleep(sleep_duration) - - start = time.time() - build._threaded_map(_some_long_io_op, []) - end = time.time() - assert end - start < sleep_duration - - -def test_correct_pt_variant_os(): - # Check that all Linux variants return Linux - for linux_variant in build.OperatingSystem.LINUX.value: - os_ = build.OperatingSystem.from_str(linux_variant) - assert build._choose_pt_variant(os_) == build._PTArchiveLinux - - # Check that ARM64 and X86_64 Mac OSX return the Mac variant - all_archs = (build.Architecture.ARM64, build.Architecture.X64) - for arch in all_archs: - os_ = build.OperatingSystem.DARWIN - assert build._choose_pt_variant(os_) == build._PTArchiveMacOSX - - -def test_PTArchiveMacOSX_url(): - arch = build.Architecture.X64 - pt_version = RAI_VERSIONS.torch - - pt_linux_cpu = build._PTArchiveLinux( - build.Architecture.X64, build.Device.CPU, pt_version, False - ) - x64_prefix = "https://download.pytorch.org/libtorch/" - assert x64_prefix in pt_linux_cpu.url - - pt_macosx_cpu = build._PTArchiveMacOSX( - build.Architecture.ARM64, build.Device.CPU, pt_version, False - ) - arm64_prefix = "https://github.com/CrayLabs/ml_lib_builder/releases/download/" - assert arm64_prefix in pt_macosx_cpu.url - - -def test_PTArchiveMacOSX_gpu_error(): - with pytest.raises(build.BuildError, match="support GPU on Mac OSX"): - build._PTArchiveMacOSX( - build.Architecture.ARM64, build.Device.GPU, RAI_VERSIONS.torch, False - ).url - - -def test_valid_platforms(): - assert build.RedisAIBuilder( - _os=build.OperatingSystem.LINUX, - architecture=build.Architecture.X64, - build_tf=True, - build_torch=True, - build_onnx=True, - ) - assert build.RedisAIBuilder( - _os=build.OperatingSystem.DARWIN, - architecture=build.Architecture.X64, - build_tf=True, - build_torch=True, - build_onnx=False, - ) - assert build.RedisAIBuilder( - _os=build.OperatingSystem.DARWIN, - architecture=build.Architecture.X64, - build_tf=False, - build_torch=True, - build_onnx=False, - ) - - -@pytest.mark.parametrize( - "plat,cmd,expected_cmd", - [ - # Bare Word - pytest.param( - build.Platform(build.OperatingSystem.LINUX, build.Architecture.X64), - ["git", "clone", "my-repo"], - ["git", "clone", "my-repo"], - id="git-Linux-X64", - ), - pytest.param( - build.Platform(build.OperatingSystem.LINUX, build.Architecture.ARM64), - ["git", "clone", "my-repo"], - ["git", "clone", "my-repo"], - id="git-Linux-Arm64", - ), - pytest.param( - build.Platform(build.OperatingSystem.DARWIN, build.Architecture.X64), - ["git", "clone", "my-repo"], - ["git", "clone", "my-repo"], - id="git-Darwin-X64", - ), - pytest.param( - build.Platform(build.OperatingSystem.DARWIN, build.Architecture.ARM64), - ["git", "clone", "my-repo"], - [ - "git", - "clone", - "--config", - "core.autocrlf=false", - "--config", - "core.eol=lf", - "my-repo", - ], - id="git-Darwin-Arm64", - ), - # Abs path - pytest.param( - build.Platform(build.OperatingSystem.LINUX, build.Architecture.X64), - ["/path/to/git", "clone", "my-repo"], - ["/path/to/git", "clone", "my-repo"], - id="Abs-Linux-X64", - ), - pytest.param( - build.Platform(build.OperatingSystem.LINUX, build.Architecture.ARM64), - ["/path/to/git", "clone", "my-repo"], - ["/path/to/git", "clone", "my-repo"], - id="Abs-Linux-Arm64", - ), - pytest.param( - build.Platform(build.OperatingSystem.DARWIN, build.Architecture.X64), - ["/path/to/git", "clone", "my-repo"], - ["/path/to/git", "clone", "my-repo"], - id="Abs-Darwin-X64", - ), - pytest.param( - build.Platform(build.OperatingSystem.DARWIN, build.Architecture.ARM64), - ["/path/to/git", "clone", "my-repo"], - [ - "/path/to/git", - "clone", - "--config", - "core.autocrlf=false", - "--config", - "core.eol=lf", - "my-repo", - ], - id="Abs-Darwin-Arm64", - ), - ], -) -def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected_cmd): - assert build.config_git_command(plat, cmd) == expected_cmd - - -def test_modify_source_files(p_test_dir): - def make_text_blurb(food): - return textwrap.dedent(f"""\ - My favorite food is {food} - {food} is an important part of a healthy breakfast - {food} {food} {food} {food} - This line should be unchanged! - --> {food} <-- - """) - - original_word = "SPAM" - mutated_word = "EGGS" - - source_files = [] - for i in range(3): - source_file = p_test_dir / f"test_{i}" - source_file.touch() - source_file.write_text(make_text_blurb(original_word)) - source_files.append(source_file) - # Modify a single file - build._modify_source_files(source_files[0], original_word, mutated_word) - assert source_files[0].read_text() == make_text_blurb(mutated_word) - assert source_files[1].read_text() == make_text_blurb(original_word) - assert source_files[2].read_text() == make_text_blurb(original_word) - - # Modify multiple files - build._modify_source_files( - (source_files[1], source_files[2]), original_word, mutated_word - ) - assert source_files[1].read_text() == make_text_blurb(mutated_word) - assert source_files[2].read_text() == make_text_blurb(mutated_word) diff --git a/tests/install/test_mlpackage.py b/tests/install/test_mlpackage.py new file mode 100644 index 000000000..d27e69b2b --- /dev/null +++ b/tests/install/test_mlpackage.py @@ -0,0 +1,122 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pathlib +from unittest.mock import MagicMock + +import pytest + +from smartsim._core._install.mlpackages import ( + MLPackage, + MLPackageCollection, + RAIPatch, + load_platform_configs, +) +from smartsim._core._install.platform import Platform + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +mock_platform = MagicMock(spec=Platform) + + +@pytest.fixture +def mock_ml_packages(): + foo = MagicMock(spec=MLPackage) + foo.name = "foo" + bar = MagicMock(spec=MLPackage) + bar.name = "bar" + yield [foo, bar] + + +@pytest.mark.parametrize( + "patch", + [MagicMock(spec=RAIPatch), [MagicMock(spec=RAIPatch) for i in range(3)], ()], + ids=["one patch", "multiple patches", "no patch"], +) +def test_mlpackage_constructor(patch): + MLPackage( + "foo", + "0.0.0", + "https://nothing.com", + ["bar==0.1", "baz==0.2"], + pathlib.Path("/nothing/fake"), + patch, + ) + + +def test_mlpackage_collection_constructor(mock_ml_packages): + MLPackageCollection(mock_platform, mock_ml_packages) + + +def test_mlpackage_collection_mutable_mapping_methods(mock_ml_packages): + ml_packages = MLPackageCollection(mock_platform, mock_ml_packages) + for val in ml_packages._ml_packages.values(): + val.version = "0.0.0" + assert ml_packages._ml_packages == ml_packages + + # Test iter + package_names = [pkg.name for pkg in mock_ml_packages] + assert [name for name in ml_packages] == package_names + + # Test get item + for pkg in mock_ml_packages: + assert ml_packages[pkg.name] is pkg + + # Test len + assert len(ml_packages) == len(mock_ml_packages) + + # Test delitem + key = next(iter(mock_ml_packages)).name + del ml_packages[key] + with pytest.raises(KeyError): + ml_packages[key] + assert len(ml_packages) == (len(mock_ml_packages) - 1) + + # Test setitem + with pytest.raises(TypeError): + ml_packages["baz"] = MagicMock(spec=MLPackage) + + # Test contains + name, package = next(iter(ml_packages.items())) + assert name in ml_packages + + # Test str + assert "Package" in str(ml_packages) + assert "Version" in str(ml_packages) + assert package.version in str(ml_packages) + assert name in str(ml_packages) + + +def test_load_configs_raises_when_dir_dne(test_dir): + dne_dir = pathlib.Path(test_dir, "dne") + dir_str = os.fspath(dne_dir) + with pytest.raises( + FileNotFoundError, + match=f"Platform configuration directory `{dir_str}` does not exist", + ): + load_platform_configs(dne_dir) diff --git a/tests/install/test_package_retriever.py b/tests/install/test_package_retriever.py new file mode 100644 index 000000000..d415ae235 --- /dev/null +++ b/tests/install/test_package_retriever.py @@ -0,0 +1,106 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import contextlib +import filecmp +import os +import pathlib +import random +import string +import tarfile +import zipfile + +import pytest + +from smartsim._core._install.utils import retrieve + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +@contextlib.contextmanager +def temp_cd(path): + original = os.getcwd() + os.chdir(path) + try: + yield + finally: + os.chdir(original) + + +def make_test_file(test_file): + data = "".join(random.choices(string.ascii_letters + string.digits, k=1024)) + with open(test_file, "w") as f: + f.write(data) + + +def test_local_archive_zip(test_dir): + with temp_cd(test_dir): + test_file = "./test.data" + make_test_file(test_file) + + zip_file = "./test.zip" + with zipfile.ZipFile(zip_file, "w") as f: + f.write(test_file) + + retrieve(zip_file, pathlib.Path("./output")) + + assert filecmp.cmp( + test_file, pathlib.Path("./output") / "test.data", shallow=False + ) + + +def test_local_archive_tgz(test_dir): + with temp_cd(test_dir): + test_file = "./test.data" + make_test_file(test_file) + + tgz_file = "./test.tgz" + with tarfile.open(tgz_file, "w:gz") as f: + f.add(test_file) + + retrieve(tgz_file, pathlib.Path("./output")) + + assert filecmp.cmp( + test_file, pathlib.Path("./output") / "test.data", shallow=False + ) + + +def test_git(test_dir): + retrieve( + "https://github.com/CrayLabs/SmartSim.git", + f"{test_dir}/smartsim_git", + branch="master", + ) + assert pathlib.Path(f"{test_dir}/smartsim_git").is_dir() + + +def test_https(test_dir): + output_dir = pathlib.Path(test_dir) / "output" + retrieve( + "https://github.com/CrayLabs/SmartSim/archive/refs/tags/v0.5.0.zip", output_dir + ) + assert output_dir.exists() diff --git a/tests/install/test_platform.py b/tests/install/test_platform.py new file mode 100644 index 000000000..76ff3f76b --- /dev/null +++ b/tests/install/test_platform.py @@ -0,0 +1,89 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os +import platform + +import pytest + +from smartsim._core._install.platform import Architecture, Device, OperatingSystem + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +def test_device_cpu(): + cpu_enum = Device.CPU + assert not cpu_enum.is_gpu() + assert not cpu_enum.is_cuda() + assert not cpu_enum.is_rocm() + + +@pytest.mark.parametrize("cuda_device", Device.cuda_enums()) +def test_cuda(monkeypatch, test_dir, cuda_device): + version = cuda_device.value.split("-")[1] + fake_full_version = version + ".8888" ".9999" + monkeypatch.setenv("CUDA_HOME", test_dir) + + mock_version = dict(cuda=dict(version=fake_full_version)) + print(mock_version) + with open(f"{test_dir}/version.json", "w") as outfile: + json.dump(mock_version, outfile) + + assert Device.detect_cuda_version() == cuda_device + assert cuda_device.is_gpu() + assert cuda_device.is_cuda() + assert not cuda_device.is_rocm() + + +@pytest.mark.parametrize("rocm_device", Device.rocm_enums()) +def test_rocm(monkeypatch, test_dir, rocm_device): + version = rocm_device.value.split("-")[1] + fake_full_version = version + ".8888" + "-9999" + monkeypatch.setenv("ROCM_HOME", test_dir) + info_dir = f"{test_dir}/.info" + os.mkdir(info_dir) + + with open(f"{info_dir}/version", "w") as outfile: + outfile.write(fake_full_version) + + assert Device.detect_rocm_version() == rocm_device + assert rocm_device.is_gpu() + assert not rocm_device.is_cuda() + assert rocm_device.is_rocm() + + +@pytest.mark.parametrize("os", ("linux", "darwin")) +def test_operating_system(monkeypatch, os): + monkeypatch.setattr(platform, "system", lambda: os) + assert OperatingSystem.autodetect().value == os + + +@pytest.mark.parametrize("arch", ("x86_64", "arm64")) +def test_architecture(monkeypatch, arch): + monkeypatch.setattr(platform, "machine", lambda: arch) + assert Architecture.autodetect().value == arch diff --git a/tests/install/test_redisai_builder.py b/tests/install/test_redisai_builder.py new file mode 100644 index 000000000..81673a7f1 --- /dev/null +++ b/tests/install/test_redisai_builder.py @@ -0,0 +1,60 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from pathlib import Path + +import pytest + +from smartsim._core._install.buildenv import BuildEnv +from smartsim._core._install.mlpackages import ( + DEFAULT_MLPACKAGE_PATH, + MLPackage, + load_platform_configs, +) +from smartsim._core._install.platform import Platform +from smartsim._core._install.redisaiBuilder import RedisAIBuilder + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +DEFAULT_MLPACKAGES = load_platform_configs(DEFAULT_MLPACKAGE_PATH) + + +@pytest.mark.parametrize( + "platform", + [platform for platform in DEFAULT_MLPACKAGES], + ids=[str(platform) for platform in DEFAULT_MLPACKAGES], +) +def test_backends_to_be_installed(monkeypatch, test_dir, platform): + mlpackages = DEFAULT_MLPACKAGES[platform] + monkeypatch.setattr(MLPackage, "retrieve", lambda *args, **kwargs: None) + builder = RedisAIBuilder(platform, mlpackages, BuildEnv(), Path(test_dir)) + + BACKENDS = ["libtorch", "libtensorflow", "onnxruntime"] + TOGGLES = ["build_torch", "build_tensorflow", "build_onnxruntime"] + + for backend, toggle in zip(BACKENDS, TOGGLES): + assert getattr(builder, toggle) == (backend in mlpackages) diff --git a/tests/test_cli.py b/tests/test_cli.py index 710a9a659..1cead7625 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -436,24 +436,23 @@ def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = No # fmt: off @pytest.mark.parametrize( - "command,mock_location,exp_output,optional_arg,exp_valid,exp_err_msg,check_prop,exp_prop_val", + "command, mock_location, exp_output, optional_arg, exp_valid, exp_err_msg, check_prop, exp_prop_val", [ - pytest.param("build", "build_execute", "verbose mocked-build", "-v", True, "", "v", True, id="verbose 'on'"), - pytest.param("build", "build_execute", "cpu mocked-build", "--device=cpu", True, "", "device", "cpu", id="device 'cpu'"), - pytest.param("build", "build_execute", "gpu mocked-build", "--device=gpu", True, "", "device", "gpu", id="device 'gpu'"), - pytest.param("build", "build_execute", "gpuX mocked-build", "--device=gpux", False, "invalid choice: 'gpux'", "", "", id="set bad device 'gpuX'"), - pytest.param("build", "build_execute", "no tensorflow mocked-build", "--no_tf", True, "", "no_tf", True, id="set no TF"), - pytest.param("build", "build_execute", "no torch mocked-build", "--no_pt", True, "", "no_pt", True, id="set no torch"), - pytest.param("build", "build_execute", "onnx mocked-build", "--onnx", True, "", "onnx", True, id="set w/onnx"), - pytest.param("build", "build_execute", "torch-dir mocked-build", "--torch_dir /foo/bar", True, "", "torch_dir", "/foo/bar", id="set torch dir"), - pytest.param("build", "build_execute", "bad-torch-dir mocked-build", "--torch_dir", False, "error: argument --torch_dir", "", "", id="set torch dir, no path"), - pytest.param("build", "build_execute", "keydb mocked-build", "--keydb", True, "", "keydb", True, id="keydb on"), - pytest.param("clean", "clean_execute", "clobbering mocked-clean", "--clobber", True, "", "clobber", True, id="clean w/clobber"), - pytest.param("validate", "validate_execute", "port mocked-validate", "--port=12345", True, "", "port", 12345, id="validate w/ manual port"), - pytest.param("validate", "validate_execute", "abbrv port mocked-validate", "-p 12345", True, "", "port", 12345, id="validate w/ manual abbreviated port"), - pytest.param("validate", "validate_execute", "cpu mocked-validate", "--device=cpu", True, "", "device", "cpu", id="validate: device 'cpu'"), - pytest.param("validate", "validate_execute", "gpu mocked-validate", "--device=gpu", True, "", "device", "gpu", id="validate: device 'gpu'"), - pytest.param("validate", "validate_execute", "gpuX mocked-validate", "--device=gpux", False, "invalid choice: 'gpux'", "", "", id="validate: set bad device 'gpuX'"), + pytest.param( "build", "build_execute", "verbose mocked-build", "-v", True, "", "v", True, id="verbose 'on'"), + pytest.param( "build", "build_execute", "cpu mocked-build", "--device=cpu", True, "", "device", "cpu", id="device 'cpu'"), + pytest.param( "build", "build_execute", "gpuX mocked-build", "--device=gpux", False, "invalid choice: 'gpux'", "", "", id="set bad device 'gpuX'"), + pytest.param( "build", "build_execute", "no tensorflow mocked-build", "--skip-tensorflow", True, "", "no_tf", True, id="Skip TF"), + pytest.param( "build", "build_execute", "no torch mocked-build", "--skip-torch", True, "", "no_pt", True, id="Skip Torch"), + pytest.param( "build", "build_execute", "onnx mocked-build", "--skip-onnx", True, "", "onnx", True, id="Skip Onnx"), + pytest.param( "build", "build_execute", "config-dir mocked-build", "--config-dir /foo/bar", True, "", "config-dir", "/foo/bar", id="set torch dir"), + pytest.param( "build", "build_execute", "bad-config-dir mocked-build", "--config-dir", False, "error: argument --config-dir", "", "", id="set config dir w/o path"), + pytest.param( "build", "build_execute", "keydb mocked-build", "--keydb", True, "", "keydb", True, id="keydb on"), + pytest.param( "clean", "clean_execute", "clobbering mocked-clean", "--clobber", True, "", "clobber", True, id="clean w/clobber"), + pytest.param("validate", "validate_execute", "port mocked-validate", "--port=12345", True, "", "port", 12345, id="validate w/ manual port"), + pytest.param("validate", "validate_execute", "abbrv port mocked-validate", "-p 12345", True, "", "port", 12345, id="validate w/ manual abbreviated port"), + pytest.param("validate", "validate_execute", "cpu mocked-validate", "--device=cpu", True, "", "device", "cpu", id="validate: device 'cpu'"), + pytest.param("validate", "validate_execute", "gpu mocked-validate", "--device=gpu", True, "", "device", "gpu", id="validate: device 'gpu'"), + pytest.param("validate", "validate_execute", "gpuX mocked-validate", "--device=gpux", False, "invalid choice: 'gpux'", "", "", id="validate: set bad device 'gpuX'"), ] ) # fmt: on @@ -735,15 +734,6 @@ def mock_operation(*args, **kwargs) -> int: monkeypatch.setattr(smartsim._core._cli.build, "tabulate", mock_operation) monkeypatch.setattr(smartsim._core._cli.build, "build_database", mock_operation) monkeypatch.setattr(smartsim._core._cli.build, "build_redis_ai", mock_operation) - monkeypatch.setattr( - smartsim._core._cli.build, "check_py_torch_version", mock_operation - ) - monkeypatch.setattr( - smartsim._core._cli.build, "check_py_tf_version", mock_operation - ) - monkeypatch.setattr( - smartsim._core._cli.build, "check_py_onnx_version", mock_operation - ) command = "build" cfg = MenuItemConfig( diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 4fe8bf71b..4bd07e920 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -593,11 +593,14 @@ def test_run_step_fail(test_dir: str) -> None: step0 = DragonStep("step0", test_dir, rs) step0.meta["status_dir"] = status_dir - mock_connector = MagicMock() # DragonConnector() + mock_connector = MagicMock(spec=DragonConnector) mock_connector.is_connected = True mock_connector.send_request = MagicMock( return_value=DragonRunResponse(step_id=step0.name, error_message="mock fail!") ) + mock_connector.merge_persisted_env = MagicMock( + return_value={"FOO": "bar", "BAZ": "boop"} + ) launcher = DragonLauncher() launcher._connector = mock_connector @@ -676,7 +679,7 @@ def test_run_step_success(test_dir: str) -> None: step0 = DragonStep("step0", test_dir, rs) step0.meta["status_dir"] = status_dir - mock_connector = MagicMock() # DragonConnector() + mock_connector = MagicMock(spec=DragonConnector) mock_connector.is_connected = True mock_connector.send_request = MagicMock( return_value=DragonRunResponse(step_id=step0.name) @@ -684,6 +687,9 @@ def test_run_step_success(test_dir: str) -> None: launcher = DragonLauncher() launcher._connector = mock_connector + mock_connector.merge_persisted_env = MagicMock( + return_value={"FOO": "bar", "BAZ": "boop"} + ) result = launcher.run(step0) diff --git a/tests/test_dragon_run_request_nowlm.py b/tests/test_dragon_run_request_nowlm.py index afd25aa9d..3dd7099c8 100644 --- a/tests/test_dragon_run_request_nowlm.py +++ b/tests/test_dragon_run_request_nowlm.py @@ -101,5 +101,5 @@ def test_run_request_with_negative_affinity( ), ) - assert f"{device}_affinity" in str(ex.value.args[0]) - assert "NumberNotGeError" in str(ex.value.args[0]) + assert f"{device}_affinity" in str(ex.value) + assert "greater than or equal to 0" in str(ex.value)