diff --git a/.gitignore b/.gitignore index 7ae724e708f..db403810bd5 100644 --- a/.gitignore +++ b/.gitignore @@ -75,3 +75,6 @@ compile_commands.json .dir-locals.el .devcontainer/devcontainer.env.user .devcontainer/docker-compose.override.yml + +# Enroot sqsh files +enroot/tensorrt_llm.devel.sqsh diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 3c3b12712cd..7410c44b896 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -15,70 +15,61 @@ LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-so # The default values come from `nvcr.io/nvidia/pytorch` ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc} ENV ENV=${ENV:-/etc/shinit_v2} + ARG GITHUB_MIRROR="" RUN echo "Using GitHub mirror: $GITHUB_MIRROR" -SHELL ["/bin/bash", "-c"] -# Clean up the pip constraint file from the base NGC PyTorch image. -RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true +ARG PYTHON_VERSION="3.12.3" +RUN echo "Using Python version: $PYTHON_VERSION" + +SHELL ["/bin/bash", "-c"] FROM base AS devel -ARG PYTHON_VERSION="3.12.3" -RUN echo "Using Python version: $PYTHON_VERSION" +# +# NB: PyTorch requires this to be < 1.0 +ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999" + +COPY docker/common/install.sh install.sh + COPY docker/common/install_base.sh install_base.sh -RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_base.sh $PYTHON_VERSION && rm install_base.sh +RUN GITHUB_MIRROR=${GITHUB_MIRROR} \ + PYTHON_VERSION=${PYTHON_VERSION} \ + bash ./install.sh --base && rm install_base.sh COPY docker/common/install_cmake.sh install_cmake.sh -RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_cmake.sh && rm install_cmake.sh +RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --cmake && rm install_cmake.sh COPY docker/common/install_ccache.sh install_ccache.sh -RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_ccache.sh && rm install_ccache.sh +RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --ccache && rm install_ccache.sh -# Only take effect when the base image is Rocky Linux 8 with old CUDA version. COPY docker/common/install_cuda_toolkit.sh install_cuda_toolkit.sh -RUN bash ./install_cuda_toolkit.sh && rm install_cuda_toolkit.sh +RUN bash ./install.sh --cuda_toolkit && rm install_cuda_toolkit.sh -# Download & install latest TRT release ARG TRT_VER ARG CUDA_VER ARG CUDNN_VER ARG NCCL_VER ARG CUBLAS_VER COPY docker/common/install_tensorrt.sh install_tensorrt.sh -RUN bash ./install_tensorrt.sh \ - --TRT_VER=${TRT_VER} \ - --CUDA_VER=${CUDA_VER} \ - --CUDNN_VER=${CUDNN_VER} \ - --NCCL_VER=${NCCL_VER} \ - --CUBLAS_VER=${CUBLAS_VER} && \ - rm install_tensorrt.sh - -# Install latest Polygraphy +RUN TRT_VER=${TRT_VER} \ + CUDA_VER=${CUDA_VER} \ + CUDNN_VER=${CUDNN_VER} \ + NCCL_VER=${NCCL_VER} \ + CUBLAS_VER=${CUBLAS_VER} \ + bash ./install.sh --tensorrt && rm install_tensorrt.sh + COPY docker/common/install_polygraphy.sh install_polygraphy.sh -RUN bash ./install_polygraphy.sh && rm install_polygraphy.sh +RUN bash ./install.sh --polygraphy && rm install_polygraphy.sh -# Install mpi4py COPY docker/common/install_mpi4py.sh install_mpi4py.sh -RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_mpi4py.sh && rm install_mpi4py.sh +RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --mpi4py && rm install_mpi4py.sh -# Install PyTorch ARG TORCH_INSTALL_TYPE="skip" COPY docker/common/install_pytorch.sh install_pytorch.sh -RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh -# -# NB: PyTorch requires this to be < 1.0 -ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999" - -# Install OpenCV with FFMPEG support -RUN pip3 uninstall -y opencv && \ - rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \ - pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir +RUN TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} bash ./install.sh --pytorch && rm install_pytorch.sh -# WARs against security issues inherited from pytorch:25.06 -# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7 -RUN pip3 install --upgrade --no-cache-dir \ - "protobuf>=4.25.8" +RUN bash ./install.sh --opencv && bash ./install.sh --protobuf && rm install.sh FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton diff --git a/docker/common/install.sh b/docker/common/install.sh new file mode 100755 index 00000000000..2637c303026 --- /dev/null +++ b/docker/common/install.sh @@ -0,0 +1,144 @@ +#!/bin/bash +set -Eeo pipefail +shopt -s nullglob +trap 'echo "[install.sh] Error on line $LINENO" >&2' ERR + +# Resolve script directory for robust relative pathing +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" + +# Default values +base=0 +cmake=0 +ccache=0 +cuda_toolkit=0 +tensorrt=0 +polygraphy=0 +mpi4py=0 +pytorch=0 +opencv=0 +protobuf=0 + +while [[ $# -gt 0 ]]; do + case $1 in + --base) + base=1 + shift 1 + ;; + --cmake) + cmake=1 + shift 1 + ;; + --ccache) + ccache=1 + shift 1 + ;; + --cuda_toolkit) + cuda_toolkit=1 + shift 1 + ;; + --tensorrt) + tensorrt=1 + shift 1 + ;; + --polygraphy) + polygraphy=1 + shift 1 + ;; + --mpi4py) + mpi4py=1 + shift 1 + ;; + --pytorch) + pytorch=1 + shift 1 + ;; + --opencv) + opencv=1 + shift 1 + ;; + --protobuf) + protobuf=1 + shift 1 + ;; + --all) + base=1 + cmake=1 + ccache=1 + cuda_toolkit=1 + tensorrt=1 + polygraphy=1 + mpi4py=1 + pytorch=1 + opencv=1 + protobuf=1 + shift 1 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [ $base -eq 1 ]; then + echo "Installing base dependencies..." + # Clean up the pip constraint file from the base NGC PyTorch image. + [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true + + echo "Using Python version: $PYTHON_VERSION" + GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_base.sh $PYTHON_VERSION +fi + +if [ $cmake -eq 1 ]; then + echo "Installing CMake..." + GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_cmake.sh +fi + +if [ $ccache -eq 1 ]; then + echo "Installing ccache..." + GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_ccache.sh +fi + +if [ $cuda_toolkit -eq 1 ]; then + echo "Installing CUDA toolkit..." + bash $SCRIPT_DIR/install_cuda_toolkit.sh +fi + +if [ $tensorrt -eq 1 ]; then + echo "Installing TensorRT..." + bash $SCRIPT_DIR/install_tensorrt.sh \ + --TRT_VER=${TRT_VER} \ + --CUDA_VER=${CUDA_VER} \ + --CUDNN_VER=${CUDNN_VER} \ + --NCCL_VER=${NCCL_VER} \ + --CUBLAS_VER=${CUBLAS_VER} +fi + +if [ $polygraphy -eq 1 ]; then + echo "Installing Polygraphy..." + bash $SCRIPT_DIR/install_polygraphy.sh +fi + +if [ $mpi4py -eq 1 ]; then + echo "Installing mpi4py..." + GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_mpi4py.sh +fi + +if [ $pytorch -eq 1 ]; then + echo "Installing PyTorch..." + bash $SCRIPT_DIR/install_pytorch.sh $TORCH_INSTALL_TYPE +fi + +if [ $opencv -eq 1 ]; then + echo "Installing OpenCV..." + pip3 uninstall -y opencv + rm -rf /usr/local/lib/python3*/dist-packages/cv2/ + pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir +fi + +# WARs against security issues inherited from pytorch:25.06 +# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7 +if [ $protobuf -eq 1 ]; then + pip3 install --upgrade --no-cache-dir \ + "protobuf>=4.25.8" +fi diff --git a/docs/source/installation/build-from-source-linux.md b/docs/source/installation/build-from-source-linux.md index f4b6f3836ff..fb6adf93d4a 100644 --- a/docs/source/installation/build-from-source-linux.md +++ b/docs/source/installation/build-from-source-linux.md @@ -88,6 +88,33 @@ Follow the linked catalog entry to enter a new container based on the pre-built make -C docker run LOCAL_USER=1 ``` +If you wish to use enroot instead of docker, then you can build a sqsh file that has the identical environment as the development image `tensorrt_llm/devel:latest` as follows. + +1. Allocate a compute node: + ```bash + salloc --nodes=1 + ``` + +2. Create a sqsh file with essential TensorRT LLM dependencies installed + ```bash + # Using default sqsh filename (enroot/tensorrt_llm.devel.sqsh) + make -C enroot build_sqsh + + # Or specify a custom path (optional) + make -C enroot build_sqsh SQSH_PATH=/path/to/dev_trtllm_image.sqsh + ``` + +3. Once this squash file is ready, you can follow the steps under [Build TensorRT LLM](#build-tensorrt-llm)by launching an enroot sandbox from `dev_trtllm_image.sqsh`. To do this, proceed as follows: + ```bash + export SQSH_PATH=/path/to/dev_trtllm_image.sqsh + + # Start a pseudo terminal for interactive session + make -C enroot run_sqsh + + # Or, you could run commands directly + make -C enroot run_sqsh RUN_CMD="python3 scripts/build_wheel.py" + ``` + **On systems without GNU `make`** 1. Create a Docker image for development. diff --git a/enroot/Makefile b/enroot/Makefile new file mode 100644 index 00000000000..4f15f028140 --- /dev/null +++ b/enroot/Makefile @@ -0,0 +1,45 @@ +ifndef MAKEFILE_PYXIS_INCLUDED +MAKEFILE_PYXIS_INCLUDED := 1 + +BASE_IMAGE ?= $(shell grep '^ARG BASE_IMAGE=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="') +BASE_TAG ?= $(shell grep '^ARG BASE_TAG=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="') +SQSH_PATH ?= tensorrt_llm.devel.sqsh +SOURCE_DIR ?= $(shell readlink -f ..) +CODE_DIR ?= /code/tensorrt_llm +RUN_CMD ?= --pty bash + +PYTHON_VERSION ?= 3.12.3 +TORCH_INSTALL_TYPE ?= skip +GITHUB_MIRROR ?= +CUDA_VERSION ?= +CUDNN_VERSION ?= +NCCL_VERSION ?= +CUBLAS_VERSION ?= +TRT_VERSION ?= + +build_sqsh: + @echo "Building trtllm sqsh image." + @echo "Base image: $(BASE_IMAGE):$(BASE_TAG)" + @echo "Location: $(SQSH_PATH)" + + srun \ + --container-image "$(BASE_IMAGE):$(BASE_TAG)" \ + --container-save "$(SQSH_PATH)" \ + --container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR)/docker/common \ + --container-mount-home --container-remap-root \ + --export PYTHON_VERSION=$(PYTHON_VERSION),GITHUB_MIRROR=$(GITHUB_MIRROR),TORCH_INSTALL_TYPE=$(TORCH_INSTALL_TYPE),CUDA_VER=$(CUDA_VERSION),CUDNN_VER=$(CUDNN_VERSION),NCCL_VER=$(NCCL_VERSION),CUBLAS_VER=$(CUBLAS_VERSION),TRT_VER=$(TRT_VERSION) \ + ./install.sh --all + +run_sqsh: + @echo "Running srun job step with:" + @echo " sqsh image: $(SQSH_PATH)" + @echo " run command: $(RUN_CMD)" + + srun \ + --container-image "$(SQSH_PATH)" \ + --container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR) \ + --container-mount-home --container-remap-root \ + --export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.99999 \ + $(RUN_CMD) + +endif diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 47818c781d5..2d82d52792e 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -12,7 +12,7 @@ # NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509081850-5980 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509081850-5980 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509091430-7383 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509091430-7383