Skip to content
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
3c8ac96
Add and use install.sh
v-shobhit Aug 29, 2025
4b5b5fa
Clear the pip constraints
v-shobhit Aug 30, 2025
ead95a6
fail on error
v-shobhit Aug 30, 2025
846ca6f
Make install.sh executable
v-shobhit Sep 1, 2025
1a2b1bc
update enroot/Makefile
v-shobhit Sep 2, 2025
be25635
Address comment
v-shobhit Sep 2, 2025
719d46a
fix end of files
v-shobhit Sep 2, 2025
d444aa4
allow unset variables
v-shobhit Sep 2, 2025
bc55715
Add run_sqsh target
v-shobhit Sep 2, 2025
3877e05
Document enroot flow
v-shobhit Sep 2, 2025
38409bd
fix typo
v-shobhit Sep 2, 2025
4f4bb4d
fix docker build layering
v-shobhit Sep 2, 2025
85bca7c
address comments
v-shobhit Sep 2, 2025
f65f93b
remove temp workaround
v-shobhit Sep 2, 2025
91fc4e1
add vars
v-shobhit Sep 2, 2025
8b0171f
clean up srun command
v-shobhit Sep 2, 2025
981d489
clean up
v-shobhit Sep 2, 2025
762a66b
fix variable export
v-shobhit Sep 2, 2025
82bddf6
Merge branch 'main' into dev/shobhitv/shell-build-ctx
nvzhihanj Sep 3, 2025
ec3cf54
address comment
v-shobhit Sep 3, 2025
67fcaba
attempt fix CI
v-shobhit Sep 3, 2025
134d831
Merge branch 'main' into dev/shobhitv/shell-build-ctx
v-shobhit Sep 6, 2025
349e17d
Merge branch 'main' into dev/shobhitv/shell-build-ctx
nvzhihanj Sep 8, 2025
3ee39e5
Update current_image_tags.properties
v-shobhit Sep 9, 2025
80b542e
Merge branch 'main' into dev/shobhitv/shell-build-ctx
v-shobhit Sep 9, 2025
faf0a66
Update current_image_tags.properties
v-shobhit Sep 9, 2025
6ba3720
address review
v-shobhit Sep 9, 2025
0148f97
review - keep pytorch comment
v-shobhit Sep 9, 2025
a387c11
Update install.sh
v-shobhit Sep 9, 2025
39d02b3
Update install.sh
v-shobhit Sep 9, 2025
42f44b0
Update Dockerfile.multi
v-shobhit Sep 9, 2025
b27a7b8
TensorRT-LLM -> TensorRT LLM
v-shobhit Sep 9, 2025
6c66a1c
Update Dockerfile.multi
v-shobhit Sep 9, 2025
a80a5c4
Update Dockerfile.multi
v-shobhit Sep 9, 2025
6318484
remove hyphen
v-shobhit Sep 10, 2025
39b1df3
add missing 'rm'
v-shobhit Sep 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,6 @@ compile_commands.json
.dir-locals.el
.devcontainer/devcontainer.env.user
.devcontainer/docker-compose.override.yml

# Enroot sqsh files
enroot/tensorrt_llm.devel.sqsh
74 changes: 38 additions & 36 deletions docker/Dockerfile.multi
Original file line number Diff line number Diff line change
Expand Up @@ -15,70 +15,72 @@ LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-so
# The default values come from `nvcr.io/nvidia/pytorch`
ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc}
ENV ENV=${ENV:-/etc/shinit_v2}

ARG GITHUB_MIRROR=""
RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
SHELL ["/bin/bash", "-c"]

# Clean up the pip constraint file from the base NGC PyTorch image.
RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
ARG PYTHON_VERSION="3.12.3"
RUN echo "Using Python version: $PYTHON_VERSION"

SHELL ["/bin/bash", "-c"]

FROM base AS devel

ARG PYTHON_VERSION="3.12.3"
RUN echo "Using Python version: $PYTHON_VERSION"
#
# NB: PyTorch requires this to be < 1.0
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"

COPY docker/common/install.sh install.sh

COPY docker/common/install_base.sh install_base.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_base.sh $PYTHON_VERSION && rm install_base.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
PYTHON_VERSION=${PYTHON_VERSION} \
bash ./install.sh --base
RUN rm install_base.sh

COPY docker/common/install_cmake.sh install_cmake.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_cmake.sh && rm install_cmake.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --cmake
RUN rm install_cmake.sh

COPY docker/common/install_ccache.sh install_ccache.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_ccache.sh && rm install_ccache.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --ccache
RUN rm install_ccache.sh

# Only take effect when the base image is Rocky Linux 8 with old CUDA version.
COPY docker/common/install_cuda_toolkit.sh install_cuda_toolkit.sh
RUN bash ./install_cuda_toolkit.sh && rm install_cuda_toolkit.sh
RUN bash ./install.sh --cuda_toolkit
RUN rm install_cuda_toolkit.sh

# Download & install latest TRT release
ARG TRT_VER
ARG CUDA_VER
ARG CUDNN_VER
ARG NCCL_VER
ARG CUBLAS_VER
COPY docker/common/install_tensorrt.sh install_tensorrt.sh
RUN bash ./install_tensorrt.sh \
--TRT_VER=${TRT_VER} \
--CUDA_VER=${CUDA_VER} \
--CUDNN_VER=${CUDNN_VER} \
--NCCL_VER=${NCCL_VER} \
--CUBLAS_VER=${CUBLAS_VER} && \
rm install_tensorrt.sh

# Install latest Polygraphy
RUN TRT_VER=${TRT_VER} \
CUDA_VER=${CUDA_VER} \
CUDNN_VER=${CUDNN_VER} \
NCCL_VER=${NCCL_VER} \
CUBLAS_VER=${CUBLAS_VER} \
bash ./install.sh --tensorrt
RUN rm install_tensorrt.sh

COPY docker/common/install_polygraphy.sh install_polygraphy.sh
RUN bash ./install_polygraphy.sh && rm install_polygraphy.sh
RUN bash ./install.sh --polygraphy
RUN rm install_polygraphy.sh

# Install mpi4py
COPY docker/common/install_mpi4py.sh install_mpi4py.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_mpi4py.sh && rm install_mpi4py.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --mpi4py
RUN install_mpi4py.sh

# Install PyTorch
ARG TORCH_INSTALL_TYPE="skip"
COPY docker/common/install_pytorch.sh install_pytorch.sh
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
#
# NB: PyTorch requires this to be < 1.0
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
RUN TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} bash ./install.sh --pytorch
RUN rm install_pytorch.sh

# Install OpenCV with FFMPEG support
RUN pip3 uninstall -y opencv && \
rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
RUN bash ./install.sh --opencv
RUN bash ./install.sh --protobuf

# WARs against security issues inherited from pytorch:25.06
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
RUN pip3 install --upgrade --no-cache-dir \
"protobuf>=4.25.8"
RUN rm install.sh

FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton

Expand Down
144 changes: 144 additions & 0 deletions docker/common/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/bin/bash
set -Eeo pipefail
shopt -s nullglob
trap 'echo "[install.sh] Error on line $LINENO" >&2' ERR

# Resolve script directory for robust relative pathing
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"

# Default values
base=0
cmake=0
ccache=0
cuda_toolkit=0
tensorrt=0
polygraphy=0
mpi4py=0
pytorch=0
opencv=0
protobuf=0

while [[ $# -gt 0 ]]; do
case $1 in
--base)
base=1
shift 1
;;
--cmake)
cmake=1
shift 1
;;
--ccache)
ccache=1
shift 1
;;
--cuda_toolkit)
cuda_toolkit=1
shift 1
;;
--tensorrt)
tensorrt=1
shift 1
;;
--polygraphy)
polygraphy=1
shift 1
;;
--mpi4py)
mpi4py=1
shift 1
;;
--pytorch)
pytorch=1
shift 1
;;
--opencv)
opencv=1
shift 1
;;
--protobuf)
protobuf=1
shift 1
;;
--all)
base=1
cmake=1
ccache=1
cuda_toolkit=1
tensorrt=1
polygraphy=1
mpi4py=1
pytorch=1
opencv=1
protobuf=1
shift 1
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done

if [ $base -eq 1 ]; then
echo "Installing base dependencies..."
# Clean up the pip constraint file from the base NGC PyTorch image.
[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true

echo "Using Python version: $PYTHON_VERSION"
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_base.sh $PYTHON_VERSION
fi

if [ $cmake -eq 1 ]; then
echo "Installing CMake..."
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_cmake.sh
fi

if [ $ccache -eq 1 ]; then
echo "Installing ccache..."
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_ccache.sh
fi

if [ $cuda_toolkit -eq 1 ]; then
echo "Installing CUDA toolkit..."
bash $SCRIPT_DIR/install_cuda_toolkit.sh
fi

if [ $tensorrt -eq 1 ]; then
echo "Installing TensorRT..."
bash $SCRIPT_DIR/install_tensorrt.sh \
--TRT_VER=${TRT_VER} \
--CUDA_VER=${CUDA_VER} \
--CUDNN_VER=${CUDNN_VER} \
--NCCL_VER=${NCCL_VER} \
--CUBLAS_VER=${CUBLAS_VER}
fi

if [ $polygraphy -eq 1 ]; then
echo "Installing Polygraphy..."
bash $SCRIPT_DIR/install_polygraphy.sh
fi

if [ $mpi4py -eq 1 ]; then
echo "Installing mpi4py..."
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_mpi4py.sh
fi

if [ $pytorch -eq 1 ]; then
echo "Installing PyTorch..."
bash $SCRIPT_DIR/install_pytorch.sh $TORCH_INSTALL_TYPE
fi

if [ $opencv -eq 1 ]; then
echo "Installing OpenCV..."
pip3 uninstall -y opencv
rm -rf /usr/local/lib/python3*/dist-packages/cv2/
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
fi

# WARs against security issues inherited from pytorch:25.06
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
if [ $protobuf -eq 1 ]; then
pip3 install --upgrade --no-cache-dir \
"protobuf>=4.25.8"
fi
27 changes: 27 additions & 0 deletions docs/source/installation/build-from-source-linux.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,33 @@ Follow the linked catalog entry to enter a new container based on the pre-built
make -C docker run LOCAL_USER=1
```

If you wish to use enroot instead of docker, then you can build a sqsh file that has the identical environment as the development image `tensorrt_llm/devel:latest` as follows.

1. Allocate a compute node:
```bash
salloc --nodes=1
```

2. Create a sqsh file with essential TensorRT-LLM dependencies installed
```bash
# Using default sqsh filename (enroot/tensorrt_llm.devel.sqsh)
make -C enroot build_sqsh

# Or specify a custom path (optional)
make -C enroot build_sqsh SQSH_PATH=/path/to/dev_trtllm_image.sqsh
```

3. Once this squash file is ready, you can follow the steps under [Build TensorRT LLM](#build-tensorrt-llm)by launching an enroot sandbox from `dev_trtllm_image.sqsh`. To do this, proceed as follows:
```bash
export SQSH_PATH=/path/to/dev_trtllm_image.sqsh

# Start a pseudo terminal for interactive session
make -C enroot run_sqsh

# Or, you could run commands directly
make -C enroot run_sqsh RUN_CMD="python3 scripts/build_wheel.py"
```

**On systems without GNU `make`**

1. Create a Docker image for development.
Expand Down
45 changes: 45 additions & 0 deletions enroot/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
ifndef MAKEFILE_PYXIS_INCLUDED
MAKEFILE_PYXIS_INCLUDED := 1

BASE_IMAGE ?= $(shell grep '^ARG BASE_IMAGE=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="')
BASE_TAG ?= $(shell grep '^ARG BASE_TAG=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="')
SQSH_PATH ?= tensorrt_llm.devel.sqsh
SOURCE_DIR ?= $(shell readlink -f ..)
CODE_DIR ?= /code/tensorrt_llm
RUN_CMD ?= --pty bash

PYTHON_VERSION ?= 3.12.3
TORCH_INSTALL_TYPE ?= skip
GITHUB_MIRROR ?=
CUDA_VERSION ?=
CUDNN_VERSION ?=
NCCL_VERSION ?=
CUBLAS_VERSION ?=
TRT_VERSION ?=

build_sqsh:
@echo "Building trtllm sqsh image."
@echo "Base image: $(BASE_IMAGE):$(BASE_TAG)"
@echo "Location: $(SQSH_PATH)"

srun \
--container-image "$(BASE_IMAGE):$(BASE_TAG)" \
--container-save "$(SQSH_PATH)" \
--container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR)/docker/common \
--container-mount-home --container-remap-root \
--export PYTHON_VERSION=$(PYTHON_VERSION),GITHUB_MIRROR=$(GITHUB_MIRROR),TORCH_INSTALL_TYPE=$(TORCH_INSTALL_TYPE),CUDA_VER=$(CUDA_VERSION),CUDNN_VER=$(CUDNN_VERSION),NCCL_VER=$(NCCL_VERSION),CUBLAS_VER=$(CUBLAS_VERSION),TRT_VER=$(TRT_VERSION) \
./install.sh --all

run_sqsh:
@echo "Running srun job step with:"
@echo " sqsh image: $(SQSH_PATH)"
@echo " run command: $(RUN_CMD)"

srun \
--container-image "$(SQSH_PATH)" \
--container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR) \
--container-mount-home --container-remap-root \
--export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.99999 \
$(RUN_CMD)

endif
8 changes: 4 additions & 4 deletions jenkins/current_image_tags.properties
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509081850-5980
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509081850-5980
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509091430-7383
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509091430-7383