Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
3c8ac96
Add and use install.sh
v-shobhit Aug 29, 2025
4b5b5fa
Clear the pip constraints
v-shobhit Aug 30, 2025
ead95a6
fail on error
v-shobhit Aug 30, 2025
846ca6f
Make install.sh executable
v-shobhit Sep 1, 2025
1a2b1bc
update enroot/Makefile
v-shobhit Sep 2, 2025
be25635
Address comment
v-shobhit Sep 2, 2025
719d46a
fix end of files
v-shobhit Sep 2, 2025
d444aa4
allow unset variables
v-shobhit Sep 2, 2025
bc55715
Add run_sqsh target
v-shobhit Sep 2, 2025
3877e05
Document enroot flow
v-shobhit Sep 2, 2025
38409bd
fix typo
v-shobhit Sep 2, 2025
4f4bb4d
fix docker build layering
v-shobhit Sep 2, 2025
85bca7c
address comments
v-shobhit Sep 2, 2025
f65f93b
remove temp workaround
v-shobhit Sep 2, 2025
91fc4e1
add vars
v-shobhit Sep 2, 2025
8b0171f
clean up srun command
v-shobhit Sep 2, 2025
981d489
clean up
v-shobhit Sep 2, 2025
762a66b
fix variable export
v-shobhit Sep 2, 2025
82bddf6
Merge branch 'main' into dev/shobhitv/shell-build-ctx
nvzhihanj Sep 3, 2025
ec3cf54
address comment
v-shobhit Sep 3, 2025
67fcaba
attempt fix CI
v-shobhit Sep 3, 2025
134d831
Merge branch 'main' into dev/shobhitv/shell-build-ctx
v-shobhit Sep 6, 2025
349e17d
Merge branch 'main' into dev/shobhitv/shell-build-ctx
nvzhihanj Sep 8, 2025
3ee39e5
Update current_image_tags.properties
v-shobhit Sep 9, 2025
80b542e
Merge branch 'main' into dev/shobhitv/shell-build-ctx
v-shobhit Sep 9, 2025
faf0a66
Update current_image_tags.properties
v-shobhit Sep 9, 2025
6ba3720
address review
v-shobhit Sep 9, 2025
0148f97
review - keep pytorch comment
v-shobhit Sep 9, 2025
a387c11
Update install.sh
v-shobhit Sep 9, 2025
39d02b3
Update install.sh
v-shobhit Sep 9, 2025
42f44b0
Update Dockerfile.multi
v-shobhit Sep 9, 2025
b27a7b8
TensorRT-LLM -> TensorRT LLM
v-shobhit Sep 9, 2025
6c66a1c
Update Dockerfile.multi
v-shobhit Sep 9, 2025
a80a5c4
Update Dockerfile.multi
v-shobhit Sep 9, 2025
6318484
remove hyphen
v-shobhit Sep 10, 2025
39b1df3
add missing 'rm'
v-shobhit Sep 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,6 @@ compile_commands.json
.dir-locals.el
.devcontainer/devcontainer.env.user
.devcontainer/docker-compose.override.yml

# Enroot sqsh files
enroot/tensorrt_llm.devel.sqsh
65 changes: 28 additions & 37 deletions docker/Dockerfile.multi
Original file line number Diff line number Diff line change
Expand Up @@ -15,70 +15,61 @@ LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-so
# The default values come from `nvcr.io/nvidia/pytorch`
ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc}
ENV ENV=${ENV:-/etc/shinit_v2}

ARG GITHUB_MIRROR=""
RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
SHELL ["/bin/bash", "-c"]

# Clean up the pip constraint file from the base NGC PyTorch image.
RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
ARG PYTHON_VERSION="3.12.3"
RUN echo "Using Python version: $PYTHON_VERSION"

SHELL ["/bin/bash", "-c"]

FROM base AS devel

ARG PYTHON_VERSION="3.12.3"
RUN echo "Using Python version: $PYTHON_VERSION"
#
# NB: PyTorch requires this to be < 1.0
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"

COPY docker/common/install.sh install.sh

COPY docker/common/install_base.sh install_base.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_base.sh $PYTHON_VERSION && rm install_base.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
PYTHON_VERSION=${PYTHON_VERSION} \
bash ./install.sh --base && rm install_base.sh

COPY docker/common/install_cmake.sh install_cmake.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_cmake.sh && rm install_cmake.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --cmake && rm install_cmake.sh

COPY docker/common/install_ccache.sh install_ccache.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_ccache.sh && rm install_ccache.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --ccache && rm install_ccache.sh

# Only take effect when the base image is Rocky Linux 8 with old CUDA version.
COPY docker/common/install_cuda_toolkit.sh install_cuda_toolkit.sh
RUN bash ./install_cuda_toolkit.sh && rm install_cuda_toolkit.sh
RUN bash ./install.sh --cuda_toolkit && rm install_cuda_toolkit.sh

# Download & install latest TRT release
ARG TRT_VER
ARG CUDA_VER
ARG CUDNN_VER
ARG NCCL_VER
ARG CUBLAS_VER
COPY docker/common/install_tensorrt.sh install_tensorrt.sh
RUN bash ./install_tensorrt.sh \
--TRT_VER=${TRT_VER} \
--CUDA_VER=${CUDA_VER} \
--CUDNN_VER=${CUDNN_VER} \
--NCCL_VER=${NCCL_VER} \
--CUBLAS_VER=${CUBLAS_VER} && \
rm install_tensorrt.sh

# Install latest Polygraphy
RUN TRT_VER=${TRT_VER} \
CUDA_VER=${CUDA_VER} \
CUDNN_VER=${CUDNN_VER} \
NCCL_VER=${NCCL_VER} \
CUBLAS_VER=${CUBLAS_VER} \
bash ./install.sh --tensorrt && rm install_tensorrt.sh

COPY docker/common/install_polygraphy.sh install_polygraphy.sh
RUN bash ./install_polygraphy.sh && rm install_polygraphy.sh
RUN bash ./install.sh --polygraphy && rm install_polygraphy.sh

# Install mpi4py
COPY docker/common/install_mpi4py.sh install_mpi4py.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_mpi4py.sh && rm install_mpi4py.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --mpi4py && rm install_mpi4py.sh

# Install PyTorch
ARG TORCH_INSTALL_TYPE="skip"
COPY docker/common/install_pytorch.sh install_pytorch.sh
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
#
# NB: PyTorch requires this to be < 1.0
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"

# Install OpenCV with FFMPEG support
RUN pip3 uninstall -y opencv && \
rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
RUN TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} bash ./install.sh --pytorch && rm install_pytorch.sh

# WARs against security issues inherited from pytorch:25.06
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
RUN pip3 install --upgrade --no-cache-dir \
"protobuf>=4.25.8"
RUN bash ./install.sh --opencv && bash ./install.sh --protobuf && rm install.sh

FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton

Expand Down
144 changes: 144 additions & 0 deletions docker/common/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/bin/bash
set -Eeo pipefail
shopt -s nullglob
trap 'echo "[install.sh] Error on line $LINENO" >&2' ERR

# Resolve script directory for robust relative pathing
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"

# Default values
base=0
cmake=0
ccache=0
cuda_toolkit=0
tensorrt=0
polygraphy=0
mpi4py=0
pytorch=0
opencv=0
protobuf=0

while [[ $# -gt 0 ]]; do
case $1 in
--base)
base=1
shift 1
;;
--cmake)
cmake=1
shift 1
;;
--ccache)
ccache=1
shift 1
;;
--cuda_toolkit)
cuda_toolkit=1
shift 1
;;
--tensorrt)
tensorrt=1
shift 1
;;
--polygraphy)
polygraphy=1
shift 1
;;
--mpi4py)
mpi4py=1
shift 1
;;
--pytorch)
pytorch=1
shift 1
;;
--opencv)
opencv=1
shift 1
;;
--protobuf)
protobuf=1
shift 1
;;
--all)
base=1
cmake=1
ccache=1
cuda_toolkit=1
tensorrt=1
polygraphy=1
mpi4py=1
pytorch=1
opencv=1
protobuf=1
shift 1
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done

if [ $base -eq 1 ]; then
echo "Installing base dependencies..."
# Clean up the pip constraint file from the base NGC PyTorch image.
[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true

echo "Using Python version: $PYTHON_VERSION"
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_base.sh $PYTHON_VERSION
fi

if [ $cmake -eq 1 ]; then
echo "Installing CMake..."
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_cmake.sh
fi

if [ $ccache -eq 1 ]; then
echo "Installing ccache..."
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_ccache.sh
fi

if [ $cuda_toolkit -eq 1 ]; then
echo "Installing CUDA toolkit..."
bash $SCRIPT_DIR/install_cuda_toolkit.sh
fi

if [ $tensorrt -eq 1 ]; then
echo "Installing TensorRT..."
bash $SCRIPT_DIR/install_tensorrt.sh \
--TRT_VER=${TRT_VER} \
--CUDA_VER=${CUDA_VER} \
--CUDNN_VER=${CUDNN_VER} \
--NCCL_VER=${NCCL_VER} \
--CUBLAS_VER=${CUBLAS_VER}
fi

if [ $polygraphy -eq 1 ]; then
echo "Installing Polygraphy..."
bash $SCRIPT_DIR/install_polygraphy.sh
fi

if [ $mpi4py -eq 1 ]; then
echo "Installing mpi4py..."
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_mpi4py.sh
fi

if [ $pytorch -eq 1 ]; then
echo "Installing PyTorch..."
bash $SCRIPT_DIR/install_pytorch.sh $TORCH_INSTALL_TYPE
fi

if [ $opencv -eq 1 ]; then
echo "Installing OpenCV..."
pip3 uninstall -y opencv
rm -rf /usr/local/lib/python3*/dist-packages/cv2/
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
fi

# WARs against security issues inherited from pytorch:25.06
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
if [ $protobuf -eq 1 ]; then
pip3 install --upgrade --no-cache-dir \
"protobuf>=4.25.8"
fi
27 changes: 27 additions & 0 deletions docs/source/installation/build-from-source-linux.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,33 @@ Follow the linked catalog entry to enter a new container based on the pre-built
make -C docker run LOCAL_USER=1
```

If you wish to use enroot instead of docker, then you can build a sqsh file that has the identical environment as the development image `tensorrt_llm/devel:latest` as follows.

1. Allocate a compute node:
```bash
salloc --nodes=1
```

2. Create a sqsh file with essential TensorRT LLM dependencies installed
```bash
# Using default sqsh filename (enroot/tensorrt_llm.devel.sqsh)
make -C enroot build_sqsh

# Or specify a custom path (optional)
make -C enroot build_sqsh SQSH_PATH=/path/to/dev_trtllm_image.sqsh
```

3. Once this squash file is ready, you can follow the steps under [Build TensorRT LLM](#build-tensorrt-llm)by launching an enroot sandbox from `dev_trtllm_image.sqsh`. To do this, proceed as follows:
```bash
export SQSH_PATH=/path/to/dev_trtllm_image.sqsh

# Start a pseudo terminal for interactive session
make -C enroot run_sqsh

# Or, you could run commands directly
make -C enroot run_sqsh RUN_CMD="python3 scripts/build_wheel.py"
```

**On systems without GNU `make`**

1. Create a Docker image for development.
Expand Down
45 changes: 45 additions & 0 deletions enroot/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
ifndef MAKEFILE_PYXIS_INCLUDED
MAKEFILE_PYXIS_INCLUDED := 1

BASE_IMAGE ?= $(shell grep '^ARG BASE_IMAGE=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="')
BASE_TAG ?= $(shell grep '^ARG BASE_TAG=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="')
SQSH_PATH ?= tensorrt_llm.devel.sqsh
SOURCE_DIR ?= $(shell readlink -f ..)
CODE_DIR ?= /code/tensorrt_llm
RUN_CMD ?= --pty bash

PYTHON_VERSION ?= 3.12.3
TORCH_INSTALL_TYPE ?= skip
GITHUB_MIRROR ?=
CUDA_VERSION ?=
CUDNN_VERSION ?=
NCCL_VERSION ?=
CUBLAS_VERSION ?=
TRT_VERSION ?=

build_sqsh:
@echo "Building trtllm sqsh image."
@echo "Base image: $(BASE_IMAGE):$(BASE_TAG)"
@echo "Location: $(SQSH_PATH)"

srun \
--container-image "$(BASE_IMAGE):$(BASE_TAG)" \
--container-save "$(SQSH_PATH)" \
--container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR)/docker/common \
--container-mount-home --container-remap-root \
--export PYTHON_VERSION=$(PYTHON_VERSION),GITHUB_MIRROR=$(GITHUB_MIRROR),TORCH_INSTALL_TYPE=$(TORCH_INSTALL_TYPE),CUDA_VER=$(CUDA_VERSION),CUDNN_VER=$(CUDNN_VERSION),NCCL_VER=$(NCCL_VERSION),CUBLAS_VER=$(CUBLAS_VERSION),TRT_VER=$(TRT_VERSION) \
./install.sh --all

run_sqsh:
@echo "Running srun job step with:"
@echo " sqsh image: $(SQSH_PATH)"
@echo " run command: $(RUN_CMD)"

srun \
--container-image "$(SQSH_PATH)" \
--container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR) \
--container-mount-home --container-remap-root \
--export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.99999 \
$(RUN_CMD)

endif
8 changes: 4 additions & 4 deletions jenkins/current_image_tags.properties
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509081850-5980
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509081850-5980
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509091430-7383
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509091430-7383