Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions .github/workflows/base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,17 @@ jobs:
cudnn_version: ""
python_version: "3.11"
pytorch: 2.9.1
torchvision: 0.24.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-base"
platforms: "linux/amd64,linux/arm64"
# arm64 disabled: torchvision 0.24.1+cu128 has no aarch64 wheel
platforms: "linux/amd64"
- cuda: "128"
cuda_version: 12.8.1
cudnn_version: ""
python_version: "3.12"
pytorch: 2.10.0
torchvision: 0.25.0
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -51,6 +54,7 @@ jobs:
# cudnn_version: ""
# python_version: "3.12"
# pytorch: 2.9.1
# torchvision: 0.24.1
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
# dockerfile: "Dockerfile-base"
# platforms: "linux/amd64,linux/arm64"
Expand All @@ -59,14 +63,17 @@ jobs:
cudnn_version: ""
python_version: "3.11"
pytorch: 2.9.1
torchvision: 0.24.1
torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
dockerfile: "Dockerfile-base"
platforms: "linux/amd64,linux/arm64"
# arm64 disabled: torchvision 0.24.1+cu130 has no aarch64 wheel
platforms: "linux/amd64"
- cuda: "130"
cuda_version: 13.0.0
cudnn_version: ""
python_version: "3.12"
pytorch: 2.10.0
torchvision: 0.25.0
torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
dockerfile: "Dockerfile-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -75,6 +82,7 @@ jobs:
# cudnn_version: ""
# python_version: "3.11"
# pytorch: nightly
# torchvision: nightly
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
# dockerfile: "Dockerfile-base-nightly"
# # "next" is for release candidates of pytorch
Expand All @@ -83,6 +91,7 @@ jobs:
# cudnn_version: ""
# python_version: "3.11"
# pytorch: next
# torchvision: next
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
# dockerfile: "Dockerfile-base-next"
steps:
Expand Down Expand Up @@ -117,6 +126,7 @@ jobs:
CUDA=${{ matrix.cuda }}
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch }}
TORCHVISION_VERSION=${{ matrix.torchvision }}
TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
build-base-uv:
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
Expand All @@ -133,6 +143,7 @@ jobs:
cudnn_version: ""
python_version: "3.11"
pytorch: 2.9.1
torchvision: 0.24.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -141,6 +152,7 @@ jobs:
cudnn_version: ""
python_version: "3.12"
pytorch: 2.9.1
torchvision: 0.24.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -149,6 +161,7 @@ jobs:
cudnn_version: ""
python_version: "3.11"
pytorch: 2.10.0
torchvision: 0.25.0
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -157,6 +170,7 @@ jobs:
cudnn_version: ""
python_version: "3.12"
pytorch: 2.10.0
torchvision: 0.25.0
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -165,6 +179,7 @@ jobs:
# cudnn_version: ""
# python_version: "3.12"
# pytorch: 2.9.1
# torchvision: 0.24.1
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
# dockerfile: "Dockerfile-uv-base"
# platforms: "linux/amd64,linux/arm64"
Expand All @@ -173,6 +188,7 @@ jobs:
cudnn_version: ""
python_version: "3.11"
pytorch: 2.9.1
torchvision: 0.24.1
torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
dockerfile: "Dockerfile-uv-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -181,6 +197,7 @@ jobs:
cudnn_version: ""
python_version: "3.12"
pytorch: 2.9.1
torchvision: 0.24.1
torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
dockerfile: "Dockerfile-uv-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -189,6 +206,7 @@ jobs:
cudnn_version: ""
python_version: "3.12"
pytorch: 2.10.0
torchvision: 0.25.0
torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
dockerfile: "Dockerfile-uv-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -197,6 +215,7 @@ jobs:
cudnn_version: ""
python_version: "3.12"
pytorch: 2.11.0
torchvision: 0.26.0
torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
dockerfile: "Dockerfile-uv-base"
platforms: "linux/amd64,linux/arm64"
Expand All @@ -205,6 +224,7 @@ jobs:
cudnn_version: ""
python_version: "3.12"
pytorch: 2.12.0
torchvision: 0.27.0
torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
dockerfile: "Dockerfile-uv-base"
platforms: "linux/amd64,linux/arm64"
Expand Down Expand Up @@ -240,4 +260,5 @@ jobs:
CUDA=${{ matrix.cuda }}
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch }}
TORCHVISION_VERSION=${{ matrix.torchvision }}
TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
4 changes: 4 additions & 0 deletions .github/workflows/multi-gpu-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,23 @@ jobs:
# cuda_version: 12.9.1
# python_version: "3.12"
# pytorch: 2.9.1
# torchvision: 0.24.1
# axolotl_extras: "fbgemm-gpu"
# num_gpus: 2
# dockerfile: "Dockerfile-uv.jinja"
- cuda: 130
cuda_version: 13.0.0
python_version: "3.12"
pytorch: 2.12.0
torchvision: 0.27.0
axolotl_extras:
# axolotl_extras: fbgemm-gpu
num_gpus: 2
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.10.0
torchvision: 0.25.0
axolotl_extras: "fbgemm-gpu"
num_gpus: 2
runs-on: [self-hosted, modal]
Expand All @@ -68,6 +71,7 @@ jobs:
run: |
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
echo "TORCHVISION_VERSION=${{ matrix.torchvision}}" >> $GITHUB_ENV
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/tests-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,19 +119,22 @@ jobs:
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.9.1
torchvision: 0.24.1
num_gpus: 1
axolotl_extras:
nightly_build: "true"
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.10.0
torchvision: 0.25.0
num_gpus: 1
axolotl_extras:
- cuda: 130
cuda_version: 13.0.0
python_version: "3.12"
pytorch: 2.9.1
torchvision: 0.24.1
num_gpus: 1
axolotl_extras:
nightly_build: "true"
Expand All @@ -150,6 +153,7 @@ jobs:
run: |
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
echo "TORCHVISION_VERSION=${{ matrix.torchvision}}" >> $GITHUB_ENV
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
Expand All @@ -176,6 +180,7 @@ jobs:
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.9.1
torchvision: 0.24.1
num_gpus: 2
axolotl_extras:
nightly_build: "true"
Expand All @@ -194,6 +199,7 @@ jobs:
run: |
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
echo "TORCHVISION_VERSION=${{ matrix.torchvision}}" >> $GITHUB_ENV
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ jobs:
cuda_version: 13.0.0
python_version: "3.12"
pytorch: 2.12.0
torchvision: 0.27.0
num_gpus: 1
axolotl_extras:
steps:
Expand All @@ -305,6 +306,7 @@ jobs:
run: |
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
echo "TORCHVISION_VERSION=${{ matrix.torchvision}}" >> $GITHUB_ENV
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
Expand Down Expand Up @@ -337,12 +339,14 @@ jobs:
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.10.0
torchvision: 0.25.0
num_gpus: 1
axolotl_extras:
- cuda: 130
cuda_version: 13.0.0
python_version: "3.12"
pytorch: 2.11.0
torchvision: 0.26.0
num_gpus: 1
axolotl_extras:
steps:
Expand All @@ -360,6 +364,7 @@ jobs:
run: |
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
echo "TORCHVISION_VERSION=${{ matrix.torchvision}}" >> $GITHUB_ENV
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
Expand Down
13 changes: 8 additions & 5 deletions cicd/Dockerfile-uv.jinja
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}

ENV VIRTUAL_ENV="/workspace/axolotl-venv"
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
ENV CUDA="{{ CUDA }}"
ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
ENV TORCHVISION_VERSION="{{ TORCHVISION_VERSION }}"
ENV GITHUB_REF="{{ GITHUB_REF }}"
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
Expand All @@ -23,13 +25,14 @@ RUN git fetch origin +$GITHUB_REF && \
git checkout FETCH_HEAD

RUN uv pip install packaging==26.0 setuptools==78.1.1
RUN uv pip install torchvision
RUN uv pip uninstall causal_conv1d
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
uv pip install --no-build-isolation -e .[deepspeed,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
RUN uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt && \
if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
uv pip install --no-build-isolation -e .[deepspeed,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS --override /tmp/torch-pin.txt; \
else \
uv pip install --no-build-isolation -e .[deepspeed,optimizers,ray] $AXOLOTL_ARGS; \
fi
uv pip install --no-build-isolation -e .[deepspeed,optimizers,ray] $AXOLOTL_ARGS --override /tmp/torch-pin.txt; \
fi && \
python -c "import torch, torchvision; torchvision.ops.nms; print('OK', torch.__version__, torchvision.__version__)"

# Override with nightly HF packages for nightly builds
RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
Expand Down
1 change: 1 addition & 0 deletions cicd/multigpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
"TORCHVISION_VERSION": os.environ.get("TORCHVISION_VERSION", "0.21.0"),
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
"CUDA": os.environ.get("CUDA", "126"),
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
Expand Down
1 change: 1 addition & 0 deletions cicd/single_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
"TORCHVISION_VERSION": os.environ.get("TORCHVISION_VERSION", "0.21.0"),
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
"CUDA": os.environ.get("CUDA", "126"),
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
Expand Down
8 changes: 5 additions & 3 deletions docker/Dockerfile-base
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ ENV PATH="/root/miniconda3/bin:${PATH}"

ARG TARGETARCH
ARG PYTHON_VERSION="3.11"
ARG PYTORCH_VERSION="2.1.2"
ARG PYTORCH_VERSION="2.9.1"
ARG TORCHVISION_VERSION="0.24.1"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🌐 Web query:

From the official PyTorch compatibility table, which torchvision version is compatible with torch 2.1.2 (CUDA wheels)?

💡 Result:

torchvision version 0.16.2 is compatible with torch 2.1.2 (CUDA wheels).

Citations:


TORCHVISION_VERSION default is incompatible with torch 2.1.2.

Line 14 defaults to 0.24.1, but the compatible version for torch 2.1.2 (Line 13) is 0.16.2. This breaks default/local builds when TORCHVISION_VERSION is not explicitly passed, resulting in an incompatible PyTorch + torchvision installation.

Suggested fix
 ARG PYTORCH_VERSION="2.1.2"
-ARG TORCHVISION_VERSION="0.24.1"
+ARG TORCHVISION_VERSION="0.16.2"
 ARG CUDA="128"

Also applies to: 48-49

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@docker/Dockerfile-base` at line 14, The default ARG TORCHVISION_VERSION
("0.24.1") is incompatible with the specified torch version (ARG TORCH_VERSION
"2.1.2"); update the ARG TORCHVISION_VERSION default to "0.16.2" wherever it
appears (the top-level ARG TORCHVISION_VERSION and the later occurrences around
lines 48-49) so the Dockerfile installs a torchvision version compatible with
torch 2.1.2; ensure any build ARG references or INSTALL steps that use
TORCHVISION_VERSION keep the new value.

ARG CUDA="128"
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

Expand Down Expand Up @@ -44,8 +45,9 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace

RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel psutil && \
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
python3 -m pip cache purge
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision==${TORCHVISION_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
python3 -m pip cache purge && \
python3 -c "import torch, torchvision; torchvision.ops.nms; print('OK', torch.__version__, torchvision.__version__)"

RUN if [ "$CUDA" != "130" ] ; then \
CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.4"; \
Expand Down
2 changes: 2 additions & 0 deletions docker/Dockerfile-uv
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
ARG BASE_TAG=main-base
FROM axolotlai/axolotl-base-uv:$BASE_TAG

ENV VIRTUAL_ENV="/workspace/axolotl-venv"

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ARG AXOLOTL_EXTRAS=""
ARG AXOLOTL_ARGS=""
Expand Down
7 changes: 5 additions & 2 deletions docker/Dockerfile-uv-base
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION A
ARG TARGETARCH
ARG PYTHON_VERSION="3.11"
ARG PYTORCH_VERSION="2.6.0"
ARG TORCHVISION_VERSION="0.21.0"
ARG CUDA="126"
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

Expand All @@ -30,10 +31,12 @@ WORKDIR /workspace
RUN uv venv --no-project --relocatable axolotl-venv

ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
ENV VIRTUAL_ENV="/workspace/axolotl-venv"

RUN uv pip install packaging setuptools wheel psutil \
&& uv pip install torch==${PYTORCH_VERSION} torchvision \
&& uv pip install awscli pydantic
&& uv pip install torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION} \
&& uv pip install awscli pydantic \
&& python -c "import torch, torchvision; torchvision.ops.nms; print('OK', torch.__version__, torchvision.__version__)"

RUN if [ "$TARGETARCH" = "amd64" ]; then \
MAMBA_SKIP_CUDA_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE uv pip install --no-build-isolation mamba_ssm causal_conv1d; \
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ requires-python = ">=3.10"
dependencies = [
# Core ML stack
"torch>=2.9.1",
"torchvision>=0.24.1",
"packaging==26.0",
"huggingface_hub>=1.1.7",
"peft>=0.19.1,<0.20.0",
Expand Down
Loading
Loading