axolotl-ai-cloud · ved1beta · Aug 15, 2025 · Aug 18, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/.bandit b/.bandit
@@ -1,3 +1,3 @@
 [bandit]
 exclude = tests
-skips = B101,B615,B102,B110
+skips = B101,B615
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
@@ -12,6 +12,5 @@ reviews:
   auto_review:
     enabled: true
     drafts: false
-    auto_incremental_review: false
 chat:
   auto_reply: true
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 88
+
+select = C,E,F,W,B,B950
+extend-ignore = E203, E501, W503
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -57,13 +57,6 @@ We welcome ideas for improvements and new features. To suggest an enhancement, o
 5. Push your branch to your fork on GitHub.
 6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues.
 
-#### Skipping CI Checks
-
-You can skip certain CI checks by including specific keywords in your commit messages:
-
-- `[skip ci]` or `skip ci` - Skips all CI checks for that commit
-- `[skip-e2e]` or `skip-e2e` - Skips only end-to-end tests while running other CI checks. You may also include this in the title of your PR to disable end-to-end tests for the entire PR.
-
 ## Style Guidelines
 
 ### Code Style

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -36,11 +36,6 @@ jobs:
             python_version: "3.11"
             pytorch: 2.7.1
             axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
@@ -115,11 +110,6 @@ jobs:
             python_version: "3.11"
             pytorch: 2.7.1
             axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
@@ -179,12 +169,6 @@ jobs:
             pytorch: 2.7.1
             axolotl_extras: vllm
             is_latest: true
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            axolotl_extras:
-            is_latest:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout

diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
@@ -36,15 +36,15 @@ jobs:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras: vllm
+            pytorch: 2.7.0
+            axolotl_extras:
             num_gpus: 2
             nightly_build: "true"
-          - cuda: 128
-            cuda_version: 12.8.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
-            pytorch: 2.8.0
-            axolotl_extras:
+            pytorch: 2.7.1
+            axolotl_extras: vllm
             num_gpus: 2
             nightly_build: "true"
     runs-on: [self-hosted, modal]

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -55,7 +55,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
+        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
     timeout-minutes: 20
 
     steps:
@@ -130,7 +130,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
+        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
     timeout-minutes: 20
 
     steps:
@@ -188,44 +188,13 @@ jobs:
         run: |
           find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
 
-  gate-skip-e2e:
-    needs: [pre-commit, pytest, pytest-sdist]
-    runs-on: ubuntu-latest
-    outputs:
-      skip: ${{ steps.compute.outputs.skip }}
-    steps:
-      - uses: actions/github-script@v7
-        id: compute
-        with:
-          script: |
-            const token = /\[skip-e2e\]/i;
-            let msg = '';
-            if (context.eventName === 'push') {
-              msg = context.payload.head_commit?.message || '';
-            } else if (context.eventName === 'pull_request') {
-              const { owner, repo } = context.repo;
-              const prNumber = context.payload.pull_request.number;
-              const commits = await github.paginate(
-                github.rest.pulls.listCommits,
-                { owner, repo, pull_number: prNumber, per_page: 100 }
-              );
-              msg = commits.at(-1)?.commit?.message || '';
-            }
-            const title = context.payload.pull_request?.title || '';
-            const body  = context.payload.pull_request?.body  || '';
-            const skip = token.test(msg) || token.test(title) || token.test(body);
-            core.setOutput('skip', String(skip));
-
   docker-e2e-tests-1st:
     # Run this job first as a gate for running the remainder of the test matrix
-    if: >
-      github.repository_owner == 'axolotl-ai-cloud' &&
-      (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
-      needs.gate-skip-e2e.outputs.skip != 'true'
+    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
     # this job needs to be run on self-hosted GPU runners...
     runs-on: [self-hosted, modal]
     timeout-minutes: 120
-    needs: [pre-commit, pytest, pytest-sdist, gate-skip-e2e]
+    needs: [pre-commit, pytest, pytest-sdist]
 
     strategy:
       fail-fast: false
@@ -240,7 +209,7 @@ jobs:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.6.0
             num_gpus: 1
             axolotl_extras:
             dockerfile: "Dockerfile-uv.jinja"
@@ -271,16 +240,13 @@ jobs:
           modal run cicd.e2e_tests
 
   docker-e2e-tests:
-    if: >
-      github.repository_owner == 'axolotl-ai-cloud' &&
-      (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
-      needs.gate-skip-e2e.outputs.skip != 'true'
+    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
     # this job needs to be run on self-hosted GPU runners...
     runs-on: [self-hosted, modal]
     timeout-minutes: 120
     # Only run the remainder of the matrix if the first e2e check passed;
     # this is to save on wasted compute costs for known failures that get caught in the first run
-    needs: [pre-commit, pytest, gate-skip-e2e, docker-e2e-tests-1st]
+    needs: [pre-commit, pytest, docker-e2e-tests-1st]
 
     strategy:
       fail-fast: false
@@ -298,13 +264,6 @@ jobs:
             pytorch: 2.7.1
             num_gpus: 1
             axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            num_gpus: 1
-            gpu_type: "B200"
-            axolotl_extras:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -325,7 +284,6 @@ jobs:
           echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
           echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
           echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
           echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
           echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
       - name: Run tests job on Modal
@@ -342,10 +300,10 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
             python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.6.0
             num_gpus: 1
             axolotl_extras:
     steps:

diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,4 @@
+[settings]
+profile=black
+known_third_party=wandb,comet_ml
+known_local_folder=src,tests
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,12 +10,22 @@ repos:
     -   id: trailing-whitespace
     -   id: no-commit-to-branch
         args: ['--branch', 'main']
--   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.11
+-   repo: https://github.com/psf/black
+    rev: 25.1.0
     hooks:
-    -   id: ruff
-        args: [--fix]
-    -   id: ruff-format
+    -   id: black
+-   repo: https://github.com/pycqa/isort
+    rev: 6.0.1
+    hooks:
+      - id: isort
+-   repo: https://github.com/PyCQA/flake8
+    rev: 7.3.0
+    hooks:
+    - id: flake8
+-   repo: https://github.com/pylint-dev/pylint
+    rev: v3.3.8
+    hooks:
+    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.17.1
     hooks:

diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,15 @@
+[MASTER]
+init-hook="from pylint.config import find_default_config_files; import sys; sys.path.append(next(find_default_config_files()).parent.as_posix())"
+
+[TYPECHECK]
+
+# List of members which are set dynamically and missed by Pylint inference
+# system, and so shouldn't trigger E1101 when accessed.
+generated-members=numpy.*, torch.*
+
+
+[pylint.messages_control]
+disable=missing-function-docstring, line-too-long, import-error,
+    too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
+    too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
+    too-many-positional-arguments, possibly-used-before-assignment
diff --git a/README.md b/README.md
@@ -17,7 +17,6 @@
     <br/>
     <a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
     <a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
-    <a href="https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="google-colab" style="height: 20px;"></a>
     <br/>
     <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
     <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
@@ -71,10 +70,6 @@ Features:
 - Python 3.11
 - PyTorch ≥2.6.0
 
-### Google Colab
-
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)
-
 ### Installation
 
 #### Using pip

diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,10 @@
+# todo list
+
+- [] Validation of parameters for combinations that won't work
+
+
+
+## things that are known not to work
+
+- FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
+- adamw_bnb_8bit doesn't play well with FSDP offload
diff --git a/_quarto.yml b/_quarto.yml
@@ -153,7 +153,7 @@ quartodoc:
         - utils.distributed
         - utils.dict
         - utils.optimizers.adopt
-        - utils.data.streaming
+        - utils.data.pretraining
         - utils.data.sft
         - utils.quantization
     - title: Schemas
@@ -272,7 +272,6 @@ website:
           contents:
             - docs/batch_vs_grad.qmd
             - docs/dataset_preprocessing.qmd
-            - docs/streaming.qmd
             - docs/multipack.qmd
             - docs/mixed_precision.qmd
             - docs/optimizers.qmd

diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja
@@ -9,6 +9,7 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"
+ENV AXOLOTL_DATASET_NUM_PROC="8"
 ENV AXOLOTL_DATASET_PROCESSES="8"
-ENV AXOLOTL_DATASET_NUM_PROC="8"
-ENV AXOLOTL_DATASET_PROCESSES="8"
+ENV AXOLOTL_DATASET_NUM_PROC="8"
+# Deprecated: kept for backward-compat. Prefer AXOLOTL_DATASET_NUM_PROC; remove after deprecation window.
+ENV AXOLOTL_DATASET_PROCESSES="8"
-ENV AXOLOTL_DATASET_NUM_PROC="8"
-ENV AXOLOTL_DATASET_PROCESSES="8"
+ENV AXOLOTL_DATASET_NUM_PROC="8"
+# Deprecated: kept for backward-compat. Prefer AXOLOTL_DATASET_NUM_PROC; remove after deprecation window.
+ENV AXOLOTL_DATASET_PROCESSES="8"
 
 RUN apt-get update && \

diff --git a/cicd/multigpu.py b/cicd/multigpu.py
@@ -2,6 +2,8 @@
 modal application to run axolotl gpu tests in Modal
 """
 
+# pylint: disable=duplicate-code
+
 import os
 import pathlib
 import tempfile
@@ -61,7 +63,7 @@ def run_cmd(cmd: str, run_folder: str):
 
     # Propagate errors from subprocess.
     if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
 
 
 @app.function(

diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py
@@ -1,5 +1,7 @@
 """Modal app to run axolotl GPU tests"""
 
+# pylint: disable=duplicate-code
+
 import os
 import pathlib
 import tempfile
@@ -57,16 +59,16 @@
 }
 
 N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_TYPE = os.environ.get("GPU_TYPE", "L40S")
-GPU_CONFIG = f"{GPU_TYPE}:{N_GPUS}"
+GPU_CONFIG = f"L40S:{N_GPUS}"
 
 
 def run_cmd(cmd: str, run_folder: str):
     import subprocess  # nosec
 
     sp_env = os.environ.copy()
+    sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8"
     sp_env["AXOLOTL_DATASET_PROCESSES"] = "8"
 
     # Propagate errors from subprocess.
     if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env):  # nosec
-        exit(exit_code)
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
diff --git a/devtools/dev_chat_template.yml b/devtools/dev_chat_template.yml
@@ -13,7 +13,7 @@ datasets:
 val_set_size: 0
 output_dir: temp_debug/axolotl_outputs/model
 dataset_prepared_path: temp_debug/axolotl_outputs/data
-dataset_processes: 1
+dataset_num_proc: 1
 
 sequence_len: 4096
 sample_packing: false

diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
@@ -37,7 +37,7 @@ WORKDIR /workspace
 
 RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
     python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
-    CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir causal_conv1d==1.5.2 && \
+    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
     python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
     python3 -m pip cache purge