Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion .bandit
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[bandit]
exclude = tests
skips = B101,B615,B102,B110
skips = B101,B615
1 change: 0 additions & 1 deletion .coderabbit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,5 @@ reviews:
auto_review:
enabled: true
drafts: false
auto_incremental_review: false
chat:
auto_reply: true
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
max-line-length = 88

select = C,E,F,W,B,B950
extend-ignore = E203, E501, W503
7 changes: 0 additions & 7 deletions .github/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,6 @@ We welcome ideas for improvements and new features. To suggest an enhancement, o
5. Push your branch to your fork on GitHub.
6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues.

#### Skipping CI Checks

You can skip certain CI checks by including specific keywords in your commit messages:

- `[skip ci]` or `skip ci` - Skips all CI checks for that commit
- `[skip-e2e]` or `skip-e2e` - Skips only end-to-end tests while running other CI checks. You may also include this in the title of your PR to disable end-to-end tests for the entire PR.

## Style Guidelines

### Code Style
Expand Down
16 changes: 0 additions & 16 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,6 @@ jobs:
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras:
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.8.0
axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
Expand Down Expand Up @@ -115,11 +110,6 @@ jobs:
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras:
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.8.0
axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
Expand Down Expand Up @@ -179,12 +169,6 @@ jobs:
pytorch: 2.7.1
axolotl_extras: vllm
is_latest: true
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.8.0
axolotl_extras:
is_latest:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/multi-gpu-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ jobs:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras: vllm
pytorch: 2.7.0
axolotl_extras:
num_gpus: 2
nightly_build: "true"
- cuda: 128
cuda_version: 12.8.1
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.8.0
axolotl_extras:
pytorch: 2.7.1
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
runs-on: [self-hosted, modal]
Expand Down
62 changes: 10 additions & 52 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
fail-fast: false
matrix:
python_version: ["3.11"]
pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
timeout-minutes: 20

steps:
Expand Down Expand Up @@ -130,7 +130,7 @@ jobs:
fail-fast: false
matrix:
python_version: ["3.11"]
pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
timeout-minutes: 20

steps:
Expand Down Expand Up @@ -188,44 +188,13 @@ jobs:
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

gate-skip-e2e:
needs: [pre-commit, pytest, pytest-sdist]
runs-on: ubuntu-latest
outputs:
skip: ${{ steps.compute.outputs.skip }}
steps:
- uses: actions/github-script@v7
id: compute
with:
script: |
const token = /\[skip-e2e\]/i;
let msg = '';
if (context.eventName === 'push') {
msg = context.payload.head_commit?.message || '';
} else if (context.eventName === 'pull_request') {
const { owner, repo } = context.repo;
const prNumber = context.payload.pull_request.number;
const commits = await github.paginate(
github.rest.pulls.listCommits,
{ owner, repo, pull_number: prNumber, per_page: 100 }
);
msg = commits.at(-1)?.commit?.message || '';
}
const title = context.payload.pull_request?.title || '';
const body = context.payload.pull_request?.body || '';
const skip = token.test(msg) || token.test(title) || token.test(body);
core.setOutput('skip', String(skip));

docker-e2e-tests-1st:
# Run this job first as a gate for running the remainder of the test matrix
if: >
github.repository_owner == 'axolotl-ai-cloud' &&
(github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
needs.gate-skip-e2e.outputs.skip != 'true'
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
# this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal]
timeout-minutes: 120
needs: [pre-commit, pytest, pytest-sdist, gate-skip-e2e]
needs: [pre-commit, pytest, pytest-sdist]

strategy:
fail-fast: false
Expand All @@ -240,7 +209,7 @@ jobs:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.1
pytorch: 2.6.0
num_gpus: 1
axolotl_extras:
dockerfile: "Dockerfile-uv.jinja"
Expand Down Expand Up @@ -271,16 +240,13 @@ jobs:
modal run cicd.e2e_tests

docker-e2e-tests:
if: >
github.repository_owner == 'axolotl-ai-cloud' &&
(github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
needs.gate-skip-e2e.outputs.skip != 'true'
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
# this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal]
timeout-minutes: 120
# Only run the remainder of the matrix if the first e2e check passed;
# this is to save on wasted compute costs for known failures that get caught in the first run
needs: [pre-commit, pytest, gate-skip-e2e, docker-e2e-tests-1st]
needs: [pre-commit, pytest, docker-e2e-tests-1st]

strategy:
fail-fast: false
Expand All @@ -298,13 +264,6 @@ jobs:
pytorch: 2.7.1
num_gpus: 1
axolotl_extras:
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.8.0
num_gpus: 1
gpu_type: "B200"
axolotl_extras:
steps:
- name: Checkout
uses: actions/checkout@v4
Expand All @@ -325,7 +284,6 @@ jobs:
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
- name: Run tests job on Modal
Expand All @@ -342,10 +300,10 @@ jobs:
fail-fast: false
matrix:
include:
- cuda: 126
cuda_version: 12.6.3
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.7.1
pytorch: 2.6.0
num_gpus: 1
axolotl_extras:
steps:
Expand Down
4 changes: 4 additions & 0 deletions .isort.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[settings]
profile=black
known_third_party=wandb,comet_ml
known_local_folder=src,tests
20 changes: 15 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,22 @@ repos:
- id: trailing-whitespace
- id: no-commit-to-branch
args: ['--branch', 'main']
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.12.11
- repo: https://github.com/psf/black
rev: 25.1.0
hooks:
- id: ruff
args: [--fix]
- id: ruff-format
- id: black
- repo: https://github.com/pycqa/isort
rev: 6.0.1
hooks:
- id: isort
- repo: https://github.com/PyCQA/flake8
rev: 7.3.0
hooks:
- id: flake8
- repo: https://github.com/pylint-dev/pylint
rev: v3.3.8
hooks:
- id: pylint
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.17.1
hooks:
Expand Down
15 changes: 15 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[MASTER]
init-hook="from pylint.config import find_default_config_files; import sys; sys.path.append(next(find_default_config_files()).parent.as_posix())"

[TYPECHECK]

# List of members which are set dynamically and missed by Pylint inference
# system, and so shouldn't trigger E1101 when accessed.
generated-members=numpy.*, torch.*


[pylint.messages_control]
disable=missing-function-docstring, line-too-long, import-error,
too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
too-many-positional-arguments, possibly-used-before-assignment
5 changes: 0 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
<br/>
<a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
<a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
<a href="https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="google-colab" style="height: 20px;"></a>
<br/>
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
Expand Down Expand Up @@ -71,10 +70,6 @@ Features:
- Python 3.11
- PyTorch ≥2.6.0

### Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)

### Installation

#### Using pip
Expand Down
10 changes: 10 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# todo list

- [] Validation of parameters for combinations that won't work



## things that are known not to work

- FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
- adamw_bnb_8bit doesn't play well with FSDP offload
3 changes: 1 addition & 2 deletions _quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ quartodoc:
- utils.distributed
- utils.dict
- utils.optimizers.adopt
- utils.data.streaming
- utils.data.pretraining
- utils.data.sft
- utils.quantization
- title: Schemas
Expand Down Expand Up @@ -272,7 +272,6 @@ website:
contents:
- docs/batch_vs_grad.qmd
- docs/dataset_preprocessing.qmd
- docs/streaming.qmd
- docs/multipack.qmd
- docs/mixed_precision.qmd
- docs/optimizers.qmd
Expand Down
1 change: 1 addition & 0 deletions cicd/Dockerfile.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
ENV HF_HOME="{{ HF_HOME }}"
ENV AXOLOTL_DATASET_NUM_PROC="8"
ENV AXOLOTL_DATASET_PROCESSES="8"
Comment on lines +12 to 13

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Add deprecation note or plan for the legacy env var

Setting both AXOLOTL_DATASET_NUM_PROC and AXOLOTL_DATASET_PROCESSES is fine for a transition period. Consider annotating the legacy variable to avoid future confusion and to make the removal plan explicit.

Apply this diff to add a brief deprecation note:

 ENV AXOLOTL_DATASET_NUM_PROC="8"
-ENV AXOLOTL_DATASET_PROCESSES="8"
+# Deprecated: kept for backward-compat. Prefer AXOLOTL_DATASET_NUM_PROC; remove after deprecation window.
+ENV AXOLOTL_DATASET_PROCESSES="8"

Run this to verify the precedence in code (NUM_PROC should be preferred over PROCESSES):


🏁 Script executed:

#!/bin/bash
# Expect: A code path that reads AXOLOTL_DATASET_NUM_PROC first, then falls back to AXOLOTL_DATASET_PROCESSES.
rg -nP -C3 'AXOLOTL_DATASET_(NUM_PROC|PROCESSES)' src | sed -n '1,200p'

# Also check for any lingering config key usages of `dataset_processes` outside of deprecation/alias handling.
rg -nP -C2 '\bdataset_processes\b' | sed -n '1,200p'

Length of output: 2466


Annotate legacy AXOLOTL_DATASET_PROCESSES in cicd/Dockerfile.jinja

Verified: AXOLOTL_DATASET_NUM_PROC is preferred at runtime and dataset_processes is already deprecated/migrated in the config — please add a deprecation note in the Dockerfile to avoid confusion.

Files to note:

  • cicd/Dockerfile.jinja — lines 12–13: add deprecation comment for the legacy env var.
  • Verified precedence and deprecation handling in:
    • src/axolotl/utils/datasets.py — get_default_process_count() checks AXOLOTL_DATASET_NUM_PROC then falls back to AXOLOTL_DATASET_PROCESSES (lines ~7–11).
    • src/axolotl/utils/schemas/config.py — dataset_processes is declared deprecated (line ~225) and migrated into dataset_num_proc with warnings (lines ~1268–1279).

Apply this diff:

 ENV AXOLOTL_DATASET_NUM_PROC="8"
-ENV AXOLOTL_DATASET_PROCESSES="8"
+# Deprecated: kept for backward-compat. Prefer AXOLOTL_DATASET_NUM_PROC; remove after deprecation window.
+ENV AXOLOTL_DATASET_PROCESSES="8"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
ENV AXOLOTL_DATASET_NUM_PROC="8"
ENV AXOLOTL_DATASET_PROCESSES="8"
ENV AXOLOTL_DATASET_NUM_PROC="8"
# Deprecated: kept for backward-compat. Prefer AXOLOTL_DATASET_NUM_PROC; remove after deprecation window.
ENV AXOLOTL_DATASET_PROCESSES="8"
🤖 Prompt for AI Agents
In cicd/Dockerfile.jinja around lines 12–13, the legacy
AXOLOTL_DATASET_PROCESSES env var is set without any deprecation note; add a
concise comment next to that ENV line stating that AXOLOTL_DATASET_NUM_PROC is
the preferred runtime variable, AXOLOTL_DATASET_PROCESSES is deprecated and kept
only for backward compatibility, and that code now prefers
AXOLOTL_DATASET_NUM_PROC (see src/axolotl/utils/datasets.py and
src/axolotl/utils/schemas/config.py for migration/deprecation handling). Ensure
the comment is short, mentions backward compatibility, and does not remove the
existing ENV assignment.

Comment thread
winglian marked this conversation as resolved.

RUN apt-get update && \
Expand Down
4 changes: 3 additions & 1 deletion cicd/multigpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
modal application to run axolotl gpu tests in Modal
"""

# pylint: disable=duplicate-code

import os
import pathlib
import tempfile
Expand Down Expand Up @@ -61,7 +63,7 @@ def run_cmd(cmd: str, run_folder: str):

# Propagate errors from subprocess.
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
exit(exit_code)
exit(exit_code) # pylint: disable=consider-using-sys-exit


@app.function(
Expand Down
8 changes: 5 additions & 3 deletions cicd/single_gpu.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Modal app to run axolotl GPU tests"""

# pylint: disable=duplicate-code

import os
import pathlib
import tempfile
Expand Down Expand Up @@ -57,16 +59,16 @@
}

N_GPUS = int(os.environ.get("N_GPUS", 1))
GPU_TYPE = os.environ.get("GPU_TYPE", "L40S")
GPU_CONFIG = f"{GPU_TYPE}:{N_GPUS}"
GPU_CONFIG = f"L40S:{N_GPUS}"


def run_cmd(cmd: str, run_folder: str):
import subprocess # nosec

sp_env = os.environ.copy()
sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8"
sp_env["AXOLOTL_DATASET_PROCESSES"] = "8"

# Propagate errors from subprocess.
if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env): # nosec
exit(exit_code)
exit(exit_code) # pylint: disable=consider-using-sys-exit
2 changes: 1 addition & 1 deletion devtools/dev_chat_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ datasets:
val_set_size: 0
output_dir: temp_debug/axolotl_outputs/model
dataset_prepared_path: temp_debug/axolotl_outputs/data
dataset_processes: 1
dataset_num_proc: 1

sequence_len: 4096
sample_packing: false
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile-base
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ WORKDIR /workspace

RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir causal_conv1d==1.5.2 && \
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
python3 -m pip cache purge

Expand Down
Loading
Loading