thad0ctor · thad0ctor · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
@@ -38,14 +38,6 @@ jobs:
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
             dockerfile: "Dockerfile-base"
             platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
           - cuda: "128"
             cuda_version: 12.8.1
             cudnn_version: ""
@@ -70,14 +62,6 @@ jobs:
             torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
             dockerfile: "Dockerfile-base"
             platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.9.1
-            torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
           - cuda: "130"
             cuda_version: 13.0.0
             cudnn_version: ""
@@ -208,19 +192,19 @@ jobs:
             torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
             dockerfile: "Dockerfile-uv-base"
             platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
+          - cuda: "130"
+            cuda_version: 13.0.0
             cudnn_version: ""
             python_version: "3.12"
             pytorch: 2.11.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
             dockerfile: "Dockerfile-uv-base"
             platforms: "linux/amd64,linux/arm64"
           - cuda: "130"
             cuda_version: 13.0.0
             cudnn_version: ""
             python_version: "3.12"
-            pytorch: 2.11.0
+            pytorch: 2.12.0
             torch_cuda_arch_list: "9.0 10.0 10.3 12.0+PTX"
             dockerfile: "Dockerfile-uv-base"
             platforms: "linux/amd64,linux/arm64"

diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
@@ -40,8 +40,8 @@ jobs:
           #            dockerfile: "Dockerfile-uv.jinja"
           - cuda: 130
             cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
+            python_version: "3.12"
+            pytorch: 2.12.0
             axolotl_extras:
             #            axolotl_extras: fbgemm-gpu
             num_gpus: 2

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -8,9 +8,6 @@ on:
 
 permissions: {}
 
-env:
-  UV_SYSTEM_PYTHON: "1"
-
 jobs:
   setup_release:
     name: Create Release
@@ -24,7 +21,10 @@ jobs:
       - name: Create release
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: gh release create "$GITHUB_REF_NAME" --generate-notes
+        # idempotent: don't fail a re-run if the release already exists
+        run: |
+          gh release view "$GITHUB_REF_NAME" >/dev/null 2>&1 \
+            || gh release create "$GITHUB_REF_NAME" --generate-notes
   pypi-publish:
     name: Upload release to PyPI
     runs-on: ubuntu-latest
@@ -47,13 +47,6 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v7
 
-      - name: Install dependencies
-        run: |
-          uv pip install wheel packaging
-          uv pip install --no-build-isolation -e .
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-
       - name: Extract tag name
         id: tag
         run: echo "TAG_NAME=$(echo $GITHUB_REF | cut -d / -f 3)" >> "$GITHUB_OUTPUT"
@@ -62,9 +55,10 @@ jobs:
         run: |
           echo "${{ steps.tag.outputs.TAG_NAME }}" | sed 's/^v//' > VERSION
 
-      - name: Build a source dist
-        run: |
-          python setup.py sdist
+      - name: Build sdist and wheel
+        # PEP 517 build via uv (setuptools backend reads the version from VERSION);
+        # replaces the removed `python setup.py sdist` after the pyproject migration.
+        run: uv build
 
       - name: Publish package distributions to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
@@ -160,7 +160,7 @@ jobs:
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
         run: |
-          modal run cicd.e2e_tests
+          modal run -m cicd.e2e_tests
   docker-e2e-multigpu-tests:
     if: github.repository_owner == 'axolotl-ai-cloud'
     # this job needs to be run on self-hosted GPU runners...
@@ -203,4 +203,4 @@ jobs:
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
         run: |
-          modal run cicd.multigpu
+          modal run -m cicd.multigpu
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -68,11 +68,11 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.12", "3.14"]
-        pytorch_version: ["2.9.1", "2.10.0"]
+        pytorch_version: ["2.9.1", "2.10.0", "2.11.0", "2.12.0"]
         exclude:
           - python_version: "3.14"
             pytorch_version: "2.9.1"
-    timeout-minutes: 25
+    timeout-minutes: 30
 
     steps:
       - name: cleanup node
@@ -155,7 +155,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.12", "3.14"]
-        pytorch_version: ["2.9.1", "2.10.0"]
+        pytorch_version: ["2.9.1", "2.10.0", "2.11.0", "2.12.0"]
         exclude:
           - python_version: "3.14"
             pytorch_version: "2.9.1"
@@ -274,7 +274,7 @@ jobs:
           - cuda: 130
             cuda_version: 13.0.0
             python_version: "3.12"
-            pytorch: 2.9.1
+            pytorch: 2.12.0
             num_gpus: 1
             axolotl_extras:
     steps:
@@ -302,7 +302,7 @@ jobs:
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
         run: |
-          modal run cicd.e2e_tests
+          modal run -m cicd.e2e_tests
 
   docker-e2e-tests:
     if: >
@@ -320,12 +320,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            num_gpus: 1
-            axolotl_extras:
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
@@ -334,8 +328,8 @@ jobs:
             axolotl_extras:
           - cuda: 130
             cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
+            python_version: "3.12"
+            pytorch: 2.11.0
             num_gpus: 1
             axolotl_extras:
     steps:
@@ -364,7 +358,7 @@ jobs:
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
         run: |
-          modal run cicd.e2e_tests
+          modal run -m cicd.e2e_tests
 
   docker-e2e-cleanup:
     runs-on: [self-hosted, modal]
@@ -404,4 +398,4 @@ jobs:
           echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
       - name: Run tests job on Modal
         run: |
-          modal run cicd.cleanup
+          modal run -m cicd.cleanup
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+uv.lock
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.16.2.dev0
+0.17.0.dev
diff --git a/cicd/cicd.sh b/cicd/cicd.sh
@@ -43,11 +43,20 @@ pytest --full-trace -vvv --durations=10 \
   --cov-append
 
 # Run solo tests with coverage append
+# test_rm_lora is run in its own process below (it fails on py3.11 when sharing
+# the solo process with other tests; isolating it avoids cross-test state).
 pytest -v --durations=10 -n1 \
+  --ignore=tests/e2e/solo/test_reward_model_smollm2.py \
   /workspace/axolotl/tests/e2e/solo/ \
   --cov=axolotl \
   --cov-append
 
+# Run reward-model test isolated in its own process
+pytest -v --durations=10 -s \
+  /workspace/axolotl/tests/e2e/solo/test_reward_model_smollm2.py \
+  --cov=axolotl \
+  --cov-append
+
 # Run integration tests with coverage append
 pytest -v --durations=10 \
   /workspace/axolotl/tests/e2e/integrations/ \

diff --git a/docs/mixed_precision.qmd b/docs/mixed_precision.qmd
@@ -54,6 +54,26 @@ bf16: true
 bf16: full  # Equivalent to bf16_full_eval in the HF trainer
 ```
 
+### Keeping norms in fp32 (FSDP2) {#sec-fp32-norms}
+
+Some models declare RMSNorm/LayerNorm layers as fp32 for training
+stability — the variance computation in RMSNorm is numerically poor in
+bf16, and the learned gain γ quantizes harshly. With FSDP1 this fights
+the flat-param dtype uniformity constraint; with FSDP2 each norm can have
+its own `MixedPrecisionPolicy`. Enable with:
+
+```{.yaml}
+fsdp_version: 2
+fp32_norms: true
+# fp32_norm_classes:        # optional override
+#   - RMSNorm
+#   - LayerNorm
+```
+
+Defaults match any class whose name ends in `RMSNorm` or `LayerNorm`. Use
+fully qualified names (`module.path.ClassName`) to pin a specific
+implementation.
+
 ## FP8 Mixed Precision {#sec-fp8}
 
 ::: {.callout-note}