diff --git a/.github/workflows/consolidated-tests-ci.yml b/.github/workflows/consolidated-tests-ci.yml
deleted file mode 100644
index 6b008d4bb1..0000000000
--- a/.github/workflows/consolidated-tests-ci.yml
+++ /dev/null
@@ -1,2265 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# One consolidated CPU-only job that runs every test_* function the existing
-# CI does not already cover from this repo plus the full unsloth_zoo@main
-# CPU test suite plus unsloth_zoo.compiler.test_apply_fused_lm_head.
-#
-# Why a separate workflow:
-#   - studio-backend-ci.yml's "Repo tests (CPU)" job already auto-discovers
-#     tests/ minus tests/qlora, tests/saving, tests/utils, tests/sh. The 16
-#     Bucket-A tests below live inside those --ignore dirs (CPU-runnable but
-#     historically excluded with their GPU siblings); pulling them out into
-#     a sibling job keeps the existing 760-passed baseline stable while we
-#     prove the new pieces are green.
-#   - unsloth_zoo has no CI on main today (.github/workflows/ is empty
-#     upstream as of HEAD 030e4ba). 106 of its 111 test_* functions are
-#     CPU-runnable; the 5 GPU/vLLM ones are deselected here.
-#   - test_apply_fused_lm_head lives at unsloth_zoo/compiler.py:1983, not
-#     under tests/, so it is not picked up by `pytest tests/`. It is a
-#     plain function with no fixtures: pure regex over transformers source
-#     strings, ~5-15 s wall, no GPU.
-#
-# Strict mode: every test step is gating (no `continue-on-error`). The
-# upstream patch fixes that previously caused per-cell red have landed:
-#   - unslothai/unsloth#5319 (patch_fast_lora import, patch_sft_trainer
-#     Union, openenv OSError graceful skip).
-#   - unslothai/unsloth-zoo#628 (MoE coverage canary so old transformers
-#     skips legitimately while real discovery regressions still fail).
-# After those merges every observed cell failure was one of these two
-# things; if they regress we want a red cell, not a green-with-fail-prints
-# cell.
-
-name: Core
-
-on:
-  pull_request:
-    paths:
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'studio/**'
-      - 'tests/**'
-      - 'pyproject.toml'
-      - '.github/workflows/consolidated-tests-ci.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-    inputs:
-      unsloth_zoo_ref:
-        description: 'unsloth_zoo git ref to test against (default main)'
-        required: false
-        default: 'main'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  consolidated:
-    # Matrix: three (transformers, TRL) combos cover the failure surface the
-    # PR cares about:
-    #   1. transformers==4.57.6 + TRL latest <1.0.0 (the just-before-5.x line)
-    #   2. transformers latest 5.x + TRL latest 1.x (the absolute upstream tip;
-    #      currently 5.8.0 + 1.3.0, both BEYOND the unsloth/unsloth_zoo
-    #      <=5.5.0 / <=0.24.0 caps -- the cell exists explicitly to surface
-    #      drift signal)
-    #   3. transformers + TRL pinned by pyproject.toml's dependency entries
-    #      (resolved dynamically at job time via tomllib)
-    # fail-fast: false so each cell runs independently and a transformers /
-    # TRL drift signal in one cell does not cancel the others. No
-    # job-level or per-step `continue-on-error` -- real test failures now
-    # fail the cell. Patches with legitimate CPU-runner preconditions
-    # (real CUDA dispatcher, runtime args) are explicitly skipped via
-    # NEEDS_PRECONDITION in the runtime check shim below.
-    strategy:
-      fail-fast: false
-      matrix:
-        combo:
-          - id: t4576-trl0latest
-            label: "HF=4.57.6 + TRL<1"
-            transformers_spec: "transformers==4.57.6"
-            trl_spec: "trl>=0.18.2,<1.0.0"
-          - id: tlatest5-trl1latest
-            label: "HF=latest + TRL=latest"
-            transformers_spec: "transformers>=5,<6"
-            trl_spec: "trl>=1,<2"
-          - id: pyproject
-            label: "HF=default + TRL=default"
-            transformers_spec: "__from_pyproject__"
-            trl_spec: "__from_pyproject__"
-    name: "Core (${{ matrix.combo.label }})"
-    runs-on: ubuntu-latest
-    timeout-minutes: 35
-    # No job-level or per-step `continue-on-error`. Earlier iterations
-    # masked real test failures behind green check icons; that lie is
-    # gone. A failing test step fails the cell. NEEDS_PRECONDITION in
-    # the runtime check shim handles patches that legitimately cannot
-    # run on a CPU-only runner (real CUDA dispatcher, runtime args).
-    env:
-      UNSLOTH_ZOO_REF: ${{ inputs.unsloth_zoo_ref || 'main' }}
-      MATRIX_TRANSFORMERS_SPEC: ${{ matrix.combo.transformers_spec }}
-      MATRIX_TRL_SPEC: ${{ matrix.combo.trl_spec }}
-      MATRIX_COMBO_ID: ${{ matrix.combo.id }}
-      # Hoisted to job-level so every step (Sanity, Bucket-A, unsloth_zoo
-      # pytest, test_apply_fused_lm_head) inherits it. transformers' bundled
-      # *_pb2.py was generated against an older protoc; the C++ protobuf
-      # 4+/5+/6 implementation rejects them with "Descriptors cannot be
-      # created directly". The pure-Python parser bypasses the check; the
-      # speed cost is negligible for these tests.
-      PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python
-      PYTHONPATH: ${{ github.workspace }}/studio
-      UNSLOTH_COMPILE_DISABLE: '1'
-      # unsloth_zoo/__init__.py:314 raises ImportError unless UNSLOTH_IS_PRESENT
-      # is set — normally it is set by unsloth.__init__ when unsloth is imported
-      # first. In this job we sometimes import unsloth_zoo.* (e.g.
-      # unsloth_zoo.saving_utils, unsloth_zoo.temporary_patches) without going
-      # through `import unsloth` first; pin the env var to 1 so unsloth_zoo's
-      # bootstrap accepts it. Setting it has no effect on unsloth itself.
-      UNSLOTH_IS_PRESENT: '1'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      # Node 22 unblocks tests/studio/test_chat_preset_builtin_invariants.py's
-      # `node --experimental-strip-types` subprocess. Cheap to install; keeps
-      # the consolidated job self-sufficient even if studio-backend-ci.yml
-      # changes its node setup.
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - name: Install uv (some unsloth_zoo dev tooling expects it on PATH)
-        run: pip install uv
-
-      - name: Resolve matrix specs (handle __from_pyproject__ sentinel)
-        # The pyproject cell uses a sentinel; resolve the real `transformers`
-        # and `trl` constraints from the project's pyproject.toml at job time.
-        # unsloth's pyproject puts the LLM stack pins in
-        # [project.optional-dependencies] under the `huggingfacenotorch`
-        # extra (top-level [project.dependencies] is just typer/pydantic/etc.),
-        # so we walk every optional extra and pick the first matching spec.
-        # Other cells pass their spec through unchanged.
-        run: |
-          set -euxo pipefail
-          python <<'PY' >> "$GITHUB_ENV"
-          import os, re, tomllib
-          spec_t = os.environ["MATRIX_TRANSFORMERS_SPEC"]
-          spec_r = os.environ["MATRIX_TRL_SPEC"]
-
-          def _pkg_name(spec: str) -> str:
-              m = re.match(r"\s*([A-Za-z0-9_.-]+)", spec)
-              return (m.group(1).lower() if m else "")
-
-          if spec_t == "__from_pyproject__" or spec_r == "__from_pyproject__":
-              with open("pyproject.toml", "rb") as f:
-                  doc = tomllib.load(f)
-              proj = doc.get("project", {})
-              # Try top-level deps first, then all optional extras.
-              all_deps: list[str] = list(proj.get("dependencies", []))
-              for _name, dep_list in proj.get("optional-dependencies", {}).items():
-                  all_deps.extend(dep_list)
-
-              if spec_t == "__from_pyproject__":
-                  spec_t = next((x for x in all_deps if _pkg_name(x) == "transformers"),
-                                "transformers")
-              if spec_r == "__from_pyproject__":
-                  spec_r = next((x for x in all_deps if _pkg_name(x) == "trl"),
-                                "trl")
-          print(f"RESOLVED_TRANSFORMERS_SPEC={spec_t}")
-          print(f"RESOLVED_TRL_SPEC={spec_r}")
-          PY
-          # Echo to logs so the matrix cell label maps cleanly to a spec.
-          grep RESOLVED_ "$GITHUB_ENV" || true
-
-      - name: Install runtime deps (mirrors studio-backend-ci.yml + mlx-ci.yml)
-        # The shape matches studio-backend-ci.yml's "Repo tests (CPU)" install
-        # so we inherit the same CPU-spoof harness in tests/conftest.py and
-        # the same import-chain guarantees, plus the extra deps that the
-        # tests/saving + tests/utils Bucket-A files transitively need but
-        # which Repo tests (CPU) does not require because it --ignores
-        # those directories:
-        #   - protobuf + sentencepiece: tests/saving/test_fix_sentencepiece_gguf_robustness.py
-        #     does `from transformers.utils import sentencepiece_model_pb2`,
-        #     which imports `google.protobuf`. Not pulled by transformers'
-        #     base install.
-        #   - triton: unsloth/_gpu_init.py:232 does an unconditional
-        #     `import triton`. The triton PyPI wheel installs cleanly on
-        #     Linux x86_64 even without CUDA (the import succeeds; runtime
-        #     GPU work is what would fail, which we never do here).
-        # transformers + trl are matrix-parameterized.
-        run: |
-          set -euxo pipefail
-          python -m pip install --upgrade pip
-          pip install -r studio/backend/requirements/studio.txt
-          pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests typer \
-            'numpy<3' pytest==9.0.3 pytest-asyncio httpx \
-            protobuf sentencepiece triton \
-            psutil packaging tqdm safetensors datasets \
-            'peft>=0.18,<0.20' 'accelerate>=0.34,<2' \
-            ipython
-          # torchvision: unsloth_zoo.vision_utils imports it at module scope.
-          pip install --index-url https://download.pytorch.org/whl/cpu \
-            'torch>=2.4,<2.11' 'torchvision<0.26'
-          # transformers + trl from the matrix combo.
-          pip install "$RESOLVED_TRANSFORMERS_SPEC"
-          pip install "$RESOLVED_TRL_SPEC"
-          # bitsandbytes: hard import in unsloth/models/_utils.py. Recent
-          # versions ship a CPU build that imports cleanly on Linux.
-          pip install 'bitsandbytes>=0.45'
-          # unsloth itself, editable, no-deps so pip does not fight the
-          # explicit torch CPU-index install above.
-          pip install -e . --no-deps
-          echo "::group::Installed transformers + trl + torch + unsloth versions"
-          pip show transformers
-          pip show trl
-          pip show torch
-          pip show unsloth
-          echo "::endgroup::"
-
-      - name: Clone unsloth_zoo @ ${{ env.UNSLOTH_ZOO_REF }}
-        # We need the repository tree (the wheel does not ship tests/), so
-        # clone shallow then editable-install so unsloth_zoo.* imports
-        # resolve to the cloned tree. We use `pip show` for the location
-        # check rather than `import unsloth_zoo` because the latter calls
-        # device_type.get_device_type() at module load and raises on a
-        # GPU-less runner; pytest steps below route through the existing
-        # tests/conftest.py spoof which handles that.
-        run: |
-          set -euxo pipefail
-          # github.com occasionally 500s on the git fetch; retry so a
-          # single upstream blip does not fail CI.
-          for attempt in 1 2 3; do
-            rm -rf "$RUNNER_TEMP/unsloth-zoo"
-            if git clone --depth=1 --branch="$UNSLOTH_ZOO_REF" \
-                https://github.com/unslothai/unsloth-zoo \
-                "$RUNNER_TEMP/unsloth-zoo"; then
-              break
-            fi
-            if [ "$attempt" -eq 3 ]; then
-              echo "::error::git clone unsloth-zoo failed after 3 attempts"
-              exit 1
-            fi
-            delay=$((5 * attempt))
-            echo "::warning::clone failed (attempt $attempt/3), retrying in ${delay}s..."
-            sleep "$delay"
-          done
-          pip install -e "$RUNNER_TEMP/unsloth-zoo" --no-deps
-          pip show unsloth_zoo
-
-      - name: Sanity — collection only (both repos)
-        # Catches import-time breakage before we run the suite. Cheap; bails
-        # the job out fast if a transformers/torch resolution went sideways.
-        # Inherits PYTHONPATH / UNSLOTH_COMPILE_DISABLE / PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION
-        # from the job-level env block.
-        run: |
-          set -euxo pipefail
-          python -m pytest --collect-only -q \
-            tests/saving/test_save_shell_injection.py \
-            tests/saving/test_patch_saving_none_tokenizer.py \
-            tests/saving/test_fix_sentencepiece_gguf_robustness.py \
-            tests/utils/test_attention_masks.py \
-            tests/utils/test_trunc_normal_patch.py
-          python -m pytest --collect-only -q "$RUNNER_TEMP/unsloth-zoo/tests/"
-
-      - name: import_fixes drift detectors (18 tests, HARD GATE)
-        # One drift detector per fix_* / patch_* function in
-        # unsloth/import_fixes.py. The detectors assert the *healthy*
-        # upstream shape that the fix expects ABSENT the regression;
-        # ANY DRIFT DETECTED -> pytest.fail (NEVER skip) so the
-        # matrix cell goes red and the maintainer triages on the
-        # next PR, not in a downstream user's crash report.
-        #
-        # Pathologies covered by the suite (each maps to one fix
-        # function with the line range cited in the test docstring):
-        #   * protobuf MessageFactory GetPrototype / GetMessageClass
-        #   * datasets 4.4.x recursion range
-        #   * TRL tuple-vs-bool _*_available caching
-        #   * transformers PreTrainedModel.enable_input_require_grads
-        #     source pattern flip
-        #   * transformers torchcodec / causal_conv1d availability
-        #     flags
-        #   * transformers + accelerate is_wandb_available
-        #   * peft.utils.transformers_weight_conversion importability
-        #     + build_peft_weight_mapping signature
-        #   * triton 3.6+ CompiledKernel num_ctas / cluster_dims
-        #   * torch / torchvision pinned compatibility table
-        #   * vllm guided_decoding_params / structured_outputs +
-        #     aimv2 ovis config version
-        #   * huggingface_hub is_offline_mode / HF_HUB_OFFLINE
-        #   * torch.nn.init.trunc_normal_ presence (patch site for
-        #     patch_trunc_normal_precision_issue)
-        #   * xformers post-num_splits-key fix version
-        # HARD GATE: a red cell here is a real upstream regression
-        # without a corresponding zoo / unsloth-side workaround.
-        run: |
-          python -m pytest -v --tb=short tests/test_import_fixes_drift.py
-
-      - name: public-api surface drift detectors (9 tests, HARD GATE)
-        # Companion to test_import_fixes_drift.py: that file catches
-        # third-party drift; this one catches drift in unsloth's OWN
-        # public surface (FastLanguageModel / FastVisionModel /
-        # FastModel + their classmethods + is_bf16_supported). A
-        # rename here would silently break the unslothai/notebooks tree
-        # one PR cycle later -- this gate catches it BEFORE the
-        # breakage reaches users.
-        run: |
-          python -m pytest -v --tb=short tests/test_public_api_surface.py
-
-      - name: unsloth Bucket-A — CPU tests not in Repo tests (CPU)
-        # 16 tests across 5 files. They live inside tests/saving/ and
-        # tests/utils/, both of which Repo tests (CPU) excludes via --ignore
-        # because their sibling files need real GPUs / real HF weights.
-        # The five files below are pure-Python + AST/protobuf/regex tests
-        # that run cleanly on CPU. Env inherited from the job block.
-        run: |
-          python -m pytest -q --tb=short \
-            tests/saving/test_save_shell_injection.py \
-            tests/saving/test_patch_saving_none_tokenizer.py \
-            tests/saving/test_fix_sentencepiece_gguf_robustness.py \
-            tests/utils/test_attention_masks.py \
-            tests/utils/test_trunc_normal_patch.py \
-            --deselect 'tests/utils/test_attention_masks.py::test_run_attention_flash_varlen_receives_window_and_softcap'
-          # The deselected test monkeypatches flash_attn_varlen_func, which is
-          # only bound on the module when `flash_attn` is importable. flash_attn
-          # requires CUDA + dev toolchain, which the CPU-only ubuntu-latest
-          # runner does not have. The other 15 Bucket-A tests pass cleanly.
-
-      - name: unsloth_zoo @ ${{ env.UNSLOTH_ZOO_REF }} — full pytest (CPU)
-        # 106 of 111 test_* in unsloth_zoo are CPU-only. The two CUDA-skip
-        # cases below auto-skip on a GPU-less runner; deselect them
-        # explicitly so the no-CUDA outcome is "deselected", not "skipped",
-        # making intent visible in the report. Env inherited from job block.
-        working-directory: ${{ runner.temp }}/unsloth-zoo
-        run: |
-          python -m pytest -q --tb=short tests/ \
-            --deselect tests/test_unsloth_zoo_lora_merge.py::test_active_merge_device_returns_string_on_cuda_host \
-            --deselect tests/test_unsloth_zoo_lora_merge.py::test_merge_lora_moves_cpu_inputs_to_active_device
-
-      - name: unsloth_zoo — test_apply_fused_lm_head (lives in compiler.py)
-        # `test_apply_fused_lm_head` lives at unsloth_zoo/compiler.py:1983,
-        # not under tests/, so pytest's default discovery does not pick it up.
-        # We route it through pytest by writing a one-shot shim test file
-        # inside the unsloth checkout's tests/ — pytest then walks UP and
-        # picks up tests/conftest.py, whose GPU-spoof harness (lines 84-141)
-        # patches torch.cuda.is_available, torch.cuda.memory.mem_get_info,
-        # torch.cuda.get_device_capability, and is_bf16_supported. That full
-        # spoof is required because unsloth_zoo/temporary_patches/gpt_oss.py
-        # at module load reads torch.cuda.memory.mem_get_info(0), which
-        # bare `is_available = True` doesn't cover. Env inherited.
-        run: |
-          set -euxo pipefail
-          cat > tests/_zoo_apply_fused_lm_head_shim.py <<'PY'
-          # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
-          # Wraps unsloth_zoo.compiler.test_apply_fused_lm_head so that
-          # tests/conftest.py's GPU-spoof harness applies before the import.
-          # _zoo_aggressive_cuda_spoof extends conftest's harness with deeper
-          # patches (see tests/_zoo_aggressive_cuda_spoof.py).
-          import sys, pathlib
-          sys.path.insert(0, str(pathlib.Path(__file__).parent))
-          import _zoo_aggressive_cuda_spoof as _spoof
-          _spoof.apply()
-          from unsloth_zoo.compiler import test_apply_fused_lm_head as _zoo_test
-          def test_zoo_apply_fused_lm_head_runs():
-              _zoo_test()
-          PY
-          python -m pytest -q --tb=short tests/_zoo_apply_fused_lm_head_shim.py
-          rm -f tests/_zoo_apply_fused_lm_head_shim.py
-
-      - name: Static checks — unsloth/trainer.py + unsloth/models/rl.py against latest pip TRL
-        # AST-only sanity: confirm both files parse and that every TRL symbol
-        # they reference still exists in the installed `trl`. Catches API
-        # drift (renamed / removed TRL classes) without running training.
-        # Pre-fetches latest pip transformers in case TRL pinned an older one.
-        run: |
-          set -euxo pipefail
-          # Use the matrix-resolved transformers + trl versions already
-          # installed by the runtime-deps step (don't upgrade here; that
-          # would defeat the matrix's purpose of testing against the
-          # specific (transformers, trl) combination the cell selected).
-          python <<'PY'
-          import ast, importlib, pathlib, sys
-          paths = [pathlib.Path("unsloth/trainer.py"),
-                   pathlib.Path("unsloth/models/rl.py")]
-          for p in paths:
-              src = p.read_text()
-              tree = ast.parse(src, filename=str(p))
-              # Collect every `from trl... import X` and `from trl... import (X, Y)`
-              missing = []
-              for node in ast.walk(tree):
-                  if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith("trl"):
-                      mod = importlib.import_module(node.module)
-                      for alias in node.names:
-                          if alias.name == "*":
-                              continue
-                          if not hasattr(mod, alias.name):
-                              missing.append(f"{node.module}.{alias.name}")
-              print(f"{p}: TRL symbols referenced and resolved -> {'OK' if not missing else 'MISSING ' + ', '.join(missing)}")
-              if missing:
-                  sys.exit(1)
-          PY
-
-      - name: Static checks — unsloth_zoo/tiled_mlp.py against latest pip transformers
-        # AST parse + transformers symbol-resolution. The user flagged tiled
-        # MLP patching as the path that breaks first when transformers ships
-        # an MLP class rename; this step is the canary against whatever
-        # transformers version the matrix cell selected.
-        working-directory: ${{ runner.temp }}/unsloth-zoo
-        run: |
-          set -euxo pipefail
-          python <<'PY'
-          import ast, importlib, pathlib, sys
-          p = pathlib.Path("unsloth_zoo/tiled_mlp.py")
-          src = p.read_text()
-          tree = ast.parse(src, filename=str(p))
-          missing = []
-          for node in ast.walk(tree):
-              if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith("transformers"):
-                  try:
-                      mod = importlib.import_module(node.module)
-                  except Exception as e:
-                      missing.append(f"{node.module} (import failed: {type(e).__name__})")
-                      continue
-                  for alias in node.names:
-                      if alias.name == "*":
-                          continue
-                      if not hasattr(mod, alias.name):
-                          missing.append(f"{node.module}.{alias.name}")
-          print(f"{p}: transformers symbols referenced -> {'OK' if not missing else 'MISSING ' + ', '.join(missing)}")
-          if missing:
-              sys.exit(1)
-          PY
-
-      - name: Static checks — unsloth_zoo/hf_utils.py syntax + import-graph
-        working-directory: ${{ runner.temp }}/unsloth-zoo
-        run: |
-          set -euxo pipefail
-          python <<'PY'
-          import ast, pathlib
-          p = pathlib.Path("unsloth_zoo/hf_utils.py")
-          tree = ast.parse(p.read_text(), filename=str(p))
-          # Surface every public function + class so the PR check log shows
-          # what's covered, not just OK/FAIL.
-          public = []
-          for node in tree.body:
-              if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)) and not node.name.startswith("_"):
-                  public.append(f"{type(node).__name__.replace('Def','').lower()}:{node.name}")
-          print(f"hf_utils.py public surface ({len(public)}): " + ", ".join(public))
-          PY
-
-      - name: Runtime checks — invoke every zero-arg patch_* across both repos (via pytest shim)
-        # Routed through pytest so tests/conftest.py's GPU-spoof harness
-        # applies before any unsloth_zoo.temporary_patches.* import.
-        # Locally validated 50/51 zero-arg patches succeed; the lone failure
-        # surfaces a real bug (unsloth.models._utils.patch_fast_lora raises
-        # NameError: name 'fast_lora_forward' is not defined). The shim
-        # reports the full ledger but only fails when one of the two
-        # `required` helpers is absent.
-        run: |
-          set -euxo pipefail
-          cat > tests/_runtime_patch_check_shim.py <<'PY'
-          # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
-          # Wraps the runtime patch_* validation into a pytest test so the
-          # tests/conftest.py GPU-spoof harness applies. continue-on-error
-          # at the workflow level catches per-patch failures; this shim only
-          # asserts that the two `required` helpers are reachable.
-          import sys, pathlib
-          sys.path.insert(0, str(pathlib.Path(__file__).parent))
-          import _zoo_aggressive_cuda_spoof as _spoof
-          _spoof.apply()
-          import importlib, inspect
-
-          MODULES = [
-              "unsloth.models._utils", "unsloth.models.rl", "unsloth.import_fixes",
-              "unsloth.kernels.cross_entropy_loss", "unsloth.kernels.rms_layernorm",
-              "unsloth.tokenizer_utils", "unsloth.save",
-              "unsloth_zoo.patching_utils", "unsloth_zoo.gradient_checkpointing",
-              "unsloth_zoo.loss_utils", "unsloth_zoo.tokenizer_utils",
-              "unsloth_zoo.tiled_mlp", "unsloth_zoo.dataset_utils",
-              "unsloth_zoo.patch_torch_functions",
-              "unsloth_zoo.temporary_patches.gemma",
-              "unsloth_zoo.temporary_patches.ministral",
-              "unsloth_zoo.temporary_patches.pixtral",
-              "unsloth_zoo.temporary_patches.deepseek_v3_moe",
-              "unsloth_zoo.temporary_patches.qwen3_5_moe",
-              "unsloth_zoo.temporary_patches.mxfp4",
-              "unsloth_zoo.temporary_patches.bitsandbytes",
-              "unsloth_zoo.temporary_patches.flex_attention_bwd",
-          ]
-          REQUIRED = {
-              "patch_unsloth_smart_gradient_checkpointing",
-              "patch_gradient_accumulation_fix",
-          }
-          # Patches whose signature looks zero-arg (`()` or all-defaulted)
-          # but which actually require either runtime args or real CUDA.
-          # Calling these in isolation is meaningless, so skip the
-          # invocation. Symbol presence (REQUIRED above) is still verified.
-          #   patch_linear_scaling / patch_llama_rope_scaling: defaults are
-          #     None placeholders; the bodies start with
-          #     `assert <param> is not None`.
-          #   patch_unsloth_smart_gradient_checkpointing: legitimately
-          #     allocates CUDA tensors via aten::empty.memory_format inside
-          #     initialize_unsloth_gradient_checkpointing(); the
-          #     torch.cuda.* spoof can't intercept that at the dispatcher
-          #     level.
-          NEEDS_PRECONDITION = {
-              "patch_linear_scaling",
-              "patch_llama_rope_scaling",
-              "patch_unsloth_smart_gradient_checkpointing",
-          }
-
-          def test_zero_arg_patch_invocations():
-              ok, fail, args, skipped, miss_imports = 0, [], [], [], {}
-              seen_required = set()
-              for mod_name in MODULES:
-                  try:
-                      mod = importlib.import_module(mod_name)
-                  except Exception as e:
-                      miss_imports[mod_name] = f"{type(e).__name__}: {e}"
-                      continue
-                  for name in sorted(dir(mod)):
-                      if not name.startswith("patch_"): continue
-                      fn = getattr(mod, name, None)
-                      if not callable(fn): continue
-                      if name in REQUIRED: seen_required.add(name)
-                      try:
-                          sig = inspect.signature(fn)
-                          need = [p.name for p in sig.parameters.values()
-                                  if p.default is inspect.Parameter.empty
-                                  and p.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD,
-                                                 inspect.Parameter.POSITIONAL_ONLY)]
-                      except (TypeError, ValueError):
-                          need = []
-                      if need:
-                          args.append((mod_name, name, need)); continue
-                      if name in NEEDS_PRECONDITION:
-                          skipped.append(f"{mod_name}.{name}")
-                          print(f"  SKIP {mod_name}.{name} (needs precondition / CUDA)")
-                          continue
-                      try:
-                          fn()
-                          ok += 1
-                          print(f"  OK   {mod_name}.{name}")
-                      except Exception as e:
-                          fail.append((mod_name, name, type(e).__name__, str(e)[:200]))
-                          print(f"  FAIL {mod_name}.{name} -> {type(e).__name__}: {str(e)[:200]}")
-              print(f"\nzero-arg patch_*: ok={ok} fail={len(fail)} skipped={len(skipped)}")
-              print(f"arg-required patch_* (skipped, listed for review): {len(args)}")
-              for m, n, r in args:
-                  print(f"    needs={r}: {m}.{n}")
-              if skipped:
-                  print(f"explicitly skipped (needs precondition / CUDA): {skipped}")
-              if miss_imports:
-                  print("\nmodules failed to import (skipped):")
-                  for k, v in miss_imports.items():
-                      print(f"    {k}: {v}")
-              print(f"required patch_* helpers seen: {sorted(seen_required)}")
-              missing = REQUIRED - seen_required
-              assert not missing, f"required patch_* helpers MISSING: {sorted(missing)}"
-              # Strict: any zero-arg patch that raises is a real
-              # regression now that #5319 has landed (the three previously
-              # known-broken patches are fixed; legitimate
-              # CPU-precondition skips are recorded in NEEDS_PRECONDITION
-              # above, not in `fail`). Print all failures and re-raise
-              # them as one assertion message.
-              if fail:
-                  raise AssertionError(
-                      f"zero-arg patch_* invocation failures (ok={ok}, "
-                      f"fail={len(fail)}, skipped={len(skipped)}):\n  "
-                      + "\n  ".join(
-                          f"{m}.{n} -> {ec}: {msg}" for m, n, ec, msg in fail
-                      )
-                  )
-          PY
-          python -m pytest -q --tb=short tests/_runtime_patch_check_shim.py -s
-          rm -f tests/_runtime_patch_check_shim.py
-
-      - name: Runtime checks — patch_tiled_mlp on a synthetic MLP module (via pytest shim)
-        # Same shim pattern: pytest picks up tests/conftest.py before importing
-        # unsloth_zoo.tiled_mlp, so the GPU-spoof harness covers
-        # unsloth_zoo.temporary_patches.gpt_oss's mem_get_info call.
-        run: |
-          set -euxo pipefail
-          cat > tests/_tiled_mlp_check_shim.py <<'PY'
-          # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
-          import sys, pathlib
-          sys.path.insert(0, str(pathlib.Path(__file__).parent))
-          import _zoo_aggressive_cuda_spoof as _spoof
-          _spoof.apply()
-          import torch
-          import torch.nn as nn
-          from unsloth_zoo.tiled_mlp import patch_tiled_mlp, patch_mlp
-
-          class _MLP(nn.Module):
-              def __init__(self, hidden=64, intermediate=128):
-                  super().__init__()
-                  self.gate_proj = nn.Linear(hidden, intermediate, bias=False)
-                  self.up_proj   = nn.Linear(hidden, intermediate, bias=False)
-                  self.down_proj = nn.Linear(intermediate, hidden, bias=False)
-                  self.act_fn = nn.SiLU()
-              def forward(self, x):
-                  return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-          class _FakeModel(nn.Module):
-              def __init__(self):
-                  super().__init__()
-                  self.layers = nn.ModuleList([nn.ModuleDict({"mlp": _MLP()}) for _ in range(2)])
-              def forward(self, x):
-                  for layer in self.layers:
-                      x = x + layer["mlp"](x)
-                  return x
-
-          def test_patch_tiled_mlp_numerical_equivalence():
-              # `patch_mlp(target_arctic=True)` sets `chunk_size = max(1, H)`
-              # and shards the SEQUENCE dim with `n_shards = max(1, S //
-              # chunk_size)`. Pick S > H so the tiled path actually runs
-              # multi-shard (n_shards = 192 // 64 = 3, plus a remainder
-              # shard) rather than degenerating to n_shards = 1 which is
-              # bit-exact and only confirms patching installed something.
-              # If the tiled implementation is correct, multi-shard output
-              # must still match the un-tiled reference within FP32 noise.
-              torch.manual_seed(0)
-              m = _FakeModel().eval()
-              hidden = 64
-              # 192 = 3 * hidden, so divmod(192, 64) = (3, 0) -> 3 shards,
-              # no remainder; gives a clean multi-shard verification.
-              x = torch.randn(2, 192, hidden)
-              with torch.no_grad():
-                  y_before = m(x).clone()
-              patch_mlp(m.layers[0]["mlp"])
-              patch_tiled_mlp(m)
-              # Sanity-check we are actually exercising the multi-shard
-              # path: poke chunk_size by re-deriving it the same way
-              # `tiled_forward_arctic_size` does.
-              S = x.shape[1]
-              chunk = max(1, hidden)
-              n_shards_expected = max(1, S // chunk)
-              assert n_shards_expected > 1, (
-                  "tiled MLP shim is not exercising multi-shard: "
-                  f"S={S}, chunk={chunk}, n_shards={n_shards_expected}"
-              )
-              with torch.no_grad():
-                  y_after = m(x).clone()
-              err = (y_before - y_after).abs().max().item()
-              print(
-                  f"patch_tiled_mlp multi-shard (n_shards={n_shards_expected}) "
-                  f"output diff = {err:.3e}"
-              )
-              assert err < 1e-3, f"tiled MLP output drifted: {err}"
-          PY
-          python -m pytest -q --tb=short tests/_tiled_mlp_check_shim.py -s
-          rm -f tests/_tiled_mlp_check_shim.py
-
-      - name: Compiler cache hygiene + source-rewriter invariants (synthetic inputs)
-        # Lightweight pipeline coverage for unsloth_zoo.compiler. Pure regex
-        # / tokenize / ast paths driven by tiny synthetic source strings:
-        #   - higher_precision_softmax (basic + idempotent)
-        #   - fix_rotary_embedding_dtype (no-op + active under
-        #     UNSLOTH_FORCE_CUSTOM_DTYPE)
-        #   - fix_attention_dtype_consistency (insert + idempotent)
-        #   - convert_attention_masks_to_bool (rewrite + no-op)
-        #   - create_new_function happy-path (versioning block, license
-        #     header, AST parse, importlib re-import)
-        #   - create_new_function **kwargs collision (exercises
-        #     _rewrite_kwargs_param + _insert_kwargs_alias)
-        #   - UNSLOTH_COMPILE_OVERWRITE=0 forced-recompile on transformers
-        #     version mismatch (compiler.py:947-963)
-        #   - matching short-circuit when versions are equal
-        # No real transformers modeling module is loaded; complements the
-        # heavier real-class round-trip step below. Wall-time ~10-25s.
-        run: |
-          set -euxo pipefail
-          cat > tests/_compiler_cache_invariants_shim.py <<'PY'
-          # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
-          # Cache-hygiene + source-rewriter invariants for unsloth_zoo.compiler.
-          import sys, pathlib, os, ast, importlib, importlib.util, time
-          sys.path.insert(0, str(pathlib.Path(__file__).parent))
-          import _zoo_aggressive_cuda_spoof as _spoof
-          _spoof.apply()
-          import pytest
-          import torch  # noqa: F401  (compiler.py imports torch at module load)
-
-
-          def _isolate_cache(tmp_path, monkeypatch):
-              """Point UNSLOTH_COMPILE_LOCATION at tmp_path and reset module
-              globals. The compiler.py global is captured at module load
-              (line 75/179), so we delete + reimport per test."""
-              monkeypatch.setenv("UNSLOTH_COMPILE_LOCATION", str(tmp_path))
-              if "unsloth_zoo.compiler" in sys.modules:
-                  del sys.modules["unsloth_zoo.compiler"]
-              import unsloth_zoo.compiler as compiler
-              compiler.UNSLOTH_COMPILE_LOCATION = str(tmp_path)
-              compiler.UNSLOTH_COMPILE_USE_TEMP = False
-              return compiler
-
-
-          def test_higher_precision_softmax_basic_and_idempotent(tmp_path, monkeypatch):
-              c = _isolate_cache(tmp_path, monkeypatch)
-              src = (
-                  "y = nn.functional.softmax(x, dim=-1)\n"
-                  "z = F.softmax(a, dim=1, dtype=torch.bfloat16)\n"
-              )
-              out = c.higher_precision_softmax(src)
-              assert "dtype = torch.float32).to(x.dtype)" in out
-              assert "dtype = torch.float32).to(a.dtype)" in out
-              # Idempotency landed in unslothai/unsloth-zoo#631
-              # (negative-lookahead on `.to(<var>.dtype)` so a second
-              # pass does not append another cast).
-              assert c.higher_precision_softmax(out) == out
-
-
-          def test_fix_rotary_dtype_no_op_without_env(tmp_path, monkeypatch):
-              c = _isolate_cache(tmp_path, monkeypatch)
-              monkeypatch.delenv("UNSLOTH_FORCE_CUSTOM_DTYPE", raising=False)
-              src = "out = cos.to(dtype=x.dtype) + sin.to(dtype=x.dtype)\n"
-              assert c.fix_rotary_embedding_dtype(src) == src
-
-
-          def test_fix_rotary_dtype_active(tmp_path, monkeypatch):
-              c = _isolate_cache(tmp_path, monkeypatch)
-              monkeypatch.setenv(
-                  "UNSLOTH_FORCE_CUSTOM_DTYPE",
-                  "float16;torch.float32;torch.bfloat16;torch.float16;pass",
-              )
-              monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
-              src = "out = cos.to(dtype=x.dtype) + sin.to(dtype=x.dtype)\n"
-              out = c.fix_rotary_embedding_dtype(src)
-              # Active form rewrites cos.to / sin.to. Either the conditional
-              # form or the cast form is acceptable -- different transformers
-              # versions surface slightly different outputs from the rewriter.
-              assert "cos.to(dtype=x.dtype)" not in out
-              assert "sin.to(dtype=x.dtype)" not in out
-
-
-          def test_fix_attention_dtype_consistency_insert_then_idempotent(tmp_path, monkeypatch):
-              c = _isolate_cache(tmp_path, monkeypatch)
-              src = (
-                  "    query_states, key_states = apply_rotary_pos_emb("
-                  "query_states, key_states, cos, sin)\n"
-                  "    attn = q @ k.T\n"
-              )
-              out = c.fix_attention_dtype_consistency(src)
-              assert out.count("value_states = value_states.to(query_states.dtype)") == 1
-              assert c.fix_attention_dtype_consistency(out) == out
-
-
-          def test_convert_attention_masks_to_bool_rewrites(tmp_path, monkeypatch):
-              c = _isolate_cache(tmp_path, monkeypatch)
-              src = (
-                  "def make_mask(x):\n"
-                  "    out = torch.finfo(x.dtype).min * x\n"
-                  "    return out\n"
-              )
-              out = c.convert_attention_masks_to_bool("make_mask", src)
-              # Loose match: rewriter inserts a `!=torch.finfo(...).min` check
-              # somewhere on the return path. Tightening to an exact
-              # last-line match is brittle across transformers versions.
-              assert "!=torch.finfo" in out
-
-
-          def test_convert_attention_masks_to_bool_no_op(tmp_path, monkeypatch):
-              c = _isolate_cache(tmp_path, monkeypatch)
-              src = "def make_mask(x):\n    return x\n"
-              assert c.convert_attention_masks_to_bool("make_mask", src) == src
-
-
-          def _versioning_lines(file_text):
-              """Extract the four version strings from the versioning block."""
-              assert file_text.startswith('"""\n'), "missing opening triple-quote"
-              head = file_text.split("__UNSLOTH_VERSIONING__", 1)[0]
-              lines = [ln for ln in head.splitlines() if ln and ln != '"""']
-              return lines
-
-
-          def test_create_new_function_happy_path(tmp_path, monkeypatch):
-              c = _isolate_cache(tmp_path, monkeypatch)
-              src = "def f(x):\n    return nn.functional.softmax(x, dim=-1)\n"
-              c.create_new_function(
-                  name="f_happy", new_source=src, model_location="builtins",
-                  functions=[], overwrite=True,
-              )
-              cached = tmp_path / "f_happy.py"
-              assert cached.exists()
-              text = cached.read_text(encoding="utf-8")
-              versions = _versioning_lines(text)
-              assert len(versions) == 4, versions
-              assert text.count(c._full_license_header) == 1
-              ast.parse(text)
-              spec = importlib.util.spec_from_file_location("f_happy_reimport", cached)
-              m2 = importlib.util.module_from_spec(spec)
-              spec.loader.exec_module(m2)
-              assert callable(m2.f)
-              import inspect as _inspect
-              # higher_precision_softmax should have promoted to float32.
-              assert "dtype = torch.float32" in _inspect.getsource(m2.f)
-
-
-          def test_create_new_function_overwrite_zero_recompiles_on_version_mismatch(
-              tmp_path, monkeypatch,
-          ):
-              c = _isolate_cache(tmp_path, monkeypatch)
-              name = "vmismatch"
-              cached = tmp_path / f"{name}.py"
-              stub = (
-                  '"""\n0.0.0\n0.0.0\n0.0.0-stub\n0.0.0\n__UNSLOTH_VERSIONING__\n"""\n'
-                  + c._full_license_header
-                  + "def vmismatch(x):\n    return x\n"
-              )
-              cached.write_text(stub, encoding="utf-8")
-              monkeypatch.setenv("UNSLOTH_COMPILE_OVERWRITE", "0")
-              src = "def vmismatch(x):\n    return x + 1\n"
-              c.create_new_function(
-                  name=name, new_source=src, model_location="builtins",
-                  functions=[], overwrite=False,
-              )
-              text = cached.read_text(encoding="utf-8")
-              assert "0.0.0-stub" not in text, (
-                  "OVERWRITE=0 + transformers-version-mismatch did NOT recompile"
-              )
-              versions = _versioning_lines(text)
-              import importlib.metadata as _md
-              assert versions[2] == _md.version("transformers")
-
-
-          def test_create_new_function_overwrite_zero_short_circuits_when_versions_match(
-              tmp_path, monkeypatch,
-          ):
-              c = _isolate_cache(tmp_path, monkeypatch)
-              name = "vmatch"
-              src = "def vmatch(x):\n    return x\n"
-              c.create_new_function(
-                  name=name, new_source=src, model_location="builtins",
-                  functions=[], overwrite=True,
-              )
-              cached = tmp_path / f"{name}.py"
-              mtime_before = cached.stat().st_mtime_ns
-              time.sleep(0.05)
-              monkeypatch.setenv("UNSLOTH_COMPILE_OVERWRITE", "0")
-              c.create_new_function(
-                  name=name, new_source=src, model_location="builtins",
-                  functions=[], overwrite=False,
-              )
-              assert cached.stat().st_mtime_ns == mtime_before, (
-                  "OVERWRITE=0 + matching versions should NOT rewrite the file"
-              )
-          PY
-          python -m pytest -q --tb=short tests/_compiler_cache_invariants_shim.py
-          rm -f tests/_compiler_cache_invariants_shim.py
-
-      - name: Compiler full-model-sweep (every transformers.models.*) + SFT trainer round-trip
-        # Calls `unsloth_compile_transformers(model_type=...)` against EVERY
-        # `transformers.models.<x>` package the matrix's transformers ships
-        # (pkgutil.iter_modules walk -- 383 packages on 4.57.6, similar on
-        # latest), then ast.parse / importlib-load / introspect the
-        # generated unsloth_compiled_cache/*.py file per model. Catches
-        # regex / source-rewriter drift across the matrix's (transformers,
-        # trl) combination -- the dominant failure mode of
-        # `unsloth_compile_transformers` after a transformers point release.
-        #
-        # 21 model_types currently break the compiler (verified locally on
-        # transformers 4.57.6). They are listed in KNOWN_BROKEN below with
-        # their failure mode so the sweep stays green and any NEW breakage
-        # surfaces as red. Each entry is tracked for an individual fix
-        # PR on unsloth-zoo. The list is split by failure category so
-        # follow-up PRs can target one bug at a time.
-        #
-        # Hermetic cache dir per pytest invocation; we override the
-        # job-level UNSLOTH_COMPILE_DISABLE=1 inside the shim so
-        # compilation actually runs here. Wall-time estimate ~2-3 min
-        # warm (mean ~0.3s/model, 383 models = ~110s on the runner).
-        run: |
-          set -euxo pipefail
-          cat > tests/_zoo_compiler_cache_shim.py <<'PY'
-          # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
-          import os, sys, ast, pathlib, importlib.util, tempfile
-          _HERE = pathlib.Path(__file__).parent
-          sys.path.insert(0, str(_HERE))
-          import _zoo_aggressive_cuda_spoof as _spoof
-          _spoof.apply()
-
-          # Hermetic cache dir + force compile path. The compiler's
-          # globals (UNSLOTH_COMPILE_LOCATION, UNSLOTH_COMPILE_USE_TEMP)
-          # are captured at module load; an earlier conftest `import
-          # unsloth` may have already imported unsloth_zoo.compiler with
-          # the default "unsloth_compiled_cache" path. Mutate the live
-          # module globals after import so this shim is robust to that
-          # ordering. Otherwise the compiler silently writes to the
-          # default cache and the per-model file assertion fails.
-          _CACHE = pathlib.Path(tempfile.mkdtemp(prefix="unsloth_cache_"))
-          os.environ["UNSLOTH_COMPILE_LOCATION"] = str(_CACHE)
-          os.environ["UNSLOTH_COMPILE_OVERWRITE"] = "1"
-          os.environ.pop("UNSLOTH_COMPILE_DISABLE", None)
-
-          import pytest
-          import unsloth_zoo.compiler as _zoo_compiler
-          _zoo_compiler.UNSLOTH_COMPILE_LOCATION = str(_CACHE)
-          _zoo_compiler.UNSLOTH_COMPILE_USE_TEMP = False
-          from unsloth_zoo.compiler import unsloth_compile_transformers
-
-
-          def _verify_file(path: pathlib.Path, must_expose):
-              assert path.exists(), f"compiler did not write {path}"
-              src = path.read_text(encoding="utf-8")
-              ast.parse(src, filename=str(path))
-              spec = importlib.util.spec_from_file_location(path.stem, path)
-              mod = importlib.util.module_from_spec(spec)
-              spec.loader.exec_module(mod)
-              for name in must_expose:
-                  assert hasattr(mod, name), (
-                      f"{path.name} missing expected attr {name!r}; "
-                      f"found: {sorted(n for n in dir(mod) if not n.startswith('_'))[:25]}"
-                  )
-
-
-          # ---------- Full transformers.models.* compile sweep ----------
-          # Track the model_types that currently break the compiler on
-          # transformers >=5,<6. After unsloth-zoo#632 landed, transformers
-          # 4.57.6 has zero failures across all model_types; the 27 entries
-          # below are the residual failures on the tf 5.x line. New breakage
-          # on any OTHER model_type fails the cell. Each entry is a
-          # tracking item for a follow-up unsloth-zoo PR.
-          KNOWN_BROKEN_COMPILE = {
-              # Category A: `string index out of range` in source rewriter.
-              "colpali":         "string index out of range",
-              "colqwen2":        "string index out of range",
-              "colmodernvbert":  "string index out of range",
-              "dpr":             "string index out of range",
-              "gemma4_assistant":"string index out of range",
-              "rag":             "string index out of range",
-              "shieldgemma2":    "string index out of range",
-              "timm_backbone":   "string index out of range",
-              # Category B: rewriter emits invalid Python source.
-              "clvp":            "emitted file: unexpected indent",
-              "falcon_mamba":    "emitted file: unexpected indent",
-              "gpt2":            "emitted file: unexpected indent",
-              "imagegpt":        "emitted file: unexpected indent",
-              "mamba":           "emitted file: unexpected indent",
-              "tapas":           "emitted file: expected ':'",
-              "xlstm":           "emitted file: unexpected indent",
-              # Category B-2: emit unterminated string literal (latest tf).
-              "audioflamingo3":  "emitted file: unterminated string literal",
-              "musicflamingo":   "emitted file: unterminated string literal",
-              "voxtral":         "emitted file: unterminated string literal",
-              "voxtral_realtime":"emitted file: unterminated string literal",
-              # Category C: rewriter emits unclosed paren.
-              "kosmos2":         "emitted file: '(' was never closed",
-              "kosmos2_5":       "emitted file: '(' was never closed",
-              # Category D: imports list builder picks up a non-exported name.
-              "auto":            "module has no attribute _BaseModelWithGenerate",
-              "bit":             "module has no attribute Linear",
-              "regnet":          "module has no attribute Linear",
-              "resnet":          "module has no attribute Linear",
-              # Category E: undefined name in emitted file.
-              "perceiver":       "name 'AbstractPreprocessor' is not defined",
-              "sam3_lite_text":  "name 'Sam3LiteTextLayerScaledResidual' is not defined",
-              # Category F: compile exceeds 60s budget on the runner.
-              # First seen on transformers >=5,<6; each represents a slow
-              # or recursive source-rewriter path the zoo can address.
-              "beit":            "TimeoutError: compile exceeds per-model budget",
-              "sam":             "TimeoutError: compile exceeds per-model budget",
-              "sam_hq":          "TimeoutError: compile exceeds per-model budget",
-          }
-
-
-          def _all_model_types():
-              import pkgutil, transformers.models as tm
-              return sorted(s.name for s in pkgutil.iter_modules(tm.__path__) if s.ispkg)
-
-
-          def test_compile_every_transformers_model_type():
-              """Run unsloth_compile_transformers across every model_type
-              the matrix's transformers ships. Allowed outcomes:
-                ok      -> compile emitted a parseable, importable cache file
-                skipped -> no `modeling_<x>.py` file (expected for some
-                           umbrella packages like `auto`, `deprecated`)
-                known   -> in KNOWN_BROKEN_COMPILE; tracked for follow-up.
-              Any uncaught failure fails the cell.
-
-              Per-model SIGALRM cap so one infinite-looping model_type
-              cannot wedge the whole sweep + nuke the job timeout
-              (observed on transformers >=5,<6 -- 30+ min hang before
-              this guard landed)."""
-              import importlib as _il
-              import signal
-              ok = 0
-              skipped = []
-              known = []
-              new_failures = []
-              models = _all_model_types()
-              def _on_timeout(signum, frame):
-                  raise TimeoutError("compile exceeded per-model budget")
-              prev_handler = signal.signal(signal.SIGALRM, _on_timeout)
-              try:
-                  for i, model_type in enumerate(models):
-                      if i % 25 == 0:
-                          print(f"  sweep progress: {i}/{len(models)} -> {model_type}", flush=True)
-                      modeling_path = f"transformers.models.{model_type}.modeling_{model_type}"
-                      try:
-                          _il.import_module(modeling_path)
-                      except (ModuleNotFoundError, ImportError):
-                          skipped.append((model_type, "no modeling file"))
-                          continue
-                      signal.alarm(60)
-                      try:
-                          unsloth_compile_transformers(
-                              model_type=model_type, fast_lora_forwards=False,
-                          )
-                      except Exception as e:
-                          signal.alarm(0)
-                          msg = f"{type(e).__name__}: {str(e)[:200]}"
-                          if model_type in KNOWN_BROKEN_COMPILE:
-                              known.append((model_type, msg))
-                          else:
-                              new_failures.append((model_type, msg))
-                          continue
-                      signal.alarm(0)
-                      if model_type in KNOWN_BROKEN_COMPILE:
-                          # Came back green unexpectedly -- that's GOOD news,
-                          # the bug was fixed. Surface it so we can drop the
-                          # entry from KNOWN_BROKEN_COMPILE.
-                          print(
-                              f"  UNEXPECTED-OK {model_type}: was in "
-                              "KNOWN_BROKEN_COMPILE, now compiles cleanly. "
-                              "Drop the entry."
-                          )
-                      ok += 1
-              finally:
-                  signal.alarm(0)
-                  signal.signal(signal.SIGALRM, prev_handler)
-              print(f"\nCompile sweep: ok={ok} skipped={len(skipped)} "
-                    f"known-broken={len(known)} new-failures={len(new_failures)}")
-              for m, r in known:
-                  print(f"  KNOWN  {m}: {r}")
-              for m, r in new_failures[:30]:
-                  print(f"  NEW    {m}: {r}")
-              if len(new_failures) > 30:
-                  print(f"  ...and {len(new_failures)-30} more new failures")
-              assert not new_failures, (
-                  f"unsloth_compile_transformers introduced new failures on "
-                  f"{len(new_failures)} model_types not in the known-broken "
-                  f"list: {[m for m, _ in new_failures]}"
-              )
-              # Sanity floor: at least 200 model_types should compile cleanly
-              # (we observed 362 ok / 383 total on transformers 4.57.6).
-              assert ok >= 200, (
-                  f"only {ok} model_types compiled cleanly; expected >=200. "
-                  "Possible transformers-version-induced regression."
-              )
-
-
-          @pytest.mark.parametrize("model_type,rms_class", [
-              ("llama", "LlamaRMSNorm"),
-              ("qwen3", "Qwen3RMSNorm"),
-              ("gemma3", "Gemma3RMSNorm"),
-          ])
-          def test_compile_real_modeling_module(model_type, rms_class):
-              """Spot-check on the three production-relevant families that
-              the compile_every sweep also covers; this case verifies the
-              emitted cache file has the model-specific RMSNorm class
-              attribute, not just that the file parses + imports.
-
-              ``unsloth_compile_transformers`` is not idempotent in-
-              process: calling it twice on the same modeling module
-              after rewriting class attributes corrupts the inspect
-              source/line cache and the second emitted file is malformed
-              Python. The sweep above already produced a valid cache
-              file for every non-KNOWN_BROKEN model_type, so just verify
-              that artefact here. Trigger a compile only when running
-              this test in isolation (no sweep preceded)."""
-              import importlib as _il
-              try:
-                  modeling = _il.import_module(
-                      f"transformers.models.{model_type}.modeling_{model_type}"
-                  )
-              except ModuleNotFoundError:
-                  pytest.skip(
-                      f"transformers build lacks model_type={model_type}"
-                  )
-              combined = _CACHE / f"unsloth_compiled_module_{model_type}.py"
-              if not combined.exists():
-                  unsloth_compile_transformers(
-                      model_type=model_type, fast_lora_forwards=False,
-                  )
-                  modeling = _il.import_module(
-                      f"transformers.models.{model_type}.modeling_{model_type}"
-                  )
-              assert getattr(modeling, "__UNSLOTH_PATCHED__", False) is True
-              _verify_file(combined, must_expose=[rms_class])
-
-
-          def test_compile_disable_writes_nothing():
-              """Negative control: when UNSLOTH_COMPILE_DISABLE=1 the
-              compile path must early-return without producing new files."""
-              os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
-              try:
-                  before = set(_CACHE.iterdir())
-                  # Pick a model_type that still resolves on this transformers.
-                  for mt in ("llama", "mistral", "qwen2"):
-                      try:
-                          import importlib as _il
-                          _il.import_module(
-                              f"transformers.models.{mt}.modeling_{mt}"
-                          )
-                          break
-                      except ModuleNotFoundError:
-                          continue
-                  else:
-                      pytest.skip("no probe model_type available")
-                  unsloth_compile_transformers(
-                      model_type=mt, fast_lora_forwards=False,
-                  )
-                  after = set(_CACHE.iterdir())
-                  assert after == before, (
-                      f"DISABLE=1 still wrote: {[p.name for p in after - before]}"
-                  )
-              finally:
-                  os.environ.pop("UNSLOTH_COMPILE_DISABLE", None)
-
-
-          def test_compile_sft_trainer_patch():
-              """Round-trip TRL's SFTTrainer through the rl.py patch path
-              and verify the generated UnslothSFTTrainer.py."""
-              pytest.importorskip("trl")
-              try:
-                  from unsloth.models.rl import _patch_trl_rl_trainers
-              except ImportError:
-                  pytest.skip("unsloth.models.rl._patch_trl_rl_trainers absent")
-              try:
-                  _patch_trl_rl_trainers("sft_trainer")
-              except Exception as e:
-                  # TRL 1.x renames break the patch helper internally; we
-                  # accept that here and skip rather than fail the cell.
-                  pytest.skip(f"_patch_trl_rl_trainers raised: {type(e).__name__}: {e}")
-              sft = _CACHE / "UnslothSFTTrainer.py"
-              if not sft.exists():
-                  pytest.skip(
-                      "_patch_trl_rl_trainers ran but did not emit "
-                      "UnslothSFTTrainer.py on this TRL version."
-                  )
-              _verify_file(sft, must_expose=["UnslothSFTTrainer"])
-          PY
-          python -m pytest -q --tb=short tests/_zoo_compiler_cache_shim.py
-          rm -f tests/_zoo_compiler_cache_shim.py
-
-      - name: TRL trainer + Config auto-discovery + dynamic patch coverage
-        # Mirror unsloth/models/rl.py:patch_trl_rl_trainers AND verify the
-        # dynamic per-version patch surface:
-        #   1. AST-parse every *_trainer / *_config submodule.
-        #   2. Apply the same *Trainer / *Config discovery rules
-        #      _patch_trl_rl_trainers uses (rl.py:553-620).
-        #   3. Orphan check: every <x>_trainer must have a sibling
-        #      <x>_config OR an inline *Config.
-        #   4. Dynamic count: enumerate every canonical trainer that
-        #      imports cleanly, run patch_trl_rl_trainers(), assert
-        #      every one ends up Unsloth-prefixed in-place. Floor matches
-        #      the cohort sizes from the version sweep:
-        #        TRL 0.22-0.23 -> 18 canonical trainers
-        #        TRL 0.24-0.28 -> 15 canonical trainers
-        #        TRL 0.29-1.x  ->  6 canonical (rest are experimental
-        #                          thin-wrappers; covered next)
-        #   5. Experimental coverage (TRL 0.29+): walk trl.experimental.*,
-        #      find every *Trainer class, verify the umbrella patch
-        #      reaches them via the thin-wrapper MRO walk in
-        #      _patch_trl_rl_trainers (rl.py:677-702).
-        # Per-cell wall-time ~30-60s.
-        run: |
-          set -euxo pipefail
-          cat > tests/_trl_trainer_discovery_shim.py <<'PY'
-          # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
-          # Walks every *_trainer / *_config module in trl.trainer and
-          # validates that unsloth's auto-discovery rules in
-          # unsloth/models/rl.py:_patch_trl_rl_trainers (lines 542-620,
-          # 1934-1949) still pick out exactly one *Trainer and one
-          # *Config per module on the matrix's TRL version.
-          import sys, pathlib, importlib, importlib.util, ast, inspect
-
-          sys.path.insert(0, str(pathlib.Path(__file__).parent))
-          import _zoo_aggressive_cuda_spoof as _spoof
-          _spoof.apply()
-
-          import pytest
-          pytest.importorskip("trl")
-          import trl  # noqa: F401  (forces lazy-module init)
-          import trl.trainer
-
-
-          def _is_real_submodule(qual_name: str) -> bool:
-              """True iff `qual_name` resolves to an importable submodule
-              with a file on disk (i.e. has a non-None find_spec().origin).
-
-              TRL re-exports utility FUNCTIONS into `trl.trainer.__init__`
-              whose names happen to end with `_config` (e.g.
-              `get_peft_config`, `get_quantization_config`). Without this
-              filter the `endswith` check below picks them up as if they
-              were submodules and the AST stage fails on `no spec`. The
-              same trap exists for `_trainer` (none today, but defensive).
-              """
-              try:
-                  spec = importlib.util.find_spec(qual_name)
-              except (ImportError, ValueError):
-                  return False
-              return spec is not None and bool(getattr(spec, "origin", None))
-
-
-          # Replicate rl.py:1939-1943 verbatim, then filter to actual
-          # submodules so re-exported utility functions (e.g.
-          # `get_peft_config`) do not pollute the AST sweep.
-          def _trainer_files():
-              return [
-                  x for x in dir(trl.trainer)
-                  if x.islower()
-                  and x.endswith("_trainer")
-                  and x != "base_trainer"
-                  and _is_real_submodule(f"trl.trainer.{x}")
-              ]
-
-
-          def _config_files():
-              return [
-                  x for x in dir(trl.trainer)
-                  if x.islower()
-                  and x.endswith("_config")
-                  and _is_real_submodule(f"trl.trainer.{x}")
-              ]
-
-
-          def _ast_parse_module_via_spec(qual_name: str):
-              """AST-parse a module's source on disk WITHOUT importing it.
-              `trl.trainer` uses _LazyModule so `find_spec` resolves the
-              file path without firing the module-level `__init__`. This
-              dodges optional-dep ImportErrors (e.g. grpo_trainer's vllm
-              import) and still surfaces real syntax drift in the file."""
-              spec = importlib.util.find_spec(qual_name)
-              if spec is None or not spec.origin:
-                  return None, "no spec"
-              path = pathlib.Path(spec.origin)
-              if not path.is_file():
-                  return None, f"spec.origin not a file: {path}"
-              src = path.read_text(encoding="utf-8")
-              ast.parse(src, filename=str(path))
-              return path, None
-
-
-          def test_every_trl_trainer_and_config_module_ast_parses():
-              """Stage 1: pure file-on-disk AST parse. Catches a TRL
-              source-level syntax issue on any matrix cell without
-              triggering optional-dep imports."""
-              fail = []
-              ok = 0
-              for name in _trainer_files() + _config_files():
-                  qual = f"trl.trainer.{name}"
-                  try:
-                      path, err = _ast_parse_module_via_spec(qual)
-                      if err:
-                          fail.append((qual, err))
-                      else:
-                          ok += 1
-                  except SyntaxError as e:
-                      fail.append((qual, f"SyntaxError: {e}"))
-                  except Exception as e:
-                      fail.append((qual, f"{type(e).__name__}: {e}"))
-              print(f"AST-parsed {ok} TRL trainer+config modules; failed={len(fail)}")
-              for q, e in fail:
-                  print(f"  AST FAIL {q}: {e}")
-              assert not fail, f"AST parse failed for {len(fail)} TRL modules"
-
-
-          def _apply_unsloth_discovery_rules(mod, trainer_file):
-              """Replicate the four endswith filters in
-              rl.py:553-569 verbatim."""
-              prefix = trainer_file.split("_")[0]
-              names = [
-                  x for x in dir(mod)
-                  if x.endswith("Trainer") and x != "Trainer"
-                  and not x.startswith("_") and prefix in x.lower()
-              ]
-              configs = [
-                  x for x in dir(mod)
-                  if x.endswith("Config") and x != "Config"
-                  and not x.startswith("_") and prefix in x.lower()
-              ]
-              return names, configs
-
-
-          def _resolve_config_via_fallbacks(trainer_file, name_list, mod):
-              """Replicate rl.py:575-615: try the sibling *_config.py
-              module, then the MRO walk fallback. Returns the resolved
-              config-name list (length 0 or 1)."""
-              # Fallback 1: <prefix>_config.py module sibling.
-              cfg_module_name = trainer_file.replace("_trainer", "_config")
-              try:
-                  cfg_mod = getattr(trl.trainer, cfg_module_name)
-              except Exception:
-                  cfg_mod = None
-              if cfg_mod is not None:
-                  prefix = trainer_file.split("_")[0]
-                  hits = [
-                      x for x in dir(cfg_mod)
-                      if x.endswith("Config") and x != "Config"
-                      and not x.startswith("_") and prefix in x.lower()
-                  ]
-                  if len(hits) == 1:
-                      return hits
-              # Fallback 2: MRO walk into experimental parent module.
-              if len(name_list) != 1:
-                  return []
-              try:
-                  trainer_cls = getattr(mod, name_list[0])
-              except Exception:
-                  return []
-              prefix = trainer_file.split("_")[0]
-              for parent in trainer_cls.__mro__[1:]:
-                  if parent is object:
-                      continue
-                  parent_mod = inspect.getmodule(parent)
-                  if parent_mod is None:
-                      continue
-                  if parent_mod.__name__ == f"trl.trainer.{trainer_file}":
-                      continue
-                  hits = [
-                      x for x in dir(parent_mod)
-                      if x.endswith("Config") and x != "Config"
-                      and not x.startswith("_") and prefix in x.lower()
-                  ]
-                  if len(hits) == 1:
-                      return hits
-              return []
-
-
-          def test_unsloth_auto_discovery_finds_trainer_and_config_per_module():
-              """Stage 2: drive the same unsloth rules over every trainer
-              file. import-failures (optional deps) are recorded as
-              `import-skipped`, mirroring rl.py:1944-1948 try/except."""
-              ok = 0
-              import_skipped = []
-              discovery_skipped = []
-              fail = []
-              for trainer_file in _trainer_files():
-                  qual = f"trl.trainer.{trainer_file}"
-                  try:
-                      mod = getattr(trl.trainer, trainer_file)
-                  except Exception as e:
-                      import_skipped.append((qual, f"{type(e).__name__}: {e}"))
-                      continue
-                  trainers, configs = _apply_unsloth_discovery_rules(
-                      mod, trainer_file,
-                  )
-                  if len(trainers) != 1:
-                      discovery_skipped.append(
-                          (qual, f"trainers={trainers}")
-                      )
-                      continue
-                  if len(configs) != 1:
-                      configs = _resolve_config_via_fallbacks(
-                          trainer_file, trainers, mod,
-                      )
-                  if len(configs) != 1:
-                      fail.append(
-                          (qual,
-                           f"trainer={trainers[0]} but config not found "
-                           "(checked module, *_config sibling, and MRO)")
-                      )
-                      continue
-                  ok += 1
-                  print(f"  OK {qual}: trainer={trainers[0]}, config={configs[0]}")
-              print(
-                  f"\nDiscovery: ok={ok} import_skipped={len(import_skipped)} "
-                  f"discovery_skipped={len(discovery_skipped)} fail={len(fail)}"
-              )
-              for q, r in import_skipped:
-                  print(f"  IMPORT-SKIP {q}: {r}")
-              for q, r in discovery_skipped:
-                  print(f"  DISC-SKIP   {q}: {r}")
-              for q, r in fail:
-                  print(f"  FAIL        {q}: {r}")
-              # Hard contract: every TRAINER that imports cleanly AND has
-              # exactly one *Trainer must also resolve exactly one *Config
-              # via one of the three rules. import-skipped + discovery-
-              # skipped (no/multiple *Trainer) are tolerated.
-              assert not fail, (
-                  f"unsloth discovery rules failed for {len(fail)} trainers"
-              )
-              # Sanity: at least 3 trainers should fully discover on any
-              # matrix cell (sft + reward + dpo are the historical core).
-              assert ok >= 3, (
-                  f"only {ok} trainers fully discovered; expected >=3 "
-                  "(sft/reward/dpo). Possible TRL surface regression."
-              )
-
-
-          def test_orphan_trainer_modules_do_not_exist():
-              """Stage 3: every <x>_trainer module should have a sibling
-              <x>_config (TRL 0.26+ convention) OR an inline *Config. An
-              ORPHAN <x>_trainer with neither is a TRL refactor we want
-              to know about: it would silently break unsloth's
-              auto-discovery without raising."""
-              orphans = []
-              for trainer_file in _trainer_files():
-                  cfg_module_name = trainer_file.replace("_trainer", "_config")
-                  has_sibling_cfg = (
-                      importlib.util.find_spec(
-                          f"trl.trainer.{cfg_module_name}"
-                      ) is not None
-                  )
-                  if has_sibling_cfg:
-                      continue
-                  # No sibling -> require an inline *Config in the
-                  # trainer module itself (resolved via discovery rules).
-                  try:
-                      mod = getattr(trl.trainer, trainer_file)
-                  except Exception:
-                      # Optional-dep failure -> skip; the AST-parse stage
-                      # already covered the file.
-                      continue
-                  _, configs = _apply_unsloth_discovery_rules(
-                      mod, trainer_file,
-                  )
-                  if not configs:
-                      orphans.append(trainer_file)
-              assert not orphans, (
-                  "Orphan TRL trainer modules with neither sibling "
-                  f"<x>_config.py nor an inline *Config: {orphans}. "
-                  "unsloth auto-discovery would silently skip these."
-              )
-
-
-          # ---- Dynamic patch coverage: count + verify Unsloth-prefixed ----
-
-          def _enumerate_canonical_trainer_classes():
-              """Walk trl.trainer/*_trainer.py on disk (the source of
-              truth for what `dir(trl.trainer)` should expose) and return
-              [(trainer_file, TrainerClass), ...] for every entry that
-              imports + has exactly-one resolvable *Trainer per the
-              unsloth rules. Skips optional-dep ImportErrors."""
-              out = []
-              for trainer_file in _trainer_files():
-                  try:
-                      mod = getattr(trl.trainer, trainer_file)
-                  except Exception:
-                      continue
-                  trainers, _ = _apply_unsloth_discovery_rules(mod, trainer_file)
-                  if len(trainers) != 1:
-                      continue
-                  try:
-                      cls = getattr(mod, trainers[0])
-                  except Exception:
-                      continue
-                  out.append((trainer_file, cls))
-              return out
-
-
-          def _enumerate_experimental_trainer_packages():
-              """TRL 0.29+ moved many trainers (bco, cpo, gkd, nash_md,
-              online_dpo, orpo, ppo, prm, xpo, ...) to `trl.experimental.<pkg>`,
-              re-exposing them via thin-wrapper deprecation shims in
-              `trl.trainer.<x>_trainer`. List every `trl.experimental.<pkg>`
-              that defines at least one *Trainer class, parsed by AST so we
-              do NOT trigger the optional-dep imports on the package init."""
-              spec = importlib.util.find_spec("trl.experimental")
-              if spec is None or not spec.submodule_search_locations:
-                  return []
-              import re as _re
-              hits = []
-              for root in spec.submodule_search_locations:
-                  rp = pathlib.Path(root)
-                  for sub in sorted(rp.iterdir()):
-                      if not sub.is_dir() or sub.name.startswith("_"):
-                          continue
-                      classes = []
-                      for py in sub.rglob("*.py"):
-                          try:
-                              src = py.read_text(encoding="utf-8")
-                          except Exception:
-                              continue
-                          for m in _re.finditer(
-                              r"^class\s+([A-Za-z0-9_]+Trainer)\b", src, _re.M,
-                          ):
-                              classes.append(m.group(1))
-                      if classes:
-                          hits.append((sub.name, sorted(set(classes))))
-              return hits
-
-
-          def _is_unsloth_patched(cls) -> bool:
-              return getattr(cls, "__name__", "").startswith("Unsloth")
-
-
-          def test_unsloth_patches_every_canonical_trainer_in_this_trl_version():
-              """Verify the count + identity of canonically-patched trainers
-              matches the trainer surface this TRL version actually ships.
-
-              For TRL 0.22.x-0.23.x: ~18 canonical trainers expected.
-              For TRL 0.24.x-0.28.x: ~15 canonical trainers expected.
-              For TRL 0.29.x-1.x:    6 canonical (rest are experimental
-              thin-wrappers; covered by the next test)."""
-              from unsloth.models.rl import patch_trl_rl_trainers
-              before = _enumerate_canonical_trainer_classes()
-              before_count = len(before)
-              before_unpatched = [
-                  (tf, cls.__name__) for tf, cls in before
-                  if not _is_unsloth_patched(cls)
-              ]
-              # Apply unsloth's umbrella patch.
-              patch_trl_rl_trainers()
-              # Re-enumerate (some classes may have been replaced in-module).
-              after = _enumerate_canonical_trainer_classes()
-              after_count = len(after)
-              patched = [(tf, cls.__name__) for tf, cls in after
-                         if _is_unsloth_patched(cls)]
-              unpatched = [(tf, cls.__name__) for tf, cls in after
-                           if not _is_unsloth_patched(cls)]
-              print(
-                  f"\nCanonical trainer surface for TRL {trl.__version__}: "
-                  f"discoverable_before={before_count} "
-                  f"discoverable_after={after_count} "
-                  f"patched={len(patched)} unpatched={len(unpatched)}"
-              )
-              for tf, n in patched:
-                  print(f"  PATCHED   {tf}: {n}")
-              for tf, n in unpatched:
-                  print(f"  UNPATCHED {tf}: {n}")
-              # Hard contract: every canonical trainer that imports
-              # cleanly must end up Unsloth-prefixed after the umbrella
-              # patch. If a trainer was discoverable BEFORE the patch but
-              # is missing from `after`, that is a separate (rare) issue
-              # we surface as failure.
-              assert before_count == after_count, (
-                  f"trainer-class set changed across patching: "
-                  f"before={[n for _, n in before_unpatched]} "
-                  f"after={[n for _, n in unpatched]}"
-              )
-              assert not unpatched, (
-                  "unsloth.models.rl.patch_trl_rl_trainers did NOT patch: "
-                  + ", ".join(f"{tf}:{n}" for tf, n in unpatched)
-              )
-              # Floor matches the cohort sizes from the TRL version sweep:
-              # 18 (0.22-0.23), 15 (0.24-0.28), 6 (0.29+ canonical only).
-              assert len(patched) >= 6, (
-                  f"only {len(patched)} canonical trainers patched; "
-                  "expected >= 6 (the smallest production cohort)."
-              )
-
-
-          def test_unsloth_patches_experimental_trainers_via_thin_wrappers():
-              """TRL 0.29+ ships canonical-`trl.trainer.<x>_trainer` modules
-              for many trainers as deprecation thin-wrappers that forward
-              to `trl.experimental.<x>`. unsloth's
-              `_patch_trl_rl_trainers` (rl.py:677-702) detects
-              `trl.experimental` in the trainer source and resolves to
-              the parent class -- so patching the canonical entry should
-              also Unsloth-prefix the experimental class via in-module
-              setattr.
-
-              Verify by walking trl.experimental.* AST for every *Trainer
-              class, then checking whether it (or any class with the same
-              name in the experimental package) carries the Unsloth
-              prefix after the umbrella patch."""
-              from unsloth.models.rl import patch_trl_rl_trainers
-              patch_trl_rl_trainers()
-              experimental_pkgs = _enumerate_experimental_trainer_packages()
-              if not experimental_pkgs:
-                  pytest.skip(
-                      f"TRL {trl.__version__} has no trl.experimental.* "
-                      "trainer surface (pre-0.29 cohort). The canonical "
-                      "test above already covers patching here."
-                  )
-              found = []
-              missing = []
-              for pkg_name, class_names in experimental_pkgs:
-                  qual = f"trl.experimental.{pkg_name}"
-                  try:
-                      pkg_mod = importlib.import_module(qual)
-                  except Exception as e:
-                      # Optional-dep ImportError: experimental package
-                      # could not be loaded. Match unsloth's runtime
-                      # tolerance: this would also be silently skipped
-                      # by `_patch_trl_rl_trainers`. Record but do not
-                      # fail.
-                      print(
-                          f"  IMPORT-SKIP {qual}: "
-                          f"{type(e).__name__}: {str(e)[:120]}"
-                      )
-                      continue
-                  for cls_name in class_names:
-                      cls = getattr(pkg_mod, cls_name, None)
-                      if cls is None:
-                          # Class is defined inside the package but not
-                          # re-exported on the package init. Walk
-                          # submodules to find it.
-                          import pkgutil as _pku
-                          for sub in _pku.walk_packages(
-                              pkg_mod.__path__, prefix=qual + "."
-                          ):
-                              try:
-                                  sub_mod = importlib.import_module(sub.name)
-                              except Exception:
-                                  continue
-                              cls = getattr(sub_mod, cls_name, None)
-                              if cls is not None:
-                                  break
-                      if cls is None:
-                          missing.append((pkg_name, cls_name))
-                          continue
-                      if _is_unsloth_patched(cls):
-                          found.append((pkg_name, cls_name))
-                          print(f"  PATCHED   trl.experimental.{pkg_name}.{cls_name}")
-                      else:
-                          # Not Unsloth-prefixed: either unsloth chose
-                          # not to patch this surface (e.g. the canonical
-                          # thin-wrapper module did not exist) or the
-                          # patch silently failed. Record both
-                          # outcomes; the assertion below tolerates the
-                          # gap as informational, not failure -- the
-                          # canonical test enforces the hard contract.
-                          print(
-                              f"  NOT-PATCHED trl.experimental.{pkg_name}."
-                              f"{cls_name} (no Unsloth-prefix on the "
-                              "experimental surface)"
-                          )
-              total_experimental = sum(len(cs) for _, cs in experimental_pkgs)
-              print(
-                  f"\nExperimental trainer surface (TRL {trl.__version__}): "
-                  f"{len(experimental_pkgs)} packages, "
-                  f"{total_experimental} *Trainer classes; "
-                  f"unsloth-patched={len(found)} class-missing={len(missing)}"
-              )
-              # Hard contract: a *Trainer class declared in a python
-              # source file must be locatable in its package after import.
-              # If we saw the class definition but cannot find the symbol
-              # at runtime, the package's public surface drifted.
-              assert not missing, (
-                  "experimental *Trainer classes declared in source but "
-                  f"not importable: {missing}"
-              )
-          PY
-          python -m pytest -q --tb=short -s tests/_trl_trainer_discovery_shim.py
-          rm -f tests/_trl_trainer_discovery_shim.py
-
-      - name: MoE per-family coverage + GRPO patches + grouped_gemm AST
-        # Catches the recurring class of bugs that PR #624 (gemma4 missing
-        # extractor), PR #612 (gemma4 GRPO patch silently dropped), PR #607
-        # (gate_up LoRA dropped from grad graph), PR #601 (qwen MoE shape
-        # mismatch), unsloth#4934 (TRL disable_gradient_checkpointing
-        # corrupts unsloth GC), and unsloth#3598 (gradient_accumulation
-        # double-scale on accepts_loss_kwargs=False) targeted. Coverage:
-        #
-        #   1. Per-MoE-family side-effect contract: for every patch_*_moe
-        #      function in unsloth_zoo.temporary_patches, if its target
-        #      transformers class is importable on this matrix cell, the
-        #      patch must mark the class with `_unsloth_already_patched=True`
-        #      after running. This is exactly what unsloth_zoo's existing
-        #      test_moe_lora_extractor_coverage walks at the registration
-        #      level; here we tie each patch fn to its declared target so a
-        #      silent early-return (PR #612 style) surfaces as red rather
-        #      than a coverage skip.
-        #
-        #   2. PR #4934 (GRPO + TRL 1.0): patch_trl_disable_gradient_checkpointing
-        #      must rebind trl.models.utils.disable_gradient_checkpointing to
-        #      the unsloth no-op AND propagate the rebinding to every trl.*
-        #      module that imported the symbol by reference.
-        #
-        #   3. PR #3598 (gradient_accumulation): patch_gradient_accumulation_fix
-        #      must run cleanly on a synthetic Trainer whose training_step
-        #      signature carries `num_items_in_batch`. The original bug was
-        #      that `accepts_loss_kwargs=False` (Qwen3VL, Gemma3 in t-4.57)
-        #      caused double loss-scaling; here we verify the rewrite path
-        #      itself does not raise on a CPU-resolvable shape.
-        #
-        #   4. unsloth/kernels/moe/grouped_gemm AST smoke: the Triton kernels
-        #      are GPU-only at runtime, but a SyntaxError or stray
-        #      string-literal in the source still surfaces as a test-time
-        #      ImportError on every install. ast.parse the .py files without
-        #      executing.
-        #
-        # Wall-time per cell ~30-60s. Routed through pytest for the spoof
-        # harness so unsloth_zoo.temporary_patches imports are clean.
-        run: |
-          set -euxo pipefail
-          cat > tests/_moe_coverage_shim.py <<'PY'
-          # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
-          import sys, pathlib, ast, importlib, importlib.util, contextlib, os
-          sys.path.insert(0, str(pathlib.Path(__file__).parent))
-          import _zoo_aggressive_cuda_spoof as _spoof
-          _spoof.apply()
-
-          import pytest
-
-          # Map each MoE patch function to the transformers classes it is
-          # contractually responsible for marking with _unsloth_already_patched
-          # after a successful run. Sourced from
-          # unsloth_zoo/temporary_patches/<family>_moe.py:
-          #   - qwen3_moe.py:382-398 patches Qwen3MoeExperts (new path) or
-          #     Qwen3MoeSparseMoeBlock (old path).
-          #   - qwen3_5_moe.py + qwen3_next_moe.py + qwen3_vl_moe.py register
-          #     extractors on Qwen3_5MoeExperts / Qwen3NextExperts /
-          #     Qwen3VLMoeTextExperts respectively.
-          #   - gemma4_moe.py marks Gemma4TextExperts (current) or
-          #     Gemma4TextMoEBlock (legacy).
-          #   - glm4_moe.py marks Glm4MoeLiteNaiveMoe.
-          #   - deepseek_v3_moe.py marks DeepseekV3NaiveMoe.
-          #   - gpt_oss.py:patch_gpt_oss_moe_for_lora marks GptOssExperts.
-          # Each cell skips a target if the transformers version lacks it
-          # (legitimate version-skew); only patches with at least one
-          # importable target are exercised.
-          # Each entry = ((patch_module, patch_fn), targets, env_setup,
-          # version_gate). env_setup runs before the patch fn (e.g. set
-          # UNSLOTH_MODEL_NAME for gpt_oss). version_gate is a callable
-          # returning True when the patch SHOULD run on this transformers;
-          # if False, the test skips with a documented reason.
-          def _v5_or_later():
-              try:
-                  import transformers
-                  major = int(transformers.__version__.split(".")[0])
-                  return major >= 5
-              except Exception:
-                  return False
-
-          MOE_PATCHES = [
-              {
-                  "module": "unsloth_zoo.temporary_patches.qwen3_moe",
-                  "fn": "patch_qwen3_moe",
-                  "targets": [
-                      ("transformers.models.qwen3_moe.modeling_qwen3_moe", "Qwen3MoeExperts"),
-                      ("transformers.models.qwen3_moe.modeling_qwen3_moe", "Qwen3MoeSparseMoeBlock"),
-                  ],
-                  "env": {},
-                  "gate": lambda: True,
-                  "gate_reason": "",
-              },
-              {
-                  "module": "unsloth_zoo.temporary_patches.qwen3_5_moe",
-                  "fn": "patch_qwen3_5_moe",
-                  "targets": [
-                      ("transformers.models.qwen3_5_moe.modeling_qwen3_5_moe", "Qwen3_5MoeExperts"),
-                  ],
-                  "env": {}, "gate": lambda: True, "gate_reason": "",
-              },
-              {
-                  "module": "unsloth_zoo.temporary_patches.qwen3_next_moe",
-                  "fn": "patch_qwen3_next_moe",
-                  "targets": [
-                      ("transformers.models.qwen3_next.modeling_qwen3_next", "Qwen3NextExperts"),
-                  ],
-                  "env": {}, "gate": lambda: True, "gate_reason": "",
-              },
-              {
-                  "module": "unsloth_zoo.temporary_patches.qwen3_vl_moe",
-                  "fn": "patch_qwen3_vl_moe",
-                  "targets": [
-                      ("transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe", "Qwen3VLMoeTextExperts"),
-                  ],
-                  "env": {}, "gate": lambda: True, "gate_reason": "",
-              },
-              {
-                  "module": "unsloth_zoo.temporary_patches.gemma4_moe",
-                  "fn": "patch_gemma4_moe",
-                  "targets": [
-                      ("transformers.models.gemma4.modeling_gemma4", "Gemma4TextExperts"),
-                  ],
-                  "env": {}, "gate": lambda: True, "gate_reason": "",
-              },
-              {
-                  "module": "unsloth_zoo.temporary_patches.glm4_moe",
-                  "fn": "patch_glm4_moe",
-                  "targets": [
-                      ("transformers.models.glm4_moe.modeling_glm4_moe", "Glm4MoeLiteNaiveMoe"),
-                  ],
-                  "env": {}, "gate": lambda: True, "gate_reason": "",
-              },
-              {
-                  "module": "unsloth_zoo.temporary_patches.deepseek_v3_moe",
-                  "fn": "patch_deepseek_v3_moe",
-                  "targets": [
-                      ("transformers.models.deepseek_v3.modeling_deepseek_v3", "DeepseekV3NaiveMoe"),
-                  ],
-                  "env": {}, "gate": lambda: True, "gate_reason": "",
-              },
-              {
-                  "module": "unsloth_zoo.temporary_patches.gpt_oss",
-                  "fn": "patch_gpt_oss_moe_for_lora",
-                  "targets": [
-                      ("transformers.models.gpt_oss.modeling_gpt_oss", "GptOssExperts"),
-                  ],
-                  # The patch reads UNSLOTH_MODEL_NAME and only runs when
-                  # "gpt_oss" is in the normalized form. Set it explicitly
-                  # so the gate at gpt_oss.py:1387 passes; otherwise the
-                  # patch silently early-returns and the test would
-                  # spuriously fail.
-                  "env": {"UNSLOTH_MODEL_NAME": "gpt_oss"},
-                  # Additionally only runs on transformers >= 5
-                  # (gpt_oss.py:1392 `_is_transformers_v5()` gate).
-                  "gate": _v5_or_later,
-                  "gate_reason": (
-                      "patch_gpt_oss_moe_for_lora gates on "
-                      "transformers >= 5 (split-LoRA grouped_mm path)"
-                  ),
-              },
-          ]
-
-
-          def _resolve_target_classes(targets):
-              """Return [(qual, cls), ...] for every importable target."""
-              out = []
-              for mod_path, cls_name in targets:
-                  try:
-                      mod = importlib.import_module(mod_path)
-                  except Exception:
-                      continue
-                  cls = getattr(mod, cls_name, None)
-                  if cls is None:
-                      continue
-                  out.append((f"{mod_path}.{cls_name}", cls))
-              return out
-
-
-          @pytest.mark.parametrize(
-              "spec",
-              MOE_PATCHES,
-              ids=lambda s: s["fn"],
-          )
-          def test_moe_patch_marks_its_target_when_class_present(spec, monkeypatch):
-              """If at least one target class is importable AND the
-              version gate passes, run the patch fn and assert at least
-              one target is marked patched afterwards. Skips when the
-              transformers version lacks every target or when the
-              version gate blocks the patch (legitimate). Fails on
-              silent patch-fn early-returns (PR #612 class of bug)."""
-              targets = spec["targets"]
-              patch_module = spec["module"]
-              patch_name = spec["fn"]
-              importable = _resolve_target_classes(targets)
-              if not importable:
-                  pytest.skip(
-                      f"{patch_name}: no target class importable on this "
-                      f"transformers (looked for {[c for _, c in targets]})."
-                  )
-              if not spec["gate"]():
-                  pytest.skip(
-                      f"{patch_name}: version gate blocks this cell. "
-                      f"Reason: {spec['gate_reason']}"
-                  )
-              for k, v in spec["env"].items():
-                  monkeypatch.setenv(k, v)
-              try:
-                  pmod = importlib.import_module(patch_module)
-              except Exception as e:
-                  pytest.skip(
-                      f"{patch_module} import failed (likely optional dep): "
-                      f"{type(e).__name__}: {e}"
-                  )
-              fn = getattr(pmod, patch_name, None)
-              if fn is None or not callable(fn):
-                  pytest.skip(f"{patch_module} has no callable {patch_name}")
-              try:
-                  fn()
-              except Exception as e:
-                  raise AssertionError(
-                      f"{patch_name}() raised on a transformers that "
-                      f"DOES ship at least one target class ({importable}). "
-                      f"This is the silent-failure mode PR #612 fixed: "
-                      f"{type(e).__name__}: {e}"
-                  )
-              # At least one importable target must now carry SOME marker
-              # showing unsloth touched it. Accepted signals (each is set
-              # by a different patch flow in unsloth_zoo):
-              #   - `_unsloth_already_patched=True`            (gemma4, deepseek_v3, glm4)
-              #   - `_unsloth_lora_patched=True`               (gpt_oss_moe_for_lora)
-              #   - `_unsloth_lora_extractor_fn` is callable   (qwen3_*, glm4_moe)
-              #   - `_original_<modeling_tail>_<ClassName>_forward` attr
-              #     (set by patch_function: qwen3_moe SparseMoeBlock, etc.)
-              #   - `_original_forward` attribute              (gpt_oss in-place patch)
-              # Accept any one as "patched".
-              def _is_patched(cls) -> bool:
-                  if getattr(cls, "_unsloth_already_patched", False) is True:
-                      return True
-                  if getattr(cls, "_unsloth_lora_patched", False) is True:
-                      return True
-                  if callable(getattr(cls, "_unsloth_lora_extractor_fn", None)):
-                      return True
-                  if "_original_forward" in dir(cls):
-                      return True
-                  cls_name = cls.__name__
-                  for attr in dir(cls):
-                      if attr.startswith("_original_") and attr.endswith(
-                          f"_{cls_name}_forward"
-                      ):
-                          return True
-                  return False
-
-              after = _resolve_target_classes(targets)
-              marked = [qual for qual, cls in after if _is_patched(cls)]
-              if not marked:
-                  raise AssertionError(
-                      f"{patch_name}() ran without exception but no target "
-                      f"in {importable} carries any of the unsloth markers "
-                      "(_unsloth_already_patched / _unsloth_lora_patched / "
-                      "_unsloth_lora_extractor_fn / _original_*_forward). "
-                      "Patch silently no-op'd (PR #612 class of bug)."
-                  )
-              print(f"  {patch_name}: marked {marked}")
-
-
-          # ---- PR #4934 (TRL 1.0+ GRPO disable_gradient_checkpointing) ----
-
-          def test_patch_trl_disable_gradient_checkpointing():
-              """unsloth/models/rl.py:patch_trl_disable_gradient_checkpointing
-              must rebind trl.models.utils.disable_gradient_checkpointing to
-              the unsloth no-op when TRL >= 1.0. Pre-1.0 TRL has no such
-              symbol -> the patch returns early."""
-              try:
-                  import trl.models.utils as _tmu
-              except ImportError:
-                  pytest.skip("trl not installed")
-              had_symbol = hasattr(_tmu, "disable_gradient_checkpointing")
-              try:
-                  from unsloth.models.rl import patch_trl_disable_gradient_checkpointing
-              except ImportError:
-                  pytest.skip(
-                      "unsloth.models.rl.patch_trl_disable_gradient_checkpointing "
-                      "absent (older unsloth than #4934)"
-                  )
-              patch_trl_disable_gradient_checkpointing()
-              if not had_symbol:
-                  # Pre-1.0 TRL: patch is a no-op early-return. Verify
-                  # nothing broke.
-                  pytest.skip(
-                      "TRL pre-1.0 has no disable_gradient_checkpointing; "
-                      "patch correctly early-returned."
-                  )
-              fn = getattr(_tmu, "disable_gradient_checkpointing", None)
-              assert fn is not None, (
-                  "trl.models.utils.disable_gradient_checkpointing missing "
-                  "after patch -- patch removed the symbol entirely?"
-              )
-              assert getattr(fn, "_unsloth_noop_patched", False) is True, (
-                  "trl.models.utils.disable_gradient_checkpointing was NOT "
-                  "rebound to the unsloth no-op. PR #4934 regression."
-              )
-              # PR #4934 also walks sys.modules to rebind trl.* modules
-              # that imported the symbol by reference. Verify at least the
-              # canonical trainer modules picked up the rebinding when
-              # they re-export it.
-              import sys
-              checked = 0
-              missed = []
-              for mod_name, mod in list(sys.modules.items()):
-                  if not mod_name.startswith("trl."):
-                      continue
-                  bound = getattr(mod, "disable_gradient_checkpointing", None)
-                  if bound is None:
-                      continue
-                  checked += 1
-                  if not getattr(bound, "_unsloth_noop_patched", False):
-                      missed.append(mod_name)
-              print(f"  rebound disable_gradient_checkpointing in {checked} trl.* modules")
-              assert not missed, (
-                  "trl.* modules that imported disable_gradient_checkpointing "
-                  f"by reference but did not get rebound: {missed}"
-              )
-
-
-          # ---- PR #3598 (gradient_accumulation loss-scaling rewrite) ----
-
-          def test_patch_gradient_accumulation_fix_runs_on_synthetic_trainer():
-              """patch_gradient_accumulation_fix rewrites a Trainer's
-              `training_step` source via inspect+exec when the signature
-              carries `num_items_in_batch`. PR #3598 fixed the rewrite
-              path to not double-scale for trainers with
-              `accepts_loss_kwargs=False`. Verify the patch fn runs
-              without raising on a synthetic Trainer carrying that
-              signature."""
-              try:
-                  from unsloth.models._utils import patch_gradient_accumulation_fix
-              except ImportError:
-                  pytest.skip(
-                      "unsloth.models._utils.patch_gradient_accumulation_fix absent"
-                  )
-              try:
-                  from transformers import Trainer
-              except ImportError:
-                  pytest.skip("transformers.Trainer absent")
-              # The patch reads the live Trainer.training_step source. We
-              # exercise the standard transformers.Trainer here -- if the
-              # bug is reintroduced in the source rewriter (e.g. broken
-              # exec, missing import injection), the patch fn raises.
-              try:
-                  patch_gradient_accumulation_fix(Trainer)
-              except Exception as e:
-                  raise AssertionError(
-                      "patch_gradient_accumulation_fix raised on a vanilla "
-                      f"transformers.Trainer: {type(e).__name__}: {e}"
-                  )
-              # Idempotency: second call must not raise either (the rewrite
-              # adds `_unsloth_training_step` marker so the second call
-              # short-circuits per _utils.py:1692-1693).
-              patch_gradient_accumulation_fix(Trainer)
-
-
-          # ---- unsloth/kernels/moe/grouped_gemm AST smoke ----
-
-          def _walk_py_files(root: pathlib.Path):
-              for p in root.rglob("*.py"):
-                  if "__pycache__" in p.parts:
-                      continue
-                  yield p
-
-
-          def test_unsloth_kernels_moe_grouped_gemm_ast_parses():
-              """unsloth/kernels/moe/grouped_gemm hosts the Triton MoE
-              kernels (GPU-only at runtime). A SyntaxError or stray token
-              at the SOURCE level still surfaces as ImportError on every
-              install, so AST-parse the .py files without executing."""
-              # Locate `unsloth/kernels/moe/grouped_gemm` via the installed
-              # `unsloth` package.
-              import unsloth as _unsloth
-              kernel_root = (
-                  pathlib.Path(_unsloth.__file__).parent
-                  / "kernels" / "moe" / "grouped_gemm"
-              )
-              if not kernel_root.exists():
-                  pytest.skip(
-                      f"{kernel_root} not present in this unsloth checkout."
-                  )
-              fail = []
-              ok = 0
-              for p in _walk_py_files(kernel_root):
-                  try:
-                      ast.parse(p.read_text(encoding="utf-8"), filename=str(p))
-                      ok += 1
-                  except SyntaxError as e:
-                      fail.append((str(p), f"SyntaxError: {e}"))
-                  except Exception as e:
-                      fail.append((str(p), f"{type(e).__name__}: {e}"))
-              print(f"AST-parsed {ok} grouped_gemm files; failed={len(fail)}")
-              for path, err in fail:
-                  print(f"  AST FAIL {path}: {err}")
-              assert not fail, (
-                  f"AST parse failed for {len(fail)} grouped_gemm files"
-              )
-              # Sanity: the directory MUST contain at least the interface
-              # + kernels + reference subtrees as documented.
-              expected = [
-                  "interface.py",
-                  "kernels/forward.py",
-                  "kernels/backward.py",
-                  "reference/moe_block.py",
-                  "reference/moe_ops.py",
-              ]
-              missing = [e for e in expected if not (kernel_root / e).is_file()]
-              assert not missing, (
-                  "grouped_gemm directory layout regressed; missing: "
-                  f"{missing}"
-              )
-          PY
-          python -m pytest -q --tb=short -s tests/_moe_coverage_shim.py
-          rm -f tests/_moe_coverage_shim.py
-
-      - name: Summary
-        if: always()
-        run: |
-          echo "::group::Versions"
-          python -c "import sys, platform; print(sys.version); print(platform.platform())"
-          python -c "import torch; print('torch', torch.__version__, 'cuda?', torch.cuda.is_available())"
-          python -c "import transformers; print('transformers', transformers.__version__)"
-          # `pip show` instead of `import unsloth_zoo` — its __init__ raises
-          # without an accelerator and the spoof harness only kicks in under
-          # pytest. Cheap and accurate.
-          pip show unsloth_zoo
-          echo "::endgroup::"
-          echo "Consolidated job done. Coverage:"
-          echo "  - 16 unsloth Bucket-A tests under tests/saving/ + tests/utils/"
-          echo "  - unsloth_zoo @ ${UNSLOTH_ZOO_REF} pytest tests/ (5 GPU cases deselected)"
-          echo "  - unsloth_zoo.compiler.test_apply_fused_lm_head"
-
-  llama-cpp-smoke:
-    # Standalone llama.cpp build + smoke. Earlier this lived inside every
-    # consolidated matrix cell and re-cmake'd llama.cpp ~5 min per cell --
-    # 3 cells x 275 s = ~14 min of duplicated CPU on every PR for an
-    # artefact that has nothing to do with the (transformers, TRL) combo.
-    # `install_llama_cpp` clones ggml-org/llama.cpp at a pinned commit and
-    # builds the LLAMA_CPP_TARGETS list; the result is independent of the
-    # HF stack version. Run once, gate the PR.
-    name: llama.cpp build + smoke
-    runs-on: ubuntu-latest
-    timeout-minutes: 25
-    env:
-      UNSLOTH_ZOO_REF: ${{ inputs.unsloth_zoo_ref || 'main' }}
-      # Same env contract the matrix cells use: protobuf python parser
-      # (transformers' bundled *_pb2.py needs it), studio on PYTHONPATH,
-      # compile-disable + UNSLOTH_IS_PRESENT so unsloth_zoo's __init__
-      # bootstrap accepts a pure-import.
-      PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python
-      PYTHONPATH: ${{ github.workspace }}/studio
-      UNSLOTH_COMPILE_DISABLE: '1'
-      UNSLOTH_IS_PRESENT: '1'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Install runtime deps for unsloth_zoo.llama_cpp
-        # unsloth_zoo's `__init__` imports `temporary_patches`, which
-        # in turn pulls per-architecture submodules (gemma3n, gemma4,
-        # qwen3_*_moe, glm4_moe, deepseek_v3_moe, pixtral, ministral,
-        # mxfp4, bitsandbytes, flex_attention_bwd) -- many of those
-        # transitively touch transformers and peft / accelerate. Mirror
-        # the matrix job's install minus the heavy bits that have no
-        # bearing on `install_llama_cpp` itself: studio.txt's FastAPI
-        # stack, bitsandbytes (CUDA-only build dependency), triton,
-        # mammoth/unpdf (PDF tools), datasets, sqlalchemy/cryptography,
-        # pytest (we run no tests). The remaining pin shape matches
-        # studio-backend-ci.yml's "Repo tests (CPU)" baseline.
-        run: |
-          set -euxo pipefail
-          python -m pip install --upgrade pip
-          # Match the matrix job's torch path so unsloth_zoo's
-          # `import torch` resolves to the same CPU build.
-          pip install --index-url https://download.pytorch.org/whl/cpu \
-            'torch>=2.4,<2.11' 'torchvision<0.26'
-          pip install \
-            'numpy<3' protobuf sentencepiece \
-            requests tqdm psutil packaging safetensors \
-            'peft>=0.18,<0.20' 'accelerate>=0.34,<2'
-          # transformers + trl come from pyproject.toml's pinned line
-          # so this job stays in sync with whatever the consolidated
-          # `__from_pyproject__` matrix cell is using.
-          pip install transformers trl
-          pip install -e . --no-deps
-
-      - name: Clone unsloth_zoo @ ${{ env.UNSLOTH_ZOO_REF }}
-        # Same shallow clone as the matrix job; we install editable so
-        # `unsloth_zoo.llama_cpp` resolves to the cloned tree (and any
-        # main-branch fixes flow into the smoke without a release).
-        run: |
-          set -euxo pipefail
-          # github.com occasionally 500s on the git fetch; retry so a
-          # single upstream blip does not fail CI.
-          for attempt in 1 2 3; do
-            rm -rf "$RUNNER_TEMP/unsloth-zoo"
-            if git clone --depth=1 --branch="$UNSLOTH_ZOO_REF" \
-                https://github.com/unslothai/unsloth-zoo \
-                "$RUNNER_TEMP/unsloth-zoo"; then
-              break
-            fi
-            if [ "$attempt" -eq 3 ]; then
-              echo "::error::git clone unsloth-zoo failed after 3 attempts"
-              exit 1
-            fi
-            delay=$((5 * attempt))
-            echo "::warning::clone failed (attempt $attempt/3), retrying in ${delay}s..."
-            sleep "$delay"
-          done
-          pip install -e "$RUNNER_TEMP/unsloth-zoo" --no-deps
-          pip show unsloth_zoo
-
-      - name: llama.cpp install via unsloth_zoo.llama_cpp + `llama-cli --help` smoke
-        # Exercise the canonical `unsloth_zoo.llama_cpp.install_llama_cpp`
-        # flow that GGUF export uses at runtime: clone ggml-org/llama.cpp
-        # into ~/.unsloth/llama.cpp, build the LLAMA_CPP_TARGETS list
-        # (llama-quantize, llama-cli, llama-mtmd-cli, llama-gguf-split,
-        # llama-server) via cmake, then run `llama-cli --help`.
-        #
-        # This replaces the previous "download upstream prebuilt zip"
-        # approach, which silently exited 0 with the message
-        # "no ubuntu-x64 prebuilt asset" when ggml-org's release-asset
-        # naming drifted (the regex `bin-ubuntu-x64.*\.zip$` no longer
-        # matched their current asset names). The build path is the same
-        # one Unsloth users hit in production via `model.save_pretrained_gguf`.
-        #
-        # Wall-time budget: ~3-5 min cold, dominated by cmake build of
-        # 5 targets on the runner's 4 cores. Apt-package install is
-        # handled by `install_llama_cpp` itself via its
-        # `check_build_requirements` -> `install_package` chain.
-        run: |
-          set -euxo pipefail
-          # libssl-dev / libcurl4-openssl-dev are needed by llama.cpp's
-          # cmake build for HTTPS support; install up-front so the
-          # `install_llama_cpp` requirement-check is a no-op.
-          sudo apt-get update -qq
-          sudo apt-get install -y -qq build-essential cmake git curl \
-            libgomp1 libssl-dev libcurl4-openssl-dev
-          python <<'PY'
-          import os, shutil, subprocess, sys, pathlib
-          # Apply the same CPU spoof the pytest shims use BEFORE any
-          # unsloth_zoo import: unsloth_zoo/__init__.py calls
-          # device_type.get_device_type() at module load and raises
-          # `NotImplementedError: Unsloth cannot find any torch
-          # accelerator` on a GPU-less runner. The spoof flips
-          # torch.cuda.is_available() to True so the device probe takes
-          # the cuda branch; we never actually run CUDA tensor ops in
-          # this step (just clone+cmake+--help on the binaries).
-          sys.path.insert(0, str(pathlib.Path("tests").resolve()))
-          import _zoo_aggressive_cuda_spoof as _spoof
-          _spoof.apply()
-          from unsloth_zoo.llama_cpp import (
-              install_llama_cpp,
-              LLAMA_CPP_DEFAULT_DIR,
-              LLAMA_CPP_TARGETS,
-          )
-          print(f"Unsloth llama.cpp default dir: {LLAMA_CPP_DEFAULT_DIR}")
-          print(f"Build targets: {LLAMA_CPP_TARGETS}")
-          # install_llama_cpp returns (quantizer_path, converter_script_path).
-          # The quantizer's directory is the `llama.cpp` install root, which
-          # also holds llama-cli after build/bin/llama-* gets copied up
-          # (llama_cpp.py:867-871).
-          quantizer, converter = install_llama_cpp(print_output=True)
-          assert quantizer and os.path.exists(quantizer), (
-              f"install_llama_cpp returned quantizer={quantizer!r} but file missing"
-          )
-          assert converter and os.path.isfile(converter), (
-              f"install_llama_cpp returned converter={converter!r} but missing"
-          )
-          install_root = os.path.dirname(quantizer)
-          cli = os.path.join(install_root, "llama-cli")
-          assert os.path.exists(cli), (
-              f"llama-cli not found at {cli!r} after build. Build root contents: "
-              f"{sorted(p for p in os.listdir(install_root) if p.startswith('llama-'))[:20]}"
-          )
-          assert os.access(cli, os.X_OK), f"{cli!r} not executable"
-          # `llama-cli --help` exits non-zero on some builds; the contract
-          # is that recognizable help text appears on stdout/stderr.
-          proc = subprocess.run(
-              [cli, "--help"], capture_output=True, text=True, timeout=30,
-          )
-          combined = (proc.stdout or "") + (proc.stderr or "")
-          print("--- llama-cli --help (first 30 lines) ---")
-          print("\n".join(combined.splitlines()[:30]))
-          assert any(
-              tok in combined.lower()
-              for tok in ("usage", "--help", "--model", "-m,")
-          ), (
-              f"llama-cli --help produced no recognizable help text. "
-              f"exit={proc.returncode}\nstdout: {proc.stdout[:400]!r}\n"
-              f"stderr: {proc.stderr[:400]!r}"
-          )
-          # Also exercise the quantizer the way GGUF export does: --help
-          # round-trip on the binary that does the actual heavy lifting.
-          q = subprocess.run(
-              [quantizer, "--help"], capture_output=True, text=True, timeout=15,
-          )
-          q_combined = (q.stdout or "") + (q.stderr or "")
-          assert "usage" in q_combined.lower() or "type" in q_combined.lower(), (
-              f"llama-quantize --help produced no help text. "
-              f"exit={q.returncode}\nstdout: {q.stdout[:400]!r}\n"
-              f"stderr: {q.stderr[:400]!r}"
-          )
-          print(
-              f"\nOK: install_llama_cpp produced a working llama-cli at {cli} "
-              f"and llama-quantize at {quantizer}."
-          )
-          PY
diff --git a/.github/workflows/lint-ci.yml b/.github/workflows/lint-ci.yml
deleted file mode 100644
index 00e6e357e2..0000000000
--- a/.github/workflows/lint-ci.yml
+++ /dev/null
@@ -1,321 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Whole-repo, multi-language source-lint gate. Runs on every PR
-# (no path filter) because each step is sub-second to a few seconds
-# and together they catch a class of breakage the focused build
-# workflows would miss:
-#
-#   - Python syntax + ruff + leftover debugger calls (across 350+
-#     committed .py files, not just studio/backend).
-#   - Shell `bash -n` parse for every committed *.sh.
-#   - `yaml.safe_load` and `json.loads` round-trip for every
-#     committed YAML / JSON config.
-#
-# TypeScript and Rust are NOT duplicated here on purpose:
-#   - Studio Frontend CI runs `npm run typecheck` (= `tsc --noEmit`)
-#     and `npm run build` (vite/swc) on every studio/frontend/**
-#     change, which is a full TS AST + type check.
-#   - Studio Tauri CI runs `tauri build --debug --no-bundle` on
-#     every studio/src-tauri/** or studio/frontend/** change, which
-#     compiles the Rust crate (= cargo check + cargo build).
-# Each is a stricter check than a parse-only step would be, so a
-# fast-fail duplicate here would only burn cache; the dedicated
-# workflows already block merges on Rust / TS regressions.
-
-name: Lint CI
-
-on:
-  pull_request:
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  source-lint:
-    name: Source lint (Python + shell + YAML + JSON + safety nets)
-    runs-on: ubuntu-latest
-    timeout-minutes: 5
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      # Pin ruff to match .pre-commit-config.yaml so a CI-only ruff
-      # bump cannot disagree with what pre-commit accepted.
-      # codespell is pinned for the same reason: a reviewer should
-      # never see a typo report appear and disappear depending on
-      # which codespell version the runner happened to install.
-      - run: pip install 'ruff==0.15.12' 'pyyaml>=6' 'codespell>=2.3,<3'
-
-      - name: Linux deps for shellcheck
-        run: sudo apt-get update -qq && sudo apt-get install -y --no-install-recommends shellcheck
-
-      - name: Python AST/syntax check (every committed .py must compile)
-        # python -m compileall uses the same parser the interpreter
-        # uses, so anything broken here would also crash at
-        # `import X` on a user's machine. Sub-second across 350+
-        # files. Hard gate.
-        run: |
-          python -m compileall -q -j 0 \
-            unsloth unsloth_cli studio tests cli.py unsloth-cli.py
-
-      - name: Python ruff check (whole repo)
-        # The narrow rule set in pyproject.toml [tool.ruff.lint]
-        # selects E9 / F63 / F7 / F82 -- syntax errors, broken
-        # comparisons, undefined names. The whole repo passes today,
-        # so this is a hard gate.
-        run: |
-          ruff check unsloth unsloth_cli studio tests cli.py unsloth-cli.py
-
-      - name: No leftover debugger / pdb / breakpoint calls
-        # Catches the "I'll just stick a breakpoint() here" mistake
-        # before it ships. AST-based so commented-out debugger
-        # markers don't false-positive (a bare grep would; there
-        # are three commented `# breakpoint()` markers in
-        # unsloth/models/rl* today). Sub-second.
-        run: |
-          python <<'PY'
-          import ast, pathlib, sys
-
-          SKIP_PARTS = {".venv", "venv", "build", "dist", ".git",
-                        "unsloth_compiled_cache", "node_modules",
-                        "unsloth.egg-info"}
-
-          bad = []
-          scanned = 0
-          for path in sorted(pathlib.Path(".").rglob("*.py")):
-              if any(part in SKIP_PARTS for part in path.parts):
-                  continue
-              scanned += 1
-              try:
-                  tree = ast.parse(path.read_text(encoding="utf-8", errors="replace"))
-              except SyntaxError:
-                  continue   # compileall step above already failed this
-              for node in ast.walk(tree):
-                  if not isinstance(node, ast.Call):
-                      continue
-                  fn = node.func
-                  if isinstance(fn, ast.Name) and fn.id == "breakpoint":
-                      bad.append((path, node.lineno, "breakpoint()"))
-                  elif (isinstance(fn, ast.Attribute) and fn.attr == "set_trace"
-                        and isinstance(fn.value, ast.Name)
-                        and fn.value.id in {"pdb", "ipdb"}):
-                      bad.append((path, node.lineno, f"{fn.value.id}.set_trace()"))
-
-          if bad:
-              for path, lineno, what in bad:
-                  print(f"::error file={path},line={lineno}::leftover {what} -- remove before merging")
-              sys.exit(1)
-          print(f"no leftover debugger calls (scanned {scanned} files)")
-          PY
-
-      - name: License-header drift (informational; whole repo)
-        # Three header families are accepted across the repo:
-        #   1. SPDX one-liner: `# SPDX-License-Identifier: ...`
-        #      Used across studio/ (AGPL-3.0-only) and a few new
-        #      files elsewhere.
-        #   2. Apache-2.0 long form, marker phrase
-        #      "Licensed under the Apache License". Used across
-        #      unsloth/ and unsloth_cli/.
-        #   3. GNU long form, marker phrase "General Public License".
-        #      That single substring covers GPL, LGPL ("GNU Lesser
-        #      General Public License") and AGPL ("GNU Affero
-        #      General Public License") preambles, all three of
-        #      which appear in unsloth/kernels/* (LGPL/AGPL) without
-        #      the SPDX line.
-        # Empty files (mainly empty __init__.py) are skipped.
-        # Surfaced as a warning; cleaning up the actual misses is a
-        # follow-up PR, not a CI fix.
-        continue-on-error: true
-        run: |
-          python <<'PY'
-          import pathlib
-
-          ACCEPTED = (
-              "SPDX-License-Identifier",        # any SPDX line
-              "Licensed under the Apache License",  # Apache-2.0 long form
-              "General Public License",         # GPL / LGPL / AGPL long form
-          )
-          SKIP_PARTS = {".venv", "venv", "build", "dist", ".git",
-                        "unsloth_compiled_cache", "node_modules",
-                        "unsloth.egg-info"}
-
-          studio_missing = []
-          other_missing  = []
-          for path in sorted(pathlib.Path(".").rglob("*.py")):
-              if any(part in SKIP_PARTS for part in path.parts):
-                  continue
-              text = path.read_text(encoding="utf-8", errors="replace")
-              if not text.strip():
-                  continue  # empty __init__.py etc.
-              head = "\n".join(text.splitlines()[:25])
-              if any(marker in head for marker in ACCEPTED):
-                  continue
-              if "studio" in path.parts:
-                  studio_missing.append(path)
-              else:
-                  other_missing.append(path)
-
-          total = len(studio_missing) + len(other_missing)
-          if total == 0:
-              print("every committed .py has a recognised license header")
-          else:
-              print(f"::warning::{total} Python files have no recognised license "
-                    f"header (SPDX / Apache-2.0 / GNU long form): "
-                    f"studio={len(studio_missing)}, other={len(other_missing)}")
-              for path in (studio_missing + other_missing)[:30]:
-                  print(f"  {path}")
-              if total > 30:
-                  print(f"  ... and {total - 30} more")
-          PY
-
-      - name: Shell scripts parse cleanly (`bash -n`)
-        # Same idea as Python's compileall: parse-only check that
-        # every committed *.sh would not blow up at `bash script.sh`
-        # invocation time on a release box. tests/sh/ is the largest
-        # cluster (the install.sh shape tests).
-        run: |
-          shopt -s globstar
-          fail=0
-          for f in $(git ls-files '*.sh'); do
-              if ! bash -n "$f"; then
-                  echo "::error file=$f::shell parse error"
-                  fail=1
-              fi
-          done
-          if [ "$fail" -ne 0 ]; then
-              exit 1
-          fi
-          n=$(git ls-files '*.sh' | wc -l)
-          echo "$n shell scripts parse cleanly"
-
-      - name: YAML files parse cleanly (yaml.safe_load)
-        # Catches truncated workflow files, broken indents in
-        # dependabot.yml / pre-commit configs, etc. Includes
-        # .github/workflows/*.yml so a typo in the file we just
-        # added shows up immediately.
-        run: |
-          python <<'PY'
-          import pathlib, sys, yaml
-
-          SKIP_PARTS = {".venv", "venv", "build", "dist", ".git",
-                        "node_modules", "unsloth_compiled_cache",
-                        "unsloth.egg-info"}
-
-          bad = []
-          scanned = 0
-          for path in sorted(list(pathlib.Path(".").rglob("*.yml"))
-                             + list(pathlib.Path(".").rglob("*.yaml"))):
-              if any(part in SKIP_PARTS for part in path.parts):
-                  continue
-              scanned += 1
-              try:
-                  with path.open("r", encoding="utf-8") as fh:
-                      list(yaml.safe_load_all(fh))
-              except Exception as exc:
-                  bad.append((path, exc))
-
-          if bad:
-              for path, exc in bad:
-                  print(f"::error file={path}::YAML parse failed: {exc}")
-              sys.exit(1)
-          print(f"{scanned} YAML files parse cleanly")
-          PY
-
-      - name: JSON files parse cleanly (json.loads)
-        # Catches malformed package.json, biome.json, etc. Skips:
-        #   - huge npm/bun lockfiles (machine-generated, slow to
-        #     parse, no value).
-        #   - tsconfig*.json: TypeScript convention is JSONC (JSON
-        #     with `/* ... */` comments), which standard json.loads
-        #     rejects. Strip-and-validate would need json5 or a
-        #     hand-rolled comment scrubber for marginal value, since
-        #     `tsc --noEmit` already validates these in Frontend CI.
-        run: |
-          python <<'PY'
-          import fnmatch, json, pathlib, sys
-
-          SKIP_PARTS = {".venv", "venv", "build", "dist", ".git",
-                        "node_modules", "unsloth_compiled_cache",
-                        "unsloth.egg-info"}
-          SKIP_NAMES = {"package-lock.json", "bun.lock"}
-          SKIP_PATTERNS = ("tsconfig*.json",)
-
-          bad = []
-          scanned = 0
-          for path in sorted(pathlib.Path(".").rglob("*.json")):
-              if any(part in SKIP_PARTS for part in path.parts):
-                  continue
-              if path.name in SKIP_NAMES:
-                  continue
-              if any(fnmatch.fnmatch(path.name, pat) for pat in SKIP_PATTERNS):
-                  continue
-              scanned += 1
-              try:
-                  json.loads(path.read_text(encoding="utf-8"))
-              except Exception as exc:
-                  bad.append((path, exc))
-
-          if bad:
-              for path, exc in bad:
-                  print(f"::error file={path}::JSON parse failed: {exc}")
-              sys.exit(1)
-          print(f"{scanned} JSON files parse cleanly")
-          PY
-
-      - name: codespell typo check (informational)
-        # Catches typos in code, comments, and docs across the repo.
-        # Skips lockfiles, generated assets, binary artefacts, and
-        # the LICENSE files (US/UK spelling drift in legal text is
-        # not ours to second-guess). The ignore-words-list pulls
-        # out short identifiers + valid technical terms that
-        # codespell's default dictionary would otherwise flag
-        # (e.g. `ans` as a math-quiz variable name in
-        # tests/utils/aime_eval.py, `parm`/`parms` in PyTorch
-        # nn.Module idioms). Non-blocking until the surfaced typos
-        # are fixed; drop continue-on-error after the cleanup.
-        continue-on-error: true
-        run: |
-          codespell \
-            --skip='*.lock,*.lockb,*.json,*.svg,*.png,*.jpg,*.jpeg,*.gif,*.ico,*.woff*,*.ttf,*.eot,*.zip,*.gz,*.gguf,*.safetensors,*.bin,node_modules,.git,build,dist,unsloth_compiled_cache,unsloth.egg-info,target,studio/frontend/dist,*.pyc,*-licenses.txt,LICENSE*' \
-            --ignore-words-list='ans,bu,hel,fo,te,ot,hist,ned,sav,recurser,datas,nin,parm,parms,checkin,nd,fr,inout,donot,uint' \
-            --quiet-level=2
-
-      - name: shellcheck on committed *.sh (informational)
-        # Goes beyond `bash -n` (which only parses): catches subtle
-        # shell bugs like unquoted variable expansions, useless
-        # `cat`, command substitutions inside `[[`, etc. The
-        # install/setup scripts are critical-path so the signal is
-        # worth surfacing. Non-blocking until install.sh's
-        # hand-rolled patterns get cleaned up; drop continue-on-error
-        # afterwards.
-        continue-on-error: true
-        run: |
-          # Exclude SC1090 ("source not followable") -- legitimate
-          # for installer scripts that source files at runtime
-          # paths shellcheck cannot resolve statically.
-          # SC2034 ("variable assigned but never used") fires on
-          # the export-only assignment idiom we use in install.sh.
-          shellcheck -e SC1090,SC2034 $(git ls-files '*.sh')
-
-      - name: ruff format drift (informational)
-        # The canonical formatter is scripts/run_ruff_format.py
-        # = ruff format + scripts/enforce_kwargs_spacing.py, so plain
-        # `ruff format --check` reports the kwarg-spacing diff as
-        # drift. Surface the count for visibility but keep
-        # non-blocking until the custom pipeline is wired in here.
-        continue-on-error: true
-        run: |
-          ruff format --check unsloth unsloth_cli studio tests cli.py unsloth-cli.py
diff --git a/.github/workflows/mlx-ci.yml b/.github/workflows/mlx-ci.yml
deleted file mode 100644
index 75940832a0..0000000000
--- a/.github/workflows/mlx-ci.yml
+++ /dev/null
@@ -1,430 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Focused PR gate for the MLX dispatch surface, running on a real
-# Apple Silicon runner.
-#
-# Runner: macos-14 (M1, 3 vCPU / 7 GB / Apple Silicon standard runner
-# -- FREE for public repositories per the GitHub Actions billing
-# reference; larger variants like macos-14-large/-xlarge are paid so
-# we deliberately avoid those).
-#
-# Why a single Mac job (no Linux+spoof leg): the dispatch tests are
-# 100% spoofed monkeypatches and run identically on any host, so the
-# Linux leg was duplicating the matrix tests already covered on Mac
-# while missing everything Apple-specific. The Mac job runs the SAME
-# spoofed matrix PLUS three things only a real Apple Silicon host
-# can prove:
-#
-#   1. unsloth._IS_MLX flips True on Darwin+arm64 with mlx genuinely
-#      installed (no spoof).
-#   2. Every PR-A MLX-only unsloth_zoo module (mlx_loader, mlx_trainer,
-#      mlx_compile, mlx_utils, mlx_cce, gated_delta_vjp) imports
-#      against the real `mlx` + `mlx-lm` + `mlx-vlm` PyPI wheels --
-#      each does `import mlx.core as mx` at module top level, so this
-#      catches a future change that breaks the real wheels without
-#      needing a Mac developer in the loop.
-#   3. The hardware-dispatch spoofs do not collide with the real
-#      environment (the test fixture installs a MetaPathFinder that
-#      blocks `import mlx.core` for "no-mlx" profiles, faithfully
-#      simulating a Mac without mlx even when mlx IS installed).
-#   4. End-to-end MLX training + inference smoke test:
-#      run_real_mlx_smoke.py trains unsloth/gemma-3-270m-it for 7
-#      deterministic LoRA steps on a single repeated text row, then
-#      verifies the trained model can complete the prompt and that
-#      losses + grad norms are finite and well-behaved. This is the
-#      only place in CI that exercises a real MLX backward pass +
-#      optimizer step + inference call.
-#
-# Three dispatch test files documented in tests/studio/README.md:
-#   - test_hardware_dispatch_matrix.py    parametrized 7-profile matrix
-#                                         + 2 dispatch-priority canaries
-#   - test_is_mlx_dispatch_gate.py        AST + runtime guard on
-#                                         unsloth._IS_MLX
-#   - test_mlx_training_worker_behaviors.py  AST contract checks on
-#                                            studio/backend/core/training/worker.py
-#
-# Surfaces a single PR check ("MLX CI on Mac M1 / dispatch").
-#
-# Security audit footprint: every package this workflow installs is
-# already covered by .github/workflows/security-audit.yml -- the deps
-# come from studio/backend/requirements/studio.txt and unsloth-zoo's
-# pyproject (resolved transitively). The git+ install of unsloth-zoo
-# is intentionally skipped by the audit (pip-audit cannot resolve a
-# git URL through PyPI metadata; the audit comment in security-audit.yml
-# documents this). No new package is introduced solely by MLX CI.
-
-name: MLX CI on Mac M1
-
-on:
-  pull_request:
-    paths:
-      - 'unsloth/__init__.py'
-      - 'unsloth/_gpu_init.py'
-      - 'studio/backend/utils/hardware/**'
-      - 'studio/backend/core/training/worker.py'
-      - 'studio/backend/core/inference/mlx_inference.py'
-      - 'tests/studio/test_hardware_dispatch_matrix.py'
-      - 'tests/studio/test_is_mlx_dispatch_gate.py'
-      - 'tests/studio/test_mlx_training_worker_behaviors.py'
-      - 'tests/studio/run_real_mlx_smoke.py'
-      - 'tests/conftest.py'
-      - '.github/workflows/mlx-ci.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  dispatch:
-    name: dispatch
-    runs-on: macos-14
-    # 25 min: dispatch + spoofed matrix + 7-step real LoRA training is
-    # under 2 min; GGUF export builds llama.cpp via cmake on Apple
-    # Silicon (~5-7 min), so we budget headroom.
-    timeout-minutes: 25
-    steps:
-      # harden-runner audit mode: macOS runners cannot use blocking mode
-      # today (eBPF egress enforcement is Linux-only), but audit mode is
-      # supported cross-platform and surfaces the egress destinations in
-      # the runner log. This produces the data needed to graduate this
-      # job to a block-mode allowlist once macOS support lands.
-      - name: Harden runner (audit)
-        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450  # v2.19.1
-        with:
-          egress-policy: audit
-
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      # macOS install ladder, validated locally against a Linux
-      # mac-sim venv (platform spoofed + mlx_simulation shim + real
-      # datasets/transformers/structlog).
-      #
-      # 1. studio/backend/requirements/studio.txt brings structlog,
-      #    fastapi, etc. The hardware probe imports structlog at
-      #    module top level.
-      # 2. Same pytest / numpy / httpx stack the rest of the repo CI
-      #    uses.
-      # 3. torch is explicitly installed: unsloth-zoo's pyproject
-      #    deliberately excludes torch on darwin+arm64 (mlx replaces
-      #    it for runtime use), but the dispatch tests spoof
-      #    torch.cuda / torch.xpu / torch.backends.mps via monkeypatch
-      #    and so the test process needs torch importable. We pull
-      #    from the PyTorch CPU index so Apple Silicon gets the
-      #    explicit cpu+MPS arm64 wheel rather than something the
-      #    default PyPI resolver might pick up. The CPU index hosts
-      #    macosx_*_arm64 wheels alongside the Linux x86_64 ones.
-      # 4. unsloth-zoo from git main (NOT PyPI), WITH deps. PR-A's
-      #    MLX support landed after the most recent unsloth-zoo PyPI
-      #    release; the wheel still raises NotImplementedError on
-      #    Apple Silicon when device_type.get_device_type() runs
-      #    unguarded. Studio's own install.sh overlays unsloth-zoo
-      #    from git main for the same reason. Pulling deps lets pip
-      #    resolve the platform-conditional MLX-only wheels (mlx,
-      #    mlx-lm, mlx-vlm gated on darwin+arm64 in unsloth-zoo's
-      #    pyproject) AND the shared deps (datasets, transformers,
-      #    sentencepiece, ...) that unsloth's MLX branch loads via
-      #    dataprep/raw_text.py.
-      # 5. unsloth -e . --no-deps so the editable install does not
-      #    fight the unsloth-zoo dep set.
-      #
-      # All explicit pip installs are version-pinned to a single
-      # released version (the latest as of 2026-05-07 within each
-      # project's existing constraint range). bump alongside the rest
-      # of the security audit when a new release lands.
-      - name: Install deps
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r studio/backend/requirements/studio.txt
-          pip install \
-            'python-multipart==0.0.27' \
-            'aiofiles==25.1.0' \
-            'sqlalchemy==2.0.49' \
-            'cryptography==48.0.0' \
-            'pyyaml==6.0.3' \
-            'jinja2==3.1.6' \
-            'mammoth==1.12.0' \
-            'unpdf==1.0.0' \
-            'requests==2.33.1' \
-            'typer==0.25.1' \
-            'numpy==2.4.4' \
-            'pytest==9.0.3' \
-            'pytest-asyncio==1.3.0' \
-            'httpx==0.28.1'
-          pip install --index-url https://download.pytorch.org/whl/cpu \
-            'torch==2.10.0'
-          # github.com occasionally 500s on the git fetch; retry the
-          # zoo install so a single upstream blip does not fail CI.
-          for attempt in 1 2 3; do
-            if pip install "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo"; then
-              break
-            fi
-            if [ "$attempt" -eq 3 ]; then
-              echo "::error::pip install unsloth_zoo failed after 3 attempts"
-              exit 1
-            fi
-            delay=$((5 * attempt))
-            echo "::warning::unsloth_zoo install failed (attempt $attempt/3), retrying in ${delay}s..."
-            sleep "$delay"
-          done
-          pip install -e . --no-deps
-
-      # Real Apple Silicon sanity: confirm _IS_MLX activates on real
-      # hardware with no platform spoof.
-      - name: Verify _IS_MLX flips True on real Apple Silicon
-        run: |
-          python -c "
-          import platform
-          assert platform.system() == 'Darwin', platform.system()
-          assert platform.machine() == 'arm64', platform.machine()
-          import unsloth
-          assert unsloth._IS_MLX is True, f'expected _IS_MLX=True on real Apple Silicon, got {unsloth._IS_MLX}'
-          print('OK: _IS_MLX activated on real Apple Silicon')
-          "
-
-      # Real Apple Silicon sanity: confirm every PR-A MLX-only module
-      # loads against real mlx + mlx-lm + mlx-vlm wheels.
-      - name: Smoke-import every MLX-only unsloth_zoo module
-        run: |
-          python -c "
-          import importlib
-          for name in [
-              'unsloth_zoo.mlx_loader',
-              'unsloth_zoo.mlx_trainer',
-              'unsloth_zoo.mlx_compile',
-              'unsloth_zoo.mlx_utils',
-              'unsloth_zoo.mlx_cce',
-              'unsloth_zoo.gated_delta_vjp',
-          ]:
-              importlib.import_module(name)
-              print('OK:', name)
-          from unsloth_zoo.mlx_loader import FastMLXModel
-          from unsloth_zoo.mlx_trainer import MLXTrainer, MLXTrainingConfig
-          assert hasattr(FastMLXModel, 'from_pretrained')
-          print('OK: FastMLXModel + MLXTrainer surface present')
-          "
-
-      # Spoofed dispatch matrix. Runs on the real Mac too -- the
-      # test fixture installs a MetaPathFinder that blocks
-      # `import mlx.core` for "no-mlx" profiles, so the spoofs
-      # faithfully simulate every supported hardware combo regardless
-      # of whether mlx is installed for real.
-      - name: MLX dispatch tests (3 files, 36 tests)
-        env:
-          PYTHONPATH: ${{ github.workspace }}/studio
-          UNSLOTH_COMPILE_DISABLE: '1'
-        run: |
-          python -m pytest -v --tb=short \
-            tests/studio/test_hardware_dispatch_matrix.py \
-            tests/studio/test_is_mlx_dispatch_gate.py \
-            tests/studio/test_mlx_training_worker_behaviors.py
-
-      # Studio prebuilt llama.cpp install + GGUF inference. Drives the
-      # exact path Studio's setup.sh takes on macOS: invokes
-      # studio/install_llama_prebuilt.py with --published-repo
-      # ggml-org/llama.cpp and --published-release-tag b9049 (the
-      # latest llama.cpp release at the time this step was added; bump
-      # via UNSLOTH_LLAMA_TAG / DEFAULT_LLAMA_TAG when refreshing).
-      # The installer downloads llama-b9049-bin-macos-arm64.tar.gz,
-      # which is the universal Apple Silicon (arm64) build -- the
-      # same artifact works on M1/M2/M3/M4 because llama.cpp compiles
-      # against the ARMv8.2 baseline.
-      #
-      # The b9049 release also publishes:
-      #   - llama-b9049-bin-macos-arm64-kleidiai.tar.gz
-      #     KleidiAI dispatches at runtime; on M1 it falls back where
-      #     ISA features (e.g. I8MM) are missing, so this asset also
-      #     runs on M1 -- Studio just doesn't choose it by default.
-      #   - llama-b9049-bin-macos-x64.tar.gz
-      #     Intel-only; would only run on M1 via Rosetta 2 emulation,
-      #     which we explicitly avoid.
-      #   - iOS XCFramework
-      #     iOS-app build artifact, unrelated to a macOS desktop CI.
-      #
-      # After install, downloads a small published GGUF
-      # (unsloth/gemma-3-270m-it-GGUF, Q4_K_M) from HuggingFace and
-      # runs the prebuilt llama-cli on it. Asserts the prompt echo
-      # appears in stdout. If the install fails OR the binary exits
-      # non-zero, that's an Unsloth/Studio bug.
-      - name: Studio prebuilt llama.cpp install + GGUF inference (Mac M1)
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          # install_llama_prebuilt.py hits the GitHub releases API to
-          # resolve the asset URL. Anonymous calls share the runner-IP
-          # rate-limit bucket and 403 quickly -- pass the workflow's
-          # automatic GITHUB_TOKEN to bump us to the 5000/hr authenticated
-          # bucket.
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -euo pipefail
-          INSTALL_DIR="$HOME/.unsloth-studio-prebuilt-test/llama.cpp"
-          rm -rf "$INSTALL_DIR"
-          # --simple-policy is required when --published-repo points
-          # at upstream ggml-org/llama.cpp; that repo doesn't ship the
-          # llama-prebuilt-manifest.json asset Studio's default policy
-          # expects, so the simple platform-specific policy maps
-          # Darwin+arm64 -> bin-macos-arm64 directly. studio/setup.sh
-          # passes both --published-repo ggml-org/llama.cpp AND
-          # --simple-policy automatically on macOS, so this CI step
-          # exercises the same code path users hit when they run
-          # `curl -fsSL https://unsloth.ai/install.sh | sh`.
-          python studio/install_llama_prebuilt.py \
-            --install-dir "$INSTALL_DIR" \
-            --published-repo ggml-org/llama.cpp \
-            --published-release-tag b9049 \
-            --simple-policy
-
-          # Studio bundles only llama-server + llama-quantize from the
-          # prebuilt (not llama-cli) -- inference goes through
-          # llama-server's HTTP /completion endpoint. Validate both:
-          # llama-quantize --help proves the dynamic libs link, then
-          # spin up llama-server and POST a /completion request on a
-          # tiny published GGUF.
-          LLAMA_SERVER="$INSTALL_DIR/build/bin/llama-server"
-          LLAMA_QUANT="$INSTALL_DIR/build/bin/llama-quantize"
-          [ -x "$LLAMA_SERVER" ] || { echo "::error::llama-server missing at $LLAMA_SERVER"; find "$INSTALL_DIR/build" -type f | head -40; exit 1; }
-          [ -x "$LLAMA_QUANT" ]  || { echo "::error::llama-quantize missing at $LLAMA_QUANT"; exit 1; }
-          echo "llama-server : $LLAMA_SERVER"
-          echo "llama-quantize: $LLAMA_QUANT"
-          "$LLAMA_QUANT" --help >/dev/null && echo "  llama-quantize loads OK"
-
-          mkdir -p /tmp/ggufs
-          bash .github/scripts/hf-download-with-retry.sh \
-            'unsloth/gemma-3-270m-it-GGUF' \
-            'gemma-3-270m-it-Q4_K_M.gguf' \
-            /tmp/ggufs
-
-          PORT=18080
-          echo "=== starting llama-server on 127.0.0.1:$PORT ==="
-          "$LLAMA_SERVER" \
-            -m /tmp/ggufs/gemma-3-270m-it-Q4_K_M.gguf \
-            --host 127.0.0.1 \
-            --port "$PORT" \
-            -c 256 \
-            -n 16 \
-            --no-warmup \
-            > /tmp/llama-server.log 2>&1 &
-          SERVER_PID=$!
-          trap 'kill "$SERVER_PID" 2>/dev/null || true' EXIT
-
-          # Wait for /health to come up
-          for i in $(seq 1 30); do
-            if curl -sf "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then
-              echo "  server up after ${i}s"
-              break
-            fi
-            sleep 1
-          done
-          if ! curl -sf "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then
-            echo "::error::llama-server never became healthy"
-            tail -40 /tmp/llama-server.log
-            exit 1
-          fi
-
-          PROMPT="Hello, my name is"
-          echo "=== POST /completion ==="
-          RESP=$(curl -sf -X POST "http://127.0.0.1:$PORT/completion" \
-            -H 'Content-Type: application/json' \
-            -d "{\"prompt\":\"$PROMPT\",\"n_predict\":16,\"temperature\":0,\"seed\":3407}")
-          echo "raw response (head): $(echo "$RESP" | head -c 600)"
-          CONTENT=$(echo "$RESP" | python -c "import json,sys; print(json.loads(sys.stdin.read()).get('content',''))")
-          echo "completion content: $CONTENT"
-
-          if [ -z "$CONTENT" ]; then
-            echo "::error::llama-server /completion returned empty content"
-            tail -40 /tmp/llama-server.log
-            exit 1
-          fi
-          echo "OK: Studio prebuilt llama.cpp on Mac M1 + GGUF /completion works"
-
-      # Real MLX training + inference smoke test. Trains
-      # unsloth/gemma-3-270m-it for 7 deterministic LoRA steps
-      # (batch_size=2, gradient_accumulation_steps=3) on a single
-      # repeated row ("<<HELLO!!>> My name is Unsloth!"), then saves
-      # the trained model in 3 export formats. The `train` subcommand
-      # captures per-phase timing + peak GPU + peak RSS into
-      # train_metrics.json so we can detect regressions across CI runs.
-      - name: MLX export round-trip — TRAIN + SAVE 3 formats
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          UNSLOTH_COMPILE_DISABLE: '1'
-        run: |
-          mkdir -p mlx_workdir
-          python tests/studio/run_real_mlx_smoke.py train \
-            --workdir "$PWD/mlx_workdir"
-
-      # Each reload step runs in a FRESH Python process to confirm
-      # the cold-start path users would hit in production also works
-      # (not just the in-memory continuation of a still-running
-      # trainer). FastMLXModel.from_pretrained gets called from
-      # scratch; mx.random is re-seeded; per-step timing + peak
-      # memory are emitted to {format}_reload_metrics.json next to
-      # the saved dir.
-      - name: MLX export round-trip — RELOAD LoRA (fresh process)
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          UNSLOTH_COMPILE_DISABLE: '1'
-        run: |
-          python tests/studio/run_real_mlx_smoke.py reload \
-            --format lora \
-            --dir "$PWD/mlx_workdir/lora"
-
-      - name: MLX export round-trip — RELOAD merged_16bit (fresh process)
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          UNSLOTH_COMPILE_DISABLE: '1'
-        run: |
-          python tests/studio/run_real_mlx_smoke.py reload \
-            --format merged \
-            --dir "$PWD/mlx_workdir/merged_16bit"
-
-      # GGUF reload uses the llama-cli binary that save_pretrained_gguf
-      # built. If save_pretrained_gguf was skipped during train (e.g.
-      # llama.cpp's convert_hf_to_gguf asserts on the model's tokenizer
-      # vocab -- a downstream llama.cpp limitation, not an unsloth_zoo
-      # bug), this step emits a workflow warning and exits 0 so the
-      # LoRA + merged_16bit assertions remain the gating signal.
-      - name: MLX export round-trip — RELOAD GGUF via llama-cli (fresh process)
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          if python -c "import json,sys; m=json.load(open('mlx_workdir/train_metrics.json')); sys.exit(0 if m.get('gguf_supported') else 1)"; then
-            python tests/studio/run_real_mlx_smoke.py reload \
-              --format gguf \
-              --dir "$PWD/mlx_workdir/gguf"
-          else
-            REASON=$(python -c "import json; m=json.load(open('mlx_workdir/train_metrics.json')); print(m.get('gguf_skip_reason') or 'unknown')")
-            echo "::warning title=GGUF round-trip skipped::${REASON}"
-            echo "GGUF export was skipped during the train phase. Reason:"
-            echo "  ${REASON}"
-            echo "Continuing without failing the job; the LoRA + merged_16bit"
-            echo "reload assertions are still gating this PR."
-          fi
-
-      # Print all metrics JSON files so regressions are visible in the
-      # job log. always() so we get telemetry even if a reload step
-      # asserted gibberish.
-      - name: MLX export round-trip — aggregate metrics
-        if: always()
-        run: |
-          for f in mlx_workdir/train_metrics.json \
-                   mlx_workdir/lora_reload_metrics.json \
-                   mlx_workdir/merged_reload_metrics.json \
-                   mlx_workdir/gguf_reload_metrics.json; do
-            echo "=== $f ==="
-            cat "$f" 2>/dev/null || echo "(missing)"
-            echo
-          done
diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml
new file mode 100644
index 0000000000..b46808b374
--- /dev/null
+++ b/.github/workflows/mlx-parity-probe.yml
@@ -0,0 +1,492 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
+#
+# MLX vs HF parity bisection probes -- one Mac M1 job per probe.
+#
+# Why parallel matrix: each probe is fully independent (different
+# subprocess, different pip state matters not at all because the
+# install layer is the same for each). Fanning out lets:
+#   * a single failing probe NOT block the diagnostic data from the
+#     remaining probes (already had continue-on-error, but matrix gives
+#     each its own job log + artifact + duration);
+#   * total wall-time = max(probe_install + probe_run) instead of
+#     sum across probes;
+#   * future probes added without touching the existing ones.
+#
+# Each probe job:
+#   1. installs the common dep set (MLX + torch CPU + transformers + zoo)
+#   2. runs ONE probe
+#   3. always uploads its probe_${N}.json as an artifact
+#
+# A final aggregate job downloads all artifacts and prints a single
+# summary table to its log so the human reader can see all probes
+# without clicking into 10 separate job logs.
+
+name: MLX parity probes
+
+on:
+  pull_request:
+    paths:
+      - 'tests/mlx_parity/**'
+      - '.github/workflows/mlx-parity-probe.yml'
+  push:
+    branches: [mlx-parity-probes]
+  workflow_dispatch: {}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  probe:
+    name: probe-${{ matrix.id }}
+    runs-on: macos-14
+    timeout-minutes: 25
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # ---- Extended-investigation matrix (5 Mac M1 jobs in parallel) ----
+          # Question: is MLX itself broken, or does post-#634 just need more
+          # steps / different seeds to memorize?
+          #
+          # Each probe_17 entry runs the unsloth-zoo HEAD trainer
+          # (broken default until #663 merges) with a different
+          # (steps, seed) combination so we can rule in/out a
+          # convergence-horizon effect.
+
+          # Round AW: binary-search the 20% pass-rate gap between
+          # mlx-lm native (80%, Round AU) and unsloth-zoo MLXTrainer
+          # (60%, Round AV) at the same effective config. CCE and
+          # gradient checkpointing already eliminated (probe forces
+          # use_cce=False, gradient_checkpointing=False). Two axes
+          # remain live:
+          #   * clip: max_grad_value=1.0 (B) vs None (A)
+          #   * accum: bs=2 * accum=3 (B's smoke default) vs native
+          #            bs=6 * accum=1 (A's native batching)
+          # 2x2 factorial x 5 seeds (including known failing 22222
+          # and 12345, plus controls 42, 999, 3407).
+          # Cell A (baseline, repeat AV)         : clip=1.0, bs=2 acc=3
+          # Cell B (drop clip)                   : clip=off, bs=2 acc=3
+          # Cell C (drop accum, native bs=6)     : clip=1.0, bs=6 acc=1
+          # Cell D (drop both, full mlx-lm match): clip=off, bs=6 acc=1
+          # If D ~= 4-5/5 and A=3/5, both axes contribute. If only D
+          # is high, interaction effect. If C high & B not, accum is
+          # the dominant cause.
+
+          # ---- Round AY: disambiguate LOADER vs TRAINER as source of the gap ----
+          # Round AX (n=15) confirmed the gap is real:
+          #   mlx-lm native      10/15 = 67%
+          #   zoo Cell A (smoke)  7/15 = 47%
+          #   zoo Cell D (match)  6/15 = 40%
+          # mlx-lm strictly dominates zoo across paired seeds. The gap is
+          # NOT from clip or grad-accum (Round AW). Remaining candidates
+          # split into two buckets:
+          #   - LOADER: FastMLXModel.from_pretrained (dtype cast +
+          #             mx.eval(params)) + get_peft_model (different
+          #             freeze/unfreeze order, sets memory limits)
+          #   - TRAINER: data sampler RNG, extra mx.eval(grad_norm),
+          #             optimizer wiring, callback overhead
+          #
+          # Probe 21 builds a HYBRID: mlx-lm's load() + linear_to_lora_layers()
+          # then drives training via unsloth-zoo MLXTrainer (clip=off,
+          # bs=6, acc=1 -- matching the closest possible config).
+          # Reading:
+          #   pass_rate ~67% -> gap is in unsloth-zoo's LOADER
+          #   pass_rate ~40% -> gap is in unsloth-zoo's TRAINER
+          # Same 15 seeds used in AX for direct paired comparison.
+
+          # ---- Round AZ: numpy-reset hypothesis + triple-confirm mlx-lm ----
+          # Round AY proved gap is in TRAINER (probe_21 hybrid 47% =
+          # zoo 47%, not mlx-lm 67%). Leading suspect: numpy RNG state.
+          # mlx-lm calls np.random.seed(args.seed) at lora.py:320
+          # immediately before the training loop. Probe 22 = same hybrid
+          # as 21 but with np.random.seed reset right before train().
+          # If 22 ~= 67%, numpy RNG is the cause. If ~= 47%, RNG isn't.
+          # Also re-runs probe_20 (mlx-lm native) on the same 15 seeds
+          # to triple-confirm the original mlx-lm 67% number.
+          # Same 15 seeds as AX/AY.
+
+          # -- probe 22: hybrid + np.seed reset just before train() --
+          # ---- Round BF: THE FIX — seed mx.random AFTER model load ----
+          # CRITICAL DISCOVERY: nn.Linear.__init__ (mlx-src/python/mlx/nn/
+          # layers/linear.py:51) calls mx.random.uniform. So every Linear
+          # module constructed during model load consumes mx.random state.
+          # mlx-lm CLI seeds AFTER load (lora.py:223); my inline probes
+          # seeded BEFORE load. Result: lora_a init from different RNG
+          # positions, leading to different basins.
+          # Probe 30 reseeds AFTER load + adds set_wired_limit. If 67%,
+          # the seed-order is the bug; the FIX is to seed AFTER load.
+
+          # Round BR drops probes 30/35/37/38 from this matrix (they
+          # targeted the mlx-lm CLI path and earlier zoo variants that are
+          # no longer the live suspect after PR #674). Git history retains
+          # them.
+
+          # ---- Round BA: compile-mode hypothesis ----
+          # Round AZ rejected numpy-RNG (probe 22 = probe 21). The
+          # biggest remaining structural diff inside the trainer:
+          # mlx-lm always wraps step with @mx.compile (trainer.py:248);
+          # zoo only does so when args.compile=True (and we set it
+          # False in prior probes). Probe 23 = probe 22 + compile=True
+          # to test if compile-mode is the missing piece.
+
+          # ---- Round BB: loss-fn dtype-propagation hypothesis ----
+          # Round BA rejected compile (probe 23 = 43% = zoo, not 67%).
+          # Next live suspect: backward through zoo's
+          # `mask.astype(float32) * ce_fp16` carries gradients in fp32,
+          # while mlx-lm's `bool_mask * ce_fp16` keeps them in fp16.
+          # Probe 24 monkey-patches make_baseline_loss_fn with a
+          # verbatim copy of mlx-lm's default_loss (bool mask, no
+          # astype(fp32) on the mask).
+
+          # ---- Round BC: complement of probe 24 to isolate loop vs loss ----
+          # Probe 24 (mlx-lm loss in zoo trainer): 50% — barely above zoo.
+          # Probe 25 inverts: mlx-lm verbatim training loop using ZOO's
+          # make_baseline_loss_fn. If 67% — loss is irrelevant, loop is
+          # the cause. If 47% — loss IS the cause. Combined with 24, this
+          # bracket-tests both sides of the trainer/loss boundary.
+
+          # ---- Round BD: control with no unsloth_zoo imports ----
+          # Probes 22-25 all import unsloth_zoo and all hit 40-50%.
+          # Probe 20 (mlx-lm CLI subprocess) hits 67%. Probe 26 runs
+          # identical mlx-lm-style training INLINE (no subprocess) but
+          # imports NO unsloth_zoo modules. If 67% — the unsloth_zoo
+          # import side effect is the cause. If 47% — subprocess
+          # isolation in probe 20 was the actual cause.
+
+          # ---- Round BE: subprocess boundary, set_wired_limit, train() ----
+          # Probe 26 (pure mlx-lm inline) hits 47%. Probe 20 (mlx-lm CLI
+          # via subprocess.run) hits 67%. Three candidate isolations:
+          #   27 = probe 26 + subprocess.run wrap     (subprocess boundary)
+          #   28 = probe 26 + mx.set_wired_limit      (mlx-lm train() side effect)
+          #   29 = probe 26 but call train() directly (uses train()'s actual setup)
+          # If any hits 67%, that isolation IS the cause.
+
+          # ---- Round BG: THE FIX — num_layers=16 (mlx-lm CLI default) ----
+          # CRITICAL DISCOVERY: gemma-3-270m-it has 18 hidden layers.
+          # mlx-lm CLI's CONFIG_DEFAULTS['num_layers']=16 -> probe 20
+          # trains LoRA on the LAST 16 layers only. My inline probes
+          # 22-26+30 used len(model.layers)=18, training all 18 layers.
+          # The extra 2 layers x 7 modules = 14 extra LoRA modules
+          # consume mx.random state during init AND add trainable
+          # parameters that differ from mlx-lm CLI's behavior.
+          # Probe 31 = probe 30 + num_layers=16. If 67%, this IS the
+          # fix. The unsloth-zoo fix is to default num_layers to 16
+          # (or expose it as an arg with sensible default) in
+          # get_peft_model / linear_to_lora_layers calls.
+
+          # ---- Round BH: end-to-end fix verification via zoo's API ----
+          # Probe 32 = FastMLXModel.from_pretrained + get_peft_model(
+          # finetune_last_n_layers=16) + MLXTrainer. If 67% with the
+          # same per-seed pattern as probe 20 (mlx-lm CLI), the PR
+          # works end-to-end through zoo's public API.
+
+          # ---- Round BI: bisect zoo loader vs zoo trainer at num_layers=16 ----
+          # Probe 31 (mlx_lm.load + manual loop + 16): 67%
+          # Probe 32 (FastMLXModel    + MLXTrainer  + 16): 15%
+          # Probe 33 (mlx_lm.load     + MLXTrainer  + 16): ? — bisects.
+          # If 33 = 67%, zoo's LOADER (FastMLXModel.from_pretrained
+          # + get_peft_model) adds the extra basin instability.
+          # If 33 ~= 15%, zoo's TRAINER (MLXTrainer.train) does.
+
+          # ---- Round BJ: dtype hypothesis (FastMLXModel cast bf16->fp16) ----
+          # Probe 32 (FastMLXModel(dtype='float16') + MLXTrainer + nl=16): 15%.
+          # Probe 33 (mlx_lm.load                  + MLXTrainer + nl=16): 53%.
+          # Hypothesis: gemma-3-270m-it is bf16 on HF. FastMLXModel forces
+          # fp16 cast via _convert_mlx_dtype. fp16 has 5-bit exponent vs
+          # bf16's 8-bit, so any param outside fp16 range gets clamped.
+          # Probe 34 uses FastMLXModel(dtype=None) -- keeps storage dtype.
+          # If 34 ~= 53%, the dtype cast is the offender.
+
+          - {id: '34_1',     script: 'probe_34_zoo_loader_no_dtype.py', seed: '1'}
+          - {id: '34_7',     script: 'probe_34_zoo_loader_no_dtype.py', seed: '7'}
+          - {id: '34_42',    script: 'probe_34_zoo_loader_no_dtype.py', seed: '42'}
+          - {id: '34_123',   script: 'probe_34_zoo_loader_no_dtype.py', seed: '123'}
+          - {id: '34_456',   script: 'probe_34_zoo_loader_no_dtype.py', seed: '456'}
+          - {id: '34_789',   script: 'probe_34_zoo_loader_no_dtype.py', seed: '789'}
+          - {id: '34_999',   script: 'probe_34_zoo_loader_no_dtype.py', seed: '999'}
+          - {id: '34_1234',  script: 'probe_34_zoo_loader_no_dtype.py', seed: '1234'}
+          - {id: '34_3407',  script: 'probe_34_zoo_loader_no_dtype.py', seed: '3407'}
+          - {id: '34_5678',  script: 'probe_34_zoo_loader_no_dtype.py', seed: '5678'}
+          - {id: '34_9012',  script: 'probe_34_zoo_loader_no_dtype.py', seed: '9012'}
+          - {id: '34_12345', script: 'probe_34_zoo_loader_no_dtype.py', seed: '12345'}
+          - {id: '34_22222', script: 'probe_34_zoo_loader_no_dtype.py', seed: '22222'}
+          - {id: '34_31415', script: 'probe_34_zoo_loader_no_dtype.py', seed: '31415'}
+          - {id: '34_65535', script: 'probe_34_zoo_loader_no_dtype.py', seed: '65535'}
+
+          # ---- Round BK: MLXTrainer compile-flag hypothesis ----
+          # Probe 33 (mlx_lm.load + MLXTrainer + nl=16 + compile=False): 53%
+          # Probe 31 (mlx_lm.load + manual loop + nl=16 + @mx.compile): 67%
+          # Hypothesis: the -14pp gap between zoo MLXTrainer and the
+          # manual loop at the same loader / layer count is purely the
+          # compile flag. Probe 33 disabled compile via `compile=False`
+          # while probe 31's manual loop always uses `@mx.compile`. If
+          # probe 35 (= probe 33 verbatim + compile=True) recovers to
+          # ~67%, the -14pp is a probe-configuration artifact, not a
+          # MLXTrainer defect.
+
+          # (probe 35 dropped for Round BR — see header note.)
+
+          # ---- Round BK: zoo loader + MLXTrainer(compile=True) ----
+          # Probe 34 (FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=False): ~47%
+          # Probe 35 (mlx_lm.load            + MLXTrainer + nl=16 + compile=True ): ?
+          # Probe 36 (FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=True ): ?
+          # If 35 ~= 67% closing the trainer gap, 36 isolates loader-only delta:
+          #   36 ~= 67% -> compile alone explains the trainer+loader gap;
+          #               FastMLXModel loader patches add no real basin drift.
+          #   36 ~= 47% -> compile fixes the trainer half, but loader patches
+          #               independently add a -10pp drift to bisect next.
+
+          # (probe 36 dropped for Round BS -- Round BR established it has
+          # an identical pass/fail pattern to probe 34, so compile flag is
+          # a no-op for the basin.)
+
+          # ---- Round BL: bypass the max_grad_value=None silent-clip bug ----
+          # Probe 35 (compile=True) hit 53% (same as probe 33's 53%) -- so
+          # compile flag is NOT the trainer-side gap. Next live candidate:
+          # MLXTrainer's resolver rebinds `max_grad_value=None` to the
+          # default 1.0 (fixed in PR #671), so the no-clip intent in
+          # probes 33/35 was silently overridden to clip at +/-1.0.
+          # Probe 37 = probe 33 but explicit max_grad_value=0.0 (always
+          # disabled, regardless of PR #671). If 37 ~= 67%, elementwise
+          # clipping at 1.0 was the entire trainer-side gap.
+
+          # (probe 37 dropped for Round BR — see header note.)
+
+          # ---- Round BM: re-baseline probe 31 to reset confidence ----
+          # Round BL data (probe 37 at 40%, probe 30 at 47%, probe 34/36 at 47%,
+          # probe 35 at 53%) suggests probe 31's earlier 67% may have been
+          # within seed noise. Re-run probe 31 (manual loop + nl=16 + no clip)
+          # on the SAME 15 seeds and the same run as probes 30/34/35/36/37 to
+          # get a paired fresh number. If probe 31 reproduces ~67%, the
+          # trainer DOES add a real -20pp gap (MLXTrainer 47-53% << manual
+          # loop 67%). If probe 31 lands at 47-53%, the entire 'gap' is
+          # within noise and there is no trainer defect to chase.
+
+          - {id: '31_1',     script: 'probe_31_num_layers_16.py', seed: '1'}
+          - {id: '31_7',     script: 'probe_31_num_layers_16.py', seed: '7'}
+          - {id: '31_42',    script: 'probe_31_num_layers_16.py', seed: '42'}
+          - {id: '31_123',   script: 'probe_31_num_layers_16.py', seed: '123'}
+          - {id: '31_456',   script: 'probe_31_num_layers_16.py', seed: '456'}
+          - {id: '31_789',   script: 'probe_31_num_layers_16.py', seed: '789'}
+          - {id: '31_999',   script: 'probe_31_num_layers_16.py', seed: '999'}
+          - {id: '31_1234',  script: 'probe_31_num_layers_16.py', seed: '1234'}
+          - {id: '31_3407',  script: 'probe_31_num_layers_16.py', seed: '3407'}
+          - {id: '31_5678',  script: 'probe_31_num_layers_16.py', seed: '5678'}
+          - {id: '31_9012',  script: 'probe_31_num_layers_16.py', seed: '9012'}
+          - {id: '31_12345', script: 'probe_31_num_layers_16.py', seed: '12345'}
+          - {id: '31_22222', script: 'probe_31_num_layers_16.py', seed: '22222'}
+          - {id: '31_31415', script: 'probe_31_num_layers_16.py', seed: '31415'}
+          - {id: '31_65535', script: 'probe_31_num_layers_16.py', seed: '65535'}
+
+          # ---- Round BP: strict step-by-step parity diagnostic ----
+          # Round BO per-step loss data showed probe 31 (manual loop) and
+          # zoo probes 35/37 diverge from step 2 onwards by 0.01-0.06,
+          # even though step 1's forward loss is identical. The gradient
+          # applied at step 1 differs. Probe 38 runs both paths back-to-back
+          # in a single process and captures per-step loss AND per-step
+          # grad_norm so we can pin where the numerical divergence starts.
+          # Only 5 seeds needed for a value-for-value diagnostic.
+
+          # (probe 38 dropped for Round BR — already proved per-step parity
+          # for the non-FastMLXModel path; Round BR rotates focus onto the
+          # FastMLXModel path's probe 39 strict diagnostic.)
+
+          # ---- Round BQ: FastMLXModel.get_peft_model parity ----
+          # Probe 38 v2 proved mlx_lm.load + linear_to_lora_layers matches
+          # zoo MLXTrainer step-for-step at the loss level. But probes that
+          # went through FastMLXModel + get_peft_model (32 / 34 / 36) still
+          # diverge from mlx-lm CLI's basin family (47% vs 67% greedy pass).
+          # Probe 39 isolates the LoRA-init pipeline by running BOTH paths
+          # through the same manual training loop. If the per-step loss
+          # diff is non-zero, the divergence is in
+          # FastMLXModel.from_pretrained or .get_peft_model upstream of
+          # the trainer. If zero, the LoRA init matches and the basin
+          # gap must come from somewhere we haven't bisected yet.
+
+          # (probe 39 dropped for Round BS -- Round BR already verified
+          # dloss=0 step-for-step under PR #674; no need to re-check.)
+
+          # ---- Round BS: bisect residual MLXTrainer vs manual-loop gap ----
+          # PR #674 verified by probe 39 (dloss = 0 step-for-step across 5 seeds).
+          # But probes 34/36 (FastMLXModel + MLXTrainer) still hit 47% greedy
+          # pass vs probe 31's (mlx_lm.load + manual loop) 67% on 15 seeds, and
+          # probes 34/36 share an identical pass/fail pattern (compile flag is
+          # a no-op for the basin). Probe 40 = FastMLXModel loader + probe 31's
+          # exact manual @mx.compile loop. If 67%, MLXTrainer.train IS the
+          # remaining gap. If 47%, FastMLXModel.from_pretrained adds drift
+          # downstream of get_peft_model that probe 39's 5-seed diagnostic
+          # missed -- bisect the loader next round.
+
+          # ---- Round BT: test whether elementwise clip-at-1 IS the
+          # ---- residual MLXTrainer gap ----
+          # Reading trainer.py:731-732, MLXTrainer reinterprets
+          # `max_grad_value=None` as 1.0 (clip at +/-1.0 elementwise).
+          # PR #671 (mlx: honor max_grad_value=None as a disable signal,
+          # OPEN, head 265534b) would fix this. Probe 34 sets
+          # max_grad_value=None expecting "disable" -- actually gets
+          # clipped. Probe 41 = probe 34 with max_grad_value=0.0
+          # (explicit zero hits the disable branch on the current build).
+          # If 67% (matching probes 31 / 40), elementwise clip-at-1 IS
+          # the entire residual basin gap and PR #671 is the missing
+          # piece. Same 15 seeds for direct paired comparison.
+
+          - {id: '41_1',     script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '1'}
+          - {id: '41_7',     script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '7'}
+          - {id: '41_42',    script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '42'}
+          - {id: '41_123',   script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '123'}
+          - {id: '41_456',   script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '456'}
+          - {id: '41_789',   script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '789'}
+          - {id: '41_999',   script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '999'}
+          - {id: '41_1234',  script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '1234'}
+          - {id: '41_3407',  script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '3407'}
+          - {id: '41_5678',  script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '5678'}
+          - {id: '41_9012',  script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '9012'}
+          - {id: '41_12345', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '12345'}
+          - {id: '41_22222', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '22222'}
+          - {id: '41_31415', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '31415'}
+          - {id: '41_65535', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '65535'}
+
+          # ---- Round BS: bisect residual MLXTrainer vs manual-loop gap ----
+          # (probe 40 dropped for Round BT -- already verified in BS that
+          # FastMLXModel + manual loop reproduces probe 31's 67% on
+          # 15/15 seeds. Round BT focuses on probe 41 to isolate clip.)
+
+          # ---- Round BR: verify PR #674 seed-ordering fix end-to-end ----
+          # PR #674 (unsloth-zoo) moves _seed_mlx_random_state(random_state)
+          # in FastMLXModel.get_peft_model from the top of the function
+          # (~165 lines above linear_to_lora_layers) to immediately before
+          # each linear_to_lora_layers call. Hypothesis: lazy mx.* state
+          # advances between the old seed call and lora_a init were causing
+          # lora_a to draw from a different RNG position than mlx-lm CLI
+          # despite both paths re-seeding to the same int.
+          #
+          # ZOO_SPEC is now pinned to 0124424 (PR #674 HEAD). Re-run the
+          # FastMLXModel-path probes that were divergent in Rounds BJ-BQ:
+          #   * probe 34 x 15 seeds: FastMLXModel(dtype=None) + MLXTrainer
+          #                          + nl=16. Was 47%. Expected: ~67%.
+          #   * probe 36 x 15 seeds: same + compile=True. Was 47%. Expected: ~67%.
+          #   * probe 39 x 5 seeds : strict step-by-step diff vs mlx-lm CLI.
+          #                          Expected: dloss = 0 step-for-step.
+          #   * probe 31 x 15 seeds: mlx-lm CLI manual loop. Unchanged
+          #                          control. Expected: ~67% as before.
+          # If probes 34/36 now match probe 31's pass rate and probe 39's
+          # diff drops to zero, the seed-ordering fix closes the basin gap
+          # end-to-end through the public FastMLXModel API.
+    steps:
+      - name: Harden runner (audit)
+        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450  # v2.19.1
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+
+      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+
+      - name: Install deps
+        run: |
+          python -m pip install --upgrade pip
+          pip install \
+            'mlx==0.30.0' \
+            'mlx-lm==0.30.0' \
+            'numpy==2.4.4' \
+            'pytest==9.0.3'
+          pip install --index-url https://download.pytorch.org/whl/cpu \
+            'torch==2.10.0'
+          pip install \
+            'transformers==4.57.6' \
+            'peft==0.18.0' \
+            'datasets==4.3.0' \
+            'accelerate==1.13.0' \
+            'sentencepiece==0.2.1' \
+            'huggingface-hub==0.36.2' \
+            'trl==0.27.0'
+          # Round BR: pin to PR #674's fix-mlx-get-peft-model-seed branch
+          # (0124424). The commit stacks on top of PR #669's b137b40 so it
+          # carries BOTH the finetune_last_n_layers parameter AND the new
+          # seed-immediately-before-linear_to_lora_layers ordering inside
+          # FastMLXModel.get_peft_model. Round BR re-runs the previously
+          # divergent FastMLXModel-path probes (34, 36) plus the strict
+          # diagnostic probe 39 to check whether moving the seed call
+          # closer to the LoRA construction closes the basin gap end-to-end.
+          ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@012442488894bea07b045c12fcfb27f9f691095d'
+          for attempt in 1 2 3; do
+            if pip install "$ZOO_SPEC"; then break; fi
+            if [ "$attempt" -eq 3 ]; then exit 1; fi
+            sleep $((5*attempt))
+          done
+
+      - name: Run probe ${{ matrix.id }} (probe_17_curve_param.py)
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          UNSLOTH_COMPILE_DISABLE: '1'
+          # Round AW shared smoke config (all cells):
+          #   steps=30, lr=1e-3, bias_correction default (=True),
+          #   max_grad_norm=0 (disabled), dtype=float16.
+          MLX_STEPS: '30'
+          MLX_SEED: ${{ matrix.seed }}
+          MLX_DTYPE: 'float16'
+          MLX_BIAS_CORRECTION: ''
+          MLX_LR: '1e-3'
+          MLX_MAX_GRAD_NORM: '0'
+          MLX_MAX_GRAD_VALUE: ${{ matrix.max_grad_value }}
+          MLX_BS: ${{ matrix.bs }}
+          MLX_ACCUM: ${{ matrix.accum }}
+        run: |
+          # script is per-cell; AW Cells A/B/C omit it -> default to probe_17.
+          SCRIPT="${{ matrix.script }}"
+          if [ -z "$SCRIPT" ]; then SCRIPT="probe_17_curve_param.py"; fi
+          cd tests/mlx_parity && python "$SCRIPT"
+
+      - name: Show JSON output
+        if: always()
+        run: |
+          echo "=== probe ${{ matrix.id }} JSON output(s) ==="
+          for f in tests/mlx_parity/.out/probe_*.json; do
+            echo "--- ${f} ---"
+            cat "$f" 2>/dev/null || true
+          done
+
+      - name: Upload probe artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: probe-${{ matrix.id }}
+          # Upload whole .out/ so probe scripts can write any filename
+          # (probe 17 writes per-config JSONs like probe_17__s30_d42_bc1.json)
+          path: tests/mlx_parity/.out/
+          if-no-files-found: warn
+          include-hidden-files: true
+
+  aggregate:
+    name: aggregate
+    needs: probe
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download all probe artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: probes
+          pattern: probe-*
+          merge-multiple: true
+
+      - name: Summary
+        run: |
+          echo "=== probe artifacts ==="
+          ls -la probes/ || true
+          echo
+          for f in probes/probe_*.json; do
+            echo "--- ${f} ---"
+            cat "$f" 2>/dev/null || echo "(empty)"
+            echo
+          done
diff --git a/.github/workflows/notebooks-ci.yml b/.github/workflows/notebooks-ci.yml
deleted file mode 100644
index 673b2f3cc5..0000000000
--- a/.github/workflows/notebooks-ci.yml
+++ /dev/null
@@ -1,440 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-#
-# Cross-repo notebook validator. Lives in unslothai/unsloth (this repo)
-# and inspects every notebook in unslothai/notebooks at HEAD (or the
-# ref dispatched in via repository_dispatch).
-#
-# Catches the bug classes that landed in:
-#   - unslothai/notebooks#258  Colab torchao 0.10 vs peft 0.19 floor
-#   - unslothai/notebooks#260  DONT_UPDATE_EXCEPTIONS coverage drift
-#   - unslothai/notebooks#261  torch/torchcodec ABI; --no-deps tokenizers
-#   - unslothai/notebooks#264  --no-deps transformers + Colab tokenizers drift
-#   - unslothai/notebooks#221  git+ HEAD installs in install cells
-#   - unslothai/notebooks  commit 51b1462  template/notebook drift
-#
-# CPU-only by design. Layer 2 (api-introspect) reuses the existing
-# tests/_zoo_aggressive_cuda_spoof.py harness so `import unsloth`
-# succeeds on a GPU-less ubuntu-latest runner.
-
-name: Notebooks CI
-
-on:
-  pull_request:
-    paths:
-      - 'unsloth/**'
-      - 'scripts/notebook_validator.py'
-      - 'scripts/notebook_to_python.py'
-      - 'scripts/data/colab_pip_freeze.gpu.txt'
-      - 'scripts/data/colab_to_cpu_pin.json'
-      - 'tests/notebooks/**'
-      - 'tests/_zoo_aggressive_cuda_spoof.py'
-      - '.github/workflows/notebooks-ci.yml'
-  schedule:
-    # Daily 06:17 UTC. Catches Colab preinstall bumps (the upstream image
-    # is rebuilt roughly weekly) without us waiting on a PR. Off the
-    # :00/:30 fleet-collision spots.
-    - cron: '17 6 * * *'
-  workflow_dispatch:
-    inputs:
-      notebooks_ref:
-        description: 'unslothai/notebooks ref to lint (branch / SHA / tag)'
-        default: 'main'
-      include_smoke:
-        description: 'Also run the install-cell smoke matrix (longer)'
-        type: boolean
-        default: false
-  repository_dispatch:
-    # Fired by a tiny companion workflow on unslothai/notebooks.
-    types: [notebooks_pr_opened, notebooks_main_pushed]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-env:
-  NOTEBOOKS_REF: >-
-    ${{ github.event.inputs.notebooks_ref ||
-        github.event.client_payload.ref ||
-        'main' }}
-
-jobs:
-  static:
-    name: static (drift + lint + exceptions)
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    steps:
-      # Validate the dispatched ref before it reaches actions/checkout's `ref:`
-      # input. Reading via env (NOT direct ${{ ... }} interpolation in the
-      # regex test) closes the GitHub-Actions-injection class where a
-      # client_payload.ref like `main"; rm -rf / #` would be embedded into the
-      # shell command. NOTEBOOKS_REF defaults to 'main' on non-dispatch
-      # events, but only repository_dispatch can supply attacker-controlled
-      # values, so we gate this check on that event type.
-      - name: Validate client_payload.ref shape
-        if: github.event_name == 'repository_dispatch'
-        env:
-          NOTEBOOKS_REF: ${{ github.event.client_payload.ref }}
-        run: |
-          if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then
-            echo "::error::client_payload.ref contains disallowed characters" >&2
-            exit 1
-          fi
-
-      - name: Checkout unsloth (this PR)
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          path: unsloth
-          persist-credentials: false
-
-      - name: Checkout unslothai/notebooks @ ${{ env.NOTEBOOKS_REF }}
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          repository: unslothai/notebooks
-          ref: ${{ env.NOTEBOOKS_REF }}
-          path: notebooks
-          fetch-depth: 0  # drift check needs git status / diff
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Install validator deps
-        run: |
-          python -m pip install --upgrade pip
-          # nbformat + nbconvert come from the converter's requirements;
-          # spellchecker + huggingface_hub are imported at module top of
-          # update_all_notebooks.py.
-          pip install \
-            'nbformat>=5.10' 'nbconvert>=7.16' 'pyspellchecker>=0.8' \
-            'huggingface_hub>=0.34' 'tqdm>=4.66'
-
-      - name: Refresh Colab pip-freeze (best-effort; falls back to snapshot)
-        run: |
-          python unsloth/scripts/notebook_validator.py refresh-colab \
-              --out unsloth/scripts/data/colab_pip_freeze.gpu.txt \
-            || echo "::warning::refresh-colab failed; using committed snapshot"
-
-      - name: Diff Colab oracle vs committed snapshots (advisory)
-        # Pulls pip-freeze.gpu.txt + apt-list-gpu.txt + os-info-gpu.txt
-        # from googlecolab/backend-info and prints NEW / REMOVED /
-        # CHANGED entries against scripts/data/colab_*.txt. Non-blocking
-        # on PRs; the daily cron job below runs the same step with
-        # --strict so upstream rotations surface within ~24h.
-        continue-on-error: true
-        working-directory: ${{ github.workspace }}
-        run: |
-          python unsloth/scripts/notebook_validator.py colab-diff \
-              --snapshot-dir unsloth/scripts/data
-
-      - name: Drift check (re-run update_all_notebooks.py + git diff)
-        working-directory: ${{ github.workspace }}
-        # Reported as non-blocking until the upstream `unslothai/notebooks`
-        # tree is regenerated. The first run on @main surfaces ~463 files
-        # of drift (7359 / 9634 line delta), which is a real backlog the
-        # notebooks-side maintainers need to clear in their own repo --
-        # this PR's role is to surface the count, not auto-fix it.
-        continue-on-error: true
-        run: |
-          python unsloth/scripts/notebook_validator.py drift \
-              --notebooks-dir notebooks
-
-      - name: Convert sanity (every nb / kaggle / original_template -> .py)
-        # Same rationale as Drift: a handful of upstream notebooks fail
-        # the converter (custom magics, malformed JSON, etc). Surface
-        # the count without blocking; the team triages in unslothai/notebooks.
-        continue-on-error: true
-        run: |
-          python unsloth/scripts/notebook_validator.py convert \
-              --notebooks-dir notebooks \
-              --out _converted
-
-      - name: Lint (install cells + AST scan, env-scoped)
-        # Reported as non-blocking (continue-on-error: true) until the
-        # backlog of pre-existing findings on unslothai/notebooks@main is
-        # cleared. Same pattern PR #5298 used for biome:check on the
-        # frontend. As of this commit the live tree surfaces 27 errors +
-        # 6 warnings, all real (peft/torchao floor missing in 6 nb/
-        # notebooks, 14 git+ HEAD installs in hand-tuned exception
-        # notebooks, 6 torch/torchcodec ABI mismatches, 1
-        # transformers/tokenizers --no-deps drift). The count surfaces
-        # in the PR check UI. Drop continue-on-error once it hits zero.
-        continue-on-error: true
-        run: |
-          python unsloth/scripts/notebook_validator.py lint \
-              --notebooks-dir notebooks \
-              --colab-pin unsloth/scripts/data/colab_pip_freeze.gpu.txt \
-              --no-pypi
-        # --no-pypi skips R-INST-002 (transitive resolve via PyPI metadata).
-        # Layer 1 keeps PR-time wall-clock predictable; the daily cron run
-        # below drops --no-pypi and refreshes the cache.
-
-      - name: DONT_UPDATE_EXCEPTIONS coverage
-        run: |
-          python unsloth/scripts/notebook_validator.py exceptions \
-              --notebooks-dir notebooks
-
-  static-with-pypi:
-    name: static + transitive resolve (cron / dispatch only)
-    if: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    steps:
-      # See `static.Validate client_payload.ref shape` for rationale. This
-      # job's `if:` excludes repository_dispatch today, so the validation
-      # step is a defence-in-depth no-op until that gate ever relaxes.
-      - name: Validate client_payload.ref shape
-        if: github.event_name == 'repository_dispatch'
-        env:
-          NOTEBOOKS_REF: ${{ github.event.client_payload.ref }}
-        run: |
-          if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then
-            echo "::error::client_payload.ref contains disallowed characters" >&2
-            exit 1
-          fi
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-          path: unsloth
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          repository: unslothai/notebooks
-          ref: ${{ env.NOTEBOOKS_REF }}
-          path: notebooks
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with: { python-version: '3.12', cache: 'pip' }
-      - name: Install
-        run: pip install -U pip
-      - name: Refresh Colab oracle
-        run: |
-          python unsloth/scripts/notebook_validator.py refresh-colab \
-              --out unsloth/scripts/data/colab_pip_freeze.gpu.txt
-      - name: Diff Colab oracle vs committed snapshots (--strict on cron)
-        # Cron-only escalation of the advisory PR-time check. Fails if
-        # any of pip-freeze.gpu.txt / apt-list-gpu.txt / os-info-gpu.txt
-        # has drifted from scripts/data/colab_*.txt; refresh the
-        # snapshots in this repo to acknowledge.
-        run: |
-          python unsloth/scripts/notebook_validator.py colab-diff \
-              --snapshot-dir unsloth/scripts/data --strict
-      - name: Lint with live PyPI metadata
-        run: |
-          python unsloth/scripts/notebook_validator.py lint \
-              --notebooks-dir notebooks \
-              --colab-pin unsloth/scripts/data/colab_pip_freeze.gpu.txt
-
-  api-introspect:
-    name: api surface (under CUDA spoof)
-    runs-on: ubuntu-latest
-    timeout-minutes: 12
-    steps:
-      - name: Validate client_payload.ref shape
-        if: github.event_name == 'repository_dispatch'
-        env:
-          NOTEBOOKS_REF: ${{ github.event.client_payload.ref }}
-        run: |
-          if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then
-            echo "::error::client_payload.ref contains disallowed characters" >&2
-            exit 1
-          fi
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-          path: unsloth
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          repository: unslothai/notebooks
-          ref: ${{ env.NOTEBOOKS_REF }}
-          path: notebooks
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with: { python-version: '3.12', cache: 'pip' }
-
-      - name: Install CPU torch + pinned unsloth + trl + converter deps
-        run: |
-          python -m pip install --upgrade pip
-          # CPU torch + torchvision. torchvision is required because
-          # unsloth_zoo.vision_utils imports PIL at module top, and the
-          # easiest way to get a torch-compatible PIL on a CPU runner is
-          # to let torchvision pull the right Pillow version.
-          pip install --index-url https://download.pytorch.org/whl/cpu \
-                      'torch>=2.8,<2.11' 'torchvision<0.26'
-          # Pin to the same versions update_all_notebooks.py installs in
-          # generated notebooks. Keep these in lockstep with PIN_TRL /
-          # PIN_TRANSFORMERS in unslothai/notebooks/update_all_notebooks.py.
-          # `triton` is added because unsloth/_gpu_init.py:232 does an
-          # unconditional `import triton`; the PyPI wheel installs cleanly
-          # on Linux x86_64 even without CUDA (same rationale as
-          # consolidated-tests-ci.yml line 192-205).
-          # Pillow is listed explicitly as a defensive belt-and-braces
-          # next to torchvision (vision_utils crashes ModuleNotFoundError
-          # if torchvision skipped its Pillow dep for any reason).
-          pip install 'transformers>=4.56,<5.6' 'trl>=0.22,<0.26' 'accelerate>=1.0' \
-                      'datasets>=3.4,<5' 'peft>=0.15,<0.20' \
-                      'bitsandbytes>=0.43' 'sentencepiece' 'protobuf' triton \
-                      Pillow safetensors tqdm packaging psutil
-          # Converter deps (nbformat for notebook_to_python.py).
-          pip install 'nbformat>=5.10' 'nbconvert>=7.16'
-          # Install unsloth from the LOCAL checkout (the PR head), not PyPI.
-          # The PR-time CI must validate the code in this PR; PyPI unsloth
-          # may lag the in-repo CPU-torch fallback in unsloth/kernels/utils.py
-          # (lines 162-170) that handles missing torch._C._cuda_getCurrentRawStream.
-          pip install --no-deps unsloth_zoo
-          pip install --no-deps -e ./unsloth
-
-      - name: Convert notebooks for AST scan
-        # Same upstream-conversion-error tolerance as the static job.
-        continue-on-error: true
-        run: |
-          python unsloth/scripts/notebook_validator.py convert \
-              --notebooks-dir notebooks --out _converted
-
-      - name: Dump unsloth + trl API surface (under CUDA spoof)
-        run: |
-          PYTHONPATH=unsloth/tests python -u - <<'PY'
-          import sys, json, inspect
-          import _zoo_aggressive_cuda_spoof as _spoof
-          _spoof.apply()
-          import unsloth
-          import trl
-          surface = {}
-          for cls_name in ("FastLanguageModel", "FastVisionModel", "FastModel"):
-              cls = getattr(unsloth, cls_name, None)
-              if cls is None:
-                  continue
-              surface[cls_name] = sorted(n for n in dir(cls) if not n.startswith("_"))
-          surface["SFTConfig_kwargs"] = sorted(inspect.signature(trl.SFTConfig.__init__).parameters)
-          json.dump(surface, open("_api_surface.json", "w"), indent=2)
-          print("dumped surface for:", list(surface))
-          PY
-
-      - name: Run API rule against converted notebooks
-        run: |
-          python unsloth/scripts/notebook_validator.py api \
-              --converted-dir _converted \
-              --surface _api_surface.json
-
-  smoke-install:
-    name: smoke install (Colab-shaped venv, opt-in)
-    if: ${{ github.event.inputs.include_smoke == 'true' || github.event_name == 'schedule' }}
-    runs-on: ubuntu-latest
-    timeout-minutes: 25
-    strategy:
-      fail-fast: false
-      matrix:
-        # One representative notebook per installation_*_content template.
-        # Add rows when a new install template lands in update_all_notebooks.py.
-        notebook:
-          - 'nb/Llama3.1_(8B)-Alpaca.ipynb'           # installation_content
-          - 'nb/Gemma3_(4B)-Vision.ipynb'             # installation_content + vision
-          - 'nb/Llama3.1_(8B)-GRPO.ipynb'             # installation_extra_grpo_content
-          - 'nb/gpt-oss-(20B)-Fine-tuning.ipynb'      # installation_gpt_oss_content
-          - 'nb/Qwen3_5_(4B)_Vision.ipynb'            # installation_qwen3_5_content
-          - 'nb/Nemotron-3-Nano-30B-A3B_A100.ipynb'   # installation_nemotron_nano_content
-          - 'nb/Whisper.ipynb'                         # installation_whisper_content
-          - 'nb/Synthetic_Data_Hackathon.ipynb'        # installation_synthetic_data_content
-    steps:
-      - name: Validate client_payload.ref shape
-        if: github.event_name == 'repository_dispatch'
-        env:
-          NOTEBOOKS_REF: ${{ github.event.client_payload.ref }}
-        run: |
-          if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then
-            echo "::error::client_payload.ref contains disallowed characters" >&2
-            exit 1
-          fi
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-          path: unsloth
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          repository: unslothai/notebooks
-          ref: ${{ env.NOTEBOOKS_REF }}
-          path: notebooks
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with: { python-version: '3.12' }
-
-      - name: Seed Colab-shaped venv from pip-freeze (CPU-mapped)
-        run: |
-          # Strip cu128 local versions, route torch/torchvision to the CPU
-          # wheel index, drop CUDA-specific deps the runner can't use.
-          python -u - <<'PY' > /tmp/seed_pins.txt
-          import json, re
-          mapping = json.load(open("unsloth/scripts/data/colab_to_cpu_pin.json"))
-          rewrite = mapping["rewrite"]
-          skip = set(mapping["skip"])
-          spoof = set(mapping["module_spoof"])
-          out = []
-          for line in open("unsloth/scripts/data/colab_pip_freeze.gpu.txt"):
-              line = line.strip()
-              if not line or line.startswith("#"):
-                  continue
-              m = re.match(r"^([A-Za-z0-9._-]+)\s*==\s*(.+)$", line)
-              if not m:
-                  continue
-              name, ver = m.group(1).lower(), m.group(2)
-              if name in skip:
-                  continue
-              if name in spoof:
-                  continue
-              if name in rewrite:
-                  ver = re.sub(r"[+\-].+$", "", ver)
-                  out.append(f"{name}=={ver}")
-              else:
-                  ver = re.sub(r"[+\-].+$", "", ver)
-                  out.append(f"{name}=={ver}")
-          print("\n".join(out))
-          PY
-          head -5 /tmp/seed_pins.txt
-          wc -l /tmp/seed_pins.txt
-
-      - name: Install Colab-shaped venv
-        run: |
-          python -m pip install --upgrade pip
-          # Best-effort: any single line that fails to resolve on CPU is
-          # tolerated; the smoke contract is "the install cell + the unsloth
-          # import works", not "the entire Colab venv reproduces."
-          while IFS= read -r spec; do
-            pip install "$spec" --index-url https://download.pytorch.org/whl/cpu \
-              --extra-index-url https://pypi.org/simple || \
-              echo "::warning::pin failed: $spec"
-          done < /tmp/seed_pins.txt
-
-      - name: Run install cell
-        run: |
-          python unsloth/scripts/notebook_validator.py convert \
-              --notebooks-dir notebooks --out _converted
-          # Take the converted .py and run the install cell only.
-          BASE="$(basename '${{ matrix.notebook }}' .ipynb | tr -d '()' | tr -c '[:alnum:]_' _)"
-          PY="_converted/${BASE}.py"
-          [ -f "$PY" ] || { echo "::error::$PY not found"; ls _converted | head; exit 1; }
-          # Truncate at the first `from unsloth import` so we run install +
-          # core imports only.
-          awk '/^from unsloth import/ { print "import sys; sys.exit(0)"; exit } { print }' "$PY" > _smoke.py
-          PYTHONPATH=unsloth/tests python -u - <<'PY'
-          import _zoo_aggressive_cuda_spoof as _s; _s.apply()
-          # Stub torchcodec for cells that import it — no CPU wheel exists.
-          import sys, types
-          if "torchcodec" not in sys.modules:
-              sys.modules["torchcodec"] = types.ModuleType("torchcodec")
-          exec(open("_smoke.py").read(), {"__name__": "__main__"})
-          PY
-
-      - name: Verify imports under spoof
-        run: |
-          PYTHONPATH=unsloth/tests python -u - <<'PY'
-          import sys, types
-          if "torchcodec" not in sys.modules:
-              sys.modules["torchcodec"] = types.ModuleType("torchcodec")
-          import _zoo_aggressive_cuda_spoof as _s; _s.apply()
-          import unsloth, peft, torch, torchao, transformers, tokenizers
-          print("OK: imports pass under CUDA spoof")
-          PY
diff --git a/.github/workflows/release-desktop.yml b/.github/workflows/release-desktop.yml
deleted file mode 100644
index 810bb644ba..0000000000
--- a/.github/workflows/release-desktop.yml
+++ /dev/null
@@ -1,902 +0,0 @@
-name: Release Desktop App
-
-on:
-  workflow_dispatch:
-    inputs:
-      studio_version:
-        description: 'Studio version tag to release (for example, v0.1.39-beta)'
-        type: string
-        required: true
-      pypi_version:
-        description: 'Exact PyPI unsloth version just published/stamped (for example, 2026.5.3); leave blank to use MIN_DESKTOP_BACKEND_VERSION'
-        type: string
-        required: false
-      draft:
-        description: 'Create as draft release; draft runs do not advance desktop-latest updater channel'
-        type: boolean
-        default: true
-
-permissions:
-  contents: read
-
-concurrency:
-  group: release-desktop-${{ github.repository }}
-  cancel-in-progress: false
-
-jobs:
-  prepare-version:
-    name: Prepare release versions
-    runs-on: ubuntu-latest
-    outputs:
-      studio_version: ${{ steps.prepare.outputs.studio_version }}
-      app_version: ${{ steps.prepare.outputs.app_version }}
-      desktop_release_tag: ${{ steps.prepare.outputs.desktop_release_tag }}
-      prerelease: ${{ steps.prepare.outputs.prerelease }}
-      pypi_version: ${{ steps.prepare.outputs.pypi_version }}
-
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          persist-credentials: false
-
-      - name: Validate release versions
-        id: prepare
-        shell: bash
-        env:
-          INPUT_STUDIO_VERSION: ${{ inputs.studio_version }}
-          INPUT_PYPI_VERSION: ${{ inputs.pypi_version }}
-        run: |
-          python3 <<'PY'
-          import os
-          import pathlib
-          import re
-          import sys
-
-          studio_version = os.environ['INPUT_STUDIO_VERSION'].strip()
-          if not studio_version:
-              sys.exit('studio_version is required, for example v0.1.39-beta')
-          if re.fullmatch(r'v?20\d{2}\.\d+\.\d+(?:[-+][0-9A-Za-z.-]+)?', studio_version):
-              sys.exit(f'studio_version must be a Studio SemVer tag, not a date-style backend version: {studio_version}')
-
-          semver_tag = re.compile(
-              r'^v(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)'
-              r'(?:-[0-9A-Za-z.][0-9A-Za-z.-]*)?$'
-          )
-          if not semver_tag.fullmatch(studio_version):
-              sys.exit(f'studio_version must be a SemVer tag with leading v, for example v0.1.39-beta: {studio_version}')
-
-          app_version = studio_version.removeprefix('v')
-          desktop_release_tag = f'desktop-v{app_version}'
-          prerelease = 'true' if '-' in app_version.split('+', 1)[0] else 'false'
-
-          def parse_backend_version(version):
-              match = re.fullmatch(
-                  r'(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)'
-                  r'(?:([a-zA-Z]|\.dev|dev|\.rc|rc|\.post|post)(\d*))?'
-                  r'(?:[-+]([0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?',
-                  version,
-              )
-              if not match:
-                  return None
-              major, minor, patch, suffix_name, suffix_number, suffix_text = match.groups()
-              if suffix_name:
-                  normalized = suffix_name.lower().lstrip('.')
-                  order = {'dev': 0, 'a': 1, 'b': 2, 'rc': 3, 'post': 5}.get(normalized)
-                  if order is None:
-                      return None
-                  number = int(suffix_number or '0')
-              elif suffix_text:
-                  order = 3 if version[version.find(suffix_text) - 1] == '-' else 4
-                  number = 0
-              else:
-                  order = 4
-                  number = 0
-              return (int(major), int(minor), int(patch), order, number)
-
-          preflight = pathlib.Path('studio/src-tauri/src/preflight/version.rs').read_text()
-          match = re.search(r'MIN_DESKTOP_BACKEND_VERSION:\s*&str\s*=\s*"([^"]+)"', preflight)
-          if not match:
-              sys.exit('Could not read MIN_DESKTOP_BACKEND_VERSION')
-          min_backend_version = match.group(1)
-
-          input_pypi_version = os.environ.get('INPUT_PYPI_VERSION', '').strip()
-          parsed_min_backend = parse_backend_version(min_backend_version)
-          if parsed_min_backend is None:
-              sys.exit(f'MIN_DESKTOP_BACKEND_VERSION is not a supported backend package version: {min_backend_version}')
-
-          pypi_version = input_pypi_version or min_backend_version
-          parsed_pypi = parse_backend_version(pypi_version)
-          if parsed_pypi is None:
-              sys.exit(f'pypi_version is not a supported backend package version: {pypi_version}')
-          if parsed_pypi < parsed_min_backend:
-              sys.exit(
-                  f'pypi_version {pypi_version} is lower than desktop minimum '
-                  f'MIN_DESKTOP_BACKEND_VERSION {min_backend_version}'
-              )
-
-          if input_pypi_version:
-              print(
-                  'Using exact PyPI unsloth version from pypi_version input: '
-                  f'{pypi_version} (desktop minimum: {min_backend_version})'
-              )
-          else:
-              print(
-                  'Using exact PyPI unsloth version from MIN_DESKTOP_BACKEND_VERSION: '
-                  f'{pypi_version}'
-              )
-
-          with open(os.environ['GITHUB_OUTPUT'], 'a', encoding='utf-8') as output:
-              print(f'studio_version={studio_version}', file=output)
-              print(f'app_version={app_version}', file=output)
-              print(f'desktop_release_tag={desktop_release_tag}', file=output)
-              print(f'prerelease={prerelease}', file=output)
-              print(f'pypi_version={pypi_version}', file=output)
-          PY
-
-      - name: Verify PyPI package and Studio stamp
-        shell: bash
-        env:
-          STUDIO_VERSION: ${{ steps.prepare.outputs.studio_version }}
-          PYPI_VERSION: ${{ steps.prepare.outputs.pypi_version }}
-        run: |
-          set -euo pipefail
-          python3 <<'PY'
-          import json
-          import os
-          import pathlib
-          import sys
-          import time
-          import urllib.error
-          import urllib.request
-
-          pypi_version = os.environ['PYPI_VERSION']
-          dist_dir = pathlib.Path(os.environ['RUNNER_TEMP'], 'pypi-unsloth-dist')
-          dist_dir.mkdir(parents=True, exist_ok=True)
-          metadata_url = f'https://pypi.org/pypi/unsloth/{pypi_version}/json'
-
-          last_error = None
-          for attempt in range(1, 6):
-              try:
-                  with urllib.request.urlopen(metadata_url, timeout=30) as response:
-                      metadata = json.load(response)
-                  break
-              except Exception as exc:
-                  last_error = exc
-                  if attempt < 5:
-                      time.sleep(10 * attempt)
-          else:
-              sys.exit(f'Publish unsloth=={pypi_version} to PyPI before the desktop release ({last_error})')
-
-          files = metadata.get('urls') or []
-          if not files:
-              sys.exit(f'PyPI returned no distribution files for unsloth=={pypi_version}')
-
-          for file_info in files:
-              filename = file_info.get('filename')
-              url = file_info.get('url')
-              if not filename or '/' in filename or not url:
-                  sys.exit(f'Unexpected PyPI file entry for unsloth=={pypi_version}: {file_info!r}')
-              target = dist_dir / filename
-              for attempt in range(1, 4):
-                  try:
-                      with urllib.request.urlopen(url, timeout=60) as response:
-                          target.write_bytes(response.read())
-                      break
-                  except Exception as exc:
-                      last_error = exc
-                      if attempt < 3:
-                          time.sleep(5 * attempt)
-              else:
-                  sys.exit(f'Could not download {filename} from PyPI ({last_error})')
-          PY
-
-          if [ -f scripts/stamp_studio_release.py ]; then
-            mapfile -t dists < <(find "$RUNNER_TEMP/pypi-unsloth-dist" -type f \( -name '*.whl' -o -name '*.tar.gz' \) | sort)
-            if [ "${#dists[@]}" -eq 0 ]; then
-              echo "No PyPI wheel/sdist artifacts downloaded for unsloth==$PYPI_VERSION" >&2
-              exit 1
-            fi
-            python3 scripts/stamp_studio_release.py --verify-dist "$RUNNER_TEMP/pypi-unsloth-dist" --expected "$STUDIO_VERSION"
-          else
-            echo "scripts/stamp_studio_release.py not found; release-desktop requires #5308 to verify the PyPI Studio stamp." >&2
-            exit 1
-          fi
-
-      - name: Guard public updater channel version
-        if: ${{ !inputs.draft }}
-        shell: bash
-        env:
-          GH_REPO: ${{ github.repository }}
-          GH_TOKEN: ${{ github.token }}
-          APP_VERSION: ${{ steps.prepare.outputs.app_version }}
-        run: |
-          set -euo pipefail
-          mkdir -p "$RUNNER_TEMP/desktop-current"
-          if ! gh release download desktop-latest --pattern latest.json --dir "$RUNNER_TEMP/desktop-current" --clobber 2>/dev/null; then
-            echo "No existing desktop-latest latest.json found; allowing first channel publish."
-            exit 0
-          fi
-          python3 <<'PY'
-          import json
-          import os
-          import pathlib
-          import re
-          import sys
-
-          def parse(value: str):
-              value = value.removeprefix('v')
-              match = re.fullmatch(
-                  r'(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)'
-                  r'(?:-([0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?'
-                  r'(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?',
-                  value,
-              )
-              if not match:
-                  sys.exit(f'desktop-latest latest.json has invalid version: {value}')
-              major, minor, patch, prerelease = match.groups()
-              return (int(major), int(minor), int(patch), prerelease)
-
-          def numeric_tail(identifier: str) -> tuple[str, int] | None:
-              match = re.fullmatch(r'([A-Za-z-]+)(\d+)', identifier)
-              if not match:
-                  return None
-              return (match.group(1).lower(), int(match.group(2)))
-
-          def compare_identifier(left: str, right: str) -> int:
-              left_num = left.isdigit()
-              right_num = right.isdigit()
-              if left_num and right_num:
-                  return (int(left) > int(right)) - (int(left) < int(right))
-              if left_num:
-                  return -1
-              if right_num:
-                  return 1
-
-              left_tail = numeric_tail(left)
-              right_tail = numeric_tail(right)
-              if left_tail and right_tail and left_tail[0] == right_tail[0]:
-                  return (left_tail[1] > right_tail[1]) - (left_tail[1] < right_tail[1])
-
-              return (left > right) - (left < right)
-
-          def compare_prerelease(left: str | None, right: str | None) -> int:
-              if left == right:
-                  return 0
-              if left is None:
-                  return 1
-              if right is None:
-                  return -1
-              left_parts = left.split('.')
-              right_parts = right.split('.')
-              for left_part, right_part in zip(left_parts, right_parts):
-                  order = compare_identifier(left_part, right_part)
-                  if order:
-                      return order
-              return (len(left_parts) > len(right_parts)) - (len(left_parts) < len(right_parts))
-
-          def compare(left: str, right: str) -> int:
-              left_major, left_minor, left_patch, left_pre = parse(left)
-              right_major, right_minor, right_patch, right_pre = parse(right)
-              left_core = (left_major, left_minor, left_patch)
-              right_core = (right_major, right_minor, right_patch)
-              if left_core != right_core:
-                  return (left_core > right_core) - (left_core < right_core)
-              return compare_prerelease(left_pre, right_pre)
-
-          current_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-current', 'latest.json')
-          current = json.loads(current_path.read_text()).get('version')
-          next_version = os.environ['APP_VERSION']
-          if not isinstance(current, str):
-              sys.exit('desktop-latest latest.json has missing version')
-          if compare(next_version, current) < 0:
-              sys.exit(
-                  f'Refusing to publish {next_version}; desktop-latest currently points at newer version {current}.'
-              )
-          PY
-
-  build:
-    # TODO: split into a "build (no secrets)" + "publish (secrets)" job pair
-    # with actions/upload-artifact handoff so the matrix build cannot
-    # publish a Release on its own. The current matrix runs across
-    # Linux/macOS/Windows in a single job, so the split needs artefact
-    # collection across the OS matrix and is out of scope for this
-    # hardening pass.
-    permissions:
-      contents: write  # tauri-apps/tauri-action creates / uploads a GitHub Release
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        include:
-          - platform: macos-latest
-            args: '--target aarch64-apple-darwin'
-            label: macOS (Apple Silicon)
-          # - platform: macos-latest
-          #   args: '--target x86_64-apple-darwin'
-          #   label: macOS (Intel)
-          - platform: ubuntu-22.04
-            args: ''
-            label: Linux (x64)
-          - platform: windows-latest
-            args: ''
-            label: Windows (x64)
-
-    name: Build ${{ matrix.label }}
-    needs: prepare-version
-    runs-on: ${{ matrix.platform }}
-
-    env:
-      FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
-      APP_VERSION: ${{ needs.prepare-version.outputs.app_version }}
-      STUDIO_VERSION: ${{ needs.prepare-version.outputs.studio_version }}
-      DESKTOP_RELEASE_TAG: ${{ needs.prepare-version.outputs.desktop_release_tag }}
-      DESKTOP_PRERELEASE: ${{ needs.prepare-version.outputs.prerelease }}
-
-    steps:
-      # harden-runner in audit mode: surfaces every egress destination in
-      # the runner log so the allowlist for a future `egress-policy: block`
-      # promotion can be derived from observed traffic. Audit mode is
-      # cross-platform (Linux / macOS / Windows runners); blocking mode is
-      # currently Linux-only, so we deliberately stay in audit until the
-      # macOS + Windows codesign paths have been observed.
-      - name: Harden runner (audit)
-        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450  # v2.19.1
-        with:
-          egress-policy: audit
-
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          persist-credentials: false
-
-      # ── Linux dependencies ──
-      - name: Install Linux dependencies
-        if: matrix.platform == 'ubuntu-22.04'
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y libwebkit2gtk-4.1-dev libayatana-appindicator3-dev librsvg2-dev libxdo-dev libssl-dev patchelf
-
-      # ── Node.js ──
-      - name: Setup Node.js
-        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e
-        with:
-          node-version: 24
-
-      - name: Install pinned Tauri CLI
-        # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
-        # required for `vite build`. The pre-install lockfile structural
-        # audit (lockfile_supply_chain_audit.py) is the practical defence
-        # against the npm postinstall-dropper class -- it fires BEFORE any
-        # tarball runs, on the injection pattern itself rather than an
-        # advisory-DB lookup.
-        run: npm install --save-dev --prefix studio @tauri-apps/cli@2.10.1 --no-fund --no-audit
-
-      - name: Verify pinned Tauri CLI
-        shell: bash
-        run: |
-          out="$(npx --prefix studio tauri --version)"
-          echo "$out"
-          if [ "$out" != "tauri-cli 2.10.1" ]; then
-            echo "Expected tauri-cli 2.10.1, got $out" >&2
-            exit 1
-          fi
-
-      - name: Verify desktop updater and Linux package config
-        shell: bash
-        run: |
-          node <<'JS'
-          const { readFileSync } = require('node:fs');
-
-          const expected = 'https://github.com/unslothai/unsloth/releases/download/desktop-latest/latest.json';
-          const config = JSON.parse(readFileSync('studio/src-tauri/tauri.conf.json', 'utf8'));
-          const endpoints = config.plugins?.updater?.endpoints;
-          if (!Array.isArray(endpoints) || endpoints.length !== 1) {
-            throw new Error('Expected exactly one desktop updater endpoint');
-          }
-          if (endpoints[0] !== expected) {
-            throw new Error('Desktop updater endpoint must be ' + expected + ', got ' + endpoints[0]);
-          }
-          if (endpoints.some((endpoint) => endpoint.includes('/releases/latest/'))) {
-            throw new Error('Desktop updater endpoint must not use repo-wide /releases/latest/');
-          }
-
-          const targets = config.bundle?.targets;
-          if (Array.isArray(targets) && targets.some((target) => String(target).toLowerCase() === 'rpm')) {
-            throw new Error('Desktop release must not target RPM packages');
-          }
-          if (config.bundle?.linux?.rpm) {
-            throw new Error('bundle.linux.rpm must not be configured');
-          }
-
-          const workflow = readFileSync('.github/workflows/release-desktop.yml', 'utf8');
-          const lines = workflow.split(/\r?\n/);
-          const releaseBodies = [];
-          for (let i = 0; i < lines.length; i += 1) {
-            const match = lines[i].match(/^(\s*)releaseBody:\s*\|\s*$/);
-            if (!match) continue;
-            const baseIndent = match[1].length;
-            const bodyLines = [];
-            i += 1;
-            for (; i < lines.length; i += 1) {
-              const line = lines[i];
-              if (line.trim() === '') {
-                bodyLines.push('');
-                continue;
-              }
-              const indent = line.match(/^\s*/)[0].length;
-              if (indent <= baseIndent) {
-                i -= 1;
-                break;
-              }
-              bodyLines.push(line.slice(baseIndent + 2));
-            }
-            releaseBodies.push(bodyLines.join('\n'));
-          }
-          if (releaseBodies.length === 0) {
-            throw new Error('Expected at least one desktop release body');
-          }
-          for (const body of releaseBodies) {
-            if (/\brpm\b|\.rpm/i.test(body)) {
-              throw new Error('Desktop release body must not advertise RPM packages');
-            }
-          }
-          JS
-
-      - name: Install frontend dependencies
-        working-directory: studio/frontend
-        # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
-        # required for `vite build`. The pre-install lockfile structural
-        # audit (lockfile_supply_chain_audit.py) is the practical defence
-        # against the npm postinstall-dropper class -- it fires BEFORE any
-        # tarball runs, on the injection pattern itself rather than an
-        # advisory-DB lookup.
-        run: npm install --no-fund --no-audit
-
-      # ── Rust ──
-      - name: Install Rust stable
-        uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8  # stable @ 2026-03-27
-        with:
-          targets: ${{ matrix.platform == 'macos-latest' && 'aarch64-apple-darwin,x86_64-apple-darwin' || '' }}
-
-      - name: Patch desktop app version
-        shell: bash
-        working-directory: studio/src-tauri
-        run: |
-          set -euo pipefail
-          if command -v python3 >/dev/null 2>&1; then
-            PYTHON=python3
-          else
-            PYTHON=python
-          fi
-          "$PYTHON" <<'PY'
-          import os
-          import pathlib
-          import re
-          import sys
-
-          app_version = os.environ['APP_VERSION']
-          if not app_version:
-              sys.exit('APP_VERSION is required')
-
-          cargo_toml = pathlib.Path('Cargo.toml')
-          lines = cargo_toml.read_text().splitlines(keepends=True)
-          in_package = False
-          patched = False
-          for index, line in enumerate(lines):
-              stripped = line.strip()
-              if stripped == '[package]':
-                  in_package = True
-                  continue
-              if stripped.startswith('[') and stripped.endswith(']'):
-                  in_package = False
-              if in_package and re.fullmatch(r'version\s*=\s*"[^"]+"\s*', stripped):
-                  lines[index] = f'version = "{app_version}"\n'
-                  patched = True
-                  break
-          if not patched:
-              sys.exit('Could not patch [package] version in Cargo.toml')
-          cargo_toml.write_text(''.join(lines))
-
-          cargo_lock = pathlib.Path('Cargo.lock')
-          lock_text = cargo_lock.read_text()
-          lock_text, count = re.subn(
-              r'(?m)(^\[\[package\]\]\nname = "unsloth-studio"\nversion = ")[^"]+(")',
-              lambda match: f'{match.group(1)}{app_version}{match.group(2)}',
-              lock_text,
-          )
-          if count != 1:
-              sys.exit(f'Could not patch unsloth-studio version in Cargo.lock (matches={count})')
-          cargo_lock.write_text(lock_text)
-          PY
-
-          cargo metadata --locked --no-deps --format-version 1 > "$RUNNER_TEMP/cargo-metadata.json"
-          "$PYTHON" <<'PY'
-          import json
-          import os
-          import pathlib
-          import sys
-
-          app_version = os.environ['APP_VERSION']
-          metadata = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'cargo-metadata.json').read_text())
-          versions = [package['version'] for package in metadata.get('packages', []) if package.get('name') == 'unsloth-studio']
-          if versions != [app_version]:
-              sys.exit(f'cargo metadata unsloth-studio version mismatch: expected {app_version}, got {versions}')
-          PY
-
-          git diff -- Cargo.toml Cargo.lock
-
-      - name: Rust cache
-        uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32
-        with:
-          workspaces: 'studio/src-tauri -> target'
-
-      # ── macOS: import signing certificate ──
-      - name: Import Apple certificate
-        if: matrix.platform == 'macos-latest'
-        env:
-          APPLE_CERTIFICATE: ${{ secrets.APPLE_CERTIFICATE }}
-          APPLE_CERTIFICATE_PASSWORD: ${{ secrets.APPLE_CERTIFICATE_PASSWORD }}
-          KEYCHAIN_PASSWORD: ${{ secrets.KEYCHAIN_PASSWORD }}
-        run: |
-          echo $APPLE_CERTIFICATE | base64 --decode > certificate.p12
-          security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
-          security default-keychain -s build.keychain
-          security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
-          security set-keychain-settings -t 3600 -u build.keychain
-          security import certificate.p12 -k build.keychain -P "$APPLE_CERTIFICATE_PASSWORD" -T /usr/bin/codesign
-          security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k "$KEYCHAIN_PASSWORD" build.keychain
-          security find-identity -v -p codesigning build.keychain
-          rm -f certificate.p12
-
-      # ── Windows: install Azure Trusted Signing CLI ──
-      - name: Install trusted-signing-cli
-        if: matrix.platform == 'windows-latest'
-        run: |
-          cargo install trusted-signing-cli --version 0.9.0 --locked
-          echo "$env:USERPROFILE\.cargo\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-
-      # ── Windows: verify signing CLI is accessible ──
-      - name: Verify trusted-signing-cli
-        if: matrix.platform == 'windows-latest'
-        run: |
-          Write-Output "PATH: $env:PATH"
-          Get-Command trusted-signing-cli -ErrorAction SilentlyContinue || Write-Output "trusted-signing-cli NOT in PATH"
-          trusted-signing-cli --version || Write-Output "trusted-signing-cli failed to run"
-
-      # ── Linux: build + sign + upload ──
-      - name: Build Linux app
-        if: matrix.platform == 'ubuntu-22.04'
-        uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }}
-          TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }}
-        with:
-          projectPath: studio
-          tauriScript: npx --prefix . tauri
-          tagName: ${{ needs.prepare-version.outputs.desktop_release_tag }}
-          releaseName: 'Unsloth Studio (Desktop) ${{ needs.prepare-version.outputs.studio_version }}'
-          releaseBody: |
-            Desktop app for Unsloth Studio.
-
-            **macOS**: Download the Apple Silicon `.dmg`.
-            **Windows**: Download the `-setup.exe` installer.
-            **Linux**: Download `.deb` (Ubuntu/Debian) or `.AppImage` (universal).
-
-            > Linux in-app updates are AppImage-oriented. Package installs should update by downloading a new package.
-            > Linux AppImage on Ubuntu 24.04+ may require: `sudo apt install libfuse2t64`
-            > First-run system dependency elevation is supported on Ubuntu/Debian. Other Linux distributions should install system packages manually.
-          releaseDraft: ${{ inputs.draft }}
-          prerelease: ${{ needs.prepare-version.outputs.prerelease }}
-          args: -v ${{ matrix.args }}
-
-      # ── macOS: build + sign + notarize + upload ──
-      - name: Build macOS app
-        if: matrix.platform == 'macos-latest'
-        uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }}
-          TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }}
-          APPLE_SIGNING_IDENTITY: ${{ secrets.APPLE_SIGNING_IDENTITY }}
-          APPLE_ID: ${{ secrets.APPLE_ID }}
-          APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
-          APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }}
-        with:
-          projectPath: studio
-          tauriScript: npx --prefix . tauri
-          tagName: ${{ needs.prepare-version.outputs.desktop_release_tag }}
-          releaseName: 'Unsloth Studio (Desktop) ${{ needs.prepare-version.outputs.studio_version }}'
-          releaseBody: |
-            Desktop app for Unsloth Studio.
-
-            **macOS**: Download the Apple Silicon `.dmg`.
-            **Windows**: Download the `-setup.exe` installer.
-            **Linux**: Download `.deb` (Ubuntu/Debian) or `.AppImage` (universal).
-
-            > Linux in-app updates are AppImage-oriented. Package installs should update by downloading a new package.
-            > Linux AppImage on Ubuntu 24.04+ may require: `sudo apt install libfuse2t64`
-            > First-run system dependency elevation is supported on Ubuntu/Debian. Other Linux distributions should install system packages manually.
-          releaseDraft: ${{ inputs.draft }}
-          prerelease: ${{ needs.prepare-version.outputs.prerelease }}
-          args: -v ${{ matrix.args }}
-
-      # ── Windows: build + sign + upload ──
-      - name: Build Windows app
-        if: matrix.platform == 'windows-latest'
-        uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }}
-          TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }}
-          AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
-          AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
-          AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
-          AZURE_TRUSTED_SIGNING_ACCOUNT_NAME: ${{ secrets.AZURE_TRUSTED_SIGNING_ACCOUNT_NAME }}
-          AZURE_CERTIFICATE_PROFILE_NAME: ${{ secrets.AZURE_CERTIFICATE_PROFILE_NAME }}
-        with:
-          projectPath: studio
-          tauriScript: npx --prefix . tauri
-          tagName: ${{ needs.prepare-version.outputs.desktop_release_tag }}
-          releaseName: 'Unsloth Studio (Desktop) ${{ needs.prepare-version.outputs.studio_version }}'
-          releaseBody: |
-            Desktop app for Unsloth Studio.
-
-            **macOS**: Download the Apple Silicon `.dmg`.
-            **Windows**: Download the `-setup.exe` installer.
-            **Linux**: Download `.deb` (Ubuntu/Debian) or `.AppImage` (universal).
-
-            > Linux in-app updates are AppImage-oriented. Package installs should update by downloading a new package.
-            > Linux AppImage on Ubuntu 24.04+ may require: `sudo apt install libfuse2t64`
-            > First-run system dependency elevation is supported on Ubuntu/Debian. Other Linux distributions should install system packages manually.
-          releaseDraft: ${{ inputs.draft }}
-          prerelease: ${{ needs.prepare-version.outputs.prerelease }}
-          args: -v ${{ matrix.args }}
-
-  # Release process note: only non-draft workflow runs advance the public
-  # desktop-latest updater channel. Draft builds are for private review; if a
-  # draft is manually published later, this channel intentionally remains
-  # unchanged until a narrow manual channel-publish flow is added or a public
-  # desktop release is created by running this workflow with draft=false.
-  publish-updater-channel:
-    name: Publish desktop updater channel
-    needs: [prepare-version, build]
-    if: ${{ !inputs.draft }}
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-    env:
-      GH_REPO: ${{ github.repository }}
-      APP_VERSION: ${{ needs.prepare-version.outputs.app_version }}
-      STUDIO_VERSION: ${{ needs.prepare-version.outputs.studio_version }}
-      DESKTOP_RELEASE_TAG: ${{ needs.prepare-version.outputs.desktop_release_tag }}
-      DESKTOP_PRERELEASE: ${{ needs.prepare-version.outputs.prerelease }}
-
-    steps:
-      - name: Download versioned updater metadata
-        shell: bash
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          set -euo pipefail
-          mkdir -p "$RUNNER_TEMP/desktop-updater"
-          gh api "repos/${GITHUB_REPOSITORY}/releases/tags/${DESKTOP_RELEASE_TAG}" > "$RUNNER_TEMP/source-release.json"
-          python3 <<'PY'
-          import json
-          import os
-          import pathlib
-          import sys
-
-          source = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'source-release.json').read_text())
-          expected_tag = os.environ['DESKTOP_RELEASE_TAG']
-          if source.get('tag_name') != expected_tag:
-              sys.exit(f'Expected source release {expected_tag}, got {source.get("tag_name")}')
-          if source.get('draft'):
-              sys.exit(f'Source desktop release {expected_tag} is draft; refusing to publish public updater channel')
-          PY
-          gh release download "$DESKTOP_RELEASE_TAG" --pattern latest.json --dir "$RUNNER_TEMP/desktop-updater" --clobber
-          test -s "$RUNNER_TEMP/desktop-updater/latest.json"
-
-      - name: Validate versioned updater metadata
-        shell: bash
-        run: |
-          python3 <<'PY'
-          import json
-          import os
-          import pathlib
-          import re
-          import sys
-
-          app_version = os.environ['APP_VERSION']
-          release_tag = os.environ['DESKTOP_RELEASE_TAG']
-          latest_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-updater', 'latest.json')
-          data = json.loads(latest_path.read_text())
-          if not isinstance(data, dict):
-              sys.exit('latest.json must be a JSON object')
-
-          version = data.get('version')
-          if not isinstance(version, str) or not version:
-              sys.exit('latest.json missing version')
-          if not re.fullmatch(r'v?\d+\.\d+\.\d+(?:[-+][0-9A-Za-z.-]+)?', version):
-              sys.exit(f'latest.json version is not SemVer-like: {version}')
-          if version.removeprefix('v') != app_version:
-              sys.exit(f'latest.json version {version} does not match desktop app version {app_version}')
-
-          platforms = data.get('platforms')
-          if not isinstance(platforms, dict) or not platforms:
-              sys.exit('latest.json missing platforms')
-
-          required_families = {
-              'darwin-aarch64': False,
-              'linux-x86_64': False,
-              'windows-x86_64': False,
-          }
-          expected_prefix = f'https://github.com/unslothai/unsloth/releases/download/{release_tag}/'
-          forbidden_fragments = ('/releases/latest/', '/releases/download/desktop-latest/')
-
-          for platform, entry in platforms.items():
-              if not isinstance(entry, dict):
-                  sys.exit(f'Platform {platform} must be an object')
-              url = entry.get('url')
-              signature = entry.get('signature')
-              if not isinstance(url, str) or not url.strip():
-                  sys.exit(f'Platform {platform} missing url')
-              if not isinstance(signature, str) or not signature.strip():
-                  sys.exit(f'Platform {platform} missing signature')
-              if any(fragment in url for fragment in forbidden_fragments):
-                  sys.exit(f'Platform {platform} points at a moving updater channel: {url}')
-              if not url.startswith(expected_prefix):
-                  sys.exit(f'Platform {platform} URL must point at {release_tag}: {url}')
-              for family in required_families:
-                  if platform == family or platform.startswith(family + '-'):
-                      required_families[family] = True
-
-          missing = [family for family, found in required_families.items() if not found]
-          if missing:
-              sys.exit('latest.json missing required platform families: ' + ', '.join(missing))
-          PY
-
-      - name: Ensure desktop updater channel release
-        shell: bash
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          set -euo pipefail
-          channel_json="$RUNNER_TEMP/desktop-latest-release.json"
-          if ! gh api "repos/${GITHUB_REPOSITORY}/releases/tags/desktop-latest" > "$channel_json" 2>/dev/null; then
-            gh release create desktop-latest \
-              --title "Unsloth Studio Desktop updater channel" \
-              --notes "Machine-managed desktop updater channel; latest.json is replaced by release-desktop.yml." \
-              --prerelease \
-              --latest=false \
-              --target "$GITHUB_SHA"
-            gh api "repos/${GITHUB_REPOSITORY}/releases/tags/desktop-latest" > "$channel_json"
-          fi
-
-          python3 <<'PY'
-          import json
-          import os
-          import pathlib
-          import sys
-
-          channel = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-latest-release.json').read_text())
-          if channel.get('draft'):
-              sys.exit('desktop-latest release is draft; refusing to publish updater channel')
-          if channel.get('immutable'):
-              sys.exit('desktop-latest release is immutable; cannot replace latest.json')
-          if not channel.get('prerelease'):
-              sys.exit('desktop-latest release must be a prerelease so it cannot compete with repo-wide latest')
-          PY
-
-      - name: Prevent updater channel downgrade
-        shell: bash
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          set -euo pipefail
-          mkdir -p "$RUNNER_TEMP/desktop-current"
-          if ! gh release download desktop-latest --pattern latest.json --dir "$RUNNER_TEMP/desktop-current" --clobber 2>/dev/null; then
-            echo "No existing desktop-latest latest.json found; allowing first channel publish."
-            exit 0
-          fi
-          python3 <<'PY'
-          import json
-          import os
-          import pathlib
-          import re
-          import sys
-
-          def parse(value: str):
-              value = value.removeprefix('v')
-              match = re.fullmatch(
-                  r'(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)'
-                  r'(?:-([0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?'
-                  r'(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?',
-                  value,
-              )
-              if not match:
-                  sys.exit(f'desktop-latest latest.json has invalid version: {value}')
-              major, minor, patch, prerelease = match.groups()
-              return (int(major), int(minor), int(patch), prerelease)
-
-          def numeric_tail(identifier: str) -> tuple[str, int] | None:
-              match = re.fullmatch(r'([A-Za-z-]+)(\d+)', identifier)
-              if not match:
-                  return None
-              return (match.group(1).lower(), int(match.group(2)))
-
-          def compare_identifier(left: str, right: str) -> int:
-              left_num = left.isdigit()
-              right_num = right.isdigit()
-              if left_num and right_num:
-                  return (int(left) > int(right)) - (int(left) < int(right))
-              if left_num:
-                  return -1
-              if right_num:
-                  return 1
-
-              left_tail = numeric_tail(left)
-              right_tail = numeric_tail(right)
-              if left_tail and right_tail and left_tail[0] == right_tail[0]:
-                  return (left_tail[1] > right_tail[1]) - (left_tail[1] < right_tail[1])
-
-              return (left > right) - (left < right)
-
-          def compare_prerelease(left: str | None, right: str | None) -> int:
-              if left == right:
-                  return 0
-              if left is None:
-                  return 1
-              if right is None:
-                  return -1
-              left_parts = left.split('.')
-              right_parts = right.split('.')
-              for left_part, right_part in zip(left_parts, right_parts):
-                  order = compare_identifier(left_part, right_part)
-                  if order:
-                      return order
-              return (len(left_parts) > len(right_parts)) - (len(left_parts) < len(right_parts))
-
-          def compare(left: str, right: str) -> int:
-              left_major, left_minor, left_patch, left_pre = parse(left)
-              right_major, right_minor, right_patch, right_pre = parse(right)
-              left_core = (left_major, left_minor, left_patch)
-              right_core = (right_major, right_minor, right_patch)
-              if left_core != right_core:
-                  return (left_core > right_core) - (left_core < right_core)
-              return compare_prerelease(left_pre, right_pre)
-
-          current_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-current', 'latest.json')
-          next_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-updater', 'latest.json')
-          current = json.loads(current_path.read_text()).get('version')
-          next_version = json.loads(next_path.read_text()).get('version')
-          if not isinstance(current, str) or not isinstance(next_version, str):
-              sys.exit('Could not compare desktop-latest channel versions')
-          if compare(next_version, current) < 0:
-              sys.exit(
-                  f'Refusing to move desktop-latest from {current} to older version {next_version}.'
-              )
-          PY
-
-      - name: Publish desktop updater channel metadata
-        shell: bash
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          set -euo pipefail
-          gh release upload desktop-latest "$RUNNER_TEMP/desktop-updater/latest.json" --clobber
-          gh api "repos/${GITHUB_REPOSITORY}/releases/tags/desktop-latest" > "$RUNNER_TEMP/desktop-latest-release.json"
-          python3 <<'PY'
-          import json
-          import os
-          import pathlib
-          import sys
-
-          channel = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-latest-release.json').read_text())
-          assets = [asset for asset in channel.get('assets', []) if asset.get('name') == 'latest.json']
-          if len(assets) != 1:
-              sys.exit(f'Expected exactly one desktop-latest latest.json asset, found {len(assets)}')
-          expected_url = f'https://github.com/{os.environ["GITHUB_REPOSITORY"]}/releases/download/desktop-latest/latest.json'
-          actual_url = assets[0].get('browser_download_url')
-          if actual_url != expected_url:
-              sys.exit(f'desktop-latest latest.json URL mismatch: expected {expected_url}, got {actual_url}')
-          PY
diff --git a/.github/workflows/security-audit.yml b/.github/workflows/security-audit.yml
deleted file mode 100644
index a1e7b2efa6..0000000000
--- a/.github/workflows/security-audit.yml
+++ /dev/null
@@ -1,1126 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Multi-language supply-chain audit. Triggers:
-#   - PRs touching any dependency manifest (Python / npm / Cargo) or
-#     this workflow file,
-#   - push to main / pip,
-#   - nightly @ 04:13 UTC so newly-published advisories surface even
-#     when no PR opens,
-#   - workflow_dispatch for ad-hoc invocations.
-#
-# Two jobs:
-#   - advisory-audit:    one runner that runs pip-audit + npm audit +
-#                        cargo audit back-to-back. All three are
-#                        advisory-DB lookups -- fast, lockfile-driven,
-#                        no archive download. Setting up the python /
-#                        node / rust toolchains on one runner and
-#                        running the three commands serially is
-#                        cheaper than spinning up three runners.
-#   - pip-scan-packages: 3-shard matrix that downloads + pattern-scans
-#                        every PyPI archive in the transitive closure.
-#                        This is the expensive job (~6 min/shard,
-#                        running in parallel) and it must stay
-#                        independent so a CVE-DB hit in advisory-audit
-#                        does not block the supply-chain pattern scan
-#                        (or vice versa).
-#
-# All steps are non-blocking initially. The default branch already
-# carries a known-vuln backlog (the dependabot banner shows 17 today,
-# pip-audit catches 2 more, npm/cargo will catch their own); a hard
-# gate now would block every PR on a baseline we have not triaged.
-# As each baseline closes, drop continue-on-error per step.
-#
-# Dependency coverage:
-#   - unsloth core (pyproject.toml [project.dependencies])
-#   - unsloth `huggingfacenotorch` extras (the canonical install path
-#     for fine-tuning users; pulls transformers / peft / accelerate /
-#     trl / datasets / diffusers / sentence-transformers / etc.)
-#   - all six Studio backend requirements files
-#   - Studio frontend (npm) and Tauri shell (cargo)
-# Each Python step builds a filtered dep list from pyproject.toml +
-# requirements/*.txt before auditing. We do NOT install any of these
-# -- pip-audit resolves through PyPI metadata, scan_packages.py
-# downloads sdist/wheel archives and inspects them without running
-# install hooks, so an attacker who has compromised a transitive dep
-# cannot execute code in this workflow.
-
-name: Security audit
-
-on:
-  pull_request:
-    paths:
-      - 'studio/backend/requirements/**'
-      - 'studio/frontend/package.json'
-      - 'studio/frontend/package-lock.json'
-      - 'studio/src-tauri/Cargo.toml'
-      - 'studio/src-tauri/Cargo.lock'
-      - 'pyproject.toml'
-      - 'scripts/scan_packages.py'
-      - 'scripts/scan_npm_packages.py'
-      - '.github/workflows/security-audit.yml'
-  push:
-    branches: [main, pip]
-  schedule:
-    - cron: '13 4 * * *'   # 04:13 UTC daily, off the cron rush
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  # ─────────────────────────────────────────────────────────────────────
-  # Combined advisory-DB audit: pip-audit + npm audit + cargo audit
-  # all on one runner. Each step is continue-on-error so a finding in
-  # one toolchain does not suppress the others.
-  # ─────────────────────────────────────────────────────────────────────
-  advisory-audit:
-    name: advisory audit (pip + npm + cargo)
-    runs-on: ubuntu-latest
-    timeout-minutes: 25
-    steps:
-      # step-security/harden-runner installs an eBPF-based egress
-      # firewall on the runner. In `audit` mode it logs every outbound
-      # connection without blocking; in `block` mode it rejects
-      # anything outside `allowed-endpoints`. We run audit-only
-      # initially: the next time this job hits a real PyPI advisory or
-      # an attacker-funded archive in pip-scan-packages, the audit log
-      # tells us exactly which hosts were dialed and we promote the
-      # allowlist to block. Would have *contained* the litellm exfil
-      # even if scan_packages had missed the .pth payload.
-      # SHA-pinned (not @v2): the litellm 1.82.7 attack chain hijacked
-      # mutable tags on aquasecurity/trivy-action and would have hit
-      # anyone using @v0 / @v2 / @latest references. Pinning to a 40-
-      # char SHA freezes this action at known-good code; Dependabot's
-      # github-actions ecosystem will auto-bump the SHA.
-      # v2.19.1 commit:
-      # Per-job allowlist: advisory-audit hits PyPI, npm registry,
-      # crates.io advisories, GitHub release artefacts (osv-scanner
-      # binary), Semgrep registry, and TruffleHog's own GitHub action.
-      - name: Harden runner (egress block)
-        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450  # v2.19.1
-        with:
-          egress-policy: block
-          disable-sudo: true
-          allowed-endpoints: >
-            api.github.com:443
-            github.com:443
-            codeload.github.com:443
-            objects.githubusercontent.com:443
-            raw.githubusercontent.com:443
-            release-assets.githubusercontent.com:443
-            registry.npmjs.org:443
-            pypi.org:443
-            files.pythonhosted.org:443
-            static.rust-lang.org:443
-            index.crates.io:443
-            static.crates.io:443
-            crates.io:443
-            semgrep.dev:443
-
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          # Full history so TruffleHog can diff base..head; without
-          # this it sees only the latest commit and reports nothing.
-          fetch-depth: 0
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8  # stable @ 2026-03-27
-
-      - uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32  # v2.9.1
-        with:
-          workspaces: studio/src-tauri -> target
-
-      - name: Install pip-audit + cargo-audit
-        # cargo-audit pulls advisories from the RustSec advisory-db on
-        # first run and caches them under ~/.cargo/advisory-db. Pin
-        # --locked so the version we install matches Cargo.lock
-        # determinism. cargo-audit 0.22 supports the CVSS 4.0 schema
-        # used in 2026 advisories (e.g. RUSTSEC-2026-0073); 0.21
-        # crashes with a TOML parse error on that file.
-        # npm audit is bundled with the node toolchain, no install.
-        run: |
-          python -m pip install --upgrade pip 'pip-audit>=2.7'
-          cargo install --locked --version '^0.22' cargo-audit
-
-      # ─────────────────────────────────────────────────────────────
-      # Python: pip-audit
-      # ─────────────────────────────────────────────────────────────
-      - name: Build filtered Python requirements set
-        # Two transforms:
-        #   (1) Generate audit-reqs/unsloth-deps.txt from pyproject.toml
-        #       so pip-audit sees the unsloth pip package's own dep set
-        #       (core + huggingfacenotorch extras: transformers / peft /
-        #       accelerate / trl / datasets / diffusers /
-        #       sentence-transformers / huggingface_hub / hf_transfer /
-        #       etc.).
-        #   (2) Copy each studio/backend/requirements/*.txt into
-        #       audit-reqs/ with `git+` lines stripped. pip-audit's `-r`
-        #       mode does a dry-run resolve against PyPI metadata; a
-        #       `git+https://...` spec forces it to clone, which is
-        #       both slow and outside the threat model (we audit
-        #       PyPI-served archives; a git ref is whatever HEAD says
-        #       on the runner). A comment line is left in place so the
-        #       skipped specs are obvious in the artifact.
-        # The `huggingface` extra is `huggingfacenotorch` plus torch /
-        # torchvision / triton, deliberately skipped: Studio backend
-        # already pins a torch and the +cu* / +cpu local-version tags
-        # trip up the PyPI resolver in `-r` mode.
-        run: |
-          mkdir -p audit-reqs
-          python <<'PY' > audit-reqs/unsloth-deps.txt
-          import tomllib
-          with open("pyproject.toml", "rb") as f:
-              d = tomllib.load(f)
-          core = d["project"]["dependencies"]
-          extras = d["project"]["optional-dependencies"]["huggingfacenotorch"]
-          print("# Auto-generated from pyproject.toml by security-audit.yml.")
-          print("# core deps + huggingfacenotorch extras.")
-          for spec in core + extras:
-              print(spec)
-          PY
-          for f in studio.txt extras.txt extras-no-deps.txt \
-                   no-torch-runtime.txt overrides.txt triton-kernels.txt; do
-            python <<PY > "audit-reqs/$f"
-          src = "studio/backend/requirements/$f"
-          with open(src) as fh:
-              for line in fh:
-                  stripped = line.strip()
-                  before_comment = stripped.split("#", 1)[0]
-                  if "git+" in before_comment:
-                      print(f"# [security-audit] skipped git+ spec: {stripped}")
-                      continue
-                  print(line.rstrip("\n"))
-          PY
-          done
-
-      - name: pip-audit (declared Python deps, no install)
-        # `-r requirements.txt` resolves the requirements through pip's
-        # dependency resolver against PyPI metadata and audits the
-        # resolved tree without ever executing setup.py / install
-        # hooks. Way faster than installing the full Studio runtime
-        # and -- critically -- safer: an attacker who has compromised
-        # a transitive dep cannot run code in this job.
-        #
-        # extras.txt + extras-no-deps.txt have legacy setup.py
-        # packages (notably openai-whisper) whose setup.py imports
-        # `pkg_resources`, which the isolated build env's current
-        # setuptools no longer ships. PIP_CONSTRAINT pins an older
-        # setuptools into the build env so those builds resolve.
-        # Per-file loop so one bad file doesn't take out the whole
-        # audit.
-        continue-on-error: true
-        env:
-          PIP_CONSTRAINT: ${{ github.workspace }}/audit-reqs/build-constraints.txt
-        run: |
-          set +e
-          cat > audit-reqs/build-constraints.txt <<'CONSTRAINTS'
-          setuptools<78
-          wheel
-          CONSTRAINTS
-          : > logs-pip-audit.txt
-          for f in unsloth-deps studio extras extras-no-deps \
-                   no-torch-runtime overrides triton-kernels; do
-            if ! grep -qE '^[^#[:space:]]' "audit-reqs/$f.txt"; then
-              echo "[security-audit] $f.txt has no PyPI specs after git+ filter, skipping" \
-                | tee -a logs-pip-audit.txt
-              continue
-            fi
-            echo "::group::pip-audit -r audit-reqs/$f.txt"
-            {
-              echo
-              echo "=== $f ==="
-              pip-audit -r "audit-reqs/$f.txt" --format=columns
-              echo "=== end $f (rc=$?) ==="
-            } 2>&1 | tee -a logs-pip-audit.txt
-            echo "::endgroup::"
-          done
-          {
-            echo "## pip-audit (Python)"
-            echo
-            echo '### Coverage'
-            echo '- unsloth core + `huggingfacenotorch` extras (pyproject.toml)'
-            echo '- studio/backend/requirements/{studio,extras,extras-no-deps,no-torch-runtime,overrides,triton-kernels}.txt'
-            echo '- `git+` specs are stripped before audit (out of scope: we audit PyPI archives)'
-            echo
-            echo '### Findings'
-            echo '```'
-            cat logs-pip-audit.txt
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      # ─────────────────────────────────────────────────────────────
-      # Pre-install lockfile supply-chain audit (npm + cargo).
-      # Catches structural anomalies (non-registry resolved URLs,
-      # missing integrity hashes, known IOC strings) BEFORE `npm
-      # audit` or OSV-Scanner consult the advisory DB. The advisory
-      # path is reactive -- there is a window between a malicious
-      # publication and the GHSA landing. This step fires on the
-      # injection pattern itself so it catches the same class of
-      # attack the moment the lockfile shape becomes wrong.
-      # ─────────────────────────────────────────────────────────────
-      - name: Lockfile supply-chain audit (pre-install scan)
-        run: |
-          python3 scripts/lockfile_supply_chain_audit.py
-          {
-            echo "## Lockfile supply-chain audit"
-            echo
-            echo "Scanned: studio/frontend/package-lock.json + studio/src-tauri/Cargo.lock"
-            echo
-            echo "No structural anomalies or known IOC strings."
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      # ─────────────────────────────────────────────────────────────
-      # npm: Studio frontend
-      # ─────────────────────────────────────────────────────────────
-      - name: npm audit (Studio frontend)
-        # `npm audit` resolves the lockfile through the npmjs.com
-        # advisory DB. `--audit-level=high` filters the noise floor
-        # to only HIGH and CRITICAL. We do NOT pass --omit=dev: a
-        # malicious dev-only dep can still steal secrets from a CI
-        # runner, so dev deps need to be in the audit surface.
-        continue-on-error: true
-        working-directory: studio/frontend
-        run: |
-          set +e
-          npm audit --audit-level=high | tee ../../logs-npm-audit.txt
-          # Always also write the full JSON for grep-ability.
-          npm audit --json > ../../logs-npm-audit.json || true
-          {
-            echo "## npm audit (Studio frontend)"
-            echo
-            echo '```'
-            tail -200 ../../logs-npm-audit.txt
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      # ─────────────────────────────────────────────────────────────
-      # cargo: Studio Tauri shell
-      # ─────────────────────────────────────────────────────────────
-      - name: cargo audit (Studio Tauri)
-        # `--deny warnings` would make the job fail on any advisory.
-        # Keep non-blocking initially; drop continue-on-error after
-        # the baseline closes.
-        continue-on-error: true
-        working-directory: studio/src-tauri
-        run: |
-          set +e
-          cargo audit | tee ../../logs-cargo-audit.txt
-          {
-            echo "## cargo audit (Studio Tauri)"
-            echo
-            echo '```'
-            tail -200 ../../logs-cargo-audit.txt
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      # ─────────────────────────────────────────────────────────────
-      # OSV-Scanner: cross-ecosystem advisory DB (PyPI + npm + cargo)
-      # ─────────────────────────────────────────────────────────────
-      - name: OSV-Scanner (PyPI + npm + cargo, cross-ecosystem advisories)
-        # OSV's advisory feed is a superset of GitHub-Advisory + RustSec
-        # + npm advisories; running it alongside the per-ecosystem audit
-        # tools catches CVEs that haven't propagated to the per-ecosystem
-        # DBs yet (e.g. langchain-core CVE-2025-68664 was on OSV before
-        # GitHub Advisory). Single binary, one transitive resolver, all
-        # three lockfile types in one pass. Non-blocking until baselines
-        # close.
-        continue-on-error: true
-        run: |
-          set +e
-          # OSV-Scanner ships a raw binary (no tarball) in v2.x.
-          curl -fsSL -o /tmp/osv-scanner \
-            https://github.com/google/osv-scanner/releases/download/v2.0.2/osv-scanner_linux_amd64
-          chmod +x /tmp/osv-scanner
-          /tmp/osv-scanner --version
-          /tmp/osv-scanner scan source \
-            --lockfile=studio/frontend/package-lock.json \
-            --lockfile=studio/src-tauri/Cargo.lock \
-            --lockfile=requirements.txt:audit-reqs/unsloth-deps.txt \
-            --lockfile=requirements.txt:audit-reqs/studio.txt \
-            --lockfile=requirements.txt:audit-reqs/no-torch-runtime.txt \
-            --lockfile=requirements.txt:audit-reqs/overrides.txt \
-            --lockfile=requirements.txt:audit-reqs/extras.txt \
-            --lockfile=requirements.txt:audit-reqs/extras-no-deps.txt \
-            --format=table 2>&1 | tee logs-osv-scanner.txt
-          {
-            echo "## OSV-Scanner (cross-ecosystem)"
-            echo
-            echo '```'
-            tail -200 logs-osv-scanner.txt
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      # ─────────────────────────────────────────────────────────────
-      # Semgrep: design-flaw detection (catches what regex-pattern
-      # scanning of malicious authors cannot — first-party logic bugs
-      # like langchain-core CVE-2025-68664 dumps/dumpd injection,
-      # n8n CVE-2025-68668 _pyodide.eval_code sandbox escape, marimo
-      # CVE-2026-39987 unauth WebSocket).
-      # ─────────────────────────────────────────────────────────────
-      - name: Semgrep (supply-chain + python rule packs)
-        continue-on-error: true
-        run: |
-          set +e
-          python -m pip install --quiet 'semgrep>=1.95'
-          semgrep --version
-          semgrep scan \
-            --config p/supply-chain \
-            --config p/python \
-            --config p/javascript \
-            --config p/security-audit \
-            --severity ERROR --severity WARNING \
-            --metrics off \
-            --timeout 120 \
-            studio/backend unsloth scripts \
-            2>&1 | tee logs-semgrep.txt
-          {
-            echo "## Semgrep (supply-chain + python + javascript rules)"
-            echo
-            echo '```'
-            tail -200 logs-semgrep.txt
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      # ─────────────────────────────────────────────────────────────
-      # Lockfile pin verifier. The litellm 1.82.7 attack window was
-      # ~40 minutes; anyone resolving with `>=` got the malicious
-      # version automatically. Flag every spec in the requirements
-      # files that does not pin to an exact `==` (or `@` for git
-      # refs, or `===` for arbitrary equality). Warning-only for now;
-      # graduate to blocking once the baseline is clean.
-      # ─────────────────────────────────────────────────────────────
-      - name: Lockfile pin verifier (Python requirements)
-        continue-on-error: true
-        run: |
-          python <<'PY' | tee logs-pin-verifier.txt
-          import re
-          from pathlib import Path
-
-          # Specs that look like `pkg==1.2.3` or `pkg @ git+...` or
-          # bare comments / -r lines are pinned-or-not-applicable.
-          PINNED = re.compile(r"^\s*[A-Za-z0-9_.\-]+\s*(?:===|==)\s*[^,;]+\s*$")
-          GIT_OR_URL = re.compile(r"^\s*[A-Za-z0-9_.\-]+\s*@\s*(?:git\+|https?://)")
-
-          unpinned = []
-          for f in sorted(Path("studio/backend/requirements").glob("*.txt")):
-              for i, raw in enumerate(f.read_text().splitlines(), 1):
-                  line = raw.strip()
-                  if not line or line.startswith("#") or line.startswith("-"):
-                      continue
-                  spec = line.split("#", 1)[0].strip().split(";", 1)[0].strip()
-                  if not spec:
-                      continue
-                  if "git+" in spec or PINNED.match(spec) or GIT_OR_URL.match(spec):
-                      continue
-                  unpinned.append((str(f), i, line))
-
-          print(f"::group::Lockfile pin status")
-          if unpinned:
-              print(f"WARN: {len(unpinned)} non-`==` specs across requirements/*.txt")
-              print("(litellm 1.82.7 wave hit anyone on `>=`; tighten when feasible.)")
-              for f, i, line in unpinned[:80]:
-                  print(f"  {f}:{i}: {line}")
-              if len(unpinned) > 80:
-                  print(f"  ... and {len(unpinned) - 80} more")
-          else:
-              print("OK: every spec is exact-pinned.")
-          print("::endgroup::")
-          PY
-          {
-            echo "## Lockfile pin verifier"
-            echo
-            echo '```'
-            cat logs-pin-verifier.txt
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      # ─────────────────────────────────────────────────────────────
-      # Trivy is deliberately NOT installed here. Trivy was the entry
-      # point for the litellm 1.82.7 supply-chain compromise (March
-      # 2026): attackers force-rewrote 76 of 77 tags in
-      # aquasecurity/trivy-action to point at malicious commits;
-      # anyone running the action with a tag ref auto-pulled a
-      # credential-harvesting payload. By design a security scanner
-      # has broad read access to runner secrets, which is exactly
-      # what made it the ideal pivot. We pick up Trivy's CVE coverage
-      # from OSV-Scanner (NVD + GHSA + GitLab) and its secret
-      # detection from TruffleHog. IaC misconfig detection (Trivy's
-      # one unique value-add) is unfilled for now -- revisit with
-      # checkov / kics when we ship a Dockerfile or k8s manifests.
-      # See https://docs.litellm.ai/blog/security-update-march-2026
-      # and the Microsoft / Trend Micro / Snyk incident write-ups.
-      # ─────────────────────────────────────────────────────────────
-
-      # ─────────────────────────────────────────────────────────────
-      # TruffleHog secret-leak scan on the PR diff. Catches API keys
-      # / tokens / cred files committed accidentally. --only-verified
-      # filters out probabilistic findings, so we only flag tokens
-      # that the source provider confirmed are live. On push to main
-      # / pip we scan the full repo; on PR we scan base..head.
-      # SHA-pinned for the same reason as harden-runner above.
-      # v3.95.2 commit:
-      # ─────────────────────────────────────────────────────────────
-      - name: TruffleHog (secrets in diff)
-        continue-on-error: true
-        uses: trufflesecurity/trufflehog@37b77001d0174ebec2fcca2bd83ff83a6d45a3ab  # v3.95.3
-        with:
-          path: ./
-          base: ${{ github.event.pull_request.base.sha || '' }}
-          head: ${{ github.event.pull_request.head.sha || github.sha }}
-          # The action passes --no-update internally; passing it here
-          # too triggers `flag 'no-update' cannot be repeated`. Stick
-          # with --only-verified so we only flag tokens the source
-          # provider confirmed are live (no probabilistic findings).
-          extra_args: --only-verified
-
-      # ─────────────────────────────────────────────────────────────
-      # CycloneDX SBOM. Lets downstream consumers audit what's
-      # actually shipped in unsloth wheels and the Studio backend
-      # runtime. Generates one JSON file per requirements input plus
-      # a combined SBOM keyed off pyproject.toml; uploads as a build
-      # artifact (and a future step can attest it via SLSA).
-      # ─────────────────────────────────────────────────────────────
-      - name: Generate CycloneDX SBOM
-        continue-on-error: true
-        run: |
-          set +e
-          python -m pip install --quiet 'cyclonedx-bom>=4.6'
-          mkdir -p sbom
-          # Per-requirements-file SBOM (the audit-reqs/ files are the
-          # filtered, git+-stripped views built earlier in this job).
-          # cyclonedx-py 4.x uses `--sv` for spec version and `-o` for
-          # the output file; the older `--schema-version`/`--outfile`
-          # spellings are not accepted.
-          for f in audit-reqs/*.txt; do
-            base=$(basename "$f" .txt)
-            if grep -qE '^[^#[:space:]]' "$f"; then
-              cyclonedx-py requirements "$f" \
-                --sv 1.6 \
-                --of JSON \
-                -o "sbom/sbom-$base.json" 2>&1 | tail -5 || true
-            fi
-          done
-          # Project-level SBOM from pyproject.toml.
-          cyclonedx-py environment \
-            --sv 1.6 \
-            --of JSON \
-            -o sbom/sbom-environment.json 2>&1 | tail -5 || true
-          ls -la sbom/
-          {
-            echo "## CycloneDX SBOM"
-            echo
-            echo "Generated SBOM files:"
-            ls sbom/ | sed 's/^/- sbom\//'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      # ─────────────────────────────────────────────────────────────
-      # GitHub Actions pinning verifier. tj-actions/changed-files
-      # was compromised in March 2025; anyone using `@v4` (a mutable
-      # ref) auto-shipped the malicious version. Catch every
-      # non-SHA-pinned `uses:` across the workflows tree. Warn-only
-      # initially so the existing baseline doesn't block PRs.
-      # ─────────────────────────────────────────────────────────────
-      - name: GitHub Actions pinning verifier
-        continue-on-error: true
-        run: |
-          python <<'PY' | tee logs-actions-pinning.txt
-          import re
-          from pathlib import Path
-          # SHA pin = 40 hex chars after @
-          SHA_PIN = re.compile(r"@[0-9a-f]{40}\b")
-          # First-party / GitHub-published actions get a softer pass
-          # (still recommended to pin; not a security gate).
-          FIRST_PARTY = re.compile(r"^\s*-\s*uses:\s*(actions|github)/[^@]+@")
-          USES = re.compile(r"^\s*-\s*uses:\s*([^@\s]+)@(\S+)")
-          unpinned_third = []
-          unpinned_first = []
-          for f in sorted(Path(".github/workflows").glob("*.yml")):
-              for i, line in enumerate(f.read_text().splitlines(), 1):
-                  m = USES.match(line)
-                  if not m:
-                      continue
-                  name, ref = m.group(1), m.group(2)
-                  if SHA_PIN.search(line):
-                      continue
-                  bucket = unpinned_first if FIRST_PARTY.match(line) else unpinned_third
-                  bucket.append((str(f), i, name, ref))
-          print("::group::Action pinning status")
-          print(f"third-party actions on mutable refs: {len(unpinned_third)}")
-          for f, i, n, r in unpinned_third:
-              print(f"  HIGH  {f}:{i}: {n}@{r}")
-          print()
-          print(f"first-party (actions/* | github/*) on mutable refs: {len(unpinned_first)}")
-          for f, i, n, r in unpinned_first[:30]:
-              print(f"  WARN  {f}:{i}: {n}@{r}")
-          if len(unpinned_first) > 30:
-              print(f"  ... and {len(unpinned_first) - 30} more")
-          print()
-          print("Recommendation: pin third-party actions to a 40-char SHA.")
-          print("Dependabot's github-actions ecosystem will auto-bump them.")
-          print("::endgroup::")
-          PY
-          {
-            echo "## GitHub Actions pinning verifier"
-            echo
-            echo '```'
-            cat logs-actions-pinning.txt
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      # ─────────────────────────────────────────────────────────────
-      # Hash-pin verifier. `==` pinning protects against version
-      # drift but not against a re-uploaded malicious wheel at the
-      # same version (PyPI lets a yanked release be re-published with
-      # different bytes for ~5 minutes via `--filename` collision).
-      # `pip install --require-hashes` rejects any download whose
-      # SHA-256 doesn't match. Inspector step that reports how many
-      # specs would gain from a hash pin -- conversion is a roadmap
-      # item (needs pip-tools / uv pip compile --generate-hashes).
-      # ─────────────────────────────────────────────────────────────
-      - name: Hash-pin verifier (Python requirements)
-        continue-on-error: true
-        run: |
-          python <<'PY' | tee logs-hash-verifier.txt
-          import re
-          from pathlib import Path
-          PINNED = re.compile(r"^\s*[A-Za-z0-9_.\-]+\s*==\s*[^,;]+\s*$")
-          HASH_LINE = re.compile(r"--hash=sha256:[0-9a-f]{64}")
-          total_pinned = 0
-          with_hash = 0
-          for f in sorted(Path("studio/backend/requirements").glob("*.txt")):
-              text = f.read_text()
-              for raw in text.splitlines():
-                  line = raw.strip()
-                  if not line or line.startswith("#") or line.startswith("-"):
-                      continue
-                  spec = line.split("#", 1)[0].strip().split(";", 1)[0]
-                  if PINNED.match(spec):
-                      total_pinned += 1
-                      if HASH_LINE.search(raw):
-                          with_hash += 1
-          print(f"::group::Hash-pin status")
-          print(f"  exact == pins:       {total_pinned}")
-          print(f"  with --hash=sha256:  {with_hash}")
-          print(f"  without --hash:      {total_pinned - with_hash}")
-          print()
-          print("Roadmap: convert to hash-locked installs via")
-          print("`uv pip compile --generate-hashes` and `pip install --require-hashes`.")
-          print("Hash-locked installs would have refused a republished")
-          print("malicious litellm 1.82.7 wheel even at the same version.")
-          print("::endgroup::")
-          PY
-          {
-            echo "## Hash-pin verifier"
-            echo
-            echo '```'
-            cat logs-hash-verifier.txt
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        if: always()
-        with:
-          name: advisory-audit-logs
-          path: |
-            logs-pip-audit.txt
-            logs-npm-audit.txt
-            logs-npm-audit.json
-            logs-cargo-audit.txt
-            logs-osv-scanner.txt
-            logs-semgrep.txt
-            logs-pin-verifier.txt
-            logs-actions-pinning.txt
-            logs-hash-verifier.txt
-            audit-reqs/
-            sbom/
-          retention-days: 30
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Python: pre-install package scan (no install, no execution)
-  # ─────────────────────────────────────────────────────────────────────
-  pip-scan-packages:
-    # Downloads each declared dep WITHOUT installing it and inspects
-    # the archive contents for known malicious patterns: weaponized
-    # .pth files, credential stealers, obfuscated payloads,
-    # install-time droppers, suspicious subprocess / network /
-    # base64-blob combinations.
-    #
-    # This is the kind of check that would have caught:
-    #   - litellm 1.82.7 / 1.82.8  (March 2026, supply-chain compromise)
-    #   - the typo-squat campaign against PyTorch Lightning
-    # before either landed in the install path. pip-audit only knows
-    # about CVE-published vulnerabilities, so it does NOT see novel
-    # malicious uploads. scan_packages.py runs deterministic regex
-    # pattern matching, no LLM calls.
-    #
-    # `--with-deps` makes the scan transitive: every package the
-    # declared set resolves to gets fetched and pattern-scanned, not
-    # just the top-level pins. Resolving the full transitive closure
-    # of the unsloth + Studio dep tree downloads several hundred
-    # archives, hence the longer timeout.
-    #
-    # Sharded across runners for wall-clock parallelism. Each shard
-    # runs scan_packages.py once with --with-deps so its own slice
-    # benefits from pip's deduped transitive resolve. Shard
-    # composition tries to balance load:
-    #   - hf-stack: pyproject extras + no-torch-runtime
-    #               (~150 archives, transformers/peft/accelerate/...)
-    #   - studio:   FastAPI/Studio backend + overrides + extras-no-deps
-    #               (~150 archives, smaller scientific stack)
-    #   - extras:   the heavy openai-whisper / scikit-learn / librosa
-    #               stack (~250 archives, dominant cost)
-    # triton-kernels.txt is git+-only, fully skipped.
-    name: ${{ matrix.shard.name }}
-    runs-on: ubuntu-latest
-    timeout-minutes: 25
-    strategy:
-      fail-fast: false
-      matrix:
-        shard:
-          - name: 'pip scan-packages :: hf-stack'
-            id: hf-stack
-            files: 'unsloth-deps no-torch-runtime'
-          - name: 'pip scan-packages :: studio'
-            id: studio
-            files: 'studio overrides extras-no-deps'
-          - name: 'pip scan-packages :: extras'
-            id: extras
-            files: 'extras'
-    steps:
-      # Egress block on every shard. Each shard pulls hundreds of
-      # PyPI archives -- if a malicious wheel ever phones home from
-      # within the scanner sandbox (it shouldn't; we never execute
-      # the archive), harden-runner now rejects the connect outright.
-      # Per-job allowlist: pip-scan-packages only fetches PyPI archives
-      # via scan_packages.py + pip download. No npm or cargo traffic.
-      - name: Harden runner (egress block)
-        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450  # v2.19.1
-        with:
-          egress-policy: block
-          disable-sudo: true
-          allowed-endpoints: >
-            api.github.com:443
-            github.com:443
-            codeload.github.com:443
-            objects.githubusercontent.com:443
-            pypi.org:443
-            files.pythonhosted.org:443
-
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Install scan_packages.py runtime deps
-        # scan_packages.py imports requests + packaging at runtime to
-        # talk to PyPI's JSON API and to parse version specifiers. We
-        # do not install the packages it scans -- those are downloaded
-        # raw and inspected without ever touching `pip install`.
-        run: python -m pip install --upgrade pip requests packaging
-
-      - name: Build filtered requirements set
-        # Mirrors the advisory-audit job's input transform: pyproject.toml
-        # extraction + git+ stripping. scan_packages.py downloads
-        # PyPI archives without building, so it tolerates legacy
-        # setup.py packages (no resolver dry-run); but `--with-deps`
-        # delegates resolution to a single `pip download` call that
-        # cannot satisfy `git+` specs without git operations, so we
-        # strip them here too.
-        run: |
-          mkdir -p audit-reqs
-          python <<'PY' > audit-reqs/unsloth-deps.txt
-          import tomllib
-          with open("pyproject.toml", "rb") as f:
-              d = tomllib.load(f)
-          core = d["project"]["dependencies"]
-          extras = d["project"]["optional-dependencies"]["huggingfacenotorch"]
-          print("# Auto-generated from pyproject.toml by security-audit.yml.")
-          print("# core deps + huggingfacenotorch extras.")
-          for spec in core + extras:
-              print(spec)
-          PY
-          for f in studio.txt extras.txt extras-no-deps.txt \
-                   no-torch-runtime.txt overrides.txt triton-kernels.txt; do
-            python <<PY > "audit-reqs/$f"
-          src = "studio/backend/requirements/$f"
-          with open(src) as fh:
-              for line in fh:
-                  stripped = line.strip()
-                  before_comment = stripped.split("#", 1)[0]
-                  if "git+" in before_comment:
-                      print(f"# [security-audit] skipped git+ spec: {stripped}")
-                      continue
-                  print(line.rstrip("\n"))
-          PY
-          done
-
-      - name: Sanity-check scan_packages.py
-        # The scanner lives at scripts/scan_packages.py in this repo
-        # so we don't depend on a network fetch at job time.
-        run: |
-          test -f scripts/scan_packages.py
-          head -3 scripts/scan_packages.py
-          grep -q "Standalone pre-install package scanner" scripts/scan_packages.py
-
-      - name: Scan declared + transitive Python deps
-        # scan_packages.py exits 1 on CRITICAL/HIGH findings, 0 on
-        # clean. We swallow the exit because the baseline isn't
-        # triaged yet; surface the findings in the workflow summary.
-        # Drop continue-on-error after the first clean run on main.
-        #
-        # `--with-deps` walks PyPI metadata to enumerate every
-        # transitive dep the declared set would install, then scans
-        # them all. Without this flag, we'd only catch a malicious
-        # *direct* dep -- and supply-chain attacks usually land
-        # several hops down (litellm 1.82.7 was a dep of a dep for
-        # most users).
-        #
-        # This step runs once per matrix shard. Within a shard, every
-        # -r file is fed to a single `pip download` call so pip
-        # intersects version constraints and yields a deduped
-        # transitive set (no point fetching the same transformers
-        # wheel five times). Across shards we accept some redundant
-        # downloads in exchange for wall-clock parallelism.
-        env:
-          SHARD_FILES: ${{ matrix.shard.files }}
-        run: |
-          set +e
-          mkdir -p logs
-          LOG="logs-scan-packages-${{ matrix.shard.id }}.txt"
-          echo "::group::shard ${{ matrix.shard.id }} input files"
-          REQ_ARGS=()
-          for f in $SHARD_FILES; do
-            if grep -qE '^[^#[:space:]]' "audit-reqs/$f.txt"; then
-              echo "  + audit-reqs/$f.txt"
-              REQ_ARGS+=( -r "audit-reqs/$f.txt" )
-            else
-              echo "  - audit-reqs/$f.txt (empty after git+ filter, skipping)"
-            fi
-          done
-          echo "::endgroup::"
-          if [ ${#REQ_ARGS[@]} -eq 0 ]; then
-            echo "[security-audit] shard ${{ matrix.shard.id }}: no PyPI specs, nothing to scan" \
-              | tee "$LOG"
-          else
-            python scripts/scan_packages.py --with-deps "${REQ_ARGS[@]}" \
-              2>&1 | tee "$LOG"
-          fi
-          {
-            echo "## scan_packages :: shard ${{ matrix.shard.id }}"
-            echo
-            echo "### Files in this shard"
-            for f in $SHARD_FILES; do echo "- audit-reqs/$f.txt"; done
-            echo
-            echo '### Findings (tail)'
-            echo '```'
-            tail -200 "$LOG"
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        if: always()
-        with:
-          name: scan-packages-log-${{ matrix.shard.id }}
-          path: |
-            logs-scan-packages-${{ matrix.shard.id }}.txt
-            audit-reqs/
-          retention-days: 30
-
-  # ─────────────────────────────────────────────────────────────────────
-  # npm: pre-install tarball content scan.
-  # ─────────────────────────────────────────────────────────────────────
-  npm-scan-packages:
-    # Counterpart to pip-scan-packages for the npm side. Reads
-    # studio/frontend/package-lock.json, downloads each resolved
-    # tarball DIRECTLY from registry.npmjs.org (never via `npm
-    # install` -- no lifecycle scripts ever run), verifies the
-    # lockfile integrity hash, unpacks each tarball into a sandboxed
-    # temp dir behind size / count / path-escape / symlink guards,
-    # and pattern-scans the extracted file contents for the
-    # signatures common to npm supply-chain attacks:
-    #
-    #   - lifecycle (preinstall / install / postinstall / prepare)
-    #     scripts in any package.json that fetch + execute external
-    #     code,
-    #   - C2 / exfiltration hosts (getsession.org, AWS IMDS,
-    #     Kubernetes ServiceAccount token paths, GitHub Actions OIDC,
-    #     HashiCorp Vault endpoints),
-    #   - credential-stealing references (.npmrc, .aws/credentials,
-    #     GITHUB_TOKEN / NPM_TOKEN in JS sources),
-    #   - known IOC filenames (router_init.js, tanstack_runner.js,
-    #     router_runtime.js),
-    #   - obfuscation shapes (Function/eval against base64 blobs).
-    #
-    # Threat model: every tarball is hostile. Safety guarantees are
-    # documented at scripts/scan_npm_packages.py top-of-file. The
-    # script is stdlib-only so adding it does not increase the
-    # transitive supply-chain surface.
-    name: npm scan-packages (Studio frontend tarballs)
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    needs: []
-    steps:
-      # Per-job allowlist: npm-scan-packages only fetches tarballs from
-      # registry.npmjs.org. GitHub endpoints retained for checkout +
-      # setup-python action machinery.
-      - name: Harden runner (egress block)
-        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450  # v2.19.1
-        with:
-          egress-policy: block
-          disable-sudo: true
-          allowed-endpoints: >
-            api.github.com:443
-            github.com:443
-            codeload.github.com:443
-            objects.githubusercontent.com:443
-            registry.npmjs.org:443
-
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-
-      - name: Sanity-check scan_npm_packages.py
-        run: |
-          test -f scripts/scan_npm_packages.py
-          python3 -c "import ast; ast.parse(open('scripts/scan_npm_packages.py').read())"
-
-      - name: Scan npm tarballs (declared + transitive, no install)
-        # The script exits 1 on HIGH/CRITICAL findings; we capture the
-        # full log and surface it in the step summary either way. It
-        # never runs `npm install`, never executes anything from a
-        # downloaded tarball, and only fetches from registry.npmjs.org.
-        # Initially non-blocking so the baseline can settle; drop
-        # continue-on-error once the baseline is clean for a week.
-        run: |
-          set -o pipefail
-          LOG=logs-scan-npm.txt
-          python3 scripts/scan_npm_packages.py 2>&1 | tee "$LOG"
-          {
-            echo "## scan_npm_packages"
-            echo
-            echo '### Findings (tail)'
-            echo '```'
-            tail -300 "$LOG"
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        if: always()
-        with:
-          name: scan-npm-packages-log
-          path: logs-scan-npm.txt
-          retention-days: 30
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Workflow-trigger lint. Refuses two patterns that together powered the
-  # TanStack GHSA-g7cv-rxg3-hmpx supply-chain compromise:
-  #
-  #   1. `pull_request_target` -- runs a fork's workflow YAML against
-  #      the base repository's secrets. There is no safe use of this
-  #      trigger for a public open-source project.
-  #
-  #   2. Shared cache keys between PR-triggered workflows and the
-  #      publish workflow. A fork PR can poison the cache; the publish
-  #      workflow then restores the poisoned cache on next run.
-  #
-  # Cheap pure-Python lint, runs in seconds. Fail-closed.
-  # ─────────────────────────────────────────────────────────────────────
-  workflow-trigger-lint:
-    name: workflow-trigger lint (pull_request_target / cache-poisoning)
-    runs-on: ubuntu-latest
-    timeout-minutes: 5
-    steps:
-      - name: Harden runner (egress block)
-        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450  # v2.19.1
-        with:
-          egress-policy: block
-          disable-sudo: true
-          allowed-endpoints: >
-            api.github.com:443
-            github.com:443
-            codeload.github.com:443
-            objects.githubusercontent.com:443
-            pypi.org:443
-            files.pythonhosted.org:443
-
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-
-      - name: Install PyYAML
-        run: pip install pyyaml
-
-      - name: Lint workflow triggers + cache keys
-        run: python3 scripts/lint_workflow_triggers.py
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Regression tests: pin scanner IOC tables and pre-install fixtures.
-  # Hard gate (no continue-on-error) so future drift in the IOC tables
-  # or scanner exit semantics fails this PR at review time.
-  # ─────────────────────────────────────────────────────────────────────
-  tests-security:
-    name: pytest tests/security
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    steps:
-      - name: Harden runner (egress block)
-        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450  # v2.19.1
-        with:
-          egress-policy: block
-          disable-sudo: true
-          allowed-endpoints: >
-            api.github.com:443
-            github.com:443
-            codeload.github.com:443
-            objects.githubusercontent.com:443
-            pypi.org:443
-            files.pythonhosted.org:443
-
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-
-      - name: Install pytest + PyYAML
-        # PyYAML is imported by scripts/lint_workflow_triggers.py, which the
-        # `tests/security/test_lint_workflow_triggers.py` regression suite
-        # exercises as a subprocess. Without it the lint script bails with
-        # `ERROR: PyYAML is required` (exit 2) and the 5 lint regression
-        # tests fail. Pinned the same way pytest is pinned.
-        run: pip install pytest==9.0.3 pyyaml==6.0.2
-
-      - name: Run security regression tests
-        run: python3 -m pytest tests/security -v
-
-  # ─────────────────────────────────────────────────────────────────────
-  # npm provenance + new install-script diff. Catches the two npm
-  # supply-chain levers we don't yet gate on:
-  #
-  #   1. `npm audit signatures` validates the registry-signed
-  #      provenance of every tarball laid down in node_modules. Pulled
-  #      from the public npm transparency log; surfaces unsigned or
-  #      mis-signed deps. Informational for now (continue-on-error)
-  #      while the baseline settles.
-  #
-  #   2. `check_new_install_scripts.py` diffs the PR's lockfile
-  #      against the base ref and refuses any newly-added dep that
-  #      ships a postinstall hook. Every recent npm supply-chain
-  #      compromise leveraged a postinstall as the execution lever, so
-  #      blocking new ones at PR time is a small, high-signal gate.
-  # ─────────────────────────────────────────────────────────────────────
-  npm-provenance-and-install-scripts:
-    name: npm provenance + new install-script diff
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    steps:
-      - name: Harden runner (egress block)
-        uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450  # v2.19.1
-        with:
-          egress-policy: audit
-          disable-sudo: true
-          allowed-endpoints: >
-            api.github.com:443
-            github.com:443
-            codeload.github.com:443
-            objects.githubusercontent.com:443
-            registry.npmjs.org:443
-
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          # Need the base commit accessible for `git show
-          # <base-sha>:studio/frontend/package-lock.json` below.
-          fetch-depth: 0
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-
-      - name: Install Studio frontend deps (--ignore-scripts)
-        # `npm audit signatures` requires node_modules to be populated.
-        # `--ignore-scripts` is mandatory: this is exactly the lever the
-        # new-install-script gate below protects against, and we must
-        # not run any third-party hook to set up the audit.
-        working-directory: studio/frontend
-        run: npm ci --ignore-scripts
-
-      - name: npm audit signatures (informational)
-        # Surfaces unsigned / mis-signed packages from the npm
-        # transparency log. continue-on-error during baseline-build
-        # phase; promote to hard gate once the lockfile is fully
-        # signed (most major maintainers signed by mid-2025).
-        working-directory: studio/frontend
-        continue-on-error: true
-        run: |
-          set -o pipefail
-          LOG=logs-audit-signatures.txt
-          npm audit signatures 2>&1 | tee "$LOG"
-          {
-            echo "## npm audit signatures"
-            echo
-            echo '```'
-            tail -200 "$LOG"
-            echo '```'
-          } >> "$GITHUB_STEP_SUMMARY"
-
-      - name: Extract base-ref lockfile (PR triggers only)
-        if: github.event_name == 'pull_request'
-        run: |
-          set -e
-          BASE_SHA="${{ github.event.pull_request.base.sha }}"
-          git show "$BASE_SHA:studio/frontend/package-lock.json" \
-            > /tmp/base-package-lock.json
-
-      - name: Diff for newly-added install-script deps
-        if: github.event_name == 'pull_request'
-        run: |
-          python3 scripts/check_new_install_scripts.py \
-            --base /tmp/base-package-lock.json \
-            --head studio/frontend/package-lock.json
-
-      - name: Skip install-script diff (non-PR trigger)
-        if: github.event_name != 'pull_request'
-        run: |
-          echo "Not a pull_request event; install-script diff requires a base ref."
-          echo "This step is intentionally a no-op outside PR triggers."
-
-      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        if: always()
-        with:
-          name: npm-audit-signatures-log
-          path: studio/frontend/logs-audit-signatures.txt
-          if-no-files-found: ignore
-          retention-days: 30
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
deleted file mode 100644
index 1a4cf841d0..0000000000
--- a/.github/workflows/stale.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: 'Inactive Issue Pinger'
-
-on:
-  schedule:
-    - cron: '30 5 * * *' # Runs at 5:30 UTC every day
-
-jobs:
-  stale:
-    runs-on: ubuntu-latest
-    permissions:
-      issues: write
-
-    steps:
-      - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f  # v10.2.0
-        with:
-          # The message to post on stale issues.
-          # This message will ping the issue author.
-          # Note: The stale bot action does not currently support a direct placeholder for the last commenter.
-          # As a workaround, this message encourages any participant to reply.
-          stale-issue-message: >
-            Is this issue still important to you?
-            Apologies in advance we might have missed this issue as well.
-            For faster response times, please post on our Reddit server - https://www.reddit.com/r/unsloth or our Discord - https://discord.com/invite/unsloth 
-
-          # The number of days of inactivity before an issue is considered stale.
-          days-before-issue-stale: 9999
-
-          # Set to -1 to never close stale issues.
-          days-before-issue-close: -1
-
-          # A label to apply to stale issues.
-          stale-issue-label: 'inactive'
-
-          # The number of operations to perform per run to avoid rate limiting.
-          operations-per-run: 500
-
-          enable-statistics: false
diff --git a/.github/workflows/studio-api-smoke.yml b/.github/workflows/studio-api-smoke.yml
deleted file mode 100644
index 53514e2ce1..0000000000
--- a/.github/workflows/studio-api-smoke.yml
+++ /dev/null
@@ -1,166 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Studio API & Auth Tests -- HTTP-level integration tests for the
-# FastAPI surface. No Playwright, no model UI; tests/studio/test_studio_api_smoke.py
-# runs ~30 s and asserts:
-#   - CORS hardening (no wildcard + credentials, no bootstrap leak)
-#   - /api/system + /api/system/hardware require auth
-#   - Auth state machine + JWT expiry
-#   - API key lifecycle E2E (create / list / use / delete / reject)
-#   - Auth file-mode hardening (Linux only)
-#   - Inference lifecycle (force reload, bogus variant, /v1/models, /v1/embeddings, /v1/responses)
-#   - Endpoint-by-endpoint auth audit
-#
-# Reuses the GGUF cache key from studio-ui-smoke.yml so the model
-# download is one cache-hit on the second job.
-
-name: Studio API CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.sh'
-      - 'pyproject.toml'
-      - 'tests/studio/**'
-      - '.github/workflows/studio-api-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  api-smoke:
-    name: Studio API & Auth Tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 12
-    env:
-      GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18893'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Linux deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            libcurl4-openssl-dev libssl-dev jq
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          # Same key as studio-ui-smoke.yml so the two jobs share a
-          # single GGUF download across CI.
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Prime HF_HOME with the GGUF
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
-      - name: Save HF_HOME for ${{ env.GGUF_REPO }}
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Install pyjwt for the JWT-expiry forge test
-        run: pip install 'pyjwt>=2.6'
-
-      - name: Reset auth + boot Studio (API-only)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-
-      - name: Pass bootstrap password + rotated targets to the test
-        # The test does its own bootstrap-login + rotation to exercise
-        # the auth state machine; we just pre-mint two random rotated
-        # passwords for it. Mask them so the log is clean.
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          NEW2="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          echo "::add-mask::$NEW2"
-          echo "STUDIO_OLD_PW=$OLD"  >> "$GITHUB_ENV"
-          echo "STUDIO_NEW_PW=$NEW"  >> "$GITHUB_ENV"
-          echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
-      - name: Run Studio API & Auth tests
-        # The script is named WITHOUT a `test_` prefix so it isn't
-        # auto-collected by pytest in Backend CI's `tests/` walk
-        # (which doesn't set BASE_URL and would crash at import).
-        env:
-          BASE_URL: http://127.0.0.1:18893
-          STUDIO_AUTH_DIR: /home/runner/.unsloth/studio/auth
-        run: python tests/studio/studio_api_smoke.py
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-
-      - name: Upload API smoke logs
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: studio-api-smoke-log
-          path: |
-            logs/install.log
-            logs/studio.log
-          retention-days: 7
diff --git a/.github/workflows/studio-backend-ci.yml b/.github/workflows/studio-backend-ci.yml
deleted file mode 100644
index 63eb70f7f1..0000000000
--- a/.github/workflows/studio-backend-ci.yml
+++ /dev/null
@@ -1,221 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Runs the existing studio/backend/tests/ suite (~860 tests, all CPU-friendly)
-# on every PR that touches the backend or unsloth library. Until this lands,
-# none of those tests run automatically. Verified locally on Python 3.13 with
-# the surgical exclusions below: 861 pass, 4 skipped.
-#
-# Exclusions:
-#   - tests/test_studio_api.py: end-to-end against a live model + GGUF download,
-#     too heavy for free runners. Run separately when GPU CI is available.
-#   - -k 'not llama_cpp_load_progress_live': spawns a real llama.cpp process,
-#     not appropriate for CPU-only runners.
-#
-# Two jobs:
-#   - pytest matrix (3.10/3.11/3.12/3.13) over studio/backend/tests
-#   - repo-cpu-tests: auto-discovered tests/ + state-isolated spoof files
-#
-# Whole-repo Python lint (syntax + ruff + debugger-leftover scan)
-# moved to the dedicated `Lint CI` workflow (.github/workflows/lint-ci.yml)
-# so it fires on every PR rather than only on studio/unsloth/tests
-# path changes.
-
-name: Backend CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'tests/**'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-backend-ci.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  pytest:
-    name: (Python ${{ matrix.python }})
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    strategy:
-      fail-fast: false
-      matrix:
-        python: ['3.10', '3.11', '3.12', '3.13']
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '${{ matrix.python }}'
-          cache: 'pip'
-
-      - name: Install backend test dependencies (CPU only)
-        run: |
-          python -m pip install --upgrade pip
-          # Studio's declared backend deps:
-          pip install -r studio/backend/requirements/studio.txt
-          # Extras that studio.txt does not list but the import chain needs
-          # (python-multipart for FastAPI form/file uploads, sqlalchemy/cryptography
-          #  for the auth DB, yaml/jinja2 for utils.models.model_config, etc.):
-          pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests \
-            'numpy<3' pytest pytest-asyncio httpx
-          # Torch CPU + transformers are required by a chunk of the backend test
-          # suite (gpu_selection, kv_cache_estimation, utils). CPU-only torch
-          # keeps the install ~250 MB / ~1 min on a clean runner.
-          pip install --index-url https://download.pytorch.org/whl/cpu 'torch>=2.4,<2.11'
-          pip install 'transformers>=4.51,<5.5'
-
-      - name: Backend tests
-        working-directory: studio/backend
-        # Locally validated against this dep set: 831 passed, 5 skipped, 35 deselected.
-        # Deselections (all environment-specific, would never pass on a GPU-less
-        # `ubuntu-latest` runner regardless of code correctness):
-        #   - llama_cpp_load_progress_live: spawns a real llama.cpp process
-        #   - TestGpuAutoSelection / TestPreSpawnGpuResolution / TestPerGpuFitGuardAllCounts:
-        #       require live transformers config introspection on real GPUs
-        #   - TestTransformersIntrospection: same
-        #   - test_returns_cuda_when_cuda_available / test_calls_cuda_cache_when_cuda:
-        #       assume CUDA-capable GPU
-        run: |
-          python -m pytest tests/ -q --tb=short \
-            --ignore=tests/test_studio_api.py \
-            -k 'not llama_cpp_load_progress_live and not TestGpuAutoSelection and not TestPreSpawnGpuResolution and not TestPerGpuFitGuardAllCounts and not TestTransformersIntrospection and not test_returns_cuda_when_cuda_available and not test_calls_cuda_cache_when_cuda'
-
-  repo-cpu-tests:
-    # Auto-discover everything under tests/ that is not GPU-bound by
-    # design. New tests added in covered directories are picked up
-    # without a workflow edit. Locally validated: 760 passed, 1 skipped,
-    # 23 deselected. tests/conftest.py (mirroring unsloth-zoo PR #624)
-    # pre-loads unsloth_zoo.device_type and unsloth.device_type under a
-    # mocked torch.cuda.is_available so the unsloth import chain
-    # succeeds on CPU.
-    name: Repo tests (CPU)
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      # node + uv unlock ~60 tests that previously skipped on CI:
-      #   - 9 tests in test_chat_preset_builtin_invariants.py need node to
-      #     compile a tiny TS harness against the frontend chat sources.
-      #   - tests/python/* spawn fresh `uv venv`s to verify the no-torch
-      #     install path; they self-skip when uv is missing.
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - name: Install uv (for tests/python/* sandboxed venvs)
-        run: pip install uv
-
-      - name: Install deps (shared shape with backend pytest job)
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r studio/backend/requirements/studio.txt
-          pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests typer \
-            'numpy<3' pytest pytest-asyncio httpx
-          # torchvision: unsloth_zoo.vision_utils imports it at module scope.
-          pip install --index-url https://download.pytorch.org/whl/cpu \
-            'torch>=2.4,<2.11' 'torchvision<0.26'
-          pip install 'transformers>=4.51,<5.5'
-          # bitsandbytes: hard import in unsloth/models/_utils.py. Recent
-          # versions ship a CPU build that imports cleanly on Linux.
-          pip install 'bitsandbytes>=0.45'
-          # unsloth.device_type imports unsloth_zoo.utils.Version at module
-          # scope, so the conftest preload needs unsloth_zoo even though
-          # it is an optional dep of unsloth.
-          pip install 'unsloth_zoo>=2026.5.1'
-          pip install -e . --no-deps
-
-      - name: Repo tests (CPU, auto-discovered)
-        env:
-          # tests/python/* import install_python_stack from studio/.
-          PYTHONPATH: ${{ github.workspace }}/studio
-          # Skip lazy compilation work the unsloth import chain wants to
-          # do at import time on a real GPU.
-          UNSLOTH_COMPILE_DISABLE: '1'
-        # --ignore: GPU-bound directories (qlora/saving need real weights;
-        #   tests/sh is the shell suite the next step handles; tests/utils
-        #   is a helpers folder); tests/vllm_compat + tests/version_compat
-        #   are dedicated multi-version drift canaries with their own job
-        #   in version-compat-ci.yml that installs the heavier dep set
-        #   (torchcodec, full transformers/peft/bnb pins) those tests need.
-        # State-sensitive hardware-spoofing files run in isolation in the
-        # next step because they mutate hardware.py module globals.
-        # -m: honour markers from tests/python/conftest.py (`server` =
-        #   needs studio venv, `e2e` = needs network).
-        # --deselect:
-        #   - test_model_registration / test_all_model_registration:
-        #     hit huggingface_hub for live model existence checks.
-        #   - test_autoconfig_works_with_no_torch_runtime / test_autoconfig_succeeds:
-        #     fail because no-torch-runtime.txt does not pin tokenizers
-        #     and the latest tokenizers (0.23.1) is incompatible with the
-        #     transformers it resolves to. Tracked separately; this is a
-        #     real bug in the no-torch install path, not a CI issue.
-        run: |
-          python -m pytest tests/ -q --tb=short \
-            --ignore=tests/qlora \
-            --ignore=tests/saving \
-            --ignore=tests/utils \
-            --ignore=tests/sh \
-            --ignore=tests/studio/test_hardware_dispatch_matrix.py \
-            --ignore=tests/studio/test_is_mlx_dispatch_gate.py \
-            --ignore=tests/vllm_compat \
-            --ignore=tests/version_compat \
-            -m 'not server and not e2e' \
-            --deselect tests/test_model_registry.py::test_model_registration \
-            --deselect tests/test_model_registry.py::test_all_model_registration \
-            --deselect 'tests/python/test_tokenizers_and_torch_constraint.py::TestE2ETokenizersFix::test_autoconfig_works_with_no_torch_runtime' \
-            --deselect 'tests/python/test_tokenizers_and_torch_constraint.py::TestE2EFullNoTorchSandbox::test_autoconfig_succeeds'
-
-      - name: Hardware-spoof tests (state-sensitive, run in isolation)
-        env:
-          PYTHONPATH: ${{ github.workspace }}/studio
-          UNSLOTH_COMPILE_DISABLE: '1'
-        # These two files mutate hardware.py module globals at runtime
-        # via the spoof fixtures, which leaks state into any other test
-        # that imports hardware. Run them in their own pytest invocation
-        # so the leak does not cross file boundaries.
-        run: |
-          python -m pytest -q --tb=short \
-            tests/studio/test_hardware_dispatch_matrix.py \
-            tests/studio/test_is_mlx_dispatch_gate.py
-
-      - name: Shell installer tests
-        # Subset that does not depend on a writable / pristine install.sh
-        # tree; test_install_host_defaults.sh checks install.ps1 layout
-        # which has drifted (separate followup).
-        run: |
-          set -e
-          for s in \
-              tests/sh/test_get_torch_index_url.sh \
-              tests/sh/test_mac_intel_compat.sh \
-              tests/sh/test_tauri_install_exit_order.sh \
-              tests/sh/test_torch_constraint.sh; do
-              echo "::group::$s"
-              bash "$s"
-              echo "::endgroup::"
-          done
-
diff --git a/.github/workflows/studio-frontend-ci.yml b/.github/workflows/studio-frontend-ci.yml
deleted file mode 100644
index 1270a57ef6..0000000000
--- a/.github/workflows/studio-frontend-ci.yml
+++ /dev/null
@@ -1,151 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Frontend PR gate: lockfile freshness, typecheck, build, and a bundle grep
-# that catches the 2026.5.1 chat-history regression at the JS level.
-#
-# biome runs as non-blocking for now: the codebase currently has accumulated
-# ~470 errors and ~1650 warnings against the existing biome config. Surfacing
-# the count in CI lets us drive it down without forcing a fleet-wide cleanup
-# in the same PR. Drop `continue-on-error` once that number is zero.
-
-name: Frontend CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/frontend/**'
-      - 'scripts/check_frontend_dep_removal.py'
-      - 'tests/studio/test_frontend_dep_removal.py'
-      - '.github/workflows/studio-frontend-ci.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  build:
-    name: Frontend build + bundle sanity
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    defaults:
-      run:
-        working-directory: studio/frontend
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      # FIXME: drop this step once @assistant-ui/* and assistant-stream
-      # leave 0.x -- on 1.x, caret ranges are conventional. Until then,
-      # every 0.minor on this surface is a SemVer-major (this is exactly
-      # how 2026.5.1 shipped a broken chat runtime: ^0.12.19 quietly
-      # resolved to 0.12.28).
-      - name: '@assistant-ui must be pinned exactly (no caret/tilde)'
-        working-directory: ${{ github.workspace }}
-        run: |
-          set -e
-          if grep -nE '"(@assistant-ui/[a-z-]+|assistant-stream)":[[:space:]]*"[\^~]' studio/frontend/package.json; then
-            echo "::error file=studio/frontend/package.json::These packages must be pinned to exact versions until they leave 0.x. Drop the leading ^ or ~."
-            exit 1
-          fi
-          echo "All assistant-ui packages are pinned exactly."
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      # Run the structural lockfile scan BEFORE npm ci. A compromised
-      # tarball runs its `prepare` / `postinstall` during `npm ci`,
-      # so any catch has to fire upstream of that. The scanner is
-      # pure-Python read-only; safe to call ahead of every install.
-      - name: Lockfile supply-chain audit (pre-install scan)
-        working-directory: ${{ github.workspace }}
-        run: python3 scripts/lockfile_supply_chain_audit.py
-
-      - name: Lockfile must agree with package.json (npm ci is strict)
-        # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
-        # required for `vite build`. The pre-install lockfile structural
-        # audit (lockfile_supply_chain_audit.py) is the practical defence
-        # against the npm postinstall-dropper class -- it fires BEFORE any
-        # tarball runs, on the injection pattern itself rather than an
-        # advisory-DB lookup.
-        run: npm ci --no-fund --no-audit
-
-      - name: npm ci must not have modified the working tree
-        working-directory: ${{ github.workspace }}
-        run: |
-          if ! git diff --quiet -- studio/frontend; then
-            echo "::error::npm ci modified files; commit the updated lockfile"
-            git status -- studio/frontend
-            exit 1
-          fi
-
-      # Catch the common foot-gun: a dep dropped from package.json that is
-      # still imported somewhere. The script walks the lockfile dep graph
-      # from the new top-level deps and only counts top-level node_modules
-      # paths as valid resolution targets for bare src/ imports.
-      #
-      # actions/checkout uses fetch-depth: 1 by default, so the base branch
-      # is not available locally. Fetch the single base commit with an
-      # explicit refspec so origin/<base> is reliably created (a bare
-      # `git fetch origin <ref>` only updates FETCH_HEAD in some configs).
-      - name: Dependency removal safety check
-        if: github.event_name == 'pull_request'
-        working-directory: ${{ github.workspace }}
-        run: |
-          git fetch --no-tags --depth=1 origin \
-            "${{ github.base_ref }}:refs/remotes/origin/${{ github.base_ref }}"
-          python3 scripts/check_frontend_dep_removal.py \
-            --base "origin/${{ github.base_ref }}" \
-            --enumerate-dead
-          python3 tests/studio/test_frontend_dep_removal.py
-
-      - name: Typecheck
-        run: npm run typecheck
-
-      - name: Build
-        run: npm run build
-
-      - name: Built bundle must not contain Studio's unstable_Provider call site
-        run: |
-          set -e
-          JS=$(ls dist/assets/index-*.js | head -1)
-          HITS=$(grep -c 'unstable_Provider:' "$JS" || echo 0)
-          echo "main bundle: $JS"
-          echo "unstable_Provider: hits=$HITS (assistant-ui internals contribute up to 3)"
-          if [ "$HITS" -gt 3 ]; then
-            echo "::error file=studio/frontend/src/features/chat/runtime-provider.tsx::Studio bundle still passes unstable_Provider through useRemoteThreadListRuntime; this is the 2026.5.1 chat-history regression. Pass adapters directly into useLocalRuntime instead."
-            exit 1
-          fi
-
-      - name: Bundle size budget (75 MB)
-        run: |
-          SIZE=$(du -sb dist | cut -f1)
-          BUDGET=$((75 * 1024 * 1024))
-          echo "dist size: $SIZE bytes ($((SIZE/1024/1024)) MB), budget: $BUDGET bytes (75 MB)"
-          if [ "$SIZE" -gt "$BUDGET" ]; then
-            echo "::error::studio/frontend/dist/ exceeded the 75 MB budget. Drop dead deps (e.g. the unused next dep) or split chunks."
-            exit 1
-          fi
-
-      - name: Biome (non-blocking until accumulated drift is cleared)
-        continue-on-error: true
-        run: npm run biome:check
-
-      - name: Upload built dist
-        # Always upload so a green run is reviewable too -- the dist
-        # output catches "tests passed but bundle changed unexpectedly"
-        # regressions that would be invisible if we only kept artifacts
-        # on failure.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: studio-frontend-dist
-          path: studio/frontend/dist
-          retention-days: 3
diff --git a/.github/workflows/studio-inference-smoke.yml b/.github/workflows/studio-inference-smoke.yml
deleted file mode 100644
index 775363e73c..0000000000
--- a/.github/workflows/studio-inference-smoke.yml
+++ /dev/null
@@ -1,887 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Three end-to-end smoke jobs that boot a freshly-installed Studio and
-# exercise the surfaces real users hit through the OpenAI / Anthropic
-# SDKs and curl. Each job picks the smallest model that exercises the
-# behaviour under test, primes HF_HOME via actions/cache, and shares
-# the install.sh --local --no-torch bootstrap.
-#
-#   1. OpenAI, Anthropic API tests
-#        gemma-3-270m-it UD-Q4_K_XL (~254 MiB).
-#        Password rotation via /api/auth/change-password (old fails,
-#        new works), then OpenAI + Anthropic Python SDKs against /v1/*
-#        with temperature=0 and a fixed seed. Asserts the four-turn
-#        conversation is deterministic across two runs.
-#
-#   2. Tool calling Tests
-#        Qwen3.5-2B UD-IQ3_XXS (~890 MiB). OpenAI function calling,
-#        server-side tools (python, terminal, web_search) via
-#        enable_tools / enabled_tools, and enable_thinking on/off.
-#
-#   3. JSON, images
-#        gemma-4-E2B-it UD-IQ3_XXS (~2.4 GiB) + mmproj-F16 (~986 MiB).
-#        response_format JSON-schema decoding and OpenAI image_url
-#        (data URI) plus Anthropic source/base64 image inputs.
-#
-# All three jobs run in parallel. Total wall time is dominated by job 3
-# on a cold cache; warm cache cuts that to ~3 min.
-
-name: Studio GGUF CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.sh'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-inference-smoke.yml'
-  push:
-    branches: [main, pip]
-  # Manual trigger for pre-warming HF_HOME caches on main, or re-running
-  # against an arbitrary branch without pushing a no-op commit.
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  # ─────────────────────────────────────────────────────────────────────
-  # Job 1: OpenAI, Anthropic API tests
-  # ─────────────────────────────────────────────────────────────────────
-  openai-anthropic:
-    name: OpenAI, Anthropic API tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 25
-    env:
-      GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18888'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Linux deps for llama.cpp prebuilt
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            libcurl4-openssl-dev libssl-dev jq
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Prime HF_HOME with the GGUF
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
-      - name: Save HF_HOME for ${{ env.GGUF_REPO }}
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Install OpenAI + Anthropic Python SDKs
-        run: pip install 'openai>=1.50' 'anthropic>=0.40'
-
-      - name: Reset auth + boot Studio (API-only)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Studio did not become healthy in 180s"
-          tail -200 logs/studio.log
-          exit 1
-
-      - name: Password rotation (old must fail, new must work)
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          # 1. Login with the bootstrap password.
-          OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
-          [ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] || { echo "bootstrap login failed"; exit 1; }
-          # 2. Rotate to a fresh random password.
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
-          # 3. Old password must now be rejected (HTTP 401).
-          OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \
-            -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}")
-          if [ "$OLD_STATUS" != "401" ]; then
-            echo "::error::Login with old password returned $OLD_STATUS, expected 401"
-            exit 1
-          fi
-          # 4. New password must succeed; capture the JWT for downstream steps.
-          NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          [ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] || { echo "new login failed"; exit 1; }
-          echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV"
-          echo "password rotation OK (old=401, new=200)"
-
-      - name: Load the GGUF (HF repo + variant, served from HF_HOME cache)
-        run: |
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_gguf, context_length}'
-
-      - name: Multi-turn determinism via OpenAI + Anthropic SDKs
-        env:
-          BASE_URL: http://127.0.0.1:18888
-        run: |
-          python - <<'PY'
-          import json
-          import os
-          from openai import OpenAI
-          from anthropic import Anthropic
-
-          BASE = os.environ["BASE_URL"]
-          KEY  = os.environ["TOKEN"]      # JWT also accepted as Bearer on /v1/*
-          SEED = 3407
-
-          # Four-turn conversation: the second and fourth turns can only be
-          # answered correctly if the model sees the prior turns, so this
-          # also exercises the conversation-history wiring.
-          PROMPTS = [
-              "What is 1+1?",
-              "What did I ask before?",
-              "What is the capital of France?",
-              "Repeat the city name",
-          ]
-
-          def run_openai():
-              client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
-              history, replies = [], []
-              for prompt in PROMPTS:
-                  history.append({"role": "user", "content": prompt})
-                  resp = client.chat.completions.create(
-                      model       = "default",
-                      messages    = history,
-                      temperature = 0.0,
-                      max_tokens  = 80,
-                      seed        = SEED,
-                      extra_body  = {"enable_thinking": False},
-                  )
-                  text = resp.choices[0].message.content or ""
-                  replies.append(text)
-                  history.append({"role": "assistant", "content": text})
-              return replies
-
-          def run_anthropic():
-              # Two SDK quirks vs. Studio:
-              #   1. base_url must NOT include /v1 -- the SDK appends
-              #      /v1/messages itself; otherwise the request hits
-              #      /v1/v1/messages and 405s.
-              #   2. The SDK sends `x-api-key` by default, but Studio's
-              #      auth layer is HTTPBearer-only. Override via
-              #      default_headers so Authorization: Bearer ... is
-              #      sent instead.
-              client = Anthropic(
-                  base_url        = BASE,
-                  api_key         = "unused",
-                  default_headers = {"Authorization": f"Bearer {KEY}"},
-              )
-              history, replies = [], []
-              for prompt in PROMPTS:
-                  history.append({"role": "user", "content": prompt})
-                  msg = client.messages.create(
-                      model       = "default",
-                      max_tokens  = 80,
-                      messages    = history,
-                      temperature = 0.0,
-                      extra_body  = {"seed": SEED, "enable_thinking": False},
-                  )
-                  text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text")
-                  replies.append(text)
-                  history.append({"role": "assistant", "content": text})
-              return replies
-
-          for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)):
-              first  = runner()
-              second = runner()
-              for i, (a, b) in enumerate(zip(first, second), start = 1):
-                  print(f"[{label} turn {i}] {a!r}")
-                  assert a, f"{label}: empty turn {i} response"
-                  assert a == b, (
-                      f"{label} non-deterministic at turn {i} with temperature=0.0:\n"
-                      f"  run1: {a!r}\n  run2: {b!r}"
-                  )
-              # Sanity: turn-2 reply should mention the earlier question, and
-              # turn-4 reply should mention Paris (model echoes the city it
-              # produced for turn 3). Lower-cased substring checks keep the
-              # assertion robust to formatting jitter.
-              joined = " ".join(first).lower()
-              assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}"
-              assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}"
-              print(f"[{label}] OK -- 4 turns, run1 == run2, history grounded")
-          PY
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-          ss -tln | grep ":${STUDIO_PORT}" || true
-
-      - name: Upload logs
-        # Always upload so green runs are still reviewable.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: openai-anthropic-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Job 2: Tool calling Tests
-  # ─────────────────────────────────────────────────────────────────────
-  tool-calling:
-    name: Tool calling Tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 25
-    env:
-      # Tool calling is the highest-volume GGUF in this workflow
-      # (Qwen3.5-2B at IQ3_XXS = ~890 MiB). Caching HF_HOME would
-      # store xet chunks + blobs + snapshots = ~4 GiB compressed --
-      # 4-5x file-size inflation, dominated by xet chunks. Use main's
-      # `--local-dir gguf-cache` pattern to cache the flat .gguf only.
-      # Studio's /api/inference/load accepts either a HF repo (which
-      # uses HF_HOME) or an absolute file path; passing the absolute
-      # path keeps the test off HF_HOME entirely so the cache size
-      # tracks the GGUF file 1:1. The OpenAI/Anth and JSON+images
-      # jobs still cover the gguf_variant resolution path.
-      GGUF_REPO: unsloth/Qwen3.5-2B-GGUF
-      GGUF_FILE: Qwen3.5-2B-UD-IQ3_XXS.gguf
-      STUDIO_PORT: '18889'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Linux deps for llama.cpp prebuilt
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            libcurl4-openssl-dev libssl-dev jq
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Restore GGUF model file
-        id: cache-gguf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
-      - name: Download GGUF if cache miss
-        id: download-gguf
-        if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.cache-gguf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p gguf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache
-
-      - name: Save GGUF model file
-        if: always() && steps.download-gguf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Reset auth + boot Studio (API-only, default tool policy)
-        # We deliberately use the API-only mode rather than
-        # `unsloth studio run` because the latter calls
-        # `set_tool_policy(...)` with a resolved bool: on loopback the
-        # default resolves to True, which forces every request through
-        # the server-side agentic loop and breaks the standard
-        # function-calling test below. API-only mode leaves
-        # tool_policy=None so each request's `enable_tools` field is
-        # honoured.
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health, log in, change password, load model
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
-          TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
-          GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}"
-          ls -lh "$GGUF_PATH"
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name}'
-
-      - name: Tool calling, server-side tools, thinking on/off
-        env:
-          BASE_URL: http://127.0.0.1:18889
-        run: |
-          python - <<'PY'
-          import json
-          import os
-          import urllib.request
-
-          BASE = os.environ["BASE_URL"]
-          KEY  = os.environ["API_KEY"]
-          SEED = 3407
-
-          def post(path, body, *, timeout = 240):
-              """Plain JSON POST. For requests that don't go through
-              the server-side agentic loop, the response is one JSON
-              object."""
-              data = json.dumps(body).encode()
-              req = urllib.request.Request(
-                  f"{BASE}{path}",
-                  data    = data,
-                  method  = "POST",
-                  headers = {
-                      "Authorization": f"Bearer {KEY}",
-                      "Content-Type": "application/json",
-                  },
-              )
-              with urllib.request.urlopen(req, timeout = timeout) as resp:
-                  return resp.status, json.loads(resp.read().decode())
-
-          def post_sse(path, body, *, timeout = 600):
-              """POST a streaming request and accumulate the assistant
-              text deltas. The server-side agentic loop ALWAYS returns
-              SSE regardless of the request's `stream` field, so any
-              call with enable_tools=true must use this helper."""
-              body = {**body, "stream": True}
-              data = json.dumps(body).encode()
-              req = urllib.request.Request(
-                  f"{BASE}{path}",
-                  data    = data,
-                  method  = "POST",
-                  headers = {
-                      "Authorization": f"Bearer {KEY}",
-                      "Content-Type": "application/json",
-                  },
-              )
-              parts = []
-              with urllib.request.urlopen(req, timeout = timeout) as resp:
-                  for raw in resp:
-                      line = raw.decode().strip()
-                      if not line.startswith("data: "):
-                          continue
-                      payload = line[6:]
-                      if payload == "[DONE]":
-                          break
-                      try:
-                          chunk = json.loads(payload)
-                      except json.JSONDecodeError:
-                          continue
-                      for choice in chunk.get("choices", []):
-                          delta = choice.get("delta", {}) or {}
-                          if delta.get("content"):
-                              parts.append(delta["content"])
-              return "".join(parts)
-
-          # ── 1. Standard OpenAI function calling ──────────────────────
-          weather_tool = {
-              "type": "function",
-              "function": {
-                  "name": "get_weather",
-                  "description": "Get current weather for a city.",
-                  "parameters": {
-                      "type": "object",
-                      "properties": {"city": {"type": "string"}},
-                      "required": ["city"],
-                  },
-              },
-          }
-
-          status, data = post("/v1/chat/completions", {
-              "messages":    [{"role": "user", "content": "What is the weather in Paris?"}],
-              "tools":       [weather_tool],
-              "tool_choice": "required",
-              "stream":      False,
-              "temperature": 0.0,
-              "seed":        SEED,
-              "max_tokens":  120,
-          })
-          assert status == 200, f"tool call status {status}: {data}"
-          choice = data["choices"][0]
-          assert choice["finish_reason"] == "tool_calls", f"finish_reason={choice['finish_reason']!r}"
-          tc = choice["message"]["tool_calls"][0]
-          assert tc["function"]["name"] == "get_weather"
-          args = json.loads(tc["function"]["arguments"])
-          assert args.get("city"), f"missing city arg: {args}"
-          print(f"[tools] PASS function calling -> {tc['function']['name']}({args})")
-
-          # ── 2. Server-side python tool ───────────────────────────────
-          # 123 * 456 = 56088. The agentic loop streams SSE; we
-          # accumulate the assistant text and look for the answer. We
-          # accept "56088" or "56,088" since the model may format it.
-          content = post_sse("/v1/chat/completions", {
-              "messages":      [{"role": "user", "content": "What is 123 * 456? Use the python tool to compute it and tell me the number."}],
-              "enable_tools":  True,
-              "enabled_tools": ["python"],
-              "session_id":    "ci-tool-calling-py",
-              "temperature":   0.0,
-              "seed":          SEED,
-              "max_tokens":    600,
-          })
-          assert "56088" in content or "56,088" in content, (
-              f"expected 56088 in python-tool answer, got: {content!r}"
-          )
-          print(f"[tools] PASS python tool ({len(content)} chars)")
-
-          # ── 3. Server-side bash (terminal) tool ──────────────────────
-          content = post_sse("/v1/chat/completions", {
-              "messages":      [{"role": "user", "content": "Use the terminal tool to run `echo hello-bash-tool` and tell me the exact output."}],
-              "enable_tools":  True,
-              "enabled_tools": ["terminal"],
-              "session_id":    "ci-tool-calling-bash",
-              "temperature":   0.0,
-              "seed":          SEED,
-              "max_tokens":    600,
-          })
-          assert "hello-bash-tool" in content, (
-              f"expected 'hello-bash-tool' in terminal-tool answer, got: {content!r}"
-          )
-          print(f"[tools] PASS bash/terminal tool ({len(content)} chars)")
-
-          # ── 4. Server-side web_search tool ───────────────────────────
-          # DuckDuckGo is flaky from CI runners and small Qwen3.5-2B
-          # may not actually search. Only assert that the SSE stream
-          # opens and yields any data; HTTP / parser failures already
-          # raise above.
-          try:
-              content = post_sse("/v1/chat/completions", {
-                  "messages":      [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}],
-                  "enable_tools":  True,
-                  "enabled_tools": ["web_search"],
-                  "session_id":    "ci-tool-calling-web",
-                  "temperature":   0.0,
-                  "seed":          SEED,
-                  "max_tokens":    400,
-              })
-              print(f"[tools] PASS web_search stream ({len(content)} chars)")
-          except Exception as exc:
-              print(f"[tools] WARN web_search probe failed (non-blocking): {exc}")
-
-          # ── 5. Thinking on / off ─────────────────────────────────────
-          # Studio strips think blocks from message.content for tools-mode
-          # responses, so we toggle plain chat (no enable_tools) and look
-          # at the surfaced reasoning_content / message.thinking field.
-          def thinking_call(enable):
-              status, data = post("/v1/chat/completions", {
-                  "messages":        [{"role": "user", "content": "Briefly: is 17 prime?"}],
-                  "stream":          False,
-                  "enable_thinking": enable,
-                  "temperature":     0.0,
-                  "seed":            SEED,
-                  "max_tokens":      300,
-              })
-              assert status == 200
-              msg = data["choices"][0]["message"]
-              # Studio surfaces thinking via reasoning_content (OpenAI
-              # extension). Fall back to inline <think> markers for
-              # robustness across template versions.
-              raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "")
-              return raw
-
-          on_text  = thinking_call(True)
-          off_text = thinking_call(False)
-          had_think_on  = ("<think>" in on_text)  or len(on_text)  > 80
-          had_think_off = ("<think>" in off_text) and len(off_text) > 0
-          assert had_think_on, (
-              f"enable_thinking=True produced no thinking signal: {on_text!r}"
-          )
-          # Off-mode should not contain the literal <think> marker.
-          assert "<think>" not in off_text, (
-              f"enable_thinking=False but <think> still present: {off_text!r}"
-          )
-          print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)")
-          PY
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-          ss -tln | grep ":${STUDIO_PORT}" || true
-
-      - name: Upload logs
-        # Always upload so green runs are still reviewable.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: tool-calling-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Job 3: JSON, images
-  # ─────────────────────────────────────────────────────────────────────
-  json-images:
-    name: JSON, images
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    env:
-      GGUF_REPO: unsloth/gemma-4-E2B-it-GGUF
-      GGUF_VARIANT: UD-IQ3_XXS
-      GGUF_FILE: gemma-4-E2B-it-UD-IQ3_XXS.gguf
-      MMPROJ_FILE: mmproj-F16.gguf
-      STUDIO_PORT: '18890'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Linux deps for llama.cpp prebuilt
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            libcurl4-openssl-dev libssl-dev jq
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Restore HF_HOME for ${{ env.GGUF_REPO }} (model + mmproj)
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1
-
-      - name: Prime HF_HOME with the GGUF + mmproj
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE"
-
-      - name: Save HF_HOME for ${{ env.GGUF_REPO }} (model + mmproj)
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Install OpenAI + Anthropic Python SDKs
-        run: pip install 'openai>=1.50' 'anthropic>=0.40'
-
-      - name: Reset auth + boot Studio (API-only)
-        # See Job 2's comment: API-only mode keeps tool_policy=None so
-        # response_format requests aren't routed through the agentic
-        # tool loop.
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health, log in, change password, load model
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
-          TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
-          # Load the GGUF (mmproj is auto-detected via the HF repo
-          # lookup, the cached file is pulled out of HF_HOME).
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 900 \
-            -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_vision}'
-
-      - name: JSON schema decoding + image input
-        env:
-          BASE_URL: http://127.0.0.1:18890
-        run: |
-          python - <<'PY'
-          import base64
-          import json
-          import os
-          import urllib.request
-          from openai import OpenAI
-          from anthropic import Anthropic
-
-          BASE = os.environ["BASE_URL"]
-          KEY  = os.environ["API_KEY"]
-          SEED = 3407
-
-          def post(path, body, *, timeout = 240):
-              req = urllib.request.Request(
-                  f"{BASE}{path}",
-                  data    = json.dumps(body).encode(),
-                  method  = "POST",
-                  headers = {
-                      "Authorization": f"Bearer {KEY}",
-                      "Content-Type":  "application/json",
-                  },
-              )
-              with urllib.request.urlopen(req, timeout = timeout) as resp:
-                  return resp.status, json.loads(resp.read().decode())
-
-          # ── 1. response_format = json_object (JSON mode) ─────────────
-          # llama.cpp's HTTP server supports OpenAI-compatible JSON
-          # mode: `response_format: {"type": "json_object"}` constrains
-          # the model to emit syntactically-valid JSON. We use raw HTTP
-          # rather than the OpenAI SDK so that the field shape Studio
-          # forwards to llama-server is unambiguous (the SDK rewrites
-          # response_format depending on which variant it recognises).
-          # We deliberately do NOT pass a strict JSON schema -- on
-          # small Gemma-4 quants the GBNF-from-schema path occasionally
-          # produces empty output, and JSON mode is the surface we care
-          # about exposing through Studio.
-          status, data = post("/v1/chat/completions", {
-              "model":         "default",
-              "messages":      [
-                  {"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'},
-                  {"role": "user",   "content": "What is the capital of France?"},
-              ],
-              "temperature":     0.0,
-              "max_tokens":      200,
-              "seed":            SEED,
-              "stream":          False,
-              "enable_thinking": False,
-              "response_format": {"type": "json_object"},
-          }, timeout = 600)
-          assert status == 200, f"json status {status}: {data}"
-          content = (data["choices"][0]["message"].get("content") or "").strip()
-          # Some chat templates wrap JSON in ```json fences even in JSON
-          # mode -- strip those before parsing.
-          if content.startswith("```"):
-              content = content.split("```", 2)[1]
-              if content.startswith("json"):
-                  content = content[4:]
-              content = content.strip("`\n ")
-          parsed = json.loads(content)
-          assert "paris" in str(parsed.get("city", "")).lower(), (
-              f"city != Paris: {parsed}"
-          )
-          print(f"[json] PASS json_object -> {parsed}")
-
-          # ── 2. OpenAI image_url (data URI base64) ───────────────────
-          # 64x64 solid-red PNG. stb_image (used by Studio's image
-          # normaliser at routes/inference.py:3410) rejects 4x4 or
-          # smaller PNGs as truncated, so we go up to 64x64 -- still
-          # tiny in token cost. The assertion is loose: any non-empty
-          # response from the vision path proves multimodal end-to-end
-          # wiring; small VL quants are weak at colour identification.
-          PNG_64X64_RED_B64 = (
-              "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k"
-              "UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA"
-              "1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII="
-          )
-          data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}"
-
-          client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
-          openai_resp = client.chat.completions.create(
-              model       = "default",
-              temperature = 0.0,
-              max_tokens  = 80,
-              seed        = SEED,
-              messages    = [{
-                  "role": "user",
-                  "content": [
-                      {"type": "image_url", "image_url": {"url": data_uri}},
-                      {"type": "text",      "text": "What colour dominates this image? Reply in one word."},
-                  ],
-              }],
-          )
-          openai_text = (openai_resp.choices[0].message.content or "").lower()
-          print(f"[image/openai] reply: {openai_text!r}")
-          assert openai_text, "OpenAI image_url returned empty content"
-          # We do not strictly require 'red' -- some quants of small VL
-          # models are weak at colour names. Just require a non-empty
-          # answer; the vision path is the part under test.
-          print("[image/openai] PASS image_url accepted, non-empty response")
-
-          # ── 3. Anthropic source/base64 image ────────────────────────
-          # Two SDK quirks vs. Studio: base_url must NOT include /v1
-          # (the SDK appends it itself; otherwise /v1/v1/messages -> 405),
-          # and Studio's auth is HTTPBearer-only so the SDK's default
-          # x-api-key header is ignored -- send Authorization: Bearer
-          # via default_headers.
-          anthropic = Anthropic(
-              base_url        = BASE,
-              api_key         = "unused",
-              default_headers = {"Authorization": f"Bearer {KEY}"},
-          )
-          a_msg = anthropic.messages.create(
-              model       = "default",
-              max_tokens  = 80,
-              temperature = 0.0,
-              extra_body  = {"seed": SEED},
-              messages    = [{
-                  "role": "user",
-                  "content": [
-                      {
-                          "type":   "image",
-                          "source": {
-                              "type":       "base64",
-                              "media_type": "image/png",
-                              "data":       PNG_64X64_RED_B64,
-                          },
-                      },
-                      {"type": "text", "text": "Describe this image briefly."},
-                  ],
-              }],
-          )
-          a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text")
-          print(f"[image/anthropic] reply: {a_text!r}")
-          assert a_text, "Anthropic source/base64 returned empty content"
-          print("[image/anthropic] PASS source/base64 accepted, non-empty response")
-          PY
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-          ss -tln | grep ":${STUDIO_PORT}" || true
-
-      - name: Upload logs
-        # Always upload so green runs are still reviewable.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: json-images-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
diff --git a/.github/workflows/studio-mac-api-smoke.yml b/.github/workflows/studio-mac-api-smoke.yml
deleted file mode 100644
index b4e274155e..0000000000
--- a/.github/workflows/studio-mac-api-smoke.yml
+++ /dev/null
@@ -1,153 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Mac counterpart to studio-api-smoke.yml. Same tests/studio/
-# studio_api_smoke.py exercise (CORS hardening, auth state machine,
-# JWT expiry, API key lifecycle, /v1/models / /v1/embeddings /
-# /v1/responses, endpoint-by-endpoint auth audit) but on a real
-# Apple Silicon (macos-14, M1) runner. Drops the apt-get block;
-# GitHub-hosted macos-14 ships curl + jq.
-
-name: Mac Studio API CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.sh'
-      - 'pyproject.toml'
-      - 'tests/studio/**'
-      - '.github/workflows/studio-mac-api-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  api-smoke:
-    name: Studio API & Auth Tests
-    runs-on: macos-14
-    timeout-minutes: 25
-    env:
-      GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18895'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Prime HF_HOME with the GGUF
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
-      - name: Save HF_HOME for ${{ env.GGUF_REPO }}
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Assert install.sh used the Mac llama.cpp prebuilt
-        run: |
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-
-      - name: Install pyjwt for the JWT-expiry forge test
-        run: pip install 'pyjwt>=2.6'
-
-      - name: Reset auth + boot Studio (API-only)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-
-      - name: Pass bootstrap password + rotated targets to the test
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          NEW2="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          echo "::add-mask::$NEW2"
-          echo "STUDIO_OLD_PW=$OLD"  >> "$GITHUB_ENV"
-          echo "STUDIO_NEW_PW=$NEW"  >> "$GITHUB_ENV"
-          echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
-      - name: Run Studio API & Auth tests
-        env:
-          BASE_URL: http://127.0.0.1:18895
-          STUDIO_AUTH_DIR: /Users/runner/.unsloth/studio/auth
-        run: python tests/studio/studio_api_smoke.py
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-
-      - name: Upload API smoke logs
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: mac-studio-api-smoke-log
-          path: |
-            logs/install.log
-            logs/studio.log
-          retention-days: 7
diff --git a/.github/workflows/studio-mac-inference-smoke.yml b/.github/workflows/studio-mac-inference-smoke.yml
deleted file mode 100644
index 2d6864e0cb..0000000000
--- a/.github/workflows/studio-mac-inference-smoke.yml
+++ /dev/null
@@ -1,1042 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Three end-to-end smoke jobs that boot a freshly-installed Studio and
-# exercise the surfaces real users hit through the OpenAI / Anthropic
-# SDKs and curl. Each job picks the smallest model that exercises the
-# behaviour under test, primes a model cache via actions/cache, and
-# shares the install.sh --local --no-torch bootstrap.
-#
-#   1. OpenAI, Anthropic API tests
-#        gemma-3-270m-it UD-Q4_K_XL (~254 MiB).
-#        Password rotation via /api/auth/change-password (old fails,
-#        new works), then OpenAI + Anthropic Python SDKs against /v1/*
-#        with temperature=0 and a fixed seed. Asserts the four-turn
-#        conversation is deterministic across two runs.
-#
-#   2. Tool calling Tests
-#        Qwen3.5-2B UD-IQ3_XXS (~890 MiB). OpenAI function calling,
-#        server-side tools (python, terminal, web_search) via
-#        enable_tools / enabled_tools, and enable_thinking on/off.
-#
-#   3. JSON, images
-#        gemma-4-E2B-it UD-IQ3_XXS (~2.4 GiB) + mmproj-F16 (~986 MiB).
-#        response_format JSON-schema decoding and OpenAI image_url
-#        (data URI) plus Anthropic source/base64 image inputs.
-#
-# All three jobs run in parallel. Total wall time is dominated by job 3
-# on a cold cache; warm cache cuts that to ~3 min.
-
-name: Mac Studio GGUF CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.sh'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-mac-inference-smoke.yml'
-  push:
-    branches: [main, pip]
-  # Manual trigger for pre-warming model caches on main, or re-running
-  # against an arbitrary branch without pushing a no-op commit.
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  # ─────────────────────────────────────────────────────────────────────
-  # Job 1: OpenAI, Anthropic API tests
-  # ─────────────────────────────────────────────────────────────────────
-  openai-anthropic:
-    name: OpenAI, Anthropic API tests
-    runs-on: macos-14
-    timeout-minutes: 25
-    env:
-      GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18888'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Prime HF_HOME with the GGUF
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
-      # Save partial caches on cancel/timeout -- hf download resumes by
-      # content hash. `outcome != skipped` keeps cache-hit a no-op.
-      - name: Save HF_HOME for ${{ env.GGUF_REPO }}
-        if: always() && steps.prime-hf.outcome != 'skipped' && hashFiles('hf-cache/**/*.gguf') != ''
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Assert install.sh used the Mac llama.cpp prebuilt
-        run: |
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-
-      - name: Install OpenAI + Anthropic Python SDKs
-        run: pip install 'openai>=1.50' 'anthropic>=0.40'
-
-      - name: Reset auth + boot Studio (API-only)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Studio did not become healthy in 180s"
-          tail -200 logs/studio.log
-          exit 1
-
-      - name: Password rotation (old must fail, new must work)
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          # 1. Login with the bootstrap password.
-          OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
-          [ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] || { echo "bootstrap login failed"; exit 1; }
-          # 2. Rotate to a fresh random password.
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
-          # 3. Old password must now be rejected (HTTP 401).
-          OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \
-            -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}")
-          if [ "$OLD_STATUS" != "401" ]; then
-            echo "::error::Login with old password returned $OLD_STATUS, expected 401"
-            exit 1
-          fi
-          # 4. New password must succeed; capture the JWT for downstream steps.
-          NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          [ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] || { echo "new login failed"; exit 1; }
-          echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV"
-          echo "password rotation OK (old=401, new=200)"
-
-      - name: Load the GGUF (HF repo + variant, served from HF_HOME cache)
-        run: |
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_gguf, context_length}'
-
-      - name: Multi-turn determinism via OpenAI + Anthropic SDKs
-        env:
-          BASE_URL: http://127.0.0.1:18888
-        run: |
-          python - <<'PY'
-          import json
-          import os
-          from openai import OpenAI
-          from anthropic import Anthropic
-
-          BASE = os.environ["BASE_URL"]
-          KEY  = os.environ["TOKEN"]      # JWT also accepted as Bearer on /v1/*
-          SEED = 3407
-
-          # Four-turn conversation: the second and fourth turns can only be
-          # answered correctly if the model sees the prior turns, so this
-          # also exercises the conversation-history wiring.
-          PROMPTS = [
-              "What is 1+1?",
-              "What did I ask before?",
-              "What is the capital of France?",
-              "Repeat the city name",
-          ]
-
-          def run_openai():
-              client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
-              history, replies = [], []
-              for prompt in PROMPTS:
-                  history.append({"role": "user", "content": prompt})
-                  resp = client.chat.completions.create(
-                      model       = "default",
-                      messages    = history,
-                      temperature = 0.0,
-                      max_tokens  = 80,
-                      seed        = SEED,
-                      extra_body  = {"enable_thinking": False},
-                  )
-                  text = resp.choices[0].message.content or ""
-                  replies.append(text)
-                  history.append({"role": "assistant", "content": text})
-              return replies
-
-          def run_anthropic():
-              # Two SDK quirks vs. Studio:
-              #   1. base_url must NOT include /v1 -- the SDK appends
-              #      /v1/messages itself; otherwise the request hits
-              #      /v1/v1/messages and 405s.
-              #   2. The SDK sends `x-api-key` by default, but Studio's
-              #      auth layer is HTTPBearer-only. Override via
-              #      default_headers so Authorization: Bearer ... is
-              #      sent instead.
-              client = Anthropic(
-                  base_url        = BASE,
-                  api_key         = "unused",
-                  default_headers = {"Authorization": f"Bearer {KEY}"},
-              )
-              history, replies = [], []
-              for prompt in PROMPTS:
-                  history.append({"role": "user", "content": prompt})
-                  msg = client.messages.create(
-                      model       = "default",
-                      max_tokens  = 80,
-                      messages    = history,
-                      temperature = 0.0,
-                      extra_body  = {"seed": SEED, "enable_thinking": False},
-                  )
-                  text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text")
-                  replies.append(text)
-                  history.append({"role": "assistant", "content": text})
-              return replies
-
-          for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)):
-              first  = runner()
-              second = runner()
-              for i, (a, b) in enumerate(zip(first, second), start = 1):
-                  print(f"[{label} turn {i}] {a!r}")
-                  assert a, f"{label}: empty turn {i} response"
-                  assert a == b, (
-                      f"{label} non-deterministic at turn {i} with temperature=0.0:\n"
-                      f"  run1: {a!r}\n  run2: {b!r}"
-                  )
-              # Sanity: turn-2 reply should mention the earlier question, and
-              # turn-4 reply should mention Paris (model echoes the city it
-              # produced for turn 3). Lower-cased substring checks keep the
-              # assertion robust to formatting jitter.
-              joined = " ".join(first).lower()
-              assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}"
-              assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}"
-              print(f"[{label}] OK -- 4 turns, run1 == run2, history grounded")
-          PY
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-          ss -tln | grep ":${STUDIO_PORT}" || true
-
-      - name: Upload logs
-        # Always upload so green runs are still reviewable.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: openai-anthropic-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Job 2: Tool calling Tests
-  # ─────────────────────────────────────────────────────────────────────
-  tool-calling:
-    name: Tool calling Tests
-    runs-on: macos-14
-    timeout-minutes: 25
-    env:
-      # Tool calling is the highest-volume GGUF in this workflow
-      # (Qwen3.5-2B at Q4_K_XL = ~1.28 GiB on Mac, where IQ3_XXS
-      # collapses for tool-call grammar under Metal at temperature=0).
-      # Caching HF_HOME stores xet chunks + blobs + snapshots = ~4.6
-      # GiB compressed -- 3.6x file-size inflation. Use main's
-      # `--local-dir gguf-cache` pattern to cache the flat .gguf only.
-      # The OpenAI/Anth and JSON+images jobs still cover the
-      # gguf_variant resolution path.
-      GGUF_REPO: unsloth/Qwen3.5-2B-GGUF
-      GGUF_FILE: Qwen3.5-2B-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18898'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Restore GGUF model file
-        id: cache-gguf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
-      - name: Download GGUF if cache miss
-        id: download-gguf
-        if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.cache-gguf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p gguf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache
-
-      # Save partial caches on cancel; next run resumes via content hash.
-      - name: Save GGUF model file
-        if: always() && steps.download-gguf.outcome != 'skipped' && hashFiles('gguf-cache/**/*.gguf') != ''
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Assert install.sh used the Mac llama.cpp prebuilt
-        run: |
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-
-      - name: Reset auth + boot Studio (API-only, default tool policy)
-        # We deliberately use the API-only mode rather than
-        # `unsloth studio run` because the latter calls
-        # `set_tool_policy(...)` with a resolved bool: on loopback the
-        # default resolves to True, which forces every request through
-        # the server-side agentic loop and breaks the standard
-        # function-calling test below. API-only mode leaves
-        # tool_policy=None so each request's `enable_tools` field is
-        # honoured.
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health, log in, change password, load model
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
-          TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
-          GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}"
-          ls -lh "$GGUF_PATH"
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name}'
-
-      - name: Tool calling, server-side tools, thinking on/off
-        env:
-          BASE_URL: http://127.0.0.1:18898
-        run: |
-          python - <<'PY'
-          import json
-          import os
-          import urllib.request
-
-          BASE = os.environ["BASE_URL"]
-          KEY  = os.environ["API_KEY"]
-          SEED = 3407
-
-          def post(path, body, *, timeout = 240):
-              """Plain JSON POST. For requests that don't go through
-              the server-side agentic loop, the response is one JSON
-              object."""
-              data = json.dumps(body).encode()
-              req = urllib.request.Request(
-                  f"{BASE}{path}",
-                  data    = data,
-                  method  = "POST",
-                  headers = {
-                      "Authorization": f"Bearer {KEY}",
-                      "Content-Type": "application/json",
-                  },
-              )
-              with urllib.request.urlopen(req, timeout = timeout) as resp:
-                  return resp.status, json.loads(resp.read().decode())
-
-          def post_sse(path, body, *, timeout = 600):
-              """POST a streaming request and accumulate the assistant
-              text deltas. The server-side agentic loop ALWAYS returns
-              SSE regardless of the request's `stream` field, so any
-              call with enable_tools=true must use this helper."""
-              body = {**body, "stream": True}
-              data = json.dumps(body).encode()
-              req = urllib.request.Request(
-                  f"{BASE}{path}",
-                  data    = data,
-                  method  = "POST",
-                  headers = {
-                      "Authorization": f"Bearer {KEY}",
-                      "Content-Type": "application/json",
-                  },
-              )
-              parts = []
-              with urllib.request.urlopen(req, timeout = timeout) as resp:
-                  for raw in resp:
-                      line = raw.decode().strip()
-                      if not line.startswith("data: "):
-                          continue
-                      payload = line[6:]
-                      if payload == "[DONE]":
-                          break
-                      try:
-                          chunk = json.loads(payload)
-                      except json.JSONDecodeError:
-                          continue
-                      for choice in chunk.get("choices", []):
-                          delta = choice.get("delta", {}) or {}
-                          if delta.get("content"):
-                              parts.append(delta["content"])
-              return "".join(parts)
-
-          # ── 1. Standard OpenAI function calling ──────────────────────
-          weather_tool = {
-              "type": "function",
-              "function": {
-                  "name": "get_weather",
-                  "description": "Get current weather for a city.",
-                  "parameters": {
-                      "type": "object",
-                      "properties": {"city": {"type": "string"}},
-                      "required": ["city"],
-                  },
-              },
-          }
-
-          # Mac Metal at temperature=0 is pathological for these small
-          # quants (Qwen3.5-2B emits ',,,,,,...' or 'The The The...'),
-          # gemma-4-E2B emits '<unused5>' tokens). The Linux CPU
-          # backend hides the issue. Use a small non-zero temperature
-          # with a fixed seed so we stay deterministic but escape the
-          # degenerate sampling trap.
-          TEMP = 0.2
-
-          status, data = post("/v1/chat/completions", {
-              "messages":    [{"role": "user", "content": "What is the weather in Paris?"}],
-              "tools":       [weather_tool],
-              "tool_choice": "required",
-              "stream":      False,
-              "temperature": TEMP,
-              "seed":        SEED,
-              # tool_choice='required' constrains the grammar so the
-              # model emits a tool_call quickly when it works at all;
-              # 128 tokens is enough for `{"city":"Paris"}` plus the
-              # JSON envelope.
-              "max_tokens":  128,
-          }, timeout = 180)
-          assert status == 200, f"tool call status {status}: {data}"
-          choice = data["choices"][0]
-          tool_calls = (choice.get("message") or {}).get("tool_calls") or []
-          # Studio's contract: when tool_choice='required', llama.cpp's
-          # grammar should force a tool_calls payload. On Mac that
-          # contract is sometimes broken by the underlying quant; the
-          # PASS path is "tool_calls present + correct schema", the
-          # WARN path documents Studio still returned 200 with a
-          # well-formed choices[] envelope.
-          if tool_calls:
-              tc = tool_calls[0]
-              assert tc["function"]["name"] == "get_weather", (
-                  f"unexpected tool name: {tc['function']['name']!r}"
-              )
-              args = json.loads(tc["function"]["arguments"])
-              assert args.get("city"), f"missing city arg: {args}"
-              print(f"[tools] PASS function calling -> {tc['function']['name']}({args}) finish={choice.get('finish_reason')!r}")
-          else:
-              # Infrastructure path is correct; model output drifted.
-              print(
-                  f"[tools] WARN function calling: no tool_calls (finish_reason="
-                  f"{choice.get('finish_reason')!r}); HTTP path OK, this is a "
-                  f"Mac Metal quant degeneracy."
-              )
-
-          # ── 2. Server-side python tool ───────────────────────────────
-          # 123 * 456 = 56088. The agentic loop streams SSE; we
-          # accumulate the assistant text and look for the answer. On
-          # Mac the model often loses the tool calling contract before
-          # producing the answer; accept either the answer OR a
-          # non-empty SSE stream as proof the path completes.
-          # macos-14 free runner is ~10 tok/s on Qwen3.5-2B Q4_K_XL;
-          # cap max_tokens tightly so each SSE round stays under ~30s
-          # even when the model stalls in a degenerate output state.
-          content = post_sse("/v1/chat/completions", {
-              "messages":      [{"role": "user", "content": "What is 123 * 456? Use the python tool to compute it and tell me the number."}],
-              "enable_tools":  True,
-              "enabled_tools": ["python"],
-              "session_id":    "ci-tool-calling-py",
-              "temperature":   TEMP,
-              "seed":          SEED,
-              "max_tokens":    128,
-          }, timeout = 180)
-          if "56088" in content or "56,088" in content:
-              print(f"[tools] PASS python tool ({len(content)} chars, found 56088)")
-          else:
-              # Empty stream is a known Mac-quant degeneracy too; log
-              # but do not fail.
-              print(
-                  f"[tools] WARN python tool: SSE OK ({len(content)} chars) but "
-                  f"model didn't return 56088 -- Mac quant drift"
-              )
-
-          # NOTE: the dedicated "Server-side bash (terminal) tool" axis
-          # was dropped in favour of the python axis above. Both share
-          # the SAME server-side agentic loop wiring (only the registry
-          # entry differs); the python axis is the canonical proof. On
-          # macos-14 the duplicated SSE round was the dominant cost in
-          # this step, so collapsing the two saves ~30-60 s wallclock
-          # without losing distinct coverage.
-
-          # ── 3. Server-side web_search tool ───────────────────────────
-          # DuckDuckGo is flaky from CI runners and small Qwen3.5-2B
-          # may not actually search. Only assert that the SSE stream
-          # opens and yields any data; HTTP / parser failures already
-          # raise above.
-          try:
-              content = post_sse("/v1/chat/completions", {
-                  "messages":      [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}],
-                  "enable_tools":  True,
-                  "enabled_tools": ["web_search"],
-                  "session_id":    "ci-tool-calling-web",
-                  "temperature":   TEMP,
-                  "seed":          SEED,
-                  "max_tokens":    96,
-              }, timeout = 180)
-              print(f"[tools] PASS web_search stream ({len(content)} chars)")
-          except Exception as exc:
-              print(f"[tools] WARN web_search probe failed (non-blocking): {exc}")
-
-          # ── 4. Thinking on / off ─────────────────────────────────────
-          # Studio strips think blocks from message.content for tools-mode
-          # responses, so we toggle plain chat (no enable_tools) and look
-          # at the surfaced reasoning_content / message.thinking field.
-          def thinking_call(enable):
-              status, data = post("/v1/chat/completions", {
-                  "messages":        [{"role": "user", "content": "Briefly: is 17 prime?"}],
-                  "stream":          False,
-                  "enable_thinking": enable,
-                  "temperature":     TEMP,
-                  "seed":            SEED,
-                  # 80 tokens lands within the 25-minute job timeout
-                  # on the macos-14 free runner. 17 is small; this is
-                  # plenty of room for either "Yes" + brief reasoning
-                  # or a degenerate empty completion.
-                  "max_tokens":      80,
-              }, timeout = 180)
-              assert status == 200
-              msg = data["choices"][0]["message"]
-              # Studio surfaces thinking via reasoning_content (OpenAI
-              # extension). Fall back to inline <think> markers for
-              # robustness across template versions.
-              raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "")
-              return raw
-
-          on_text  = thinking_call(True)
-          off_text = thinking_call(False)
-          # Mac quant drift: the model may produce empty / degenerate
-          # output regardless of enable_thinking. Assert ONLY that the
-          # endpoint returned 200 (already enforced inside thinking_call)
-          # and that toggling the flag doesn't surface a hard <think>
-          # marker when off.
-          had_think_on  = ("<think>" in on_text)  or len(on_text)  > 80
-          if not had_think_on:
-              print(
-                  f"[tools] WARN enable_thinking=True produced no thinking signal: "
-                  f"{on_text[:200]!r} -- Mac quant drift"
-              )
-          # Off-mode should not contain the literal <think> marker.
-          assert "<think>" not in off_text, (
-              f"enable_thinking=False but <think> still present: {off_text!r}"
-          )
-          print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)")
-          PY
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-          ss -tln | grep ":${STUDIO_PORT}" || true
-
-      - name: Upload logs
-        # Always upload so green runs are still reviewable.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: tool-calling-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Job 3: JSON, images
-  # ─────────────────────────────────────────────────────────────────────
-  json-images:
-    name: JSON, images
-    runs-on: macos-14
-    timeout-minutes: 30
-    env:
-      GGUF_REPO: unsloth/gemma-4-E2B-it-GGUF
-      # Linux smoke uses UD-IQ3_XXS, but on Mac Metal that gemma-4
-      # quant emits sentinel tokens (<unused5>) for any prompt at
-      # temperature=0 -- inference path is fine, the quant itself is
-      # broken on Metal. UD-Q4_K_XL is the smallest published variant
-      # that generates real text on M1.
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-4-E2B-it-UD-Q4_K_XL.gguf
-      MMPROJ_FILE: mmproj-F16.gguf
-      STUDIO_PORT: '18899'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      # Cache flat .gguf + mmproj (Job 2's pattern). HF_HOME inflates
-      # ~3.6x via xet/blobs/snapshots, which made macOS saves never land.
-      # mmproj is auto-detected as a sibling via detect_mmproj_file
-      # (studio/backend/utils/models/model_config.py).
-      - name: Restore GGUF + mmproj files
-        id: cache-gguf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-${{ env.MMPROJ_FILE }}-v2
-
-      - name: Verify cache contains BOTH gguf + mmproj
-        id: verify-cache
-        if: steps.cache-gguf.outputs.cache-hit == 'true'
-        run: |
-          if [[ -f "gguf-cache/$GGUF_FILE" && -f "gguf-cache/$MMPROJ_FILE" ]]; then
-            echo "ok=true" >> "$GITHUB_OUTPUT"
-          else
-            echo "Partial cache hit -- forcing re-download."
-            echo "ok=false" >> "$GITHUB_OUTPUT"
-          fi
-
-      - name: Download GGUF + mmproj if cache miss or partial
-        id: download-gguf
-        if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.verify-cache.outputs.ok != 'true'
-        # Authenticated + parallel: shared macos-14 NAT egress stalls
-        # multi-GB anonymous downloads.
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p gguf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache &
-          MODEL_PID=$!
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE" gguf-cache &
-          MMPROJ_PID=$!
-          wait "$MODEL_PID"
-          wait "$MMPROJ_PID"
-          # Fail loud on a partial download instead of in the next step.
-          ls -lh "gguf-cache/$GGUF_FILE" "gguf-cache/$MMPROJ_FILE"
-
-      # Save partial caches on cancel. hashFiles guard avoids a hard
-      # save failure when the download step exits with no files. The
-      # additional mmproj-presence check stops a partial save from
-      # poisoning the cache for the next run.
-      - name: Save GGUF + mmproj files
-        if: always() && steps.download-gguf.outcome != 'skipped' && hashFiles('gguf-cache/**/*.gguf') != '' && hashFiles(format('gguf-cache/{0}', env.MMPROJ_FILE)) != ''
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-${{ env.MMPROJ_FILE }}-v2
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Assert install.sh used the Mac llama.cpp prebuilt
-        run: |
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-
-      - name: Install OpenAI + Anthropic Python SDKs
-        run: pip install 'openai>=1.50' 'anthropic>=0.40'
-
-      - name: Reset auth + boot Studio (API-only)
-        # See Job 2's comment: API-only mode keeps tool_policy=None so
-        # response_format requests aren't routed through the agentic
-        # tool loop.
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health, log in, change password, load model
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
-          TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
-          # Load via local file path; mmproj sibling auto-detected by
-          # detect_mmproj_file (model_config.py). gguf_variant omitted
-          # -- it routes through _find_local_gguf_by_variant which
-          # expects a directory, not a file path.
-          GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}"
-          MMPROJ_PATH="$GITHUB_WORKSPACE/gguf-cache/${MMPROJ_FILE}"
-          ls -lh "$GGUF_PATH" "$MMPROJ_PATH"
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 900 \
-            -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_vision}'
-
-      - name: JSON schema decoding + image input
-        env:
-          BASE_URL: http://127.0.0.1:18899
-        run: |
-          python - <<'PY'
-          import base64
-          import json
-          import os
-          import urllib.request
-          from openai import OpenAI
-          from anthropic import Anthropic
-
-          BASE = os.environ["BASE_URL"]
-          KEY  = os.environ["API_KEY"]
-          SEED = 3407
-          # Mac Metal degenerates these gemma-4 quants at temperature=0
-          # (any prompt yields '<unused5>...' padding tokens). Use a
-          # small non-zero temperature with the same seed so we stay
-          # deterministic-enough but escape the trap.
-          TEMP = 0.2
-
-          def post(path, body, *, timeout = 240):
-              req = urllib.request.Request(
-                  f"{BASE}{path}",
-                  data    = json.dumps(body).encode(),
-                  method  = "POST",
-                  headers = {
-                      "Authorization": f"Bearer {KEY}",
-                      "Content-Type":  "application/json",
-                  },
-              )
-              with urllib.request.urlopen(req, timeout = timeout) as resp:
-                  return resp.status, json.loads(resp.read().decode())
-
-          # ── 1. response_format = json_object (JSON mode) ─────────────
-          # llama.cpp's HTTP server supports OpenAI-compatible JSON
-          # mode: `response_format: {"type": "json_object"}` constrains
-          # the model to emit syntactically-valid JSON. We use raw HTTP
-          # rather than the OpenAI SDK so that the field shape Studio
-          # forwards to llama-server is unambiguous (the SDK rewrites
-          # response_format depending on which variant it recognises).
-          # We deliberately do NOT pass a strict JSON schema -- on
-          # small Gemma-4 quants the GBNF-from-schema path occasionally
-          # produces empty output, and JSON mode is the surface we care
-          # about exposing through Studio.
-          status, data = post("/v1/chat/completions", {
-              "model":         "default",
-              "messages":      [
-                  {"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'},
-                  {"role": "user",   "content": "What is the capital of France?"},
-              ],
-              "temperature":     TEMP,
-              # Trimmed for Mac runner timeout budget; json_object
-              # grammar terminates quickly when working.
-              "max_tokens":      200,
-              "seed":            SEED,
-              "stream":          False,
-              "enable_thinking": False,
-              "response_format": {"type": "json_object"},
-          }, timeout = 240)
-          assert status == 200, f"json status {status}: {data}"
-          # Verify the response envelope shape -- this is what we
-          # actually want to exercise on Mac. The model output quality
-          # downstream of this is a Mac-Metal-quant artefact.
-          assert (
-              isinstance(data.get("choices"), list)
-              and data["choices"]
-              and "message" in data["choices"][0]
-          ), f"json response envelope malformed: {data}"
-          content = (data["choices"][0]["message"].get("content") or "").strip()
-          print(f"[json] raw json_object content: {content!r}")
-          # Some chat templates wrap JSON in ```json fences even in JSON
-          # mode -- strip those before parsing.
-          if content.startswith("```"):
-              content = content.split("```", 2)[1]
-              if content.startswith("json"):
-                  content = content[4:]
-              content = content.strip("`\n ")
-          if content:
-              try:
-                  parsed = json.loads(content)
-                  if "paris" in str(parsed.get("city", "")).lower():
-                      print(f"[json] PASS json_object -> {parsed}")
-                  else:
-                      print(f"[json] WARN json_object decoded but city!=Paris: {parsed}")
-              except json.JSONDecodeError as exc:
-                  print(f"[json] WARN json_object content not parseable ({exc}); content={content!r}")
-          else:
-              print("[json] WARN json_object produced empty content on this Mac quant")
-          # Cross-check: same prompt without response_format. We care
-          # that the inference path stays healthy (status 200 + envelope
-          # shape OK); model output quality is a separate concern.
-          status2, data2 = post("/v1/chat/completions", {
-              "model":         "default",
-              "messages":      [{"role": "user", "content": "What is the capital of France? Answer with one word."}],
-              "temperature":     TEMP,
-              # 1-word answer doesn't need 400 tokens; trim so a
-              # degenerate streaming model doesn't burn through the
-              # job's wallclock budget.
-              "max_tokens":      150,
-              "seed":            SEED,
-              "stream":          False,
-              "enable_thinking": False,
-          }, timeout = 240)
-          assert status2 == 200, f"plain status {status2}: {data2}"
-          plain = (data2["choices"][0]["message"].get("content") or "").lower()
-          print(f"[json] plain capital-of-france reply: {plain!r}")
-          if "paris" in plain:
-              print("[json] PASS plain inference path (paris mentioned)")
-          else:
-              print(
-                  f"[json] WARN plain inference returned no 'paris' -- Mac quant "
-                  f"degeneracy. HTTP path validated separately above."
-              )
-
-          # ── 2. OpenAI image_url (data URI base64) ───────────────────
-          # 64x64 solid-red PNG. stb_image (used by Studio's image
-          # normaliser at routes/inference.py:3410) rejects 4x4 or
-          # smaller PNGs as truncated, so we go up to 64x64 -- still
-          # tiny in token cost. The assertion is loose: any non-empty
-          # response from the vision path proves multimodal end-to-end
-          # wiring; small VL quants are weak at colour identification.
-          PNG_64X64_RED_B64 = (
-              "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k"
-              "UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA"
-              "1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII="
-          )
-          data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}"
-
-          # The Mac prebuilt llama.cpp server has a known crash when
-          # processing image inputs alongside the gemma-4-E2B mmproj
-          # (server disconnects mid-completion). This is upstream
-          # llama.cpp behaviour, not Studio. Wrap both SDK calls in
-          # try/except so an upstream crash registers as a WARN rather
-          # than failing the whole job. Studio's contract (OpenAI/
-          # Anthropic image fields are accepted and forwarded) is
-          # validated by the request body Studio constructs, not by
-          # whether llama.cpp can decode it on Mac Metal.
-          client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
-          try:
-              openai_resp = client.chat.completions.create(
-                  model       = "default",
-                  temperature = TEMP,
-                  max_tokens  = 80,
-                  seed        = SEED,
-                  messages    = [{
-                      "role": "user",
-                      "content": [
-                          {"type": "image_url", "image_url": {"url": data_uri}},
-                          {"type": "text",      "text": "What colour dominates this image? Reply in one word."},
-                      ],
-                  }],
-              )
-              openai_text = (openai_resp.choices[0].message.content or "").lower()
-              print(f"[image/openai] reply: {openai_text!r}")
-              if openai_text:
-                  print("[image/openai] PASS image_url accepted, non-empty response")
-              else:
-                  print("[image/openai] WARN image_url accepted but empty content -- Mac quant drift")
-          except Exception as exc:
-              print(
-                  f"[image/openai] WARN image_url SDK call raised: {type(exc).__name__}: "
-                  f"{exc}. Likely upstream llama.cpp Mac+vision crash, NOT a Studio "
-                  f"regression. Studio successfully forwarded the request."
-              )
-
-          # ── 3. Anthropic source/base64 image ────────────────────────
-          # Two SDK quirks vs. Studio: base_url must NOT include /v1
-          # (the SDK appends it itself; otherwise /v1/v1/messages -> 405),
-          # and Studio's auth is HTTPBearer-only so the SDK's default
-          # x-api-key header is ignored -- send Authorization: Bearer
-          # via default_headers.
-          anthropic = Anthropic(
-              base_url        = BASE,
-              api_key         = "unused",
-              default_headers = {"Authorization": f"Bearer {KEY}"},
-          )
-          try:
-              a_msg = anthropic.messages.create(
-                  model       = "default",
-                  max_tokens  = 80,
-                  temperature = TEMP,
-                  extra_body  = {"seed": SEED},
-                  messages    = [{
-                      "role": "user",
-                      "content": [
-                          {
-                              "type":   "image",
-                              "source": {
-                                  "type":       "base64",
-                                  "media_type": "image/png",
-                                  "data":       PNG_64X64_RED_B64,
-                              },
-                          },
-                          {"type": "text", "text": "Describe this image briefly."},
-                      ],
-                  }],
-              )
-              a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text")
-              print(f"[image/anthropic] reply: {a_text!r}")
-              if a_text:
-                  print("[image/anthropic] PASS source/base64 accepted, non-empty response")
-              else:
-                  print("[image/anthropic] WARN source/base64 accepted but empty content -- Mac quant drift")
-          except Exception as exc:
-              print(
-                  f"[image/anthropic] WARN anthropic image SDK call raised: "
-                  f"{type(exc).__name__}: {exc}. Likely upstream llama.cpp Mac+vision "
-                  f"crash, NOT a Studio regression."
-              )
-          PY
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-          ss -tln | grep ":${STUDIO_PORT}" || true
-
-      - name: Upload logs
-        # Always upload so green runs are still reviewable.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: json-images-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
diff --git a/.github/workflows/studio-mac-ui-smoke.yml b/.github/workflows/studio-mac-ui-smoke.yml
deleted file mode 100644
index 510c3543d2..0000000000
--- a/.github/workflows/studio-mac-ui-smoke.yml
+++ /dev/null
@@ -1,343 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Mac counterpart to studio-ui-smoke.yml. Same Playwright + Chromium
-# end-to-end chat UI flow, but on macos-14 (M1) so we catch
-# Mac-specific frontend / backend wiring regressions that the Linux
-# job would miss (e.g. the Mac Tauri shell loading the same React
-# bundle, or the Mac llama.cpp prebuilt's HTTP layer behaving
-# differently from the Linux build).
-
-name: Mac Studio UI CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.sh'
-      - 'pyproject.toml'
-      - 'tests/studio/**'
-      - '.github/workflows/studio-mac-ui-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  ui-smoke:
-    name: Chat UI Tests
-    runs-on: macos-14
-    timeout-minutes: 35
-    env:
-      GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18896'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Prime HF_HOME with the GGUF
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
-      - name: Save HF_HOME for ${{ env.GGUF_REPO }}
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Assert install.sh used the Mac llama.cpp prebuilt
-        run: |
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-
-      - name: Install Playwright + Chromium
-        # No --with-deps on Mac: that flag installs Linux apt packages.
-        # GitHub-hosted macos-14 ships the system frameworks Chromium
-        # needs already.
-        # Pinned <1.58 because all 1.55-1.58 drivers ship Node 24 on
-        # macos-14 and intermittently hit 'SyntaxError: Unexpected end
-        # of JSON input' in pipeTransport.js. Run 25491698868 showed
-        # the crash hitting 100% of three retry attempts -- not a
-        # rare race but a hard reproduction. Belt-and-suspenders fix:
-        # the test scripts pass --single-process to Chromium (see
-        # tests/studio/playwright_chat_ui.py) AND we patch
-        # pipeTransport.js below to swallow JSON parse errors instead
-        # of crashing the driver Node process. Both together let the
-        # in-script retry recover from any residual flakes.
-        run: |
-          pip install 'playwright>=1.55,<1.58'
-          python -m playwright install chromium
-
-      - name: Patch Playwright pipeTransport.js to tolerate malformed JSON
-        # In Playwright 1.55-1.58, pipeTransport.js does
-        # `JSON.parse(message)` with no try/catch; when Chromium dies
-        # mid-write the partial buffer crashes the driver Node
-        # process and the test script exits with 'Connection closed
-        # while reading from the driver'. Newer Playwright versions
-        # added a try/catch upstream. Backport that here.
-        run: |
-          python - <<'PY'
-          import os, re, sys
-          import playwright
-          driver_dir = os.path.join(os.path.dirname(playwright.__file__), "driver", "package", "lib", "server")
-          path = os.path.join(driver_dir, "pipeTransport.js")
-          src = open(path).read()
-          # Wrap both `this.onmessage.call(null, JSON.parse(...))` sites in try/catch.
-          patched = re.sub(
-              r"this\.onmessage\.call\(null, JSON\.parse\((message2?)\)\);",
-              r"try { this.onmessage.call(null, JSON.parse(\1)); } "
-              r"catch (e) { /* swallow malformed JSON from a crashing browser */ }",
-              src,
-          )
-          if patched == src:
-              # Already patched, or upstream changed -- either way, don't fail the build.
-              print(f"pipeTransport.js: no JSON.parse calls matched at {path}; skipping.")
-          else:
-              open(path, "w").write(patched)
-              print(f"pipeTransport.js: patched JSON.parse calls in {path}")
-          PY
-
-      - name: Reset auth + boot Studio
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-
-      - name: Pass bootstrap password to the Playwright step
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          NEW2="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          echo "::add-mask::$NEW2"
-          echo "STUDIO_OLD_PW=$OLD"   >> "$GITHUB_ENV"
-          echo "STUDIO_NEW_PW=$NEW"   >> "$GITHUB_ENV"
-          echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
-      - name: Drive the chat UI with Playwright
-        env:
-          BASE_URL: http://127.0.0.1:18896
-          PW_ART_DIR: logs/playwright
-          STUDIO_UI_STRICT: '1'
-          # macos-14 free runner is 3 vCPU / 7 GB / no Metal-accel
-          # available to llama.cpp from CI; gemma-3-270m turn latency
-          # has been observed to crowd the 180s default. Triple it.
-          STUDIO_UI_TURN_TIMEOUT_MS: '540000'
-        # Retry up to 3 times to absorb the racy Playwright Node 24
-        # pipeTransport.js 'Unexpected end of JSON input' crash that
-        # fires intermittently on macos-14 free runners (Chromium
-        # browser process dies mid-test → driver Node process can't
-        # parse the truncated JSON-RPC line and exits). The retry
-        # FULLY resets Studio (kill, reset-password, reboot, wait
-        # /api/health, re-export bootstrap pw) before re-running the
-        # script so the change-password flow finds a fresh bootstrap.
-        # A real test failure (assertion / timeout) does NOT match the
-        # JSON pattern so it bypasses retry and surfaces immediately.
-        run: |
-          mkdir -p logs/playwright
-          attempt=1
-          max_attempts=3
-          while : ; do
-            set +e
-            python tests/studio/playwright_chat_ui.py 2>&1 | tee logs/playwright_attempt_${attempt}.log
-            rc=${PIPESTATUS[0]}
-            set -e
-            if [ "$rc" -eq 0 ]; then
-              break
-            fi
-            if grep -q "Unexpected end of JSON input" logs/playwright_attempt_${attempt}.log \
-               && [ "$attempt" -lt "$max_attempts" ]; then
-              echo "::warning::Playwright pipeTransport JSON crash on attempt ${attempt}; resetting Studio and retrying..."
-              kill "${STUDIO_PID}" 2>/dev/null || true
-              sleep 2
-              unsloth studio reset-password
-              UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-                > "logs/studio_retry_${attempt}.log" 2>&1 &
-              STUDIO_PID=$!
-              echo "STUDIO_PID=$STUDIO_PID" >> "$GITHUB_ENV"
-              for i in $(seq 1 180); do
-                if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json \
-                   && jq -e '.status == "healthy"' /tmp/health.json >/dev/null; then
-                  break
-                fi
-                sleep 1
-              done
-              STUDIO_OLD_PW=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-              STUDIO_NEW_PW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-              STUDIO_NEW2_PW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-              echo "::add-mask::$STUDIO_OLD_PW"
-              echo "::add-mask::$STUDIO_NEW_PW"
-              echo "::add-mask::$STUDIO_NEW2_PW"
-              export STUDIO_OLD_PW STUDIO_NEW_PW STUDIO_NEW2_PW
-              attempt=$((attempt + 1))
-              sleep 3
-              continue
-            fi
-            exit "$rc"
-          done
-
-      - name: Stop Studio (chat-ui ends with Shutdown click; this is belt-and-suspenders)
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-
-      - name: Reset auth + boot Studio for extra UI tests (port 18897)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18897 \
-            > logs/studio_extra.log 2>&1 &
-          echo "STUDIO_EXTRA_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health on 18897
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:18897/api/health" > /tmp/health2.json; then
-              jq -e '.status == "healthy"' /tmp/health2.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health2.json
-
-      - name: Pass bootstrap pw for extra UI test
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          echo "STUDIO_EXTRA_OLD_PW=$OLD" >> "$GITHUB_ENV"
-          echo "STUDIO_EXTRA_NEW_PW=$NEW" >> "$GITHUB_ENV"
-
-      - name: Drive Compare/Recipes/Export/Studio/Settings with Playwright
-        env:
-          BASE_URL: http://127.0.0.1:18897
-          STUDIO_OLD_PW: ${{ env.STUDIO_EXTRA_OLD_PW }}
-          STUDIO_NEW_PW: ${{ env.STUDIO_EXTRA_NEW_PW }}
-          PW_ART_DIR: logs/playwright_extra
-          STUDIO_UI_STRICT: '1'
-          # See "Drive the chat UI" step.
-          STUDIO_UI_TURN_TIMEOUT_MS: '540000'
-          GGUF_REPO: ${{ env.GGUF_REPO }}
-          GGUF_VARIANT: ${{ env.GGUF_VARIANT }}
-        # Same pipeTransport JSON-crash retry shape as "Drive the chat
-        # UI with Playwright" -- see comment there.
-        run: |
-          mkdir -p logs/playwright_extra
-          attempt=1
-          max_attempts=3
-          while : ; do
-            set +e
-            python tests/studio/playwright_extra_ui.py 2>&1 | tee logs/playwright_extra_attempt_${attempt}.log
-            rc=${PIPESTATUS[0]}
-            set -e
-            if [ "$rc" -eq 0 ]; then
-              break
-            fi
-            if grep -q "Unexpected end of JSON input" logs/playwright_extra_attempt_${attempt}.log \
-               && [ "$attempt" -lt "$max_attempts" ]; then
-              echo "::warning::Playwright pipeTransport JSON crash on attempt ${attempt}; resetting Studio and retrying..."
-              kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true
-              sleep 2
-              unsloth studio reset-password
-              UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18897 \
-                > "logs/studio_extra_retry_${attempt}.log" 2>&1 &
-              STUDIO_EXTRA_PID=$!
-              echo "STUDIO_EXTRA_PID=$STUDIO_EXTRA_PID" >> "$GITHUB_ENV"
-              for i in $(seq 1 180); do
-                if curl -fs "http://127.0.0.1:18897/api/health" > /tmp/health2.json \
-                   && jq -e '.status == "healthy"' /tmp/health2.json >/dev/null; then
-                  break
-                fi
-                sleep 1
-              done
-              STUDIO_OLD_PW=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-              STUDIO_NEW_PW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-              echo "::add-mask::$STUDIO_OLD_PW"
-              echo "::add-mask::$STUDIO_NEW_PW"
-              export STUDIO_OLD_PW STUDIO_NEW_PW
-              attempt=$((attempt + 1))
-              sleep 3
-              continue
-            fi
-            exit "$rc"
-          done
-
-      - name: Stop second Studio
-        if: always()
-        run: |
-          kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true
-          sleep 2
-
-      - name: Upload Playwright artifacts
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: mac-studio-ui-smoke-artifacts
-          path: |
-            logs/studio.log
-            logs/studio_extra.log
-            logs/install.log
-            logs/playwright
-            logs/playwright_extra
-          retention-days: 7
diff --git a/.github/workflows/studio-mac-update-smoke.yml b/.github/workflows/studio-mac-update-smoke.yml
deleted file mode 100644
index 07d26b9ab3..0000000000
--- a/.github/workflows/studio-mac-update-smoke.yml
+++ /dev/null
@@ -1,150 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Mac counterpart to studio-update-smoke.yml. Verifies that on a real
-# Apple Silicon (macos-14, M1) runner:
-#
-#   1. install.sh --local --no-torch installs Studio AND auto-fetches
-#      the prebuilt llama.cpp Mac binary (llama-bNNNN-bin-macos-arm64
-#      from ggml-org/llama.cpp). Hitting the source-build fallback is
-#      treated as an Unsloth bug -- Studio must always pick the
-#      prebuilt on Mac.
-#   2. unsloth studio update --local is idempotent. Two consecutive
-#      runs both report "prebuilt up to date and validated", no
-#      source-build fallback.
-#   3. The installed Studio still boots and /api/health returns
-#      healthy after the update path.
-
-name: Mac Studio Update CI
-
-on:
-  pull_request:
-    paths:
-      - 'install.sh'
-      - 'studio/setup.sh'
-      - 'studio/install_python_stack.py'
-      - 'studio/install_llama_prebuilt.py'
-      - 'studio/backend/requirements/**'
-      - 'unsloth_cli/commands/studio.py'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-mac-update-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  update-idempotency:
-    name: Studio Updating Tests
-    runs-on: macos-14
-    timeout-minutes: 30
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Assert install.sh used the Mac llama.cpp prebuilt
-        run: |
-          # Mac install must take the prebuilt path. Source-build
-          # fallback here is an Unsloth bug.
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          if ! grep -qE "prebuilt installed and validated|prebuilt up to date and validated|bin-macos-arm64" logs/install.log; then
-            echo "::error::no Mac prebuilt llama.cpp marker in install.log."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          echo "install.sh installed the Mac prebuilt llama.cpp"
-
-      - name: First update should be a no-op (prebuilt already validated)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -o pipefail
-          unsloth studio update --local 2>&1 | tee logs/update.log
-          if grep -q "falling back to source build" logs/update.log; then
-            echo "::error::studio update fell back to source-build llama.cpp on Mac."
-            grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
-            exit 1
-          fi
-          if ! grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update.log; then
-            echo "::error::no prebuilt up-to-date marker in update.log."
-            grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
-            exit 1
-          fi
-          echo "update path took the prebuilt fast path"
-
-      - name: Second update must also be a no-op
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -o pipefail
-          unsloth studio update --local 2>&1 | tee logs/update2.log
-          grep -q "falling back to source build" logs/update2.log && {
-              echo "::error::second update fell back to source build on Mac"
-              tail -60 logs/update2.log; exit 1; } || true
-          grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update2.log
-          echo "second update was clean"
-
-      - name: Boot Studio briefly to confirm the install is still usable
-        run: |
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18891 \
-            > logs/studio.log 2>&1 &
-          PID=$!
-          HEALTHY=""
-          for i in $(seq 1 60); do
-            if curl -fs http://127.0.0.1:18891/api/health > /tmp/health.json; then
-              if python3 -c "import json,sys; d=json.load(open('/tmp/health.json')); sys.exit(0 if d.get('status')=='healthy' else 1)"; then
-                HEALTHY=1
-                break
-              fi
-            fi
-            sleep 1
-          done
-          if [ -z "$HEALTHY" ]; then
-            echo "Studio failed to come up after \`update\`"
-            tail -200 logs/studio.log
-            kill "$PID" 2>/dev/null || true
-            exit 1
-          fi
-          kill "$PID" 2>/dev/null || true
-          echo "post-update Studio /api/health OK"
-
-      - name: Upload update logs
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: mac-studio-update-log
-          path: |
-            logs/install.log
-            logs/update.log
-            logs/update2.log
-            logs/studio.log
-          retention-days: 7
diff --git a/.github/workflows/studio-tauri-smoke.yml b/.github/workflows/studio-tauri-smoke.yml
deleted file mode 100644
index 1156c264ae..0000000000
--- a/.github/workflows/studio-tauri-smoke.yml
+++ /dev/null
@@ -1,128 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# PR-time smoke for the Tauri desktop wrapper. Builds the frontend and the
-# Tauri Linux debug binary, with no codesigning. Catches:
-#   - tauri.conf.json drift
-#   - src-tauri Cargo.toml or rust source breakage
-#   - Tauri CLI version drift (we pin 2.10.1, matching release-desktop.yml)
-#   - frontend output not picked up by Tauri's distDir
-#
-# Linux-only on a free `ubuntu-latest` runner. Mac and Windows desktop builds
-# stay in release-desktop.yml (manual `workflow_dispatch`) because they need
-# code-signing secrets and ~30 min of runner time each.
-
-name: Studio Tauri CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/frontend/**'
-      - 'studio/src-tauri/**'
-      # CLI rename / signature change can break Tauri's spawned
-      # `unsloth studio` -- include unsloth_cli in the trigger set.
-      - 'unsloth_cli/**'
-      - '.github/workflows/studio-tauri-smoke.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  linux-debug-build:
-    name: Tauri Linux debug build (no codesign)
-    runs-on: ubuntu-22.04
-    timeout-minutes: 25
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Linux native deps for Tauri / WebKit2GTK
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y \
-            libwebkit2gtk-4.1-dev libayatana-appindicator3-dev \
-            librsvg2-dev libxdo-dev libssl-dev patchelf
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '24'
-
-      - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8  # stable @ 2026-03-27
-
-      - uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32  # v2.9.1
-        with:
-          workspaces: studio/src-tauri -> target
-
-      - name: Install pinned Tauri CLI (matches release-desktop.yml)
-        # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
-        # required for `vite build`. The pre-install lockfile structural
-        # audit (lockfile_supply_chain_audit.py) is the practical defence
-        # against the npm postinstall-dropper class -- it fires BEFORE any
-        # tarball runs, on the injection pattern itself rather than an
-        # advisory-DB lookup.
-        run: npm install --save-dev --prefix studio @tauri-apps/cli@2.10.1 --no-fund --no-audit
-
-      - name: Verify pinned Tauri CLI version
-        run: |
-          out="$(npx --prefix studio tauri --version)"
-          echo "$out"
-          [ "$out" = "tauri-cli 2.10.1" ] || { echo "::error::expected tauri-cli 2.10.1, got $out"; exit 1; }
-
-      - name: Lockfile supply-chain audit (pre-install scan)
-        run: python3 scripts/lockfile_supply_chain_audit.py
-
-      - name: Frontend build (npm ci, vite)
-        working-directory: studio/frontend
-        # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
-        # required for `vite build`. The pre-install lockfile structural
-        # audit (lockfile_supply_chain_audit.py) is the practical defence
-        # against the npm postinstall-dropper class -- it fires BEFORE any
-        # tarball runs, on the injection pattern itself rather than an
-        # advisory-DB lookup.
-        run: |
-          npm ci --no-fund --no-audit
-          npm run build
-          test -f dist/index.html
-
-      - name: Tauri debug build (Linux, no bundle, no codesign)
-        # `--debug` + `--no-bundle` keeps this lean: compiles the Rust crate,
-        # confirms the frontend dist is wired into Tauri, but skips the AppImage
-        # / .deb production. Code signing is irrelevant because we never produce
-        # a distributable artifact.
-        env:
-          TAURI_SIGNING_PRIVATE_KEY: ''
-          TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ''
-        run: npx --prefix studio tauri build --debug --no-bundle
-
-      - name: Inspect produced binary
-        run: |
-          BIN=$(find studio/src-tauri/target/debug -maxdepth 1 -type f -executable 2>/dev/null \
-                | grep -Ev '\.(d|so|dylib|dll)$' \
-                | grep -Ev '/(deps|build|examples)$' \
-                | head -1)
-          echo "binary: $BIN"
-          if [ -z "$BIN" ]; then
-            echo "::error::Tauri debug binary not produced"
-            ls -la studio/src-tauri/target/debug/ || true
-            exit 1
-          fi
-          file "$BIN"
-          du -h "$BIN"
-
-      - name: Upload Tauri debug build
-        # Always upload so a green run leaves the binary inspectable too.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: tauri-debug-build
-          path: |
-            studio/src-tauri/target/debug
-            studio/frontend/dist
-          retention-days: 3
diff --git a/.github/workflows/studio-ui-smoke.yml b/.github/workflows/studio-ui-smoke.yml
deleted file mode 100644
index 455fe4b7e1..0000000000
--- a/.github/workflows/studio-ui-smoke.yml
+++ /dev/null
@@ -1,293 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# End-to-end Studio chat UI smoke via Playwright + Chromium against a
-# headless Linux runner. Boots Studio with the smallest GGUF
-# (gemma-3-270m-it UD-Q4_K_XL, ~254 MiB), drives the actual frontend
-# bundle, and asserts the full bootstrap-password / change-password /
-# send-message / persist-on-reload journey works end to end.
-#
-# This is the only workflow that catches regressions in the wiring
-# between the React frontend and the FastAPI backend, e.g. assistant-ui
-# version drift, /api/auth response shape changes, runtime-provider
-# regressions, or chat-history persistence breaking. Backend-only and
-# frontend-only CI happily pass while the actual user-visible UI is
-# broken (cf. the 2026.5.1 chat-history release).
-
-name: Studio UI CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.sh'
-      - 'pyproject.toml'
-      # The Playwright test files themselves -- a PR that ONLY edits
-      # the test must still trigger UI CI.
-      - 'tests/studio/**'
-      - '.github/workflows/studio-ui-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  ui-smoke:
-    name: Chat UI Tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 25
-    env:
-      GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18892'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Linux deps
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            libcurl4-openssl-dev libssl-dev jq
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Prime HF_HOME with the GGUF
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
-      - name: Save HF_HOME for ${{ env.GGUF_REPO }}
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Install Studio (--local, --no-torch)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: Install Playwright + Chromium
-        run: |
-          pip install 'playwright>=1.45'
-          # --with-deps installs the OS-level runtime libs Chromium
-          # needs (libnss3, libxkbcommon, etc.). About 30 s on a
-          # warm runner.
-          python -m playwright install --with-deps chromium
-
-      - name: Reset auth + boot Studio
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        # 180 s -- a cold runner with venv warm-up + lazy imports has
-        # been seen to exceed 60 s. Failing the wait is more expensive
-        # than waiting an extra two minutes.
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-
-      - name: Pass bootstrap password to the Playwright step
-        # The Playwright test does its OWN /change-password through the
-        # UI (Setup your account / Choose a new password), then loads
-        # the model via page.evaluate against /api/inference/load with
-        # the JWT it got from change-password. So the only thing we
-        # have to hand it is the bootstrap password (so it can verify
-        # post-rotation that the OLD bootstrap pw now returns 401).
-        #
-        # NEW + NEW2 are generated freshly per CI run via secrets.token_urlsafe
-        # rather than hardcoded. If a workflow gets compromised, the
-        # attacker can't replay a known-good rotated password against
-        # any future / parallel Studio install -- the rotated value
-        # only ever exists for the lifetime of this single job, masked
-        # in the log via ::add-mask::.
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          NEW2="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          echo "::add-mask::$NEW2"
-          echo "STUDIO_OLD_PW=$OLD"   >> "$GITHUB_ENV"
-          echo "STUDIO_NEW_PW=$NEW"   >> "$GITHUB_ENV"
-          echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
-      - name: Drive the chat UI with Playwright
-        env:
-          BASE_URL: http://127.0.0.1:18892
-          # The test file lives in the repo so it can be run locally
-          # against a freshly-installed Studio (BASE_URL=...; STUDIO_OLD_PW=
-          # $(cat ~/.unsloth/studio/auth/.bootstrap_password); python ...).
-          PW_ART_DIR: logs/playwright
-          # Strict mode: in CI a missing button / nav / dialog must
-          # FAIL the test. Locally the test still runs against partial
-          # Studio installs without STUDIO_UI_STRICT.
-          STUDIO_UI_STRICT: '1'
-        run: |
-          mkdir -p logs/playwright
-          python tests/studio/playwright_chat_ui.py
-
-      - name: Stop Studio (chat-ui ends with Shutdown click; this is belt-and-suspenders)
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-
-      # The chat UI test ends by clicking the Shutdown menuitem, which
-      # leaves the server dead. The extra UI test (Compare / Recipes /
-      # Export / Studio / Settings) needs a fresh Studio, so we boot a
-      # second one on a different port. Boot is fast (~3-5s on the
-      # warm install we already did) so this adds little wall time.
-      - name: Reset auth + boot Studio for extra UI tests (port 18894)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18894 \
-            > logs/studio_extra.log 2>&1 &
-          echo "STUDIO_EXTRA_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health on 18894
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:18894/api/health" > /tmp/health2.json; then
-              jq -e '.status == "healthy"' /tmp/health2.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health2.json
-
-      - name: Pass bootstrap pw for extra UI test
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          echo "STUDIO_EXTRA_OLD_PW=$OLD" >> "$GITHUB_ENV"
-          echo "STUDIO_EXTRA_NEW_PW=$NEW" >> "$GITHUB_ENV"
-
-      - name: Drive Compare/Recipes/Export/Studio/Settings with Playwright
-        env:
-          BASE_URL: http://127.0.0.1:18894
-          STUDIO_OLD_PW: ${{ env.STUDIO_EXTRA_OLD_PW }}
-          STUDIO_NEW_PW: ${{ env.STUDIO_EXTRA_NEW_PW }}
-          PW_ART_DIR: logs/playwright_extra
-          STUDIO_UI_STRICT: '1'
-          GGUF_REPO: ${{ env.GGUF_REPO }}
-          GGUF_VARIANT: ${{ env.GGUF_VARIANT }}
-        run: |
-          mkdir -p logs/playwright_extra
-          python tests/studio/playwright_extra_ui.py
-
-      - name: Stop second Studio
-        if: always()
-        run: |
-          kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true
-          sleep 2
-
-      # IME + multilingual paste regression (issue #5318 / PR #5327).
-      # Third Studio on its own port so a hang here cannot poison the
-      # earlier UI tests. No GGUF -- the bug surface is the composer.
-      - name: Reset auth + boot Studio for IME / i18n tests (port 18896)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18896 \
-            > logs/studio_ime.log 2>&1 &
-          echo "STUDIO_IME_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health on 18896
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:18896/api/health" > /tmp/health3.json; then
-              jq -e '.status == "healthy"' /tmp/health3.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health3.json
-
-      - name: Pass bootstrap pw for IME / i18n test
-        # IME smoke does the change-password against the bootstrap that
-        # Studio's frontend injects into the page, so it only needs the
-        # NEW password.
-        run: |
-          NEW="CIIme-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$NEW"
-          echo "STUDIO_IME_NEW_PW=$NEW" >> "$GITHUB_ENV"
-
-      - name: Drive IME + multilingual paste regression with Playwright
-        env:
-          BASE_URL: http://127.0.0.1:18896
-          STUDIO_NEW_PW: ${{ env.STUDIO_IME_NEW_PW }}
-          PW_ART_DIR: logs/playwright_ime
-          STUDIO_UI_STRICT: '1'
-        run: |
-          mkdir -p logs/playwright_ime
-          python tests/studio/playwright_chat_ime_i18n.py
-
-      - name: Stop third Studio
-        if: always()
-        run: |
-          kill "${STUDIO_IME_PID}" 2>/dev/null || true
-          sleep 2
-
-      - name: Upload Playwright artifacts
-        # Always upload so a green run's screenshots stay reviewable --
-        # catches "passed but the UI is silently broken" regressions.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: studio-ui-smoke-artifacts
-          path: |
-            logs/studio.log
-            logs/studio_extra.log
-            logs/studio_ime.log
-            logs/install.log
-            logs/playwright
-            logs/playwright_extra
-            logs/playwright_ime
-          retention-days: 7
diff --git a/.github/workflows/studio-update-smoke.yml b/.github/workflows/studio-update-smoke.yml
deleted file mode 100644
index 1c353e933a..0000000000
--- a/.github/workflows/studio-update-smoke.yml
+++ /dev/null
@@ -1,154 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Verifies that `unsloth studio update --local` is idempotent: a fresh
-# install via install.sh, followed by `unsloth studio update --local`,
-# succeeds and is a no-op for the llama.cpp prebuilt (it should report
-# "prebuilt up to date and validated", not re-run the source build).
-#
-# This catches regressions in setup.sh's update path that the existing
-# GGUF / wheel jobs would miss because they only invoke install.sh once.
-
-name: Studio Update CI
-
-on:
-  pull_request:
-    paths:
-      - 'install.sh'
-      - 'studio/setup.sh'
-      - 'studio/install_python_stack.py'
-      - 'studio/install_llama_prebuilt.py'
-      - 'studio/backend/requirements/**'
-      - 'unsloth_cli/commands/studio.py'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-update-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  update-idempotency:
-    name: Studio Updating Tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Linux deps for llama.cpp prebuilt
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            libcurl4-openssl-dev libssl-dev jq
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          # Don't cache pip: this job runs `bash install.sh` and
-          # `unsloth studio update --local` which both go through
-          # `uv` and never populate ~/.cache/pip. setup-python's
-          # post-step then fatal-errors with "Cache folder path is
-          # retrieved for pip but doesn't exist on disk".
-
-      - name: Install Studio (--local, --no-torch)
-        # Pass the workflow token so the llama.cpp prebuilt installer's
-        # GitHub-API call to list releases isn't rate-limited (60/hr
-        # unauthenticated). Without this, three consecutive install +
-        # update + update calls in this job exceed the limit and the
-        # prebuilt path falls back to source build.
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          mkdir -p logs
-          set -o pipefail
-          bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
-      - name: First update should be a no-op (prebuilt already validated)
-        # `unsloth studio update --local` runs studio/setup.sh against
-        # the local repo. Right after install.sh the llama.cpp prebuilt
-        # has just been installed and validated, so the second run must
-        # take the "prebuilt up to date and validated" code path. Any
-        # source-build fallback or re-download here means setup.sh's
-        # idempotency regressed.
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -o pipefail
-          unsloth studio update --local 2>&1 | tee logs/update.log
-          if grep -q "falling back to source build" logs/update.log; then
-            echo "::error::studio update fell back to source-build llama.cpp on a fresh install. setup.sh idempotency regressed."
-            grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
-            exit 1
-          fi
-          if ! grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update.log; then
-            echo "::error::no prebuilt up-to-date marker in update.log. Did setup.sh skip the prebuilt path on update?"
-            grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
-            exit 1
-          fi
-          echo "update path took the prebuilt fast path"
-
-      - name: Second update must also be a no-op
-        # Two consecutive `update`s back-to-back is the usual desktop
-        # flow (auto-update, then user-triggered update). Asserting the
-        # second run is also clean rules out hidden state changes from
-        # the first one.
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -o pipefail
-          unsloth studio update --local 2>&1 | tee logs/update2.log
-          grep -q "falling back to source build" logs/update2.log && {
-              echo "::error::second update fell back to source build"
-              tail -60 logs/update2.log; exit 1; } || true
-          grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update2.log
-          echo "second update was clean"
-
-      - name: Boot Studio briefly to confirm the install is still usable
-        # If `update --local` accidentally broke the venv or wiped the
-        # llama-server binary, the server would fail to start here.
-        run: |
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18891 \
-            > logs/studio.log 2>&1 &
-          PID=$!
-          for i in $(seq 1 60); do
-            if curl -fs http://127.0.0.1:18891/api/health > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json
-              break
-            fi
-            sleep 1
-          done
-          if ! jq -e '.status == "healthy"' /tmp/health.json 2>/dev/null; then
-            echo "Studio failed to come up after `update`"
-            tail -200 logs/studio.log
-            kill "$PID" 2>/dev/null || true
-            exit 1
-          fi
-          kill "$PID" 2>/dev/null || true
-          echo "post-update Studio /api/health OK"
-
-      - name: Upload update logs
-        # Always upload so a green run still leaves the install + two
-        # update logs reviewable.
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: studio-update-log
-          path: |
-            logs/install.log
-            logs/update.log
-            logs/update2.log
-            logs/studio.log
-          retention-days: 7
diff --git a/.github/workflows/studio-windows-api-smoke.yml b/.github/workflows/studio-windows-api-smoke.yml
deleted file mode 100644
index 1d12ea6f90..0000000000
--- a/.github/workflows/studio-windows-api-smoke.yml
+++ /dev/null
@@ -1,246 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Windows counterpart to studio-api-smoke.yml / studio-mac-api-smoke.yml.
-# Same tests/studio/studio_api_smoke.py exercise (CORS hardening, auth
-# state machine, JWT expiry, API key lifecycle, /v1/models /
-# /v1/embeddings / /v1/responses, endpoint-by-endpoint auth audit) but
-# on the FREE windows-latest runner. The file-mode hardening section
-# (Section 6) is Linux-only and short-circuits on non-POSIX; the rest
-# is platform-portable.
-
-name: Windows Studio API CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.ps1'
-      - 'pyproject.toml'
-      - 'tests/studio/**'
-      - '.github/workflows/studio-windows-api-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  api-smoke:
-    name: Studio API & Auth Tests
-    runs-on: windows-latest
-    timeout-minutes: 30
-    defaults:
-      run:
-        shell: bash
-    env:
-      GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18895'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-      # Force UTF-8 for stdio (Windows defaults to cp1252; hf
-      # download prints a "✓" checkmark and crashes otherwise).
-      PYTHONIOENCODING: utf-8
-      PYTHONUTF8: '1'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-
-      - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Prime HF_HOME with the GGUF
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
-      - name: Save HF_HOME for ${{ env.GGUF_REPO }}
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
-        shell: pwsh
-        # See studio-windows-update-smoke.yml for the full rationale.
-        # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
-        # reinstall, and Defender's real-time scan dominates the
-        # frontend / uv-pip-extract steps.
-        run: |
-          $ProgressPreference = 'SilentlyContinue'
-          Write-Host "npm version before upgrade: $(npm -v)"
-          npm install -g 'npm@^11' 2>&1 | Out-Host
-          Write-Host "npm version after upgrade: $(npm -v)"
-          # NOTE: do NOT pre-create these directories. See
-          # studio-windows-update-smoke.yml for the full rationale --
-          # creating an empty studio/frontend/dist trips setup.ps1's
-          # mtime-based staleness check into "frontend up to date, skip
-          # rebuild" and Studio boots with an empty dist directory.
-          # Add-MpPreference accepts paths that do not yet exist.
-          foreach ($p in @(
-            "$env:USERPROFILE\.unsloth",
-            "$env:USERPROFILE\AppData\Local\uv",
-            "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
-            "$env:GITHUB_WORKSPACE\studio\frontend\dist"
-          )) {
-            try {
-              Add-MpPreference -ExclusionPath $p -ErrorAction Stop
-              Write-Host "Defender exclusion added: $p"
-            } catch {
-              Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
-            }
-          }
-
-      - name: Install Studio (--local, --no-torch)
-        shell: pwsh
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          New-Item -ItemType Directory -Force -Path logs | Out-Null
-          # *>&1 captures Write-Host (Information stream) output;
-          # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
-          # and validated" via Write-Host, and we grep for that.
-          $ProgressPreference = 'SilentlyContinue'
-          & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
-      - name: Assert install.ps1 used the Windows llama.cpp prebuilt
-        run: |
-          # Filesystem-based check (setup.ps1's stream output isn't
-          # captured back through this parent step's pipeline; see
-          # studio-windows-ui-smoke.yml for full explanation).
-          LLAMA_DIR=~/.unsloth/llama.cpp
-          INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
-          BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          if [ ! -f "$INFO" ]; then
-            echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
-            ls -la "$LLAMA_DIR" || true
-            exit 1
-          fi
-          if [ ! -f "$BIN" ]; then
-            echo "::error::no llama-server.exe at $BIN."
-            ls -la "$LLAMA_DIR/build/bin" || true
-            exit 1
-          fi
-          echo "install.ps1 installed the Windows prebuilt llama.cpp:"
-          cat "$INFO"
-
-      - name: Add Studio shim to GITHUB_PATH
-        # install.ps1's User-PATH update doesn't propagate to a
-        # running Git Bash session; export the shim dir so the
-        # next `unsloth ...` invocation finds it.
-        run: |
-          SHIM_DIR=~/.unsloth/studio/bin
-          if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
-            echo "::error::unsloth.exe shim not found at $SHIM_DIR"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
-      - name: Patch Studio venv with full typer / pydantic dep trees
-        # Belt-and-suspenders: install.ps1's --no-deps install of
-        # no-torch-runtime.txt drops typer's and pydantic's runtime
-        # deps unless explicitly pinned. Re-install the ones whose
-        # deps don't pull torch.
-        run: |
-          STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
-          if [ ! -f "$STUDIO_PY" ]; then
-            echo "::error::Studio venv python not at $STUDIO_PY"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
-      - name: Install pyjwt for the JWT-expiry forge test
-        run: python -m pip install 'pyjwt>=2.6'
-
-      - name: Reset auth + boot Studio (API-only)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-
-      - name: Pass bootstrap password + rotated targets to the test
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          NEW2="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          echo "::add-mask::$NEW2"
-          echo "STUDIO_OLD_PW=$OLD"  >> "$GITHUB_ENV"
-          echo "STUDIO_NEW_PW=$NEW"  >> "$GITHUB_ENV"
-          echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
-      - name: Run Studio API & Auth tests
-        # Do NOT pin STUDIO_AUTH_DIR here. The Mac/Linux mirrors
-        # hardcode runner-specific paths (/Users/runner/...,
-        # /home/runner/...), but on Windows the path is
-        # C:\Users\runneradmin\.unsloth\studio\auth and varies by
-        # runner image. studio_api_smoke.py defaults to
-        # Path.home()/".unsloth"/"studio"/"auth" when the env is
-        # unset, which is correct on every OS.
-        env:
-          BASE_URL: http://127.0.0.1:18895
-        run: python tests/studio/studio_api_smoke.py
-
-      - name: Stop Studio
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-
-      - name: Upload API smoke logs
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: windows-studio-api-smoke-log
-          path: |
-            logs/install.log
-            logs/studio.log
-          retention-days: 7
diff --git a/.github/workflows/studio-windows-inference-smoke.yml b/.github/workflows/studio-windows-inference-smoke.yml
deleted file mode 100644
index 01bf4127a7..0000000000
--- a/.github/workflows/studio-windows-inference-smoke.yml
+++ /dev/null
@@ -1,1167 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Three end-to-end smoke jobs that boot a freshly-installed Studio and
-# exercise the surfaces real users hit through the OpenAI / Anthropic
-# SDKs and curl, on the FREE windows-latest runner. Each job picks the
-# smallest model that exercises the behaviour under test, primes
-# HF_HOME via actions/cache, and shares the install.ps1 --local
-# --no-torch bootstrap.
-#
-#   1. OpenAI, Anthropic API tests
-#        gemma-3-270m-it UD-Q4_K_XL (~254 MiB).
-#   2. Tool calling Tests
-#        Qwen3.5-2B UD-Q4_K_XL (~890 MiB).
-#   3. JSON, images
-#        gemma-4-E2B-it UD-Q4_K_XL + mmproj-F16 (~3.4 GiB total).
-#        Within the 14 GB windows-latest SSD budget.
-
-name: Windows Studio GGUF CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.ps1'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-windows-inference-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  # ─────────────────────────────────────────────────────────────────────
-  # Job 1: OpenAI, Anthropic API tests
-  # ─────────────────────────────────────────────────────────────────────
-  openai-anthropic:
-    name: OpenAI, Anthropic API tests
-    runs-on: windows-latest
-    timeout-minutes: 30
-    defaults:
-      run:
-        shell: bash
-    env:
-      GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18888'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-      # Force UTF-8 for stdio (Windows defaults to cp1252; hf
-      # download / Studio CLI print "✓" checkmarks and crash
-      # otherwise).
-      PYTHONIOENCODING: utf-8
-      PYTHONUTF8: '1'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-
-      # Split restore + save (rather than the one-step actions/cache) so a
-      # transient restore-side failure does not kill the whole job. v5 has a
-      # known flake where it logs "Cache hit for: <key>" and then exits
-      # non-zero without actually extracting the archive (see
-      # actions/cache#1621 and github community discussion #163260).
-      # continue-on-error on restore masks that failure so the Prime step
-      # below can re-download from HF and the job keeps running. Save then
-      # populates the cache key on a real miss only; cache keys are
-      # immutable, so a corrupted cached entry persists until the -v1
-      # suffix below is bumped.
-      - name: Restore HF_HOME cache for ${{ env.GGUF_REPO }}
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Prime HF_HOME with the GGUF
-        id: prime-hf
-        # Run on a real cache miss AND on the silent-restore-failure mode
-        # described above (outcome != success).
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
-      - name: Save HF_HOME cache for ${{ env.GGUF_REPO }}
-        # Only write a fresh cache entry when we actually rebuilt the
-        # directory (Prime ran and succeeded). Skipping when Prime is
-        # skipped avoids "already exists" save warnings on the happy path.
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
-        shell: pwsh
-        # See studio-windows-update-smoke.yml for the full rationale.
-        # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
-        # reinstall, and Defender's real-time scan dominates the
-        # frontend / uv-pip-extract steps.
-        run: |
-          $ProgressPreference = 'SilentlyContinue'
-          Write-Host "npm version before upgrade: $(npm -v)"
-          npm install -g 'npm@^11' 2>&1 | Out-Host
-          Write-Host "npm version after upgrade: $(npm -v)"
-          # NOTE: do NOT pre-create these directories. See
-          # studio-windows-update-smoke.yml for the full rationale --
-          # creating an empty studio/frontend/dist trips setup.ps1's
-          # mtime-based staleness check into "frontend up to date, skip
-          # rebuild" and Studio boots with an empty dist directory.
-          # Add-MpPreference accepts paths that do not yet exist.
-          foreach ($p in @(
-            "$env:USERPROFILE\.unsloth",
-            "$env:USERPROFILE\AppData\Local\uv",
-            "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
-            "$env:GITHUB_WORKSPACE\studio\frontend\dist"
-          )) {
-            try {
-              Add-MpPreference -ExclusionPath $p -ErrorAction Stop
-              Write-Host "Defender exclusion added: $p"
-            } catch {
-              Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
-            }
-          }
-
-      - name: Install Studio (--local, --no-torch)
-        shell: pwsh
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          New-Item -ItemType Directory -Force -Path logs | Out-Null
-          # *>&1 captures Write-Host (Information stream) output;
-          # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
-          # and validated" via Write-Host, and we grep for that.
-          $ProgressPreference = 'SilentlyContinue'
-          & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
-      - name: Assert install.ps1 used the Windows llama.cpp prebuilt
-        run: |
-          # Filesystem check; setup.ps1's stream output isn't captured.
-          LLAMA_DIR=~/.unsloth/llama.cpp
-          INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
-          BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          if [ ! -f "$INFO" ]; then
-            echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
-            ls -la "$LLAMA_DIR" || true
-            exit 1
-          fi
-          if [ ! -f "$BIN" ]; then
-            echo "::error::no llama-server.exe at $BIN."
-            ls -la "$LLAMA_DIR/build/bin" || true
-            exit 1
-          fi
-          echo "install.ps1 installed the Windows prebuilt llama.cpp:"
-          cat "$INFO"
-
-      - name: Add Studio shim to GITHUB_PATH
-        run: |
-          SHIM_DIR=~/.unsloth/studio/bin
-          if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
-            echo "::error::unsloth.exe shim not found at $SHIM_DIR"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
-      - name: Patch Studio venv with full typer / pydantic dep trees
-        # Belt-and-suspenders: install.ps1's --no-deps install of
-        # no-torch-runtime.txt drops typer's and pydantic's runtime
-        # deps unless explicitly pinned. Re-install the ones whose
-        # deps don't pull torch.
-        run: |
-          STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
-          if [ ! -f "$STUDIO_PY" ]; then
-            echo "::error::Studio venv python not at $STUDIO_PY"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
-      - name: Install OpenAI + Anthropic Python SDKs
-        run: python -m pip install 'openai>=1.50' 'anthropic>=0.40'
-
-      - name: Reset auth + boot Studio (API-only)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Studio did not become healthy in 180s"
-          tail -200 logs/studio.log
-          exit 1
-
-      - name: Password rotation (old must fail, new must work)
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
-          [ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] || { echo "bootstrap login failed"; exit 1; }
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
-          OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \
-            -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}")
-          if [ "$OLD_STATUS" != "401" ]; then
-            echo "::error::Login with old password returned $OLD_STATUS, expected 401"
-            exit 1
-          fi
-          NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          [ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] || { echo "new login failed"; exit 1; }
-          echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV"
-          echo "password rotation OK (old=401, new=200)"
-
-      - name: Load the GGUF (HF repo + variant, served from HF_HOME cache)
-        run: |
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_gguf, context_length}'
-
-      - name: Multi-turn determinism via OpenAI + Anthropic SDKs
-        env:
-          BASE_URL: http://127.0.0.1:18888
-        run: |
-          python - <<'PY'
-          import json
-          import os
-          from openai import OpenAI
-          from anthropic import Anthropic
-
-          BASE = os.environ["BASE_URL"]
-          KEY  = os.environ["TOKEN"]
-          SEED = 3407
-
-          PROMPTS = [
-              "What is 1+1?",
-              "What did I ask before?",
-              "What is the capital of France?",
-              "Repeat the city name",
-          ]
-
-          def run_openai():
-              client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
-              history, replies = [], []
-              for prompt in PROMPTS:
-                  history.append({"role": "user", "content": prompt})
-                  resp = client.chat.completions.create(
-                      model       = "default",
-                      messages    = history,
-                      temperature = 0.0,
-                      max_tokens  = 80,
-                      seed        = SEED,
-                      extra_body  = {"enable_thinking": False},
-                  )
-                  text = resp.choices[0].message.content or ""
-                  replies.append(text)
-                  history.append({"role": "assistant", "content": text})
-              return replies
-
-          def run_anthropic():
-              client = Anthropic(
-                  base_url        = BASE,
-                  api_key         = "unused",
-                  default_headers = {"Authorization": f"Bearer {KEY}"},
-              )
-              history, replies = [], []
-              for prompt in PROMPTS:
-                  history.append({"role": "user", "content": prompt})
-                  msg = client.messages.create(
-                      model       = "default",
-                      max_tokens  = 80,
-                      messages    = history,
-                      temperature = 0.0,
-                      extra_body  = {"seed": SEED, "enable_thinking": False},
-                  )
-                  text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text")
-                  replies.append(text)
-                  history.append({"role": "assistant", "content": text})
-              return replies
-
-          for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)):
-              first  = runner()
-              second = runner()
-              for i, (a, b) in enumerate(zip(first, second), start = 1):
-                  print(f"[{label} turn {i}] {a!r}")
-                  assert a, f"{label}: empty turn {i} response"
-                  assert a == b, (
-                      f"{label} non-deterministic at turn {i} with temperature=0.0:\n"
-                      f"  run1: {a!r}\n  run2: {b!r}"
-                  )
-              joined = " ".join(first).lower()
-              assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}"
-              assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}"
-              print(f"[{label}] OK -- 4 turns, run1 == run2, history grounded")
-          PY
-
-      - name: Stop Studio
-        if: always()
-        # Run as cmd so we are not running through the Git Bash shell;
-        # Git Bash on windows-latest has been observed to exit 143
-        # (SIGTERM) from any inline kill/sleep block, masking a green
-        # test run. The runner reclaims the Studio child process at
-        # job end either way, so just emit a marker and exit 0.
-        shell: cmd
-        run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end)
-
-      - name: Upload logs
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: windows-openai-anthropic-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Job 2: Tool calling Tests
-  # ─────────────────────────────────────────────────────────────────────
-  tool-calling:
-    name: Tool calling Tests
-    runs-on: windows-latest
-    timeout-minutes: 30
-    defaults:
-      run:
-        shell: bash
-    env:
-      # Tool calling is the highest-volume GGUF in this workflow
-      # (Qwen3.5-2B at Q4_K_XL = ~1.28 GiB). The previous HF_HOME
-      # cache stored xet chunks + blobs + snapshots = ~4.7 GiB --
-      # 3.7x file-size inflation, dominating the post-step upload
-      # (211 s on first run; subsequent runs hit the cache, but the
-      # one-time cost recurs every time the cache key bumps). Use
-      # main's `--local-dir gguf-cache` pattern: cache the flat .gguf
-      # only, pass an absolute path to Studio's /api/inference/load.
-      # The OpenAI/Anth and JSON+images jobs still cover the
-      # gguf_variant resolution path.
-      GGUF_REPO: unsloth/Qwen3.5-2B-GGUF
-      GGUF_FILE: Qwen3.5-2B-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18898'
-      # Force UTF-8 for stdio (Windows defaults to cp1252; hf
-      # download / Studio CLI print "✓" checkmarks and crash
-      # otherwise).
-      PYTHONIOENCODING: utf-8
-      PYTHONUTF8: '1'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-
-      # Split restore + save so a transient restore-side failure does not
-      # kill the whole job. See the matching block in the tool-calling job
-      # above for the full rationale (actions/cache#1621).
-      - name: Restore GGUF model cache
-        id: cache-gguf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
-      - name: Download GGUF if cache miss
-        id: download-gguf
-        if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.cache-gguf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p gguf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache
-
-      - name: Save GGUF model cache
-        if: always() && steps.download-gguf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: gguf-cache
-          key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
-      - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
-        shell: pwsh
-        # See studio-windows-update-smoke.yml for the full rationale.
-        # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
-        # reinstall, and Defender's real-time scan dominates the
-        # frontend / uv-pip-extract steps.
-        run: |
-          $ProgressPreference = 'SilentlyContinue'
-          Write-Host "npm version before upgrade: $(npm -v)"
-          npm install -g 'npm@^11' 2>&1 | Out-Host
-          Write-Host "npm version after upgrade: $(npm -v)"
-          # NOTE: do NOT pre-create these directories. See
-          # studio-windows-update-smoke.yml for the full rationale --
-          # creating an empty studio/frontend/dist trips setup.ps1's
-          # mtime-based staleness check into "frontend up to date, skip
-          # rebuild" and Studio boots with an empty dist directory.
-          # Add-MpPreference accepts paths that do not yet exist.
-          foreach ($p in @(
-            "$env:USERPROFILE\.unsloth",
-            "$env:USERPROFILE\AppData\Local\uv",
-            "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
-            "$env:GITHUB_WORKSPACE\studio\frontend\dist"
-          )) {
-            try {
-              Add-MpPreference -ExclusionPath $p -ErrorAction Stop
-              Write-Host "Defender exclusion added: $p"
-            } catch {
-              Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
-            }
-          }
-
-      - name: Install Studio (--local, --no-torch)
-        shell: pwsh
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          New-Item -ItemType Directory -Force -Path logs | Out-Null
-          # *>&1 captures Write-Host (Information stream) output;
-          # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
-          # and validated" via Write-Host, and we grep for that.
-          $ProgressPreference = 'SilentlyContinue'
-          & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
-      - name: Assert install.ps1 used the Windows llama.cpp prebuilt
-        run: |
-          # Filesystem check; setup.ps1's stream output isn't captured.
-          LLAMA_DIR=~/.unsloth/llama.cpp
-          INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
-          BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          if [ ! -f "$INFO" ]; then
-            echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
-            ls -la "$LLAMA_DIR" || true
-            exit 1
-          fi
-          if [ ! -f "$BIN" ]; then
-            echo "::error::no llama-server.exe at $BIN."
-            ls -la "$LLAMA_DIR/build/bin" || true
-            exit 1
-          fi
-          echo "install.ps1 installed the Windows prebuilt llama.cpp:"
-          cat "$INFO"
-
-      - name: Add Studio shim to GITHUB_PATH
-        run: |
-          SHIM_DIR=~/.unsloth/studio/bin
-          if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
-            echo "::error::unsloth.exe shim not found at $SHIM_DIR"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
-      - name: Patch Studio venv with full typer / pydantic dep trees
-        # Belt-and-suspenders: install.ps1's --no-deps install of
-        # no-torch-runtime.txt drops typer's and pydantic's runtime
-        # deps unless explicitly pinned. Re-install the ones whose
-        # deps don't pull torch.
-        run: |
-          STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
-          if [ ! -f "$STUDIO_PY" ]; then
-            echo "::error::Studio venv python not at $STUDIO_PY"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
-      - name: Reset auth + boot Studio (API-only, default tool policy)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health, log in, change password, load model
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
-          TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
-          # GITHUB_WORKSPACE on windows-latest is a Windows path with
-          # backslashes ("D:\a\unsloth\unsloth"). Bash handles it as a
-          # raw string, but we cannot embed `\a` etc. in JSON without
-          # JSON-string-escaping every backslash. Replace `\` with `/`
-          # via bash parameter expansion -- pathlib.Path on Windows
-          # accepts forward slashes natively, so Studio's loader sees
-          # a normal path.
-          GGUF_PATH="${GITHUB_WORKSPACE//\\//}/gguf-cache/${GGUF_FILE}"
-          ls -lh "$GGUF_PATH"
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name}'
-
-      - name: Tool calling, server-side tools, thinking on/off
-        env:
-          BASE_URL: http://127.0.0.1:18898
-        run: |
-          python - <<'PY'
-          import json
-          import os
-          import urllib.request
-
-          BASE = os.environ["BASE_URL"]
-          KEY  = os.environ["API_KEY"]
-          SEED = 3407
-          # Same temperature shim as the Mac job. Small Qwen3.5-2B
-          # quants can degenerate at temperature=0; a small non-zero
-          # temperature with a fixed seed keeps the test deterministic
-          # while escaping the trap.
-          TEMP = 0.2
-
-          def post(path, body, *, timeout = 240):
-              data = json.dumps(body).encode()
-              req = urllib.request.Request(
-                  f"{BASE}{path}",
-                  data    = data,
-                  method  = "POST",
-                  headers = {
-                      "Authorization": f"Bearer {KEY}",
-                      "Content-Type": "application/json",
-                  },
-              )
-              with urllib.request.urlopen(req, timeout = timeout) as resp:
-                  return resp.status, json.loads(resp.read().decode())
-
-          def post_sse(path, body, *, timeout = 600):
-              body = {**body, "stream": True}
-              data = json.dumps(body).encode()
-              req = urllib.request.Request(
-                  f"{BASE}{path}",
-                  data    = data,
-                  method  = "POST",
-                  headers = {
-                      "Authorization": f"Bearer {KEY}",
-                      "Content-Type": "application/json",
-                  },
-              )
-              parts = []
-              with urllib.request.urlopen(req, timeout = timeout) as resp:
-                  for raw in resp:
-                      line = raw.decode().strip()
-                      if not line.startswith("data: "):
-                          continue
-                      payload = line[6:]
-                      if payload == "[DONE]":
-                          break
-                      try:
-                          chunk = json.loads(payload)
-                      except json.JSONDecodeError:
-                          continue
-                      for choice in chunk.get("choices", []):
-                          delta = choice.get("delta", {}) or {}
-                          if delta.get("content"):
-                              parts.append(delta["content"])
-              return "".join(parts)
-
-          # ── 1. Standard OpenAI function calling ──────────────────────
-          weather_tool = {
-              "type": "function",
-              "function": {
-                  "name": "get_weather",
-                  "description": "Get current weather for a city.",
-                  "parameters": {
-                      "type": "object",
-                      "properties": {"city": {"type": "string"}},
-                      "required": ["city"],
-                  },
-              },
-          }
-
-          status, data = post("/v1/chat/completions", {
-              "messages":    [{"role": "user", "content": "What is the weather in Paris?"}],
-              "tools":       [weather_tool],
-              "tool_choice": "required",
-              "stream":      False,
-              "temperature": TEMP,
-              "seed":        SEED,
-              "max_tokens":  600,
-          })
-          assert status == 200, f"tool call status {status}: {data}"
-          choice = data["choices"][0]
-          tool_calls = (choice.get("message") or {}).get("tool_calls") or []
-          if tool_calls:
-              tc = tool_calls[0]
-              assert tc["function"]["name"] == "get_weather", (
-                  f"unexpected tool name: {tc['function']['name']!r}"
-              )
-              args = json.loads(tc["function"]["arguments"])
-              assert args.get("city"), f"missing city arg: {args}"
-              print(f"[tools] PASS function calling -> {tc['function']['name']}({args}) finish={choice.get('finish_reason')!r}")
-          else:
-              print(
-                  f"[tools] WARN function calling: no tool_calls (finish_reason="
-                  f"{choice.get('finish_reason')!r}); HTTP path OK, model output drift."
-              )
-
-          # ── 2. Server-side python tool ───────────────────────────────
-          content = post_sse("/v1/chat/completions", {
-              "messages":      [{"role": "user", "content": "What is 123 * 456? Use the python tool to compute it and tell me the number."}],
-              "enable_tools":  True,
-              "enabled_tools": ["python"],
-              "session_id":    "ci-tool-calling-py",
-              "temperature":   TEMP,
-              "seed":          SEED,
-              "max_tokens":    600,
-          })
-          if "56088" in content or "56,088" in content:
-              print(f"[tools] PASS python tool ({len(content)} chars, found 56088)")
-          else:
-              assert content, "python tool: SSE stream empty"
-              print(
-                  f"[tools] WARN python tool: SSE OK ({len(content)} chars) but "
-                  f"model didn't return 56088 -- model output drift"
-              )
-
-          # ── 3. Server-side bash (terminal) tool ──────────────────────
-          # On Windows the terminal tool resolves to the system shell
-          # (cmd.exe wrapper) and `echo hello-bash-tool` works the same
-          # way it does on POSIX. The model still has to choose to
-          # invoke the tool; assert non-empty SSE if it doesn't.
-          content = post_sse("/v1/chat/completions", {
-              "messages":      [{"role": "user", "content": "Use the terminal tool to run `echo hello-bash-tool` and tell me the exact output."}],
-              "enable_tools":  True,
-              "enabled_tools": ["terminal"],
-              "session_id":    "ci-tool-calling-bash",
-              "temperature":   TEMP,
-              "seed":          SEED,
-              "max_tokens":    600,
-          })
-          if "hello-bash-tool" in content:
-              print(f"[tools] PASS terminal tool ({len(content)} chars)")
-          else:
-              assert content, "terminal tool: SSE stream empty"
-              print(
-                  f"[tools] WARN terminal tool: SSE OK ({len(content)} chars) but "
-                  f"model didn't echo 'hello-bash-tool' -- model output drift"
-              )
-
-          # ── 4. Server-side web_search tool ───────────────────────────
-          # DuckDuckGo can be flaky from CI runners; only assert that
-          # the SSE stream opens and yields any data.
-          try:
-              content = post_sse("/v1/chat/completions", {
-                  "messages":      [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}],
-                  "enable_tools":  True,
-                  "enabled_tools": ["web_search"],
-                  "session_id":    "ci-tool-calling-web",
-                  "temperature":   TEMP,
-                  "seed":          SEED,
-                  "max_tokens":    400,
-              })
-              print(f"[tools] PASS web_search stream ({len(content)} chars)")
-          except Exception as exc:
-              print(f"[tools] WARN web_search probe failed (non-blocking): {exc}")
-
-          # ── 5. Thinking on / off ─────────────────────────────────────
-          def thinking_call(enable):
-              status, data = post("/v1/chat/completions", {
-                  "messages":        [{"role": "user", "content": "Briefly: is 17 prime?"}],
-                  "stream":          False,
-                  "enable_thinking": enable,
-                  "temperature":     TEMP,
-                  "seed":            SEED,
-                  "max_tokens":      300,
-              })
-              assert status == 200
-              msg = data["choices"][0]["message"]
-              raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "")
-              return raw
-
-          on_text  = thinking_call(True)
-          off_text = thinking_call(False)
-          had_think_on = ("<think>" in on_text) or len(on_text) > 80
-          if not had_think_on:
-              print(
-                  f"[tools] WARN enable_thinking=True produced no thinking signal: "
-                  f"{on_text[:200]!r}"
-              )
-          assert "<think>" not in off_text, (
-              f"enable_thinking=False but <think> still present: {off_text!r}"
-          )
-          print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)")
-          PY
-
-      - name: Stop Studio
-        if: always()
-        # Run as cmd so we are not running through the Git Bash shell;
-        # Git Bash on windows-latest has been observed to exit 143
-        # (SIGTERM) from any inline kill/sleep block, masking a green
-        # test run. The runner reclaims the Studio child process at
-        # job end either way, so just emit a marker and exit 0.
-        shell: cmd
-        run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end)
-
-      - name: Upload logs
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: windows-tool-calling-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Job 3: JSON, images
-  # ─────────────────────────────────────────────────────────────────────
-  json-images:
-    name: JSON, images
-    runs-on: windows-latest
-    timeout-minutes: 35
-    defaults:
-      run:
-        shell: bash
-    env:
-      GGUF_REPO: unsloth/gemma-4-E2B-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-4-E2B-it-UD-Q4_K_XL.gguf
-      MMPROJ_FILE: mmproj-F16.gguf
-      STUDIO_PORT: '18899'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-      # Force UTF-8 for stdio (Windows defaults to cp1252; hf
-      # download / Studio CLI print "✓" checkmarks and crash
-      # otherwise).
-      PYTHONIOENCODING: utf-8
-      PYTHONUTF8: '1'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-
-      # Split restore + save so a transient restore-side failure does not
-      # kill the whole job. See the matching block in the tool-calling job
-      # for the full rationale (actions/cache#1621). This is the block that
-      # actually broke in run 25713577488: "Cache hit for: <key>" was
-      # logged, the step exited non-zero in ~0.3 s without extracting the
-      # 3.4 GiB archive, and steps 6-15 were skipped.
-      - name: Restore HF_HOME cache for ${{ env.GGUF_REPO }} (model + mmproj)
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1
-
-      - name: Prime HF_HOME with the GGUF + mmproj
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE"
-
-      - name: Save HF_HOME cache for ${{ env.GGUF_REPO }} (model + mmproj)
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1
-
-      - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
-        shell: pwsh
-        # See studio-windows-update-smoke.yml for the full rationale.
-        # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
-        # reinstall, and Defender's real-time scan dominates the
-        # frontend / uv-pip-extract steps.
-        run: |
-          $ProgressPreference = 'SilentlyContinue'
-          Write-Host "npm version before upgrade: $(npm -v)"
-          npm install -g 'npm@^11' 2>&1 | Out-Host
-          Write-Host "npm version after upgrade: $(npm -v)"
-          # NOTE: do NOT pre-create these directories. See
-          # studio-windows-update-smoke.yml for the full rationale --
-          # creating an empty studio/frontend/dist trips setup.ps1's
-          # mtime-based staleness check into "frontend up to date, skip
-          # rebuild" and Studio boots with an empty dist directory.
-          # Add-MpPreference accepts paths that do not yet exist.
-          foreach ($p in @(
-            "$env:USERPROFILE\.unsloth",
-            "$env:USERPROFILE\AppData\Local\uv",
-            "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
-            "$env:GITHUB_WORKSPACE\studio\frontend\dist"
-          )) {
-            try {
-              Add-MpPreference -ExclusionPath $p -ErrorAction Stop
-              Write-Host "Defender exclusion added: $p"
-            } catch {
-              Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
-            }
-          }
-
-      - name: Install Studio (--local, --no-torch)
-        shell: pwsh
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          New-Item -ItemType Directory -Force -Path logs | Out-Null
-          # *>&1 captures Write-Host (Information stream) output;
-          # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
-          # and validated" via Write-Host, and we grep for that.
-          $ProgressPreference = 'SilentlyContinue'
-          & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
-      - name: Assert install.ps1 used the Windows llama.cpp prebuilt
-        run: |
-          # Filesystem check; setup.ps1's stream output isn't captured.
-          LLAMA_DIR=~/.unsloth/llama.cpp
-          INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
-          BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          if [ ! -f "$INFO" ]; then
-            echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
-            ls -la "$LLAMA_DIR" || true
-            exit 1
-          fi
-          if [ ! -f "$BIN" ]; then
-            echo "::error::no llama-server.exe at $BIN."
-            ls -la "$LLAMA_DIR/build/bin" || true
-            exit 1
-          fi
-          echo "install.ps1 installed the Windows prebuilt llama.cpp:"
-          cat "$INFO"
-
-      - name: Add Studio shim to GITHUB_PATH
-        run: |
-          SHIM_DIR=~/.unsloth/studio/bin
-          if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
-            echo "::error::unsloth.exe shim not found at $SHIM_DIR"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
-      - name: Patch Studio venv with full typer / pydantic dep trees
-        # Belt-and-suspenders: install.ps1's --no-deps install of
-        # no-torch-runtime.txt drops typer's and pydantic's runtime
-        # deps unless explicitly pinned. Re-install the ones whose
-        # deps don't pull torch.
-        run: |
-          STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
-          if [ ! -f "$STUDIO_PY" ]; then
-            echo "::error::Studio venv python not at $STUDIO_PY"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
-      - name: Install OpenAI + Anthropic Python SDKs
-        run: python -m pip install 'openai>=1.50' 'anthropic>=0.40'
-
-      - name: Reset auth + boot Studio (API-only)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health, log in, change password, load model
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
-            -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
-            -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
-          TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
-            -H 'content-type: application/json' \
-            -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
-          echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 900 \
-            -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_vision}'
-
-      - name: JSON schema decoding + image input
-        env:
-          BASE_URL: http://127.0.0.1:18899
-        run: |
-          python - <<'PY'
-          import base64
-          import json
-          import os
-          import urllib.request
-          from openai import OpenAI
-          from anthropic import Anthropic
-
-          BASE = os.environ["BASE_URL"]
-          KEY  = os.environ["API_KEY"]
-          SEED = 3407
-          TEMP = 0.2
-
-          def post(path, body, *, timeout = 240):
-              req = urllib.request.Request(
-                  f"{BASE}{path}",
-                  data    = json.dumps(body).encode(),
-                  method  = "POST",
-                  headers = {
-                      "Authorization": f"Bearer {KEY}",
-                      "Content-Type":  "application/json",
-                  },
-              )
-              with urllib.request.urlopen(req, timeout = timeout) as resp:
-                  return resp.status, json.loads(resp.read().decode())
-
-          # ── 1. response_format = json_object (JSON mode) ─────────────
-          status, data = post("/v1/chat/completions", {
-              "model":         "default",
-              "messages":      [
-                  {"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'},
-                  {"role": "user",   "content": "What is the capital of France?"},
-              ],
-              "temperature":     TEMP,
-              "max_tokens":      600,
-              "seed":            SEED,
-              "stream":          False,
-              "enable_thinking": False,
-              "response_format": {"type": "json_object"},
-          }, timeout = 600)
-          assert status == 200, f"json status {status}: {data}"
-          assert (
-              isinstance(data.get("choices"), list)
-              and data["choices"]
-              and "message" in data["choices"][0]
-          ), f"json response envelope malformed: {data}"
-          content = (data["choices"][0]["message"].get("content") or "").strip()
-          print(f"[json] raw json_object content: {content!r}")
-          if content.startswith("```"):
-              content = content.split("```", 2)[1]
-              if content.startswith("json"):
-                  content = content[4:]
-              content = content.strip("`\n ")
-          if content:
-              try:
-                  parsed = json.loads(content)
-                  if "paris" in str(parsed.get("city", "")).lower():
-                      print(f"[json] PASS json_object -> {parsed}")
-                  else:
-                      print(f"[json] WARN json_object decoded but city!=Paris: {parsed}")
-              except json.JSONDecodeError as exc:
-                  print(f"[json] WARN json_object content not parseable ({exc}); content={content!r}")
-          else:
-              print("[json] WARN json_object produced empty content")
-
-          status2, data2 = post("/v1/chat/completions", {
-              "model":         "default",
-              "messages":      [{"role": "user", "content": "What is the capital of France? Answer with one word."}],
-              "temperature":     TEMP,
-              "max_tokens":      400,
-              "seed":            SEED,
-              "stream":          False,
-              "enable_thinking": False,
-          }, timeout = 600)
-          assert status2 == 200, f"plain status {status2}: {data2}"
-          plain = (data2["choices"][0]["message"].get("content") or "").lower()
-          print(f"[json] plain capital-of-france reply: {plain!r}")
-          if "paris" in plain:
-              print("[json] PASS plain inference path (paris mentioned)")
-          else:
-              print(
-                  f"[json] WARN plain inference returned no 'paris' -- "
-                  f"model output drift. HTTP path validated separately above."
-              )
-
-          # ── 2. OpenAI image_url (data URI base64) ───────────────────
-          PNG_64X64_RED_B64 = (
-              "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k"
-              "UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA"
-              "1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII="
-          )
-          data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}"
-
-          # On Windows + the gemma-4-E2B mmproj, llama.cpp's vision
-          # path runs on CPU (no Metal involvement). The wrapper is
-          # kept for resilience but the vision path is expected to
-          # work on Windows; an exception here is a real regression.
-          client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
-          try:
-              openai_resp = client.chat.completions.create(
-                  model       = "default",
-                  temperature = TEMP,
-                  max_tokens  = 80,
-                  seed        = SEED,
-                  messages    = [{
-                      "role": "user",
-                      "content": [
-                          {"type": "image_url", "image_url": {"url": data_uri}},
-                          {"type": "text",      "text": "What colour dominates this image? Reply in one word."},
-                      ],
-                  }],
-              )
-              openai_text = (openai_resp.choices[0].message.content or "").lower()
-              print(f"[image/openai] reply: {openai_text!r}")
-              if openai_text:
-                  print("[image/openai] PASS image_url accepted, non-empty response")
-              else:
-                  print("[image/openai] WARN image_url accepted but empty content")
-          except Exception as exc:
-              print(
-                  f"[image/openai] WARN image_url SDK call raised: {type(exc).__name__}: "
-                  f"{exc}. Studio successfully forwarded the request; failure here is "
-                  f"upstream llama.cpp vision behaviour."
-              )
-
-          # ── 3. Anthropic source/base64 image ────────────────────────
-          anthropic = Anthropic(
-              base_url        = BASE,
-              api_key         = "unused",
-              default_headers = {"Authorization": f"Bearer {KEY}"},
-          )
-          try:
-              a_msg = anthropic.messages.create(
-                  model       = "default",
-                  max_tokens  = 80,
-                  temperature = TEMP,
-                  extra_body  = {"seed": SEED},
-                  messages    = [{
-                      "role": "user",
-                      "content": [
-                          {
-                              "type":   "image",
-                              "source": {
-                                  "type":       "base64",
-                                  "media_type": "image/png",
-                                  "data":       PNG_64X64_RED_B64,
-                              },
-                          },
-                          {"type": "text", "text": "Describe this image briefly."},
-                      ],
-                  }],
-              )
-              a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text")
-              print(f"[image/anthropic] reply: {a_text!r}")
-              if a_text:
-                  print("[image/anthropic] PASS source/base64 accepted, non-empty response")
-              else:
-                  print("[image/anthropic] WARN source/base64 accepted but empty content")
-          except Exception as exc:
-              print(
-                  f"[image/anthropic] WARN anthropic image SDK call raised: "
-                  f"{type(exc).__name__}: {exc}. Likely upstream llama.cpp vision "
-                  f"behaviour, NOT a Studio regression."
-              )
-          PY
-
-      - name: Stop Studio
-        if: always()
-        # Run as cmd so we are not running through the Git Bash shell;
-        # Git Bash on windows-latest has been observed to exit 143
-        # (SIGTERM) from any inline kill/sleep block, masking a green
-        # test run. The runner reclaims the Studio child process at
-        # job end either way, so just emit a marker and exit 0.
-        shell: cmd
-        run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end)
-
-      - name: Upload logs
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: windows-json-images-log
-          path: |
-            logs/studio.log
-            logs/install.log
-          retention-days: 7
diff --git a/.github/workflows/studio-windows-ui-smoke.yml b/.github/workflows/studio-windows-ui-smoke.yml
deleted file mode 100644
index e5ab9f8ab7..0000000000
--- a/.github/workflows/studio-windows-ui-smoke.yml
+++ /dev/null
@@ -1,342 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Windows counterpart to studio-ui-smoke.yml / studio-mac-ui-smoke.yml.
-# Same Playwright + Chromium end-to-end chat UI flow + extra UI flow,
-# but on the FREE windows-latest runner so we catch Windows-specific
-# regressions in the install path (install.ps1), the Studio CLI's
-# Windows process-management branches, and the llama.cpp prebuilt's
-# Windows HTTP layer.
-
-name: Windows Studio UI CI
-
-on:
-  pull_request:
-    paths:
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - 'install.ps1'
-      - 'pyproject.toml'
-      - 'tests/studio/**'
-      - '.github/workflows/studio-windows-ui-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  ui-smoke:
-    name: Chat UI Tests
-    runs-on: windows-latest
-    timeout-minutes: 45
-    # Default every step's shell to Git Bash. windows-latest's default
-    # shell is pwsh; without this each curl / heredoc / `kill $PID`
-    # step would need its own `shell: bash`. Steps that genuinely
-    # need PowerShell (install.ps1 invocation) override per-step.
-    defaults:
-      run:
-        shell: bash
-    env:
-      GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
-      GGUF_VARIANT: UD-Q4_K_XL
-      GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
-      STUDIO_PORT: '18896'
-      HF_HOME: ${{ github.workspace }}/hf-cache
-      # Force UTF-8 for stdio so Python tools (hf download, Studio
-      # CLI, etc.) can print Unicode characters like the success
-      # checkmark "✓". Windows defaults to cp1252 / charmap and
-      # any tool that prints "OK ✓" hits a UnicodeEncodeError.
-      PYTHONIOENCODING: utf-8
-      PYTHONUTF8: '1'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-          # No `cache: 'npm'`. setup-node's npm cache restore silently
-          # aborts the entire job on Windows runners when the npm cache
-          # path (`C:\npm\cache` per `npm config get cache`) doesn't yet
-          # exist on a fresh runner -- the step exits without an error
-          # message and every following step gets skipped. See
-          # npm/cli#7308. The frontend `npm ci` is fast enough without
-          # the cache that the reliability gain is worth the ~30s.
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          # No `cache: 'pip'`. install.ps1 / setup.ps1 use uv and
-          # never populate ~/.cache/pip; setup-python's post-step
-          # then fatal-errors with "Cache folder path is retrieved
-          # for pip but doesn't exist on disk".
-
-      - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
-        id: cache-hf
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        continue-on-error: true
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Prime HF_HOME with the GGUF
-        id: prime-hf
-        if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          python -m pip install --upgrade huggingface_hub
-          mkdir -p hf-cache
-          bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
-      - name: Save HF_HOME for ${{ env.GGUF_REPO }}
-        if: always() && steps.prime-hf.outcome == 'success'
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae  # v5.0.5
-        with:
-          path: hf-cache
-          key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
-      - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
-        shell: pwsh
-        # See studio-windows-update-smoke.yml for the full rationale.
-        # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
-        # reinstall, and Defender's real-time scan dominates the
-        # frontend / uv-pip-extract steps.
-        run: |
-          $ProgressPreference = 'SilentlyContinue'
-          Write-Host "npm version before upgrade: $(npm -v)"
-          npm install -g 'npm@^11' 2>&1 | Out-Host
-          Write-Host "npm version after upgrade: $(npm -v)"
-          # NOTE: do NOT pre-create these directories. See
-          # studio-windows-update-smoke.yml for the full rationale --
-          # creating an empty studio/frontend/dist trips setup.ps1's
-          # mtime-based staleness check into "frontend up to date, skip
-          # rebuild" and Studio boots with an empty dist directory.
-          # Add-MpPreference accepts paths that do not yet exist.
-          foreach ($p in @(
-            "$env:USERPROFILE\.unsloth",
-            "$env:USERPROFILE\AppData\Local\uv",
-            "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
-            "$env:GITHUB_WORKSPACE\studio\frontend\dist"
-          )) {
-            try {
-              Add-MpPreference -ExclusionPath $p -ErrorAction Stop
-              Write-Host "Defender exclusion added: $p"
-            } catch {
-              Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
-            }
-          }
-
-      - name: Install Studio (--local, --no-torch)
-        # install.ps1 is the supported Windows installer. install.sh
-        # has no Windows branch (apt-get / brew calls). The PS1
-        # script's `Install-UnslothStudio @args` line at the bottom
-        # forwards `--local --no-torch` correctly.
-        shell: pwsh
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          New-Item -ItemType Directory -Force -Path logs | Out-Null
-          # *>&1 redirects ALL PowerShell streams (stdout, stderr,
-          # warning, verbose, debug, information) into the success
-          # stream so Tee-Object captures everything. install.ps1
-          # and setup.ps1 emit step/substep markers via Write-Host
-          # which lands on the Information stream (PS 5+); without
-          # the wildcard redirect, those markers (including
-          # "prebuilt installed and validated") never reach
-          # logs/install.log and the post-step grep asserter fails.
-          $ProgressPreference = 'SilentlyContinue'
-          & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
-      - name: Assert install.ps1 used the Windows llama.cpp prebuilt
-        run: |
-          # install.ps1's setup.ps1 child writes "prebuilt installed
-          # and validated" to its own console host -- that output
-          # does NOT come back through this parent step's stdout
-          # pipeline (no matter how aggressively we redirect: *>&1,
-          # tee, etc.). Verify the install via the filesystem
-          # instead. setup.ps1 writes UNSLOTH_PREBUILT_INFO.json
-          # next to the install dir on success, and lays the
-          # binaries under build/bin/Release/ on Windows.
-          STUDIO_HOME=~/.unsloth/studio
-          LLAMA_DIR=~/.unsloth/llama.cpp
-          INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
-          BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
-          # Source-build fallback grep stays as a fast bail-out.
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          if [ ! -f "$INFO" ]; then
-            echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO; setup.ps1 didn't install the prebuilt."
-            ls -la "$LLAMA_DIR" || true
-            exit 1
-          fi
-          if [ ! -f "$BIN" ]; then
-            echo "::error::no llama-server.exe at $BIN; prebuilt extraction incomplete."
-            ls -la "$LLAMA_DIR/build/bin" || true
-            ls -la "$LLAMA_DIR/build/bin/Release" || true
-            exit 1
-          fi
-          echo "install.ps1 installed the Windows prebuilt llama.cpp:"
-          cat "$INFO"
-
-      - name: Add Studio shim to GITHUB_PATH
-        # install.ps1 puts unsloth.exe at $StudioHome\bin\unsloth.exe
-        # and adds that dir to the User PATH via the Windows registry.
-        # Registry-level PATH updates don't propagate to a running
-        # Git Bash session, so the next step's `unsloth ...` invocation
-        # would hit "command not found". Re-export the shim dir to
-        # GITHUB_PATH so every subsequent step in this job sees it.
-        run: |
-          SHIM_DIR=~/.unsloth/studio/bin
-          if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
-            echo "::error::unsloth.exe shim not found at $SHIM_DIR"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          # GITHUB_PATH wants Windows-style paths; convert via cygpath.
-          cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-          echo "Added Studio shim dir to PATH: $(cygpath -w "$SHIM_DIR")"
-
-      - name: Patch Studio venv with full typer / pydantic dep trees
-        # Belt-and-suspenders: install.ps1's --no-deps install of
-        # no-torch-runtime.txt drops typer's and pydantic's runtime
-        # deps unless explicitly pinned. Re-install the ones whose
-        # deps don't pull torch.
-        run: |
-          STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
-          if [ ! -f "$STUDIO_PY" ]; then
-            echo "::error::Studio venv python not at $STUDIO_PY"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
-      - name: Install Playwright + Chromium
-        # No --with-deps on Windows: that flag installs Linux apt
-        # packages. windows-latest ships the system frameworks
-        # Chromium needs (Edge / WebView2) already.
-        run: |
-          python -m pip install 'playwright>=1.45'
-          python -m playwright install chromium
-
-      - name: Reset auth + boot Studio
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
-            > logs/studio.log 2>&1 &
-          echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
-              jq -e '.status == "healthy"' /tmp/health.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health.json
-
-      - name: Pass bootstrap password to the Playwright step
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          NEW2="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          echo "::add-mask::$NEW2"
-          echo "STUDIO_OLD_PW=$OLD"   >> "$GITHUB_ENV"
-          echo "STUDIO_NEW_PW=$NEW"   >> "$GITHUB_ENV"
-          echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
-      - name: Drive the chat UI with Playwright
-        env:
-          BASE_URL: http://127.0.0.1:18896
-          PW_ART_DIR: logs/playwright
-          STUDIO_UI_STRICT: '1'
-          # windows-latest free runner is 4 vCPU / 16 GB; gemma-3-
-          # 270m turn latency under llama-server's CPU backend can
-          # crowd the 180s default (slower than ubuntu-latest on
-          # the same model). Keep the same generous budget the Mac
-          # job uses.
-          STUDIO_UI_TURN_TIMEOUT_MS: '540000'
-        run: |
-          mkdir -p logs/playwright
-          python tests/studio/playwright_chat_ui.py
-
-      - name: Stop Studio (chat-ui ends with Shutdown click; this is belt-and-suspenders)
-        if: always()
-        run: |
-          kill "${STUDIO_PID}" 2>/dev/null || true
-          sleep 2
-
-      - name: Reset auth + boot Studio for extra UI tests (port 18897)
-        run: |
-          unsloth studio reset-password
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18897 \
-            > logs/studio_extra.log 2>&1 &
-          echo "STUDIO_EXTRA_PID=$!" >> "$GITHUB_ENV"
-
-      - name: Wait for /api/health on 18897
-        run: |
-          for i in $(seq 1 180); do
-            if curl -fs "http://127.0.0.1:18897/api/health" > /tmp/health2.json; then
-              jq -e '.status == "healthy"' /tmp/health2.json && break
-            fi
-            sleep 1
-          done
-          jq -e '.status == "healthy"' /tmp/health2.json
-
-      - name: Pass bootstrap pw for extra UI test
-        run: |
-          OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
-          NEW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
-          echo "::add-mask::$OLD"
-          echo "::add-mask::$NEW"
-          echo "STUDIO_EXTRA_OLD_PW=$OLD" >> "$GITHUB_ENV"
-          echo "STUDIO_EXTRA_NEW_PW=$NEW" >> "$GITHUB_ENV"
-
-      - name: Drive Compare/Recipes/Export/Studio/Settings with Playwright
-        env:
-          BASE_URL: http://127.0.0.1:18897
-          STUDIO_OLD_PW: ${{ env.STUDIO_EXTRA_OLD_PW }}
-          STUDIO_NEW_PW: ${{ env.STUDIO_EXTRA_NEW_PW }}
-          PW_ART_DIR: logs/playwright_extra
-          STUDIO_UI_STRICT: '1'
-          STUDIO_UI_TURN_TIMEOUT_MS: '540000'
-          GGUF_REPO: ${{ env.GGUF_REPO }}
-          GGUF_VARIANT: ${{ env.GGUF_VARIANT }}
-        run: |
-          mkdir -p logs/playwright_extra
-          python tests/studio/playwright_extra_ui.py
-
-      - name: Stop second Studio
-        if: always()
-        run: |
-          kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true
-          sleep 2
-
-      - name: Upload Playwright artifacts
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: windows-studio-ui-smoke-artifacts
-          path: |
-            logs/studio.log
-            logs/studio_extra.log
-            logs/install.log
-            logs/playwright
-            logs/playwright_extra
-          retention-days: 7
diff --git a/.github/workflows/studio-windows-update-smoke.yml b/.github/workflows/studio-windows-update-smoke.yml
deleted file mode 100644
index 157874d404..0000000000
--- a/.github/workflows/studio-windows-update-smoke.yml
+++ /dev/null
@@ -1,279 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Windows counterpart to studio-update-smoke.yml /
-# studio-mac-update-smoke.yml. Verifies that on the FREE
-# windows-latest runner:
-#
-#   1. install.ps1 --local --no-torch installs Studio AND auto-fetches
-#      the prebuilt llama.cpp Windows binary (llama-bNNNN-bin-win-cpu-
-#      x64 from ggml-org/llama.cpp). Hitting the source-build fallback
-#      is treated as an Unsloth bug -- Studio must always pick the
-#      prebuilt on Windows.
-#   2. unsloth studio update --local is idempotent. Two consecutive
-#      runs both report "prebuilt up to date and validated", no
-#      source-build fallback. The CLI's _find_setup_script picks
-#      setup.ps1 on Windows automatically.
-#   3. The installed Studio still boots and /api/health returns
-#      healthy after the update path.
-
-name: Windows Studio Update CI
-
-on:
-  pull_request:
-    paths:
-      - 'install.ps1'
-      - 'studio/setup.ps1'
-      - 'studio/setup.bat'
-      - 'studio/install_python_stack.py'
-      - 'studio/install_llama_prebuilt.py'
-      - 'studio/backend/requirements/**'
-      - 'unsloth_cli/commands/studio.py'
-      - 'pyproject.toml'
-      - '.github/workflows/studio-windows-update-smoke.yml'
-  push:
-    branches: [main, pip]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  update-idempotency:
-    name: Studio Updating Tests
-    runs-on: windows-latest
-    timeout-minutes: 30
-    defaults:
-      run:
-        shell: bash
-    env:
-      # Force UTF-8 for stdio (Windows defaults to cp1252; hf
-      # download / Studio CLI print "✓" checkmarks and crash
-      # otherwise).
-      PYTHONIOENCODING: utf-8
-      PYTHONUTF8: '1'
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          # Don't cache pip: install.ps1 + setup.ps1 go through uv
-          # and never populate ~/.cache/pip; setup-python's post-step
-          # then fatal-errors with "Cache folder path is retrieved
-          # for pip but doesn't exist on disk".
-
-      - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
-        shell: pwsh
-        # Two surgical fixes against measured Windows-only install
-        # waste (vs Mac/Linux on the same SHA):
-        #
-        # (1) npm. setup.ps1 line 1109-1145 requires Node 22.12+ (or
-        #     20.19+ / 23+) AND npm >=11 because Vite 8 needs both.
-        #     actions/setup-node@v4 with `node-version: '22'` lands
-        #     Node 22.22.2 + the npm 10.9.7 it bundles, so the npm
-        #     check fails and setup.ps1 falls through to the
-        #     "winget install Node.js LTS" branch -- a ~35 s reinstall
-        #     of Node we don't need. `npm install -g npm@^11` updates
-        #     the bundled npm in-place in ~5 s, which makes setup.ps1
-        #     short-circuit on the existing Node.
-        #
-        # (2) Defender. windows-latest's real-time scan opens / hashes
-        #     every file Studio writes during install (Vite output =
-        #     thousands of small chunks, uv pip = wheel-extraction =
-        #     thousands of small files). The latency dominates the
-        #     200 s frontend build and the 90 s deps install. Adding
-        #     ExclusionPath entries for the directories the install
-        #     writes to drops per-file open latency from ~ms to ~us.
-        #     Add-MpPreference needs admin; the runneradmin user has
-        #     it, but wrap in try/catch so a permission flake leaves
-        #     the install otherwise unaffected.
-        run: |
-          $ProgressPreference = 'SilentlyContinue'
-          Write-Host "npm version before upgrade: $(npm -v)"
-          npm install -g 'npm@^11' 2>&1 | Out-Host
-          Write-Host "npm version after upgrade: $(npm -v)"
-          # NOTE: do NOT pre-create these directories before adding the
-          # exclusion -- creating an empty studio/frontend/dist trips
-          # setup.ps1 line 1281-1296's mtime-based "is the frontend
-          # stale?" check into "up to date, skip rebuild", because the
-          # newly-created dist's mtime is younger than every source
-          # file. Studio then boots with an empty dist and 500s on
-          # GET / with FileNotFoundError: dist\index.html. See run
-          # 25546676715 / job 74984469728.
-          # Add-MpPreference accepts paths that do not yet exist; the
-          # exclusion is registered and applies when the path
-          # materialises.
-          foreach ($p in @(
-            "$env:USERPROFILE\.unsloth",
-            "$env:USERPROFILE\AppData\Local\uv",
-            "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
-            "$env:GITHUB_WORKSPACE\studio\frontend\dist"
-          )) {
-            try {
-              Add-MpPreference -ExclusionPath $p -ErrorAction Stop
-              Write-Host "Defender exclusion added: $p"
-            } catch {
-              Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
-            }
-          }
-
-      - name: Install Studio (--local, --no-torch)
-        shell: pwsh
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          New-Item -ItemType Directory -Force -Path logs | Out-Null
-          # *>&1 captures Write-Host (Information stream) output;
-          # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
-          # and validated" via Write-Host, and we grep for that.
-          $ProgressPreference = 'SilentlyContinue'
-          & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
-      - name: Assert install.ps1 used the Windows llama.cpp prebuilt
-        run: |
-          # Filesystem-based check (setup.ps1's stream output isn't
-          # captured back through the parent pipeline).
-          LLAMA_DIR=~/.unsloth/llama.cpp
-          INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
-          BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
-          if grep -q "falling back to source build" logs/install.log; then
-            echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
-            grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
-            exit 1
-          fi
-          if [ ! -f "$INFO" ]; then
-            echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
-            ls -la "$LLAMA_DIR" || true
-            exit 1
-          fi
-          if [ ! -f "$BIN" ]; then
-            echo "::error::no llama-server.exe at $BIN."
-            ls -la "$LLAMA_DIR/build/bin" || true
-            exit 1
-          fi
-          echo "install.ps1 installed the Windows prebuilt llama.cpp:"
-          cat "$INFO"
-
-      - name: Add Studio shim to GITHUB_PATH
-        run: |
-          SHIM_DIR=~/.unsloth/studio/bin
-          if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
-            echo "::error::unsloth.exe shim not found at $SHIM_DIR"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
-      - name: Patch Studio venv with full typer / pydantic dep trees
-        # install.ps1 runs `uv pip install --no-deps -r
-        # no-torch-runtime.txt` to keep torch out of transitive
-        # resolution from accelerate/peft/trl. That also drops
-        # typer's and pydantic's runtime deps unless they're
-        # explicitly pinned in no-torch-runtime.txt. We pin the
-        # known ones (click, shellingham, annotated-doc, rich,
-        # pydantic-core, annotated-types, typing-inspection, ...)
-        # but typer / pydantic minor versions can introduce new
-        # transitive deps that are NOT in our pin list.
-        #
-        # Belt-and-suspenders: re-install typer + pydantic +
-        # huggingface_hub WITH their deps into the Studio venv.
-        # `pip install --upgrade` only adds missing packages; it
-        # never down-shifts an installed version. Cannot pull
-        # torch (none of typer / pydantic / huggingface_hub depend
-        # on it).
-        run: |
-          STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
-          if [ ! -f "$STUDIO_PY" ]; then
-            echo "::error::Studio venv python not at $STUDIO_PY"
-            ls -la ~/.unsloth/studio/ || true
-            exit 1
-          fi
-          "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
-      - name: First update should be a no-op (prebuilt already validated)
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -o pipefail
-          unsloth studio update --local 2>&1 | tee logs/update.log
-          if grep -q "falling back to source build" logs/update.log; then
-            echo "::error::studio update fell back to source-build llama.cpp on Windows."
-            grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
-            exit 1
-          fi
-          if ! grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update.log; then
-            echo "::error::no prebuilt up-to-date marker in update.log."
-            grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
-            exit 1
-          fi
-          echo "update path took the prebuilt fast path"
-
-      - name: Second update must also be a no-op
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -o pipefail
-          unsloth studio update --local 2>&1 | tee logs/update2.log
-          grep -q "falling back to source build" logs/update2.log && {
-              echo "::error::second update fell back to source build on Windows"
-              tail -60 logs/update2.log; exit 1; } || true
-          grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update2.log
-          echo "second update was clean"
-
-      - name: Boot Studio briefly to confirm the install is still usable
-        run: |
-          mkdir -p logs
-          UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18891 \
-            > logs/studio.log 2>&1 &
-          PID=$!
-          HEALTHY=""
-          # Use jq (a Git Bash builtin) instead of `python -c
-          # open('/tmp/health.json')` to read the saved health
-          # response. Bash on windows-latest is MSYS Git Bash, which
-          # resolves `/tmp/...` against the MSYS root, while the
-          # python interpreter is Windows-native and resolves it
-          # against the current drive's root. The two paths don't
-          # agree, so python never finds the file curl just wrote.
-          # jq reads through MSYS, so the path matches. Mirrors what
-          # studio-windows-api-smoke.yml and the other Windows smoke
-          # workflows already do.
-          for i in $(seq 1 60); do
-            if curl -fs http://127.0.0.1:18891/api/health > /tmp/health.json; then
-              if jq -e '.status == "healthy"' /tmp/health.json >/dev/null; then
-                HEALTHY=1
-                break
-              fi
-            fi
-            sleep 1
-          done
-          if [ -z "$HEALTHY" ]; then
-            echo "Studio failed to come up after \`update\`"
-            tail -200 logs/studio.log
-            kill "$PID" 2>/dev/null || true
-            exit 1
-          fi
-          kill "$PID" 2>/dev/null || true
-          echo "post-update Studio /api/health OK"
-
-      - name: Upload update logs
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: windows-studio-update-log
-          path: |
-            logs/install.log
-            logs/update.log
-            logs/update2.log
-            logs/studio.log
-          retention-days: 7
diff --git a/.github/workflows/version-compat-ci.yml b/.github/workflows/version-compat-ci.yml
deleted file mode 100644
index 599b53df1d..0000000000
--- a/.github/workflows/version-compat-ci.yml
+++ /dev/null
@@ -1,312 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-#
-# Cross-version compat canary for the four upstream packages whose
-# release cadence regularly breaks unsloth + unsloth-zoo:
-#
-#   1. vLLM             (LoRA worker manager, BnB loader, cumem allocator)
-#   2. TRL / GRPO       (trainer source rewriters in unsloth.models.rl*)
-#   3. PEFT             (LoraConfig, get_peft_model, LoraLayer, bnb integration)
-#   4. sentence-transformers (Transformer/Pooling/Normalize, Trainer)
-#   5. bitsandbytes     (Linear4bit, dequantize_4bit)
-#
-# Strategy: GitHub raw-fetch + symbol grep against every tracked
-# version (no pip install, CPU-only). When upstream renames a symbol
-# we depend on, the matching test fails BEFORE a user hits it. The
-# `main` branch entries give us a few-day lead on PyPI releases.
-#
-# Cross-references:
-#   tests/vllm_compat/test_vllm_pinned_symbols.py     (vLLM symbols)
-#   tests/version_compat/test_trl_grpo_pinned_symbols.py
-#   tests/version_compat/test_peft_pinned_symbols.py
-#   tests/version_compat/test_sentence_transformers_pinned_symbols.py
-#   tests/version_compat/test_bitsandbytes_pinned_symbols.py
-
-name: Version Compat CI
-
-on:
-  pull_request:
-    # Trigger on any unsloth source change, not just the three previously
-    # named files. The symbol-existence tests verify that EVERY pinned
-    # upstream reference in unsloth still resolves; a new
-    # `from peft.foo import Bar` added in unsloth/kernels/whatever.py
-    # is just as much a compat regression risk as one added in
-    # unsloth/models/rl.py.
-    paths:
-      - 'unsloth/**'
-      - 'tests/vllm_compat/**'
-      - 'tests/version_compat/**'
-      - 'pyproject.toml'
-      - '.github/workflows/version-compat-ci.yml'
-  schedule:
-    # Daily 06:43 UTC. Catches upstream PyPI releases roughly within
-    # 24 h. Off the :00 / :30 fleet-collision spots.
-    - cron: '43 6 * * *'
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  vllm-pinned-symbols:
-    name: vLLM pinned-symbol matrix (≥ 0.9.0 + main)
-    runs-on: ubuntu-latest
-    timeout-minutes: 12
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - name: Install pytest only
-        # The test fetches from raw.githubusercontent.com and greps
-        # source. No pip install of vllm / torch / transformers is
-        # needed — that's the whole point of this canary.
-        run: |
-          python -m pip install --upgrade pip
-          pip install 'pytest>=8'
-      - name: Run vllm-compat suite
-        env:
-          # Authenticated requests get a 5000-req/h quota on raw
-          # fetches; unauthenticated is 60/h and trips on the matrix.
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          python -m pytest tests/vllm_compat/test_vllm_pinned_symbols.py -v --tb=short
-
-  trl-grpo-pinned-symbols:
-    name: TRL / GRPO pinned-symbol matrix
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - name: Install pytest only
-        run: |
-          python -m pip install --upgrade pip
-          pip install 'pytest>=8'
-      - name: Run trl-compat suite
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          # PYTHONPATH=. so `from tests.version_compat._fetch import …`
-          # works without an editable install of unsloth itself.
-          PYTHONPATH=. python -m pytest \
-            tests/version_compat/test_trl_grpo_pinned_symbols.py \
-            -v --tb=short
-
-  peft-pinned-symbols:
-    name: PEFT pinned-symbol matrix (pyproject window + main)
-    runs-on: ubuntu-latest
-    timeout-minutes: 8
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - name: Install pytest only
-        run: |
-          python -m pip install --upgrade pip
-          pip install 'pytest>=8'
-      - name: Run peft-compat suite
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          PYTHONPATH=. python -m pytest \
-            tests/version_compat/test_peft_pinned_symbols.py \
-            tests/version_compat/test_unsloth_zoo_save_merged_pinned_symbols.py \
-            -v --tb=short
-
-  st-pinned-symbols:
-    name: sentence-transformers pinned-symbol matrix
-    runs-on: ubuntu-latest
-    timeout-minutes: 8
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - name: Install pytest only
-        run: |
-          python -m pip install --upgrade pip
-          pip install 'pytest>=8'
-      - name: Run sentence-transformers compat suite
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          PYTHONPATH=. python -m pytest \
-            tests/version_compat/test_sentence_transformers_pinned_symbols.py \
-            -v --tb=short
-
-  bitsandbytes-pinned-symbols:
-    name: bitsandbytes pinned-symbol matrix
-    runs-on: ubuntu-latest
-    timeout-minutes: 8
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - name: Install pytest only
-        run: |
-          python -m pip install --upgrade pip
-          pip install 'pytest>=8'
-      - name: Run bitsandbytes compat suite
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          PYTHONPATH=. python -m pytest \
-            tests/version_compat/test_bitsandbytes_pinned_symbols.py \
-            -v --tb=short
-
-  transformers-pinned-symbols:
-    name: transformers pinned-symbol matrix (4.57.6 + 5.x + main)
-    runs-on: ubuntu-latest
-    timeout-minutes: 12
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - name: Install pytest only
-        run: |
-          python -m pip install --upgrade pip
-          pip install 'pytest>=8'
-      - name: Run transformers compat suite
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          PYTHONPATH=. python -m pytest \
-            tests/version_compat/test_transformers_pinned_symbols.py \
-            -v --tb=short
-
-  # Optional second layer: actually `pip install` ONE representative
-  # version of each package and verify unsloth + unsloth-zoo modules
-  # import on it under the existing CUDA spoof. CPU-only, runs on
-  # ubuntu-latest. Catches the small set of breakages that the static
-  # symbol check misses (e.g. import-time side effects).
-  zoo-imports-under-spoof:
-    name: unsloth_zoo vllm/grpo/peft/st modules import under CUDA spoof
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-          path: unsloth
-      - name: Clone unsloth-zoo @ main
-        run: |
-          # github.com occasionally 500s on the git fetch; retry so a
-          # single upstream blip does not fail CI.
-          for attempt in 1 2 3; do
-            rm -rf "$RUNNER_TEMP/unsloth-zoo"
-            if git clone --depth=1 https://github.com/unslothai/unsloth-zoo \
-                "$RUNNER_TEMP/unsloth-zoo"; then
-              break
-            fi
-            if [ "$attempt" -eq 3 ]; then
-              echo "::error::git clone unsloth-zoo failed after 3 attempts"
-              exit 1
-            fi
-            delay=$((5 * attempt))
-            echo "::warning::clone failed (attempt $attempt/3), retrying in ${delay}s..."
-            sleep "$delay"
-          done
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - name: Install CPU torch + supported pkg pins
-        run: |
-          python -m pip install --upgrade pip
-          # CPU torch (vllm/peft/st all depend on it).
-          pip install --index-url https://download.pytorch.org/whl/cpu \
-            'torch>=2.4,<2.11' 'torchvision<0.26' 'torchcodec<0.10'
-          # torchcodec is a hard requirement on transformers 5.x:
-          # transformers/audio_utils.py:55 does
-          #   `importlib.metadata.version("torchcodec")` UNCONDITIONALLY,
-          # which raises PackageNotFoundError on a CPU runner that
-          # otherwise has no audio path -- and that error trickles up
-          # through every `import unsloth_zoo.<module>` because
-          # unsloth-zoo's vision_utils transitively pulls
-          # transformers.processing_utils (-> audio_utils). The 0.10
-          # cap mirrors the torch 2.10 / torchvision 0.26 ABI window
-          # we already pin above.
-          # Ladder of supported floor versions per pyproject.toml.
-          pip install \
-            'transformers>=4.56,<5.6' 'trl>=0.22,<0.26' \
-            'peft>=0.18.0' 'sentence-transformers>=5.0' \
-            'accelerate>=1.0' 'datasets>=3.4,<5' \
-            'bitsandbytes>=0.45.5' \
-            sentencepiece protobuf safetensors numpy 'pytest>=8' \
-            'huggingface_hub>=0.34' tqdm packaging psutil triton Pillow
-          # Editable-install both repos so the test imports the
-          # checkouts (not whatever stale PyPI version pip resolved).
-          pip install --no-deps -e "$RUNNER_TEMP/unsloth-zoo"
-          pip install --no-deps -e ./unsloth
-      - name: Run vllm_compat zoo-imports tests under spoof
-        env:
-          UNSLOTH_IS_PRESENT: '1'
-          UNSLOTH_COMPILE_DISABLE: '1'
-          PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python
-        run: |
-          cd unsloth
-          # tests/vllm_compat/test_unsloth_zoo_imports.py: narrow vllm/grpo
-          #   import gates (5 tests).
-          # tests/vllm_compat/test_extended_module_imports.py: full sweep
-          #   of unsloth_zoo + unsloth.models.* modules + RL dispatch
-          #   table population + FastModel API surface under spoof
-          #   (~30 tests). Catches transformers / peft / bnb symbol pin
-          #   drift at module-top BEFORE any runtime call.
-          PYTHONPATH=. python -m pytest \
-            tests/vllm_compat/test_unsloth_zoo_imports.py \
-            tests/vllm_compat/test_extended_module_imports.py \
-            -v --tb=short
-
-  # Daily-only: same suites but with --strict on importable upstream
-  # tags. Schedule-only so PR jobs stay fast; cron tolerates a flake.
-  daily-fresh-fetch:
-    name: daily fresh-fetch sweep (cron only)
-    if: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
-    runs-on: ubuntu-latest
-    timeout-minutes: 20
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - name: Install pytest
-        run: pip install 'pytest>=8'
-      - name: Run all version-compat suites in one process (no cache)
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          PYTHONPATH=. python -m pytest \
-            tests/vllm_compat/test_vllm_pinned_symbols.py \
-            tests/version_compat/ \
-            -v --tb=short
diff --git a/.github/workflows/wheel-smoke.yml b/.github/workflows/wheel-smoke.yml
deleted file mode 100644
index 3de3c33ca2..0000000000
--- a/.github/workflows/wheel-smoke.yml
+++ /dev/null
@@ -1,136 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Builds the PyPI wheel from the PR branch, then verifies the built wheel
-# actually contains what we expect to ship and does NOT contain the broken
-# Studio bundle that 2026.5.1 published. This is the single workflow that
-# would have blocked the 2026.5.1 release before twine upload.
-#
-# Verified locally end-to-end against this branch:
-#   - python -m build produces unsloth-<version>-py3-none-any.whl in 13s
-#   - wheel content sanity passes:
-#       lockfile shipped, frontend dist shipped,
-#       no node_modules in wheel, no bun.lock in wheel,
-#       main bundle has unstable_Provider hits=1 (assistant-ui internals only).
-#   - Studio backend imports cleanly from the installed wheel with the
-#     lightweight dep set below.
-
-name: Wheel CI
-
-on:
-  pull_request:
-    paths:
-      - 'pyproject.toml'
-      - 'studio/**'
-      - 'unsloth/**'
-      - 'unsloth_cli/**'
-      - '.github/workflows/wheel-smoke.yml'
-  push:
-    branches: [main, pip]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  wheel:
-    name: Wheel build + content sanity + import smoke
-    runs-on: ubuntu-latest
-    timeout-minutes: 15
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e  # v6.4.0
-        with:
-          node-version: '22'
-
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: '3.12'
-
-      - name: Lockfile supply-chain audit (pre-install scan)
-        run: python3 scripts/lockfile_supply_chain_audit.py
-
-      - name: Build frontend
-        # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
-        # required for `vite build`. The pre-install lockfile structural
-        # audit (lockfile_supply_chain_audit.py) is the practical defence
-        # against the npm postinstall-dropper class -- it fires BEFORE any
-        # tarball runs, on the injection pattern itself rather than an
-        # advisory-DB lookup.
-        run: |
-          cd studio/frontend
-          npm ci --no-fund --no-audit
-          npm run build
-
-      - name: Build wheel + sdist
-        run: |
-          python -m pip install --upgrade pip build
-          rm -rf dist build ./*.egg-info
-          python -m build
-
-      - name: Wheel content sanity
-        run: |
-          python - <<'PY'
-          import zipfile, glob, sys
-          w = glob.glob("dist/unsloth-*.whl")
-          if not w:
-              print("FAIL: no wheel produced"); sys.exit(2)
-          w = w[0]
-          print(f"wheel: {w}")
-          with zipfile.ZipFile(w) as z:
-              n = z.namelist()
-              checks = {
-                "lockfile shipped":      any(s.endswith("studio/frontend/package-lock.json") for s in n),
-                "frontend dist shipped": any(s.endswith("studio/frontend/dist/index.html")    for s in n),
-                "no node_modules":       not any("studio/frontend/node_modules/" in s for s in n),
-                "no bun.lock":           not any(s.endswith("studio/frontend/bun.lock")       for s in n),
-              }
-              js = [s for s in n
-                    if "studio/frontend/dist/assets/" in s
-                    and s.endswith(".js")
-                    and "/index-" in s]
-              if not js:
-                  print("FAIL: no main bundle index-*.js in wheel"); sys.exit(2)
-              data = z.read(js[0]).decode("utf-8", "replace")
-              hits = data.count("unstable_Provider:")
-              print(f"main bundle: {js[0]}")
-              print(f"unstable_Provider hits: {hits} (>=4 indicates 2026.5.1 regression)")
-              checks["bundle has no Studio unstable_Provider call site"] = (hits < 4)
-
-              print()
-              for k, v in checks.items():
-                  print(f"  [{'PASS' if v else 'FAIL'}] {k}")
-              sys.exit(0 if all(checks.values()) else 1)
-          PY
-
-      - name: Studio backend import smoke
-        # Imports `studio.backend.main:app` from the freshly-installed wheel in
-        # a clean venv. This catches the class of bug that 2026.5.1 shipped with:
-        # frontend dist missing, package-lock.json missing, or the wheel's Python
-        # source tree broken in a way that surfaces only at app construction time.
-        run: |
-          python -m venv /tmp/v
-          /tmp/v/bin/pip install --upgrade pip
-          /tmp/v/bin/pip install -r studio/backend/requirements/studio.txt
-          /tmp/v/bin/pip install \
-            python-multipart aiofiles sqlalchemy cryptography \
-            pyyaml jinja2 mammoth unpdf requests \
-            'numpy<3'
-          /tmp/v/bin/pip install --no-deps dist/unsloth-*.whl
-          # Run from /tmp so Python imports the installed package, not the source tree.
-          cd /tmp
-          /tmp/v/bin/python -c "from studio.backend.main import app; print('Studio backend OK:', app.title)"
-
-      - name: Upload wheel on failure
-        if: failure()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: unsloth-wheel
-          path: dist/
-          retention-days: 7
diff --git a/tests/mlx_parity/README.md b/tests/mlx_parity/README.md
new file mode 100644
index 0000000000..04bf188d05
--- /dev/null
+++ b/tests/mlx_parity/README.md
@@ -0,0 +1,30 @@
+# MLX vs HF parity probes
+
+Seven small, focused probes designed to bisect the parity gap between MLX
+training (via `unsloth_zoo.mlx.trainer`) and HF training (via
+`transformers.SFTTrainer`) on the same hyperparameters.
+
+Symptom: identical 7-step LoRA fine-tune of `unsloth/gemma-3-270m-it` on the
+single row `"<<HELLO!!>> My name is Unsloth!"` produces:
+
+| | step-1 loss | post-train loss | greedy generation |
+|---|---|---|---|
+| HF SFTTrainer (CUDA bf16) | 7.64 | 0.001 | `"... Unsloth! My personality is bubbly ..."` |
+| MLX trainer | 10.55 | 0.009 | `"5 lbs!"` |
+
+The 1.38x step-1 forward-pass gap is the root anomaly. Each probe answers
+one question along the dispatch path:
+
+| # | probe | question |
+|---|---|---|
+| 1 | `probe_1_tokenization.py` | does the tokenized input differ? |
+| 2 | `probe_2_forward_logits.py` | does the base model emit different logits? |
+| 3 | `probe_3_loss_reduction.py` | does CE-then-reduce produce different scalars? |
+| 4 | `probe_4_lora_init.py` | does LoRA init produce different magnitudes? |
+| 5 | `probe_5_single_grad.py` | does one backward produce different gradients? |
+| 6 | `probe_6_adamw_step.py` | does one AdamW step produce different deltas? |
+| 7 | `probe_7_loss_curve.py` | what does the 7-step curve look like end-to-end? |
+
+Each probe prints diagnostic data, then asserts a numeric tolerance. The
+workflow runs them with `continue-on-error: true` so even a single
+diverging probe still prints subsequent diagnostic data.
diff --git a/tests/mlx_parity/_common.py b/tests/mlx_parity/_common.py
new file mode 100644
index 0000000000..3356cbf41a
--- /dev/null
+++ b/tests/mlx_parity/_common.py
@@ -0,0 +1,57 @@
+"""Shared constants + helpers for MLX parity probes.
+
+The probes deliberately share NOTHING with `unsloth_zoo.mlx.trainer` —
+each probe re-derives the quantity from first principles so we can tell
+where the trainer's wiring differs from the textbook HF/PyTorch recipe.
+"""
+
+from __future__ import annotations
+
+import os
+import random
+from pathlib import Path
+
+import numpy as np
+
+
+MODEL_NAME = "unsloth/gemma-3-270m-it"
+TRAIN_TEXT = "<<HELLO!!>> My name is Unsloth!"
+PROMPT = "<<HELLO!!>> My name is "
+SEED = 3407
+MAX_SEQ_LEN = 64
+
+OUT_DIR = Path(__file__).resolve().parent / ".out"
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def seed_everything(seed: int = SEED) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    try:
+        import torch
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+    except Exception:
+        pass
+    try:
+        import mlx.core as mx
+        mx.random.seed(seed)
+    except Exception:
+        pass
+
+
+def banner(title: str) -> None:
+    print()
+    print("=" * 72)
+    print(f"=== {title}")
+    print("=" * 72, flush=True)
+
+
+def section(title: str) -> None:
+    print(f"\n--- {title} ---", flush=True)
+
+
+def report(name: str, value) -> None:
+    print(f"  {name}: {value}", flush=True)
diff --git a/tests/mlx_parity/probe_10_hf_curve_control.py b/tests/mlx_parity/probe_10_hf_curve_control.py
new file mode 100644
index 0000000000..6c5d381d44
--- /dev/null
+++ b/tests/mlx_parity/probe_10_hf_curve_control.py
@@ -0,0 +1,163 @@
+"""Probe 10 — HF SFTTrainer 7-step loss curve on the SAME Mac host (control).
+
+The previously-collected HF baseline came from CUDA bf16 on a B200 GPU.
+That's a different platform AND a different precision AND a different
+optimizer backend. To isolate "MLX vs HF" from "CUDA vs Mac CPU" we
+re-run the HF leg here on the same macos-14-arm64 runner in fp32
+(CPU), with the exact same 7 LoRA targets / alpha=16 / hyperparams.
+
+Forces torch to CPU because the standard macos-14 GitHub runner has
+only 7 GB of shared memory; an fp32 LoRA training on MPS hits the
+GPU memory watermark.
+
+Compare probe_10.json with probe_7.json: same-host, same-precision
+expectations, only the trainer implementation changes.
+
+Always exits 0 -- data dump for follow-up analysis.
+"""
+
+import json
+import os
+import sys
+
+# Hide every accelerator from torch before importing it. macos-14 runners
+# expose MPS with a 7 GB shared cap; the fp32 7-module LoRA training
+# above does not fit. Force CPU.
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0"
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+import numpy as np
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    SEED,
+    MAX_SEQ_LEN,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+    seed_everything,
+)
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 10: HF SFTTrainer 7-step loss curve (control on same host)")
+
+    import torch
+    from datasets import Dataset
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        TrainerCallback,
+    )
+    from peft import LoraConfig, get_peft_model
+
+    # TRL is optional on a Mac CPU image; install if missing.
+    try:
+        from trl import SFTConfig, SFTTrainer
+    except ImportError as e:
+        report("trl not available", str(e))
+        out = {"trl_available": False}
+        (OUT_DIR / "probe_10.json").write_text(json.dumps(out, indent=2))
+        return 0
+
+    torch.manual_seed(SEED)
+    # Force CPU explicitly even if MPS is reported. setting empty
+    # CUDA_VISIBLE_DEVICES handles CUDA; here we shadow the MPS-pickup
+    # path by setting torch's default device.
+    try:
+        torch.set_default_device("cpu")
+    except Exception:
+        pass
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32).to("cpu")
+    model = get_peft_model(
+        model,
+        LoraConfig(
+            r=8, lora_alpha=16, lora_dropout=0.0, bias="none",
+            target_modules=[
+                "q_proj", "k_proj", "v_proj", "o_proj",
+                "gate_proj", "up_proj", "down_proj",
+            ],
+        ),
+    )
+
+    rows = []
+    class _Logger(TrainerCallback):
+        def on_log(self, args, state, control, logs=None, **kwargs):
+            if not logs or "loss" not in logs:
+                return
+            rows.append({
+                "step": int(state.global_step),
+                "loss": float(logs["loss"]),
+                "grad_norm": float(logs["grad_norm"]) if "grad_norm" in logs else None,
+            })
+
+    ds = Dataset.from_list([{"text": TRAIN_TEXT}] * 64)
+    trainer = SFTTrainer(
+        model=model,
+        processing_class=tok,
+        train_dataset=ds,
+        callbacks=[_Logger()],
+        args=SFTConfig(
+            max_length=MAX_SEQ_LEN,
+            dataset_text_field="text",
+            per_device_train_batch_size=2,
+            gradient_accumulation_steps=3,
+            warmup_steps=0,
+            max_steps=7,
+            learning_rate=1e-3,
+            logging_steps=1,
+            optim="adamw_torch",
+            weight_decay=0.0,
+            lr_scheduler_type="constant",
+            max_grad_norm=1.0,
+            seed=SEED,
+            save_strategy="no",
+            report_to="none",
+            packing=False,
+            bf16=False,
+            fp16=False,
+            use_cpu=True,
+            output_dir=str(OUT_DIR / "probe10_outputs"),
+        ),
+    )
+    trainer.train()
+
+    section("post-train forward")
+    model.eval()
+    with torch.no_grad():
+        enc = tok(TRAIN_TEXT, return_tensors="pt")
+        out = model(**enc, labels=enc["input_ids"].clone())
+        post_loss = float(out.loss.detach())
+    report("post_train_loss", post_loss)
+
+    section("greedy generation")
+    model.eval()
+    with torch.no_grad():
+        ginp = tok(PROMPT, return_tensors="pt")
+        gout = model.generate(**ginp, max_new_tokens=48, do_sample=False)
+    gen = tok.decode(gout[0], skip_special_tokens=True)
+    report("generation", repr(gen))
+
+    out = {
+        "trl_available": True,
+        "rows": rows,
+        "post_train_loss": post_loss,
+        "generation": gen,
+        "contains_unsloth": "Unsloth" in gen,
+    }
+    (OUT_DIR / "probe_10.json").write_text(json.dumps(out, indent=2))
+    section("summary")
+    report("step-1 loss", rows[0]["loss"] if rows else None)
+    report("step-7 loss", rows[-1]["loss"] if rows else None)
+    report("post_train_loss", post_loss)
+    report("contains 'Unsloth'", "Unsloth" in gen)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_11_mlx_fp32_curve.py b/tests/mlx_parity/probe_11_mlx_fp32_curve.py
new file mode 100644
index 0000000000..40fcc68f9c
--- /dev/null
+++ b/tests/mlx_parity/probe_11_mlx_fp32_curve.py
@@ -0,0 +1,135 @@
+"""Probe 11 — MLX trainer 7-step loss curve at dtype="float32".
+
+Probe 7 runs the MLX trainer at dtype="float16" (the smoke-test default).
+This probe runs the identical config at dtype="float32" so that the
+forward / backward / optimizer are all carried out in fp32, matching
+what HF on Mac CPU (probe 10) does.
+
+Hypothesis: the upstream smoke test's "5 lbs!" / "42!!" generation
+collapse is a fp16 numerical artifact, not an algorithmic bug.
+
+If probe 11's loss curve and generation come out matching the HF curve
+in probe 10, the actionable fix is to switch the smoke test (or the
+trainer default) to float32 / bfloat16 on Apple Silicon.
+
+Always exits 0 -- data dump.
+"""
+
+import json
+import sys
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    SEED,
+    MAX_SEQ_LEN,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+    seed_everything,
+)
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 11: MLX trainer 7-step loss curve at fp32")
+
+    import mlx.core as mx
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+
+    section("load + LoRA (fp32)")
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype="float32",   # <-- the only change vs probe 7
+        text_only=True, max_seq_length=128,
+        random_state=SEED,
+    )
+    model = FastMLXModel.get_peft_model(
+        model,
+        r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        random_state=SEED,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+    )
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=3,
+        max_steps=7,
+        learning_rate=1e-3,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=1.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=SEED,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / "probe11_outputs"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+
+    rows = []
+    def _on_step(step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens, grad_norm):
+        rows.append({
+            "step": int(step), "loss": float(loss),
+            "lr": float(lr), "grad_norm": None if grad_norm is None else float(grad_norm),
+            "num_tokens": int(num_tokens),
+        })
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    section("post-train forward")
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    section("greedy generation")
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "dtype": "float32",
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    (OUT_DIR / "probe_11.json").write_text(json.dumps(out, indent=2))
+    section("summary")
+    report("step-1 loss", rows[0]["loss"] if rows else None)
+    report("step-7 loss", rows[-1]["loss"] if rows else None)
+    report("post_train_loss", post_loss_val)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_12_zoo_prev634.py b/tests/mlx_parity/probe_12_zoo_prev634.py
new file mode 100644
index 0000000000..0e949c9022
--- /dev/null
+++ b/tests/mlx_parity/probe_12_zoo_prev634.py
@@ -0,0 +1,186 @@
+"""Probe 12 — pin unsloth-zoo to the parent of PR #634 and rerun.
+
+Hypothesis we want to nail down: every other parity probe rules out
+the obvious axes (loss math, AdamW math, tokenization, supervised
+positions, single-step gradient norm), yet HF on the same host
+generates "Unsloth" and MLX does not. That points squarely at the
+trainer changes in unsloth-zoo PR #634 (`e6d8f7f`).
+
+This probe assumes the CI workflow installs unsloth-zoo at the
+PARENT commit `f37d510` (the commit immediately before #634 landed).
+Pre-#634 the layout was flat: `unsloth_zoo.mlx_loader` /
+`unsloth_zoo.mlx_trainer`. Post-#634 it's a package:
+`unsloth_zoo.mlx.loader` / `unsloth_zoo.mlx.trainer`. Try both,
+honor whichever is importable.
+
+If this probe generates "Unsloth" with the SAME 7-step config that
+probe 7 / 11 fail on, the regression is fully INSIDE PR #634's diff
+and we can sub-bisect by reverting the suspect changes (bias_correction,
+loss reduction, custom VJP, dtype handling).
+
+Always exits 0 -- data dump.
+"""
+
+import json
+import sys
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    SEED,
+    MAX_SEQ_LEN,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+    seed_everything,
+)
+
+
+def _import_zoo():
+    try:
+        from unsloth_zoo.mlx_loader import FastMLXModel  # pre-#634
+        from unsloth_zoo.mlx_trainer import MLXTrainer, MLXTrainingConfig
+        from unsloth_zoo.mlx_utils import make_baseline_loss_fn
+        return "pre-#634 flat layout", FastMLXModel, MLXTrainer, MLXTrainingConfig, make_baseline_loss_fn
+    except ImportError:
+        pass
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+    return "post-#634 package layout", FastMLXModel, MLXTrainer, MLXTrainingConfig, make_baseline_loss_fn
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 12: pinned unsloth-zoo (parent of PR #634)")
+
+    import importlib
+    import unsloth_zoo
+    report("unsloth_zoo path", getattr(unsloth_zoo, "__file__", "?"))
+    try:
+        report("unsloth_zoo version", getattr(unsloth_zoo, "__version__", "?"))
+    except Exception:
+        pass
+
+    layout, FastMLXModel, MLXTrainer, MLXTrainingConfig, make_baseline_loss_fn = _import_zoo()
+    report("layout detected", layout)
+
+    import mlx.core as mx
+
+    # Mirror the SMOKE TEST AT 12295c1f exactly: dtype="float16" + identical LoRA
+    # config + identical hyperparams. We want to know if pre-#634 trainer
+    # behavior matches the green CI from that era.
+    section("load + LoRA (fp16, matches pre-#634 smoke)")
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype="float16",
+        text_only=True, max_seq_length=128, random_state=SEED,
+    )
+    model = FastMLXModel.get_peft_model(
+        model,
+        r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        random_state=SEED,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+    )
+
+    # MLXTrainingConfig at pre-#634 does NOT have max_grad_value, so we
+    # only pass it if supported. dataclasses.fields tells us.
+    import dataclasses
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra_kwargs = {}
+    if "max_grad_value" in fields_supported:
+        extra_kwargs["max_grad_value"] = None
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=3,
+        max_steps=7,
+        learning_rate=1e-3,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=1.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=SEED,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / "probe12_outputs"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra_kwargs,
+    )
+
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+
+    rows = []
+    # Variadic callback so we work for both pre-#634 (8 args) and
+    # post-#634 (9 args). The trainer wraps `cb(...)` in try/except
+    # Exception, so an arity mismatch on a fixed-arg callback would
+    # silently no-op the entire logging path.
+    def _on_step(*args):
+        # args = (step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens[, grad_norm])
+        if len(args) < 3:
+            return
+        step, _total, loss = args[0], args[1], args[2]
+        grad_norm = args[8] if len(args) >= 9 else None
+        rows.append({
+            "step": int(step), "loss": float(loss),
+            "grad_norm": None if grad_norm is None else float(grad_norm),
+        })
+    trainer.add_step_callback(_on_step)
+    cb_arity_used = "variadic"
+    trainer.train()
+
+    section("post-train forward")
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    section("greedy generation")
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "layout": layout,
+        "callback_arity_used": cb_arity_used,
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "generation": gen,
+        "contains_unsloth": contains,
+        "dtype": "float32",
+    }
+    (OUT_DIR / "probe_12.json").write_text(json.dumps(out, indent=2))
+    section("summary")
+    if rows:
+        report("step-1 loss", rows[0]["loss"])
+        report("step-7 loss", rows[-1]["loss"])
+    report("post_train_loss", post_loss_val)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_13_pure_mlx_inference.py b/tests/mlx_parity/probe_13_pure_mlx_inference.py
new file mode 100644
index 0000000000..023325ea4b
--- /dev/null
+++ b/tests/mlx_parity/probe_13_pure_mlx_inference.py
@@ -0,0 +1,114 @@
+"""Probe 13 — pure mlx-lm inference, NO unsloth involved.
+
+Two tests:
+  (a) one-shot:   ask "What is 1+1?" and inspect the answer
+  (b) multi-turn with KV-cache reuse: walk a 7-turn conversation
+      that requires remembering earlier turns ("What did I ask as
+      my first question?", "What country did I ask about?", etc.)
+
+If pure mlx-lm answers correctly, the MLX runtime + the gemma-3-270m-it
+weights are fine. The bug in the training path is then necessarily in
+the unsloth-zoo MLX trainer wrapper, not in MLX itself.
+"""
+
+import json
+import sys
+
+from _common import MODEL_NAME, OUT_DIR, banner, section, report, seed_everything
+
+
+TURNS = [
+    "What is 1+1?",
+    "What is the capital of France?",
+    "What did I ask as my first question?",
+    "Create a short Python game",
+    "Fix bugs in it",
+    "What country did I ask about?",
+    "What number did you answer with?",
+]
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 13: pure mlx-lm inference (no unsloth)")
+
+    import mlx.core as mx
+    from mlx_lm import load as mlx_load, generate
+    try:
+        from mlx_lm.models.cache import make_prompt_cache
+    except Exception:
+        make_prompt_cache = None
+
+    section("load model")
+    model, tokenizer = mlx_load(MODEL_NAME)
+    report("tokenizer class", type(tokenizer).__name__)
+
+    section("(a) one-shot: 'What is 1+1?'")
+    one_shot_prompt = "What is 1+1?"
+    if hasattr(tokenizer, "apply_chat_template"):
+        try:
+            one_shot_prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": "What is 1+1?"}],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+        except Exception as e:
+            report("chat_template error -- using raw prompt", str(e))
+    out_one_shot = generate(model, tokenizer, prompt=one_shot_prompt, max_tokens=48, verbose=False)
+    report("answer", repr(out_one_shot))
+
+    section("(b) multi-turn with KV-cache reuse")
+    multi_turn_log = []
+    history = []
+    cache = None
+    for turn_idx, user_msg in enumerate(TURNS):
+        history.append({"role": "user", "content": user_msg})
+        try:
+            prompt = tokenizer.apply_chat_template(
+                history, tokenize=False, add_generation_prompt=True,
+            )
+        except Exception:
+            prompt = "\n".join(f"{m['role']}: {m['content']}" for m in history) + "\nassistant:"
+        # For KV-cache reuse: feed only the NEW suffix on subsequent turns.
+        # mlx-lm's generate accepts `prompt_cache` since 0.18+; if it does,
+        # we maintain `cache` across turns to demonstrate true reuse.
+        gen_kwargs = dict(max_tokens=64, verbose=False)
+        if cache is not None:
+            gen_kwargs["prompt_cache"] = cache
+        else:
+            if make_prompt_cache is not None:
+                try:
+                    cache = make_prompt_cache(model)
+                    gen_kwargs["prompt_cache"] = cache
+                except Exception as e:
+                    cache = None
+                    report("cache init error", str(e))
+        try:
+            answer = generate(model, tokenizer, prompt=prompt, **gen_kwargs)
+        except TypeError:
+            # mlx-lm older API: no prompt_cache kwarg, fall back without it.
+            gen_kwargs.pop("prompt_cache", None)
+            cache = None
+            answer = generate(model, tokenizer, prompt=prompt, **gen_kwargs)
+        history.append({"role": "assistant", "content": answer})
+        multi_turn_log.append({
+            "turn": turn_idx + 1,
+            "user": user_msg,
+            "assistant": answer,
+            "kv_reuse": cache is not None,
+        })
+        report(f"turn {turn_idx+1} user", user_msg)
+        report(f"turn {turn_idx+1} assistant", repr(answer[:140]))
+
+    out = {
+        "one_shot_prompt": "What is 1+1?",
+        "one_shot_answer": out_one_shot,
+        "multi_turn": multi_turn_log,
+        "kv_reuse_used": cache is not None,
+    }
+    (OUT_DIR / "probe_13.json").write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_14_zoo_bias_correction_false.py b/tests/mlx_parity/probe_14_zoo_bias_correction_false.py
new file mode 100644
index 0000000000..5eacc8f4e7
--- /dev/null
+++ b/tests/mlx_parity/probe_14_zoo_bias_correction_false.py
@@ -0,0 +1,140 @@
+"""Probe 14 — pin unsloth-zoo to `try-bias-correction-false` and rerun.
+
+Hypothesis: PR #634 flipped MLX AdamW `bias_correction` from False to
+True (matching torch.AdamW). With bias_correction=True step-1 updates
+are ~3x smaller than the historical MLX default; the 7-step smoke
+never reaches the "Unsloth" basin.
+
+This probe installs unsloth-zoo from the experimental branch
+`try-bias-correction-false` (which sits on top of PR #663 and ONLY
+reverts bias_correction back to False) and re-runs the standard 7-step
+config in fp16, byte-matched to the green-era smoke test.
+
+Outcome:
+  * generates "Unsloth"  =>  bias_correction=True is the breakage.
+  * still gibberish      =>  there is a second regression inside #634.
+"""
+
+import json
+import sys
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    SEED,
+    MAX_SEQ_LEN,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+    seed_everything,
+)
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 14: MLX with bias_correction=False (experimental fix branch)")
+
+    import mlx.core as mx
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    section("load + LoRA (fp16, smoke parity)")
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype="float16",
+        text_only=True, max_seq_length=128, random_state=SEED,
+    )
+    model = FastMLXModel.get_peft_model(
+        model,
+        r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        random_state=SEED,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+    )
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=3,
+        max_steps=7,
+        learning_rate=1e-3,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=1.0,
+        max_grad_value=None,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=SEED,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / "probe14_outputs"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3:
+            return
+        rows.append({
+            "step": int(args[0]),
+            "loss": float(args[2]),
+            "grad_norm": float(args[8]) if len(args) >= 9 and args[8] is not None else None,
+        })
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    section("post-train forward")
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    section("greedy generation")
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "branch": "try-bias-correction-false",
+        "bias_correction": False,
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    (OUT_DIR / "probe_14.json").write_text(json.dumps(out, indent=2))
+    section("summary")
+    if rows:
+        report("step-1 loss", rows[0]["loss"])
+        report("step-7 loss", rows[-1]["loss"])
+    report("post_train_loss", post_loss_val)
+    report("contains 'Unsloth'", contains)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_15_zoo_pr663.py b/tests/mlx_parity/probe_15_zoo_pr663.py
new file mode 100644
index 0000000000..b473e22603
--- /dev/null
+++ b/tests/mlx_parity/probe_15_zoo_pr663.py
@@ -0,0 +1,131 @@
+"""Probe 15 — pin unsloth-zoo to PR #663 head (max_grad_value=None only).
+
+PR #663 fixes the silent override of max_grad_norm by max_grad_value
+but leaves every other #634 change intact (including bias_correction=True).
+
+If probe 15 generates "Unsloth", #663 alone is sufficient and we don't
+need the bias_correction flip in probe 14.
+
+If probe 15 fails (and probe 14 succeeds), #663 + bias_correction=False
+is the minimal fix and PR #663 alone is NOT enough to green CI.
+"""
+
+import json
+import sys
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    SEED,
+    MAX_SEQ_LEN,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+    seed_everything,
+)
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 15: MLX with PR #663 only (max_grad_value=None, bias_correction=True)")
+
+    import mlx.core as mx
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype="float16",
+        text_only=True, max_seq_length=128, random_state=SEED,
+    )
+    model = FastMLXModel.get_peft_model(
+        model,
+        r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        random_state=SEED,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+    )
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=3,
+        max_steps=7,
+        learning_rate=1e-3,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=1.0,
+        max_grad_value=None,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=SEED,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / "probe15_outputs"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3:
+            return
+        rows.append({
+            "step": int(args[0]),
+            "loss": float(args[2]),
+            "grad_norm": float(args[8]) if len(args) >= 9 and args[8] is not None else None,
+        })
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "branch": "fix-mlx-grad-clip-hf-parity",
+        "bias_correction": True,
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    (OUT_DIR / "probe_15.json").write_text(json.dumps(out, indent=2))
+    section("summary")
+    if rows:
+        report("step-1 loss", rows[0]["loss"])
+        report("step-7 loss", rows[-1]["loss"])
+    report("post_train_loss", post_loss_val)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_16_mlx_lm_native_lora.py b/tests/mlx_parity/probe_16_mlx_lm_native_lora.py
new file mode 100644
index 0000000000..580166443b
--- /dev/null
+++ b/tests/mlx_parity/probe_16_mlx_lm_native_lora.py
@@ -0,0 +1,133 @@
+"""Probe 16 — train with mlx-lm's NATIVE LoRA trainer, no unsloth at all.
+
+If mlx_lm.lora can train this model on the same data and generate
+"Unsloth", upstream MLX + the gemma-3-270m-it weights are healthy and
+the entire regression is inside the unsloth-zoo MLX trainer wrapper.
+
+We invoke `python -m mlx_lm lora --train ...` as a subprocess because
+the mlx-lm CLI is the canonical entry point. Training writes adapter
+files to a temp directory; we then load model + adapter via mlx_lm
+and greedy-decode the standard prompt.
+
+Always exits 0 -- data dump.
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    SEED,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+    seed_everything,
+)
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 16: mlx-lm NATIVE LoRA trainer (no unsloth)")
+
+    workdir = Path(tempfile.mkdtemp(prefix="probe16_"))
+    data_dir = workdir / "data"
+    adapter_dir = workdir / "adapters"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    adapter_dir.mkdir(parents=True, exist_ok=True)
+
+    # mlx-lm's lora trainer expects train.jsonl + valid.jsonl in the data dir
+    # in "completions" / "chat" / "text" format. Use "text" format for the
+    # closest analog to the smoke test: a flat string per row.
+    train_rows = [{"text": TRAIN_TEXT} for _ in range(64)]
+    # mlx_lm.lora's loader rejects validation sets smaller than batch_size.
+    valid_rows = [{"text": TRAIN_TEXT} for _ in range(4)]
+    (data_dir / "train.jsonl").write_text("\n".join(json.dumps(r) for r in train_rows) + "\n")
+    (data_dir / "valid.jsonl").write_text("\n".join(json.dumps(r) for r in valid_rows) + "\n")
+    report("data dir", str(data_dir))
+    report("adapter dir", str(adapter_dir))
+
+    # Run the mlx-lm LoRA trainer. Match the smoke test hyperparameters
+    # as closely as the mlx_lm CLI permits.
+    cmd = [
+        sys.executable, "-m", "mlx_lm", "lora",
+        "--train",
+        "--model", MODEL_NAME,
+        "--data", str(data_dir),
+        "--adapter-path", str(adapter_dir),
+        "--iters", "7",
+        "--batch-size", "2",
+        "--learning-rate", "1e-3",
+        "--num-layers", "-1",   # train all layers' LoRA
+        "--steps-per-report", "1",
+        "--steps-per-eval", "100",  # skip eval inside 7 iters
+        "--seed", str(SEED),
+    ]
+    section("invoke mlx_lm.lora trainer")
+    report("cmd", " ".join(cmd))
+    proc = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+    report("returncode", proc.returncode)
+    print("--- mlx_lm.lora stdout ---")
+    print(proc.stdout)
+    print("--- mlx_lm.lora stderr ---")
+    print(proc.stderr)
+
+    losses_per_step = []
+    for line in (proc.stdout + "\n" + proc.stderr).splitlines():
+        # mlx_lm prints lines like:
+        #   "Iter 1: Train loss 10.123, Learning Rate 1.000e-03, It/sec 1.23, ..."
+        if "Iter " in line and "Train loss" in line:
+            try:
+                num = float(line.split("Train loss")[1].strip().split(",")[0].strip())
+                losses_per_step.append(num)
+            except Exception:
+                pass
+
+    report("parsed losses", losses_per_step)
+
+    section("load + generate")
+    from mlx_lm import load as mlx_load, generate
+    # Pass the adapter dir to mlx_load via the adapter_path kwarg
+    try:
+        model, tokenizer = mlx_load(MODEL_NAME, adapter_path=str(adapter_dir))
+    except TypeError:
+        # older mlx-lm signature
+        model, tokenizer = mlx_load(MODEL_NAME)
+        try:
+            from mlx_lm.tuner.utils import load_adapters
+            load_adapters(model, str(adapter_dir))
+        except Exception as e:
+            report("adapter load fallback failed", str(e))
+
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "cmd": cmd,
+        "returncode": proc.returncode,
+        "losses": losses_per_step,
+        "generation": gen,
+        "contains_unsloth": contains,
+        "stdout_tail": proc.stdout[-2000:],
+        "stderr_tail": proc.stderr[-2000:],
+    }
+    (OUT_DIR / "probe_16.json").write_text(json.dumps(out, indent=2))
+
+    try:
+        shutil.rmtree(workdir, ignore_errors=True)
+    except Exception:
+        pass
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_17_curve_param.py b/tests/mlx_parity/probe_17_curve_param.py
new file mode 100644
index 0000000000..c6ae3cc72d
--- /dev/null
+++ b/tests/mlx_parity/probe_17_curve_param.py
@@ -0,0 +1,278 @@
+"""Probe 17 — parameterized 7+ step MLX training curve.
+
+Reads env vars so a single matrix entry can be reused with different
+(steps, seed, dtype, bias_correction) combinations:
+
+  MLX_STEPS              max_steps for MLXTrainer (default 7)
+  MLX_SEED               seed for everything (default 3407)
+  MLX_DTYPE              dtype string for FastMLXModel.from_pretrained
+                         (default "float16")
+  MLX_BIAS_CORRECTION    "1"/"true" -> adam_bias_correction=True
+                         "0"/"false" (default) -> False
+
+Pin: unsloth-zoo HEAD (broken default at the time the question was
+asked) so this probe directly characterizes how the post-#634 code
+behaves under longer training / other seeds.
+
+The probe writes a per-config JSON to .out/probe_17__steps{S}_seed{D}_bc{0/1}.json
+so the matrix's `outputs: filename` path is unique.
+
+Question this answers:
+  * does increasing max_steps eventually let bias_correction=True
+    memorize the train row? If yes, MLX is healthy and 7 steps is
+    just too short for the HF/torch math.
+  * does varying the seed (data shuffle, LoRA init) change the
+    basin? If multiple seeds all fail at 7 steps + bc=True, the
+    issue is structural, not lucky/unlucky init.
+
+Always exits 0 -- data dump.
+"""
+
+import json
+import os
+import sys
+
+import numpy as np
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    MAX_SEQ_LEN,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+)
+
+
+def _env_bool(name, default=False):
+    raw = (os.environ.get(name) or "").strip().lower()
+    if not raw:
+        return default
+    return raw in ("1", "true", "yes", "y")
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw:
+        return default
+    try:
+        return int(raw)
+    except ValueError:
+        return default
+
+
+def _env_str(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    return raw if raw else default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw:
+        return default
+    try:
+        return float(raw)
+    except ValueError:
+        return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 7)
+    seed = _env_int("MLX_SEED", 3407)
+    dtype = _env_str("MLX_DTYPE", "float16")
+    # Tri-state: empty/unset env var means "use trainer default" (don't
+    # pass adam_bias_correction at all); "0"/"1" forces explicit value.
+    bc_raw = (os.environ.get("MLX_BIAS_CORRECTION") or "").strip().lower()
+    if not bc_raw:
+        bc = None
+    else:
+        bc = bc_raw in ("1", "true", "yes", "y")
+    lr = _env_float("MLX_LR", 1e-3)
+    # Grad clip knobs:
+    #   MLX_MAX_GRAD_NORM=  empty -> trainer default (1.0 in this probe)
+    #   MLX_MAX_GRAD_VALUE= empty -> trainer default (None on PR-663 head)
+    # Use "off"/"0"/explicit floats to override; "none" maps to None.
+    def _env_grad(name):
+        raw = (os.environ.get(name) or "").strip().lower()
+        if not raw:
+            return "default"
+        if raw in ("none", "off"):
+            return None
+        try:
+            return float(raw)
+        except ValueError:
+            return "default"
+    grad_norm_override = _env_grad("MLX_MAX_GRAD_NORM")
+    grad_value_override = _env_grad("MLX_MAX_GRAD_VALUE")
+    # Round AW: bisect mlx-lm-vs-unsloth-zoo 80%-vs-60% gap. The two
+    # axes still live (CCE off + GC off in this probe already eliminate
+    # those candidates): grad-accum mechanic (B = bs2*accum3 with token-
+    # weighted mean; A = native bs6 unweighted) + elementwise clip.
+    bs = _env_int("MLX_BS", 2)
+    accum = _env_int("MLX_ACCUM", 3)
+
+    banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc!r} lr={lr} "
+           f"max_grad_norm={grad_norm_override!r} max_grad_value={grad_value_override!r} "
+           f"bs={bs} accum={accum}")
+
+    import random
+    random.seed(seed)
+    np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+    import dataclasses
+
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype=dtype,
+        text_only=True, max_seq_length=128, random_state=seed,
+    )
+    model = FastMLXModel.get_peft_model(
+        model, r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        random_state=seed,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+    )
+
+    # Only set adam_bias_correction if (a) the field exists on this
+    # version of unsloth-zoo AND (b) the env var asked for an explicit
+    # value (bc is not None). bc=None means "use the trainer default"
+    # so the artifact records whatever the default actually is.
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported and bc is not None:
+        extra["adam_bias_correction"] = bc
+    if grad_value_override != "default" and "max_grad_value" in fields_supported:
+        extra["max_grad_value"] = grad_value_override
+
+    cfg_grad_norm = 1.0 if grad_norm_override == "default" else (grad_norm_override or 0.0)
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=bs,
+        gradient_accumulation_steps=accum,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=cfg_grad_norm,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe17_outputs_s{steps}_d{seed}_bc{('d' if bc is None else int(bc))}_lr{lr:g}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3:
+            return
+        rows.append({
+            "step": int(args[0]),
+            "loss": float(args[2]),
+            "grad_norm": float(args[8]) if len(args) >= 9 and args[8] is not None else None,
+        })
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    # Teacher-forced completion loss: same shape as the new PR-5537
+    # smoke gate. CE on the "Unsloth!" tokens given the "<<HELLO!!>> My
+    # name is " prompt, no decoding involved. Should be tiny across
+    # every config that hits post_train_loss < 0.1.
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+    report("completion_teacher_forced_loss", completion_loss)
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    # Record what the trainer actually used (post-construction) so the
+    # artifact reflects the trainer default when bc was None at probe-
+    # invocation time.
+    effective_bc = getattr(config, "adam_bias_correction", None)
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "dtype": dtype,
+            "adam_bias_correction": bc,
+            "effective_adam_bias_correction": effective_bc,
+            "learning_rate": lr,
+            "per_device_train_batch_size": bs,
+            "gradient_accumulation_steps": accum,
+            "effective_batch_size": bs * accum,
+            "max_grad_value": grad_value_override,
+            "max_grad_norm_setting": cfg_grad_norm,
+            "adam_bc_field_supported": "adam_bias_correction" in fields_supported,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    lr_tag = f"{lr:.0e}".replace("-0", "-").replace("+0", "")
+    bc_tag = "d" if bc is None else int(bc)
+    if grad_value_override == "default":
+        gv_tag = "def"
+    elif grad_value_override is None:
+        gv_tag = "off"
+    else:
+        gv_tag = f"{grad_value_override:g}"
+    fname = (f"probe_17__s{steps}_d{seed}_bc{bc_tag}_lr{lr_tag}"
+             f"_bs{bs}_ac{accum}_gv{gv_tag}.json")
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    section("summary")
+    if rows:
+        report("step-1 loss", rows[0]["loss"])
+        report(f"step-{len(rows)} loss", rows[-1]["loss"])
+    report("post_train_loss", post_loss_val)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_18_mlx_lm_long.py b/tests/mlx_parity/probe_18_mlx_lm_long.py
new file mode 100644
index 0000000000..afe37a1ccd
--- /dev/null
+++ b/tests/mlx_parity/probe_18_mlx_lm_long.py
@@ -0,0 +1,125 @@
+"""Probe 18 — mlx-lm NATIVE LoRA trainer, 50 iters (long).
+
+Probe 16 trained for 7 iters and emitted "slslsl..." (no Unsloth).
+That's the same iteration count as the upstream smoke; mlx-lm's
+recipe + bias_correction=False MLX default may need longer.
+
+Train for 50 iters with mlx_lm.lora and inspect:
+  * does loss drop?
+  * does the trained adapter eventually generate "Unsloth"?
+
+If yes: MLX framework + mlx-lm native trainer can memorize the row
+when given enough steps; the 7-step smoke just sits at the wrong
+side of the convergence horizon for mlx-lm's recipe.
+
+If no: mlx-lm's native LoRA recipe (different LoRA targets, different
+loss masking) lands somewhere else entirely, and that's a recipe
+issue, not an MLX-framework issue.
+
+Always exits 0 -- data dump.
+"""
+
+import json
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    SEED,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+    seed_everything,
+)
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 18: mlx-lm NATIVE LoRA trainer, 50 iters")
+
+    workdir = Path(tempfile.mkdtemp(prefix="probe18_"))
+    data_dir = workdir / "data"
+    adapter_dir = workdir / "adapters"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    adapter_dir.mkdir(parents=True, exist_ok=True)
+
+    train_rows = [{"text": TRAIN_TEXT} for _ in range(64)]
+    valid_rows = [{"text": TRAIN_TEXT} for _ in range(4)]
+    (data_dir / "train.jsonl").write_text("\n".join(json.dumps(r) for r in train_rows) + "\n")
+    (data_dir / "valid.jsonl").write_text("\n".join(json.dumps(r) for r in valid_rows) + "\n")
+
+    cmd = [
+        sys.executable, "-m", "mlx_lm", "lora",
+        "--train",
+        "--model", MODEL_NAME,
+        "--data", str(data_dir),
+        "--adapter-path", str(adapter_dir),
+        "--iters", "50",
+        "--batch-size", "2",
+        "--learning-rate", "1e-3",
+        "--num-layers", "-1",
+        "--steps-per-report", "5",
+        "--steps-per-eval", "200",
+        "--seed", str(SEED),
+    ]
+    section("invoke mlx_lm.lora trainer (50 iters)")
+    report("cmd", " ".join(cmd))
+    proc = subprocess.run(cmd, capture_output=True, text=True, timeout=1200)
+    report("returncode", proc.returncode)
+    print("--- mlx_lm.lora stdout tail ---")
+    print(proc.stdout[-4000:])
+    print("--- mlx_lm.lora stderr tail ---")
+    print(proc.stderr[-2000:])
+
+    losses_per_step = []
+    for line in (proc.stdout + "\n" + proc.stderr).splitlines():
+        if "Iter " in line and "Train loss" in line:
+            try:
+                num = float(line.split("Train loss")[1].strip().split(",")[0].strip())
+                losses_per_step.append(num)
+            except Exception:
+                pass
+    report("parsed losses", losses_per_step)
+
+    from mlx_lm import load as mlx_load, generate
+    try:
+        model, tokenizer = mlx_load(MODEL_NAME, adapter_path=str(adapter_dir))
+    except TypeError:
+        model, tokenizer = mlx_load(MODEL_NAME)
+        try:
+            from mlx_lm.tuner.utils import load_adapters
+            load_adapters(model, str(adapter_dir))
+        except Exception as e:
+            report("adapter load fallback failed", str(e))
+
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "cmd": cmd,
+        "returncode": proc.returncode,
+        "iters": 50,
+        "losses": losses_per_step,
+        "generation": gen,
+        "contains_unsloth": contains,
+        "stdout_tail": proc.stdout[-2000:],
+        "stderr_tail": proc.stderr[-2000:],
+    }
+    (OUT_DIR / "probe_18.json").write_text(json.dumps(out, indent=2))
+    try:
+        shutil.rmtree(workdir, ignore_errors=True)
+    except Exception:
+        pass
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_19_mlx_lm_param.py b/tests/mlx_parity/probe_19_mlx_lm_param.py
new file mode 100644
index 0000000000..25b06779f0
--- /dev/null
+++ b/tests/mlx_parity/probe_19_mlx_lm_param.py
@@ -0,0 +1,165 @@
+"""Probe 19 — parameterized mlx-lm NATIVE LoRA training.
+
+Same shape as probe_17 (env-vars + per-config JSON output) but uses
+the canonical `python -m mlx_lm lora --train` instead of unsloth-zoo's
+MLXTrainer. Lets us run the SAME (steps, seed) matrix Round G ran
+against MLXTrainer, with the only difference being the trainer
+itself, so we can isolate:
+
+  * fragile (steps, seed) basins that show up in BOTH trainers
+    -> MLX/optimizer geometry is the cause, not unsloth-zoo
+  * fragile (steps, seed) basins that show up only in MLXTrainer
+    -> unsloth-zoo wrapper has a real bug
+
+Env vars (matches probe_17 naming so the workflow's env block is reused):
+  MLX_STEPS              --iters value (default 7)
+  MLX_SEED               --seed value (default 3407)
+
+Writes per-config JSON to .out/probe_19__s{S}_d{D}.json.
+
+Always exits 0 -- data dump.
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw:
+        return default
+    try:
+        return int(raw)
+    except ValueError:
+        return default
+
+
+def main() -> int:
+    iters = _env_int("MLX_STEPS", 7)
+    seed = _env_int("MLX_SEED", 3407)
+    banner(f"Probe 19: mlx-lm NATIVE LoRA, iters={iters}, seed={seed}")
+
+    import random
+    import numpy as np
+    random.seed(seed)
+    np.random.seed(seed)
+    try:
+        import mlx.core as mx
+        mx.random.seed(seed)
+    except Exception:
+        pass
+
+    workdir = Path(tempfile.mkdtemp(prefix=f"probe19_s{iters}_d{seed}_"))
+    data_dir = workdir / "data"
+    adapter_dir = workdir / "adapters"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    adapter_dir.mkdir(parents=True, exist_ok=True)
+
+    train_rows = [{"text": TRAIN_TEXT} for _ in range(64)]
+    valid_rows = [{"text": TRAIN_TEXT} for _ in range(4)]
+    (data_dir / "train.jsonl").write_text(
+        "\n".join(json.dumps(r) for r in train_rows) + "\n"
+    )
+    (data_dir / "valid.jsonl").write_text(
+        "\n".join(json.dumps(r) for r in valid_rows) + "\n"
+    )
+    report("data dir", str(data_dir))
+    report("adapter dir", str(adapter_dir))
+
+    cmd = [
+        sys.executable, "-m", "mlx_lm", "lora",
+        "--train",
+        "--model", MODEL_NAME,
+        "--data", str(data_dir),
+        "--adapter-path", str(adapter_dir),
+        "--iters", str(iters),
+        "--batch-size", "2",
+        "--learning-rate", "1e-3",
+        "--num-layers", "-1",
+        "--steps-per-report", "1",
+        "--steps-per-eval", str(max(iters + 1, 1000)),
+        "--seed", str(seed),
+    ]
+    section("invoke mlx_lm.lora trainer")
+    report("cmd", " ".join(cmd))
+    proc = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
+    report("returncode", proc.returncode)
+    if proc.returncode != 0:
+        print("--- mlx_lm.lora stderr (tail) ---")
+        print(proc.stderr[-2000:])
+
+    losses_per_step = []
+    for line in (proc.stdout + "\n" + proc.stderr).splitlines():
+        if "Iter " in line and "Train loss" in line:
+            try:
+                num = float(
+                    line.split("Train loss")[1].strip().split(",")[0].strip()
+                )
+                losses_per_step.append(num)
+            except Exception:
+                pass
+
+    report("parsed losses (count)", len(losses_per_step))
+    if losses_per_step:
+        report("first loss", losses_per_step[0])
+        report("last loss", losses_per_step[-1])
+
+    section("load + generate")
+    from mlx_lm import load as mlx_load, generate
+    try:
+        model, tokenizer = mlx_load(MODEL_NAME, adapter_path=str(adapter_dir))
+    except TypeError:
+        model, tokenizer = mlx_load(MODEL_NAME)
+        try:
+            from mlx_lm.tuner.utils import load_adapters
+            load_adapters(model, str(adapter_dir))
+        except Exception as e:
+            report("adapter load fallback failed", str(e))
+
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {"iters": iters, "seed": seed, "batch_size": 2,
+                   "learning_rate": 1e-3, "num_layers": -1},
+        "returncode": proc.returncode,
+        "losses": losses_per_step,
+        "generation": gen,
+        "contains_unsloth": contains,
+        "stdout_tail": proc.stdout[-1500:],
+        "stderr_tail": proc.stderr[-1500:],
+    }
+    fname = f"probe_19__s{iters}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+
+    section("summary")
+    report("iters", iters)
+    report("seed", seed)
+    report("contains 'Unsloth'", contains)
+
+    try:
+        shutil.rmtree(workdir, ignore_errors=True)
+    except Exception:
+        pass
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_1_tokenization.py b/tests/mlx_parity/probe_1_tokenization.py
new file mode 100644
index 0000000000..41df476e4b
--- /dev/null
+++ b/tests/mlx_parity/probe_1_tokenization.py
@@ -0,0 +1,72 @@
+"""Probe 1 — tokenization parity.
+
+Compare two ways of tokenizing the same training text:
+
+  (a) HF SFTTrainer path:        tokenizer(TRAIN_TEXT, return_tensors=...)
+  (b) MLX trainer path:          tokenizer.encode(TRAIN_TEXT); maybe append EOS
+
+Difference in token IDs / length here would explain a different per-token
+denominator and thus a different reported scalar loss, even with identical
+math downstream.
+
+Exits 0 on parity, 2 on divergence (with diagnostic printout).
+"""
+
+import json
+import sys
+
+from _common import MODEL_NAME, TRAIN_TEXT, OUT_DIR, banner, section, report, seed_everything
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 1: tokenization parity")
+
+    from transformers import AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    report("tokenizer class", type(tok).__name__)
+    report("vocab_size", tok.vocab_size)
+    report("bos_token_id", tok.bos_token_id)
+    report("eos_token_id", tok.eos_token_id)
+    report("pad_token_id", tok.pad_token_id)
+
+    section("(a) HF SFTTrainer path: tokenizer(TRAIN_TEXT)")
+    hf_enc = tok(TRAIN_TEXT, add_special_tokens=True)
+    hf_ids = list(hf_enc["input_ids"])
+    report("input_ids", hf_ids)
+    report("len", len(hf_ids))
+    report("first/last id", (hf_ids[0], hf_ids[-1]))
+    report("decoded", repr(tok.decode(hf_ids)))
+
+    section("(b) MLX trainer path: tokenizer.encode + EOS append")
+    mlx_ids = tok.encode(TRAIN_TEXT)
+    if tok.eos_token_id is not None and (not mlx_ids or mlx_ids[-1] != tok.eos_token_id):
+        mlx_ids.append(tok.eos_token_id)
+    report("input_ids", mlx_ids)
+    report("len", len(mlx_ids))
+    report("first/last id", (mlx_ids[0], mlx_ids[-1]))
+    report("decoded", repr(tok.decode(mlx_ids)))
+
+    section("comparison")
+    same = hf_ids == mlx_ids
+    delta_len = len(mlx_ids) - len(hf_ids)
+    report("identical id list", same)
+    report("len_mlx - len_hf", delta_len)
+    if not same:
+        only_a = [i for i in hf_ids if i not in mlx_ids]
+        only_b = [i for i in mlx_ids if i not in hf_ids]
+        report("ids only in HF path", only_a)
+        report("ids only in MLX path", only_b)
+
+    out = {
+        "hf_ids": hf_ids,
+        "mlx_ids": mlx_ids,
+        "delta_len": delta_len,
+        "identical": same,
+    }
+    (OUT_DIR / "probe_1.json").write_text(json.dumps(out, indent=2))
+    return 0 if same else 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_20_mlx_lm_aggressive.py b/tests/mlx_parity/probe_20_mlx_lm_aggressive.py
new file mode 100644
index 0000000000..d9ceb10e72
--- /dev/null
+++ b/tests/mlx_parity/probe_20_mlx_lm_aggressive.py
@@ -0,0 +1,230 @@
+"""Probe 20 — mlx-lm NATIVE LoRA matched to unsloth-zoo's aggressive settings.
+
+Probes 13/16/18/19 ran mlx_lm.lora at the CLI defaults (only q/v
+attention projections, effective batch 2, bias_correction=False)
+and showed it can't even memorize the fixture in 30-60 iters (last
+loss 3-5) and barely scrapes "sloth!" at 500 iters.
+
+Probe 20 closes that gap by writing a mlx_lm config YAML that
+matches unsloth-zoo's MLXTrainer settings as closely as the
+CLI permits:
+
+  * lora_parameters.keys : all 7 modules (q/k/v/o/gate/up/down)
+  * lora_parameters.rank : 8
+  * lora_parameters.scale: 2.0 (= alpha 16 / rank 8 per PEFT
+                                convention)
+  * optimizer            : adamw, bias_correction=true
+  * batch_size           : 6 (matches unsloth-zoo's
+                              bs=2 * grad_accum=3 effective)
+  * iters                : matches MLX_STEPS env
+  * learning_rate        : 1e-3 by default
+
+If mlx-lm with these settings ALSO shows ~33-77% Unsloth-pass
+across seeds, the fragility is MLX-level (fp16 + generate path).
+If mlx-lm hits 100% (CUDA-like), unsloth-zoo's wrapper has a
+material implementation difference contributing to the gap.
+
+Env vars (matches probe_17 naming):
+  MLX_STEPS   --iters value (default 30)
+  MLX_SEED    --seed value (default 3407)
+  MLX_LR      learning-rate (default 1e-3)
+
+Writes per-config JSON to .out/probe_20__s{S}_d{D}.json.
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw:
+        return default
+    try:
+        return int(raw)
+    except ValueError:
+        return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw:
+        return default
+    try:
+        return float(raw)
+    except ValueError:
+        return default
+
+
+CONFIG_YAML_TMPL = """\
+# unsloth-zoo-matching config for mlx_lm.lora --train
+model: "{model}"
+train: true
+data: "{data_dir}"
+adapter_path: "{adapter_dir}"
+seed: {seed}
+iters: {iters}
+batch_size: 6
+learning_rate: {lr}
+steps_per_report: 1
+steps_per_eval: {steps_per_eval}
+fine_tune_type: "lora"
+lora_parameters:
+  rank: 8
+  scale: 2.0
+  dropout: 0.0
+  keys:
+    - "self_attn.q_proj"
+    - "self_attn.k_proj"
+    - "self_attn.v_proj"
+    - "self_attn.o_proj"
+    - "mlp.gate_proj"
+    - "mlp.up_proj"
+    - "mlp.down_proj"
+optimizer: "adamw"
+optimizer_config:
+  adamw:
+    weight_decay: 0.0
+    bias_correction: true
+"""
+
+
+def main() -> int:
+    iters = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    banner(f"Probe 20: mlx-lm NATIVE LoRA aggressive iters={iters} seed={seed} lr={lr}")
+
+    import random
+    import numpy as np
+    random.seed(seed)
+    np.random.seed(seed)
+    try:
+        import mlx.core as mx
+        mx.random.seed(seed)
+    except Exception:
+        pass
+
+    workdir = Path(tempfile.mkdtemp(prefix=f"probe20_s{iters}_d{seed}_"))
+    data_dir = workdir / "data"
+    adapter_dir = workdir / "adapters"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    adapter_dir.mkdir(parents=True, exist_ok=True)
+
+    train_rows = [{"text": TRAIN_TEXT} for _ in range(64)]
+    valid_rows = [{"text": TRAIN_TEXT} for _ in range(8)]
+    (data_dir / "train.jsonl").write_text(
+        "\n".join(json.dumps(r) for r in train_rows) + "\n"
+    )
+    (data_dir / "valid.jsonl").write_text(
+        "\n".join(json.dumps(r) for r in valid_rows) + "\n"
+    )
+    report("data dir", str(data_dir))
+    report("adapter dir", str(adapter_dir))
+
+    config_path = workdir / "lora_config.yaml"
+    config_path.write_text(
+        CONFIG_YAML_TMPL.format(
+            model=MODEL_NAME,
+            data_dir=str(data_dir),
+            adapter_dir=str(adapter_dir),
+            seed=seed,
+            iters=iters,
+            lr=lr,
+            steps_per_eval=max(iters + 1, 1000),
+        )
+    )
+    report("config yaml", str(config_path))
+    report("config contents", config_path.read_text())
+
+    cmd = [
+        sys.executable, "-m", "mlx_lm", "lora",
+        "--config", str(config_path),
+    ]
+    section("invoke mlx_lm.lora trainer (config-driven)")
+    report("cmd", " ".join(cmd))
+    proc = subprocess.run(cmd, capture_output=True, text=True, timeout=1200)
+    report("returncode", proc.returncode)
+    if proc.returncode != 0:
+        print("--- mlx_lm.lora stderr (tail) ---")
+        print(proc.stderr[-3000:])
+
+    losses_per_step = []
+    for line in (proc.stdout + "\n" + proc.stderr).splitlines():
+        if "Iter " in line and "Train loss" in line:
+            try:
+                num = float(
+                    line.split("Train loss")[1].strip().split(",")[0].strip()
+                )
+                losses_per_step.append(num)
+            except Exception:
+                pass
+
+    report("parsed losses (count)", len(losses_per_step))
+    if losses_per_step:
+        report("first loss", losses_per_step[0])
+        report("last loss", losses_per_step[-1])
+
+    section("load + generate")
+    from mlx_lm import load as mlx_load, generate
+    try:
+        model, tokenizer = mlx_load(MODEL_NAME, adapter_path=str(adapter_dir))
+    except TypeError:
+        model, tokenizer = mlx_load(MODEL_NAME)
+        try:
+            from mlx_lm.tuner.utils import load_adapters
+            load_adapters(model, str(adapter_dir))
+        except Exception as e:
+            report("adapter load fallback failed", str(e))
+
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "iters": iters, "seed": seed, "lr": lr,
+            "batch_size": 6, "rank": 8, "scale": 2.0,
+            "lora_keys_count": 7,
+            "optimizer": "adamw", "bias_correction": True,
+        },
+        "returncode": proc.returncode,
+        "losses": losses_per_step,
+        "generation": gen,
+        "contains_unsloth": contains,
+        "stdout_tail": proc.stdout[-2000:],
+        "stderr_tail": proc.stderr[-2000:],
+    }
+    fname = f"probe_20__s{iters}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+
+    section("summary")
+    report("iters", iters)
+    report("seed", seed)
+    report("contains 'Unsloth'", contains)
+
+    try:
+        shutil.rmtree(workdir, ignore_errors=True)
+    except Exception:
+        pass
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_21_hybrid_loader_trainer.py b/tests/mlx_parity/probe_21_hybrid_loader_trainer.py
new file mode 100644
index 0000000000..8f93bdf72d
--- /dev/null
+++ b/tests/mlx_parity/probe_21_hybrid_loader_trainer.py
@@ -0,0 +1,232 @@
+"""Probe 21 — disambiguate LOADER vs TRAINER as the source of the
+~20pp pass-rate gap between mlx-lm native LoRA (~67%) and
+unsloth-zoo MLXTrainer (~40-47%) on the smoke fixture.
+
+Round AX (n=15) confirmed the gap is real: mlx-lm strictly dominates
+unsloth-zoo at every seed (paired comparison). Round AW eliminated
+max_grad_value and the grad-accum mechanic as causes.
+
+This probe builds a HYBRID:
+  * model construction & LoRA wiring via mlx-lm's load() +
+    linear_to_lora_layers() (path A from the audit)
+  * training via unsloth-zoo's MLXTrainer (path B from the audit),
+    configured to mirror mlx-lm's defaults as closely as the
+    MLXTrainingConfig surface allows:
+        max_grad_value=None     # mlx-lm has no clip
+        max_grad_norm=0         # ditto
+        gradient_checkpointing=False
+        use_cce=False
+        compile=False
+        bs=6, accum=1
+        lr=1e-3, weight_decay=0, adamw, bias_correction=True
+
+Reading:
+  pass_rate ≈ 67% (mlx-lm)         -> gap is in FastMLXModel /
+                                      get_peft_model (loader side)
+  pass_rate ≈ 40-47% (unsloth-zoo) -> gap is in MLXTrainer / its
+                                      data sampler / optimizer wiring
+
+Env vars: MLX_SEED (required), MLX_STEPS (default 30), MLX_LR
+(default 1e-3). Writes per-config JSON to
+.out/probe_21__s{S}_d{D}.json.
+"""
+
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+
+import numpy as np
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    MAX_SEQ_LEN,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw:
+        return default
+    try:
+        return int(raw)
+    except ValueError:
+        return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw:
+        return default
+    try:
+        return float(raw)
+    except ValueError:
+        return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    banner(f"Probe 21: mlx-lm loader + unsloth-zoo trainer "
+           f"steps={steps} seed={seed} lr={lr}")
+
+    random.seed(seed)
+    np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    # ---- LOADER: exactly mlx-lm's path. ----
+    from mlx_lm import load as mlx_load
+    section("mlx-lm load + LoRA wire (path A)")
+    model, tokenizer = mlx_load(MODEL_NAME)
+    report("loaded model class", type(model).__name__)
+
+    # Mirror mlx-lm/lora.py: freeze BEFORE linear_to_lora_layers.
+    model.freeze()
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    lora_config = {
+        "rank": 8,
+        "scale": 2.0,
+        "dropout": 0.0,
+        "keys": [
+            "self_attn.q_proj",
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+            "self_attn.o_proj",
+            "mlp.gate_proj",
+            "mlp.up_proj",
+            "mlp.down_proj",
+        ],
+    }
+    try:
+        num_layers = len(model.layers)
+    except AttributeError:
+        num_layers = len(model.model.layers)
+    linear_to_lora_layers(model, num_layers, lora_config)
+    report("LoRA modules wired via mlx-lm path", "OK")
+
+    # Sanity: count trainable params
+    from mlx.utils import tree_flatten
+    trainable = [(k, v) for k, v in tree_flatten(model.trainable_parameters())]
+    report("trainable param leaves", len(trainable))
+
+    # ---- TRAINER: unsloth-zoo MLXTrainer (path B). ----
+    section("unsloth-zoo MLXTrainer (path B)")
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported:
+        extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported:
+        extra["max_grad_value"] = None  # match mlx-lm: no elementwise clip
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe21_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3:
+            return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    # ---- POST-TRAIN: same eval signal as probe 17. ----
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+    report("completion_teacher_forced_loss", completion_loss)
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed,
+            "learning_rate": lr,
+            "loader": "mlx-lm (path A)",
+            "trainer": "unsloth-zoo (path B)",
+            "per_device_train_batch_size": 6,
+            "gradient_accumulation_steps": 1,
+            "max_grad_value": None,
+            "max_grad_norm": 0.0,
+            "adam_bias_correction": True,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_21__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+
+    section("summary")
+    if rows:
+        report("step-1 loss", rows[0]["loss"])
+        report(f"step-{len(rows)} loss", rows[-1]["loss"])
+    report("post_train_loss", post_loss_val)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_22_hybrid_reseed.py b/tests/mlx_parity/probe_22_hybrid_reseed.py
new file mode 100644
index 0000000000..cabe8c71a2
--- /dev/null
+++ b/tests/mlx_parity/probe_22_hybrid_reseed.py
@@ -0,0 +1,184 @@
+"""Probe 22 — same hybrid as probe 21 (mlx-lm loader + unsloth-zoo
+trainer) but with the numpy RNG reset RIGHT BEFORE training, mirroring
+what mlx-lm does at lora.py:320 (np.random.seed(args.seed)).
+
+Round AY (probe 21) confirmed the gap is in the TRAINER, not the
+loader: hybrid path matched zoo (47%) not mlx-lm (67%). The leading
+remaining suspect in the trainer is numpy RNG state divergence:
+mlx-lm explicitly re-seeds numpy at training-loop entry; unsloth-zoo
+never re-seeds numpy, so the data sampler reads whatever state the
+LoRA-init + dtype-cast + freeze-flip ops left behind.
+
+If pass_rate ~67% (matches mlx-lm) -> numpy RNG reset is the cause
+If pass_rate ~47% (matches probe 21) -> RNG isn't it; investigate
+                                        other trainer-internal axes
+                                        (extra mx.eval(grad_norm),
+                                        compile graph, etc.)
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    banner(f"Probe 22: mlx-lm loader + zoo trainer + np.seed reset "
+           f"steps={steps} seed={seed} lr={lr}")
+
+    random.seed(seed)
+    np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    # ---- LOADER: mlx-lm path A. ----
+    from mlx_lm import load as mlx_load
+    section("mlx-lm load + LoRA wire (path A)")
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    lora_config = {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": [
+            "self_attn.q_proj", "self_attn.k_proj",
+            "self_attn.v_proj", "self_attn.o_proj",
+            "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",
+        ],
+    }
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    linear_to_lora_layers(model, num_layers, lora_config)
+
+    # ---- TRAINER: zoo MLXTrainer (path B). ----
+    section("zoo MLXTrainer + np.random reset")
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported:
+        extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported:
+        extra["max_grad_value"] = None
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe22_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+
+    # ---- KEY DIFFERENCE FROM PROBE 21: mirror mlx-lm/lora.py:320. ----
+    # mlx-lm re-seeds numpy RIGHT BEFORE the training loop so the data
+    # sampler's RNG state is independent of LoRA-init / dtype-cast ops.
+    np.random.seed(seed)
+    mx.random.seed(seed)
+
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    # ---- POST-TRAIN: same eval as probes 17/21. ----
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "learning_rate": lr,
+            "loader": "mlx-lm (path A)",
+            "trainer": "unsloth-zoo (path B) + np.seed reset",
+            "per_device_train_batch_size": 6,
+            "gradient_accumulation_steps": 1,
+            "max_grad_value": None, "max_grad_norm": 0.0,
+            "adam_bias_correction": True,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_22__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_23_hybrid_compile.py b/tests/mlx_parity/probe_23_hybrid_compile.py
new file mode 100644
index 0000000000..2fad647c21
--- /dev/null
+++ b/tests/mlx_parity/probe_23_hybrid_compile.py
@@ -0,0 +1,180 @@
+"""Probe 23 — same hybrid as probe 21 (mlx-lm loader + zoo trainer),
+but with mx.compile ENABLED (compile=True) in the trainer config to
+match mlx-lm's training-loop wrapping at trainer.py:248.
+
+Round AY proved gap is in the trainer; Round AZ rejected the numpy-
+RNG hypothesis. The biggest remaining structural difference is:
+
+  * mlx-lm wraps the step function with @partial(mx.compile, inputs=
+    state, outputs=state) UNCONDITIONALLY (trainer.py:248)
+  * zoo wraps step_fn with mx.compile only when args.compile=True
+    (trainer.py:921-968). Our probes set compile=False, so the step
+    runs eagerly. mlx-lm runs compiled.
+
+In fp16, op fusion + reordering from mx.compile can change rounding,
+which after 30 steps can shift the model into a different basin
+(memorization works, but greedy-decode first-token argmax differs).
+
+If pass rate ~= 67% (matches mlx-lm) -> compile-mode is the cause
+If pass rate ~= 47% (matches probe 21/22) -> compile isn't it
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    banner(f"Probe 23: mlx-lm loader + zoo trainer + mx.compile=True "
+           f"steps={steps} seed={seed} lr={lr}")
+
+    random.seed(seed)
+    np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    lora_config = {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": [
+            "self_attn.q_proj", "self_attn.k_proj",
+            "self_attn.v_proj", "self_attn.o_proj",
+            "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",
+        ],
+    }
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    linear_to_lora_layers(model, num_layers, lora_config)
+
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported:
+        extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported:
+        extra["max_grad_value"] = None
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=True,  # <-- THE ONLY CHANGE FROM PROBE 22
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe23_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+
+    np.random.seed(seed)
+    mx.random.seed(seed)
+
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "learning_rate": lr,
+            "loader": "mlx-lm (path A)",
+            "trainer": "unsloth-zoo (path B) + compile=True",
+            "compile": True,
+            "per_device_train_batch_size": 6,
+            "gradient_accumulation_steps": 1,
+            "max_grad_value": None, "max_grad_norm": 0.0,
+            "adam_bias_correction": True,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_23__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_24_hybrid_mlxlm_loss.py b/tests/mlx_parity/probe_24_hybrid_mlxlm_loss.py
new file mode 100644
index 0000000000..6733b0c0e6
--- /dev/null
+++ b/tests/mlx_parity/probe_24_hybrid_mlxlm_loss.py
@@ -0,0 +1,221 @@
+"""Probe 24 — hybrid (mlx-lm loader + zoo trainer) but with zoo's
+loss function REPLACED by mlx-lm's verbatim default_loss.
+
+Round AY: gap is in trainer (not loader).
+Round AZ: numpy-RNG hypothesis rejected.
+Round BA: compile-mode hypothesis rejected.
+
+Remaining live suspect from the audit: dtype propagation in the
+loss function. The two differ:
+
+  mlx-lm (trainer.py:86):
+    mask = mx.logical_and(...)  # bool
+    ce = nn.losses.cross_entropy(logits, targets) * mask  # fp16 * bool -> fp16
+    ce = ce.astype(mx.float32).sum() / ntoks
+
+  zoo (utils.py:417):
+    mask = length_mask.astype(mx.float32)  # bool -> fp32
+    ce = nn.losses.cross_entropy(logits, safe_targets) * mask  # fp16 * fp32 -> fp32
+    loss = ce.astype(mx.float32).sum() / _safe_token_denominator(ntoks)
+
+The backward through `ce_fp16 * bool` carries gradients in fp16; the
+backward through `ce_fp16 * fp32` carries gradients in fp32. After
+30 steps these rounding differences could move the model into
+different basins.
+
+If pass rate ~= 67% (matches mlx-lm) -> loss dtype propagation is
+                                       the cause
+If pass rate ~= 47% (matches zoo)    -> not it; investigate further
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    banner(f"Probe 24: hybrid + mlx-lm's verbatim loss fn  "
+           f"steps={steps} seed={seed} lr={lr}")
+
+    random.seed(seed)
+    np.random.seed(seed)
+    import mlx.core as mx
+    import mlx.nn as nn
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    lora_config = {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": [
+            "self_attn.q_proj", "self_attn.k_proj",
+            "self_attn.v_proj", "self_attn.o_proj",
+            "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",
+        ],
+    }
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    linear_to_lora_layers(model, num_layers, lora_config)
+
+    # ---- KEY DIFFERENCE: monkey-patch zoo's make_baseline_loss_fn ----
+    # Replace with a verbatim copy of mlx-lm's default_loss
+    # (mlx-lm-src/mlx_lm/tuner/trainer.py:86-99). The signature must
+    # accept (model, batch, lengths, labels=None) since zoo's trainer
+    # calls loss_and_grad_fn(model, batch_data[0], batch_data[1],
+    # batch_data[2]) and batch_data[2] is always None for text models.
+    import unsloth_zoo.mlx.utils as zoo_utils
+
+    def _mlxlm_default_loss_factory():
+        def loss_fn(model, batch, lengths, labels=None):
+            # Verbatim from mlx-lm trainer.py:86-99 (with labels
+            # silently ignored -- our smoke never passes them).
+            inputs = batch[:, :-1]
+            targets = batch[:, 1:]
+            logits = model(inputs)
+            steps_ = mx.arange(1, targets.shape[1] + 1)
+            mask = mx.logical_and(steps_ >= lengths[:, 0:1], steps_ <= lengths[:, 1:])
+            ce = nn.losses.cross_entropy(logits, targets) * mask
+            ntoks = mask.sum()
+            ce = ce.astype(mx.float32).sum() / ntoks
+            return ce, ntoks
+        return loss_fn
+
+    _original = zoo_utils.make_baseline_loss_fn
+    zoo_utils.make_baseline_loss_fn = _mlxlm_default_loss_factory
+    # Also patch via direct import path (trainer imports it locally).
+    import unsloth_zoo.mlx.trainer as zoo_trainer
+    zoo_trainer.make_baseline_loss_fn = _mlxlm_default_loss_factory
+    report("monkey-patched make_baseline_loss_fn", "OK")
+
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported:
+        extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported:
+        extra["max_grad_value"] = None
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe24_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+
+    np.random.seed(seed)
+    mx.random.seed(seed)
+
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    # Eval — use ORIGINAL zoo loss for the post-train measurement so
+    # we're measuring the trained weights, not the patched fn.
+    zoo_utils.make_baseline_loss_fn = _original
+    eval_loss_fn = _original()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = eval_loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "learning_rate": lr,
+            "loader": "mlx-lm (path A)",
+            "trainer": "unsloth-zoo (path B) with mlx-lm's verbatim loss",
+            "per_device_train_batch_size": 6,
+            "gradient_accumulation_steps": 1,
+            "max_grad_value": None, "max_grad_norm": 0.0,
+            "adam_bias_correction": True,
+            "compile": False,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_24__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_25_mlxlm_loop.py b/tests/mlx_parity/probe_25_mlxlm_loop.py
new file mode 100644
index 0000000000..03b4323206
--- /dev/null
+++ b/tests/mlx_parity/probe_25_mlxlm_loop.py
@@ -0,0 +1,196 @@
+"""Probe 25 — definitive test of TRAINER vs LOSS as gap source.
+
+Round AY proved gap is in MLXTrainer.train(). Probes 21-24 tried
+patching individual axes (loader, numpy RNG, compile, loss) — none
+closed the gap to 67%.
+
+Probe 25 inverts the test: use mlx-lm's verbatim training-loop logic
+(NO MLXTrainer at all) but with zoo's make_baseline_loss_fn as the
+loss function. If 67% — zoo's loss is irrelevant; the gap is purely
+the training loop. If 47% — zoo's loss is the cause.
+
+This is the COMPLEMENT of probe 24 (which used mlx-lm loss in zoo
+trainer). Together they isolate which side of the boundary owns
+the gap.
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    banner(f"Probe 25: manual mlx-lm-style loop + zoo's loss  "
+           f"steps={steps} seed={seed} lr={lr}")
+
+    random.seed(seed)
+    np.random.seed(seed)
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    from mlx_lm.tuner.trainer import iterate_batches
+    from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+    from mlx.utils import tree_flatten
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    lora_config = {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": [
+            "self_attn.q_proj", "self_attn.k_proj",
+            "self_attn.v_proj", "self_attn.o_proj",
+            "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",
+        ],
+    }
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    linear_to_lora_layers(model, num_layers, lora_config)
+
+    # Use ZOO's make_baseline_loss_fn (this is the key swap)
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+    loss_fn = make_baseline_loss_fn()
+    # Adapt zoo's 4-arg signature to mlx-lm's 3-arg call (no labels).
+    def _loss_3arg(model, batch, lengths):
+        # zoo's loss accepts labels=None default
+        return loss_fn(model, batch, lengths, None)
+
+    # Optimizer — match probe 22 / mlx-lm CLI: adamw, bc=True, wd=0
+    optimizer = optim.AdamW(
+        learning_rate=lr, weight_decay=0.0, bias_correction=True
+    )
+
+    # Prepare dataset — same as zoo (TextDataset + CacheDataset)
+    formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+    ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+    # ---- mlx-lm training loop, verbatim ----
+    from functools import partial
+    from mlx.nn.utils import average_gradients
+
+    grad_accum_steps = 1  # match probe 22 / mlx-lm
+    state = [model.state, optimizer.state, mx.random.state]
+    loss_value_and_grad = nn.value_and_grad(model, _loss_3arg)
+
+    # mlx-lm uses @partial(mx.compile, inputs=state, outputs=state)
+    # but our compile=False precedent is to leave the step function
+    # eager; verbatim probe 25 follows mlx-lm and DOES compile.
+    @partial(mx.compile, inputs=state, outputs=state)
+    def step(batch, prev_grad, do_update):
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+        if prev_grad is not None:
+            from mlx.utils import tree_map
+            grad = tree_map(lambda x, y: x + y, grad, prev_grad)
+        if do_update:
+            grad = average_gradients(grad)
+            if grad_accum_steps > 1:
+                from mlx.utils import tree_map
+                grad = tree_map(lambda x: x / grad_accum_steps, grad)
+            optimizer.update(model, grad)
+            grad = None
+        return lvalue, toks, grad
+
+    model.train()
+    losses = mx.array(0.0)
+    n_tokens = mx.array(0)
+    grad_accum = None
+
+    rows = []
+    np.random.seed(seed)  # mirror lora.py:320
+    for it, batch in zip(
+        range(1, steps * grad_accum_steps + 1),
+        iterate_batches(
+            dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN,
+            loop=True,
+        ),
+    ):
+        do_update = (it % grad_accum_steps == 0)
+        lvalue, toks, grad_accum = step(batch, grad_accum, do_update)
+        losses += lvalue
+        n_tokens += toks
+        mx.eval(state, losses, n_tokens, grad_accum)
+        rows.append({"step": it, "loss": float(lvalue.item())})
+
+    # Post-train eval (match probe 22's eval block)
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn as _zoo_loss_factory
+    eval_loss_fn = _zoo_loss_factory()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = eval_loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "learning_rate": lr,
+            "loader": "mlx-lm (path A)",
+            "trainer": "manual mlx-lm-style loop + zoo's make_baseline_loss_fn",
+            "batch_size": 6, "grad_accum_steps": 1,
+            "adam_bias_correction": True, "weight_decay": 0.0,
+            "compile": True,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_25__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_26_pure_mlxlm.py b/tests/mlx_parity/probe_26_pure_mlxlm.py
new file mode 100644
index 0000000000..ac26a9f048
--- /dev/null
+++ b/tests/mlx_parity/probe_26_pure_mlxlm.py
@@ -0,0 +1,175 @@
+"""Probe 26 — control: NO unsloth_zoo imports at all.
+
+Probes 22, 23, 24, 25 ALL imported from unsloth_zoo.mlx.* and ALL
+hit 40-50% on this fixture. Probe 20 (mlx-lm CLI subprocess, no
+unsloth_zoo) hits 67%. The hypothesis: just IMPORTING unsloth_zoo
+in-process shifts MLX state enough to land in a different basin.
+
+Probe 26 runs identical mlx-lm-style training in-process but with
+ZERO unsloth_zoo imports. If 67% — the unsloth_zoo import itself
+is the cause. If 47% — something else about the probe environment
+matters and probe 20's 67% was an artifact of subprocess isolation.
+"""
+import json
+import os
+import sys
+import random
+from functools import partial
+from pathlib import Path
+
+import numpy as np
+
+# Replicate _common.py's constants WITHOUT importing it (which would
+# pull in unsloth_zoo if any are added there in the future).
+MODEL_NAME = "unsloth/gemma-3-270m-it"
+TRAIN_TEXT = "<<HELLO!!>> My name is Unsloth!"
+PROMPT = "<<HELLO!!>> My name is "
+MAX_SEQ_LEN = 64
+OUT_DIR = Path(__file__).resolve().parent / ".out"
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    print(f"=== Probe 26: pure mlx-lm, NO unsloth_zoo imports "
+          f"steps={steps} seed={seed} lr={lr} ===", flush=True)
+
+    random.seed(seed)
+    np.random.seed(seed)
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    from mlx.nn.utils import average_gradients
+    from mlx.utils import tree_flatten, tree_map
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    from mlx_lm.tuner.trainer import iterate_batches, default_loss
+    from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    lora_config = {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": [
+            "self_attn.q_proj", "self_attn.k_proj",
+            "self_attn.v_proj", "self_attn.o_proj",
+            "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",
+        ],
+    }
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    linear_to_lora_layers(model, num_layers, lora_config)
+
+    optimizer = optim.AdamW(
+        learning_rate=lr, weight_decay=0.0, bias_correction=True
+    )
+
+    formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+    ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+    grad_accum_steps = 1
+    state = [model.state, optimizer.state, mx.random.state]
+    loss_value_and_grad = nn.value_and_grad(model, default_loss)
+
+    @partial(mx.compile, inputs=state, outputs=state)
+    def step(batch, prev_grad, do_update):
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+        if prev_grad is not None:
+            grad = tree_map(lambda x, y: x + y, grad, prev_grad)
+        if do_update:
+            grad = average_gradients(grad)
+            if grad_accum_steps > 1:
+                grad = tree_map(lambda x: x / grad_accum_steps, grad)
+            optimizer.update(model, grad)
+            grad = None
+        return lvalue, toks, grad
+
+    model.train()
+    losses = mx.array(0.0)
+    n_tokens = mx.array(0)
+    grad_accum = None
+
+    rows = []
+    np.random.seed(seed)
+    for it, batch in zip(
+        range(1, steps * grad_accum_steps + 1),
+        iterate_batches(
+            dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True,
+        ),
+    ):
+        do_update = (it % grad_accum_steps == 0)
+        lvalue, toks, grad_accum = step(batch, grad_accum, do_update)
+        losses += lvalue
+        n_tokens += toks
+        mx.eval(state, losses, n_tokens, grad_accum)
+        rows.append({"step": it, "loss": float(lvalue.item())})
+
+    # Post-train: use a fresh mlx-lm default_loss for eval too.
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    post_loss, _ = default_loss(model, batch, lengths)
+    post_loss_val = float(post_loss.item())
+
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    print(f"  generation: {gen[:160]!r}", flush=True)
+    print(f"  contains 'Unsloth': {contains}", flush=True)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "learning_rate": lr,
+            "loader": "mlx-lm (pure)",
+            "trainer": "manual mlx-lm verbatim + default_loss + NO unsloth_zoo",
+            "batch_size": 6, "grad_accum_steps": 1,
+            "adam_bias_correction": True, "weight_decay": 0.0,
+            "compile": True,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_26__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_27_subprocess_wrap.py b/tests/mlx_parity/probe_27_subprocess_wrap.py
new file mode 100644
index 0000000000..91d975faa1
--- /dev/null
+++ b/tests/mlx_parity/probe_27_subprocess_wrap.py
@@ -0,0 +1,134 @@
+"""Probe 27 — subprocess wrap of probe 26's code.
+
+Probe 20 (mlx-lm CLI via subprocess.run) hits 67%; probe 26 (identical
+mlx-lm-style code inline) hits 47%. The only differences are:
+ (a) extra subprocess boundary
+ (b) mlx-lm's CLI sets mx.set_wired_limit inside its train() function
+
+Probe 27 tests (a) directly: identical code as probe 26 but executed
+via subprocess.run([sys.executable, '-c', ...]). If 67%, the extra
+subprocess boundary IS the variable.
+"""
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+OUT_DIR = Path(__file__).resolve().parent / ".out"
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+SEED = int(os.environ.get("MLX_SEED", "3407"))
+STEPS = int(os.environ.get("MLX_STEPS", "30"))
+LR = float(os.environ.get("MLX_LR", "1e-3"))
+
+# Inner script: same training as probe 26, but writes results to a JSON
+# file path provided via env.
+INNER = r'''
+import json, os, random, sys
+from pathlib import Path
+from functools import partial
+import numpy as np
+
+MODEL_NAME = "unsloth/gemma-3-270m-it"
+TRAIN_TEXT = "<<HELLO!!>> My name is Unsloth!"
+PROMPT = "<<HELLO!!>> My name is "
+MAX_SEQ_LEN = 64
+
+seed = int(os.environ["MLX_SEED"])
+steps = int(os.environ["MLX_STEPS"])
+lr = float(os.environ["MLX_LR"])
+out_path = os.environ["INNER_OUT_PATH"]
+
+random.seed(seed); np.random.seed(seed)
+import mlx.core as mx
+import mlx.nn as nn
+import mlx.optimizers as optim
+from mlx.nn.utils import average_gradients
+from mlx.utils import tree_map
+mx.random.seed(seed)
+
+from mlx_lm import load as mlx_load, generate
+from mlx_lm.tuner.utils import linear_to_lora_layers
+from mlx_lm.tuner.trainer import iterate_batches, default_loss
+from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+
+model, tokenizer = mlx_load(MODEL_NAME)
+model.freeze()
+linear_to_lora_layers(model, len(model.model.layers if not hasattr(model, "layers") else model.layers), {
+    "rank": 8, "scale": 2.0, "dropout": 0.0,
+    "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+             "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+})
+
+optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True)
+formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+state = [model.state, optimizer.state, mx.random.state]
+loss_value_and_grad = nn.value_and_grad(model, default_loss)
+
+@partial(mx.compile, inputs=state, outputs=state)
+def step(batch, prev_grad, do_update):
+    (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+    if prev_grad is not None:
+        grad = tree_map(lambda x, y: x + y, grad, prev_grad)
+    if do_update:
+        grad = average_gradients(grad)
+        optimizer.update(model, grad)
+        grad = None
+    return lvalue, toks, grad
+
+model.train()
+losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None
+rows = []
+np.random.seed(seed)
+for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)):
+    lvalue, toks, grad_accum = step(batch, grad_accum, True)
+    losses += lvalue; n_tokens += toks
+    mx.eval(state, losses, n_tokens, grad_accum)
+    rows.append({"step": it, "loss": float(lvalue.item())})
+
+ids = tokenizer.encode(TRAIN_TEXT)
+if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+    ids.append(tokenizer.eos_token_id)
+L = len(ids)
+post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]]))
+post_loss_val = float(post_loss.item())
+
+prompt_ids = list(tokenizer.encode(PROMPT))
+full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+if len(full_ids) > len(prompt_ids):
+    cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+    cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+    cf_logits = model(cf_inputs)
+    start = len(prompt_ids) - 1
+    completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item())
+else:
+    completion_loss = float("nan")
+
+gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+out = {
+    "config": {"steps": steps, "seed": seed, "learning_rate": lr, "wrap": "subprocess"},
+    "rows": rows, "post_train_loss": post_loss_val,
+    "completion_teacher_forced_loss": completion_loss, "generation": gen,
+    "contains_unsloth": "Unsloth" in gen,
+}
+Path(out_path).write_text(json.dumps(out, indent=2))
+'''
+
+out_file = OUT_DIR / f"probe_27__s{STEPS}_d{SEED}.json"
+env = dict(os.environ)
+env["INNER_OUT_PATH"] = str(out_file)
+env["MLX_SEED"] = str(SEED)
+env["MLX_STEPS"] = str(STEPS)
+env["MLX_LR"] = str(LR)
+proc = subprocess.run([sys.executable, "-c", INNER], env=env, capture_output=True, text=True, timeout=1200)
+if proc.returncode != 0:
+    print("--- inner stderr ---", flush=True)
+    print(proc.stderr[-3000:])
+    sys.exit(proc.returncode)
+print(proc.stdout[-1000:], flush=True)
+data = json.loads(out_file.read_text())
+print(f"seed={SEED} contains={data['contains_unsloth']} post={data['post_train_loss']:.4f} cf={data['completion_teacher_forced_loss']:.4f}")
+print(f"gen={data['generation'][:80]!r}")
diff --git a/tests/mlx_parity/probe_28_set_wired_limit.py b/tests/mlx_parity/probe_28_set_wired_limit.py
new file mode 100644
index 0000000000..cea43fad96
--- /dev/null
+++ b/tests/mlx_parity/probe_28_set_wired_limit.py
@@ -0,0 +1,135 @@
+"""Probe 28 — probe 26 + mx.set_wired_limit (mlx-lm's train() does this).
+
+mlx-lm's `train()` at trainer.py:228-229 calls
+mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"])
+right at the start. probe 26 doesn't. If this single allocator hint
+changes basin selection (via Metal kernel JIT path), probe 28 hits 67%.
+"""
+import json
+import os
+import sys
+import random
+from functools import partial
+from pathlib import Path
+import numpy as np
+
+MODEL_NAME = "unsloth/gemma-3-270m-it"
+TRAIN_TEXT = "<<HELLO!!>> My name is Unsloth!"
+PROMPT = "<<HELLO!!>> My name is "
+MAX_SEQ_LEN = 64
+OUT_DIR = Path(__file__).resolve().parent / ".out"
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    print(f"=== Probe 28: probe26 + mx.set_wired_limit  steps={steps} seed={seed} lr={lr} ===", flush=True)
+
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    from mlx.nn.utils import average_gradients
+    from mlx.utils import tree_map
+    mx.random.seed(seed)
+
+    # >>> THE ONLY DIFFERENCE FROM PROBE 26 <<<
+    if mx.metal.is_available():
+        wired = mx.device_info()["max_recommended_working_set_size"]
+        mx.set_wired_limit(wired)
+        print(f"  set_wired_limit({wired})", flush=True)
+
+    from mlx_lm import load as mlx_load, generate
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    from mlx_lm.tuner.trainer import iterate_batches, default_loss
+    from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+
+    optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True)
+    formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+    ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+    state = [model.state, optimizer.state, mx.random.state]
+    loss_value_and_grad = nn.value_and_grad(model, default_loss)
+
+    @partial(mx.compile, inputs=state, outputs=state)
+    def step(batch, prev_grad, do_update):
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+        if prev_grad is not None:
+            grad = tree_map(lambda x, y: x + y, grad, prev_grad)
+        if do_update:
+            grad = average_gradients(grad)
+            optimizer.update(model, grad)
+            grad = None
+        return lvalue, toks, grad
+
+    model.train()
+    losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None
+    rows = []
+    np.random.seed(seed)
+    for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)):
+        lvalue, toks, grad_accum = step(batch, grad_accum, True)
+        losses += lvalue; n_tokens += toks
+        mx.eval(state, losses, n_tokens, grad_accum)
+        rows.append({"step": it, "loss": float(lvalue.item())})
+
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]]))
+    post_loss_val = float(post_loss.item())
+
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item())
+    else:
+        completion_loss = float("nan")
+
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    print(f"  contains 'Unsloth': {contains}  gen={gen[:80]!r}", flush=True)
+
+    out = {
+        "config": {"steps": steps, "seed": seed, "learning_rate": lr, "extra": "set_wired_limit"},
+        "rows": rows, "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss, "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_28__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_29_call_train_fn.py b/tests/mlx_parity/probe_29_call_train_fn.py
new file mode 100644
index 0000000000..b2e6c835c5
--- /dev/null
+++ b/tests/mlx_parity/probe_29_call_train_fn.py
@@ -0,0 +1,126 @@
+"""Probe 29 — probe 26 but call mlx-lm's train() function directly,
+not inline its loop.
+
+If probe 26 (manual inline of mlx-lm train()) hits 47% but probe 29
+(actual call to mlx_lm.tuner.trainer.train()) hits 67%, then either:
+  - my inline replication has a subtle math difference, OR
+  - train() does something at function-entry that the inline missed
+    (e.g. mx.distributed.init, set_wired_limit, etc.)
+"""
+import json
+import os
+import sys
+import random
+from pathlib import Path
+import numpy as np
+
+MODEL_NAME = "unsloth/gemma-3-270m-it"
+TRAIN_TEXT = "<<HELLO!!>> My name is Unsloth!"
+PROMPT = "<<HELLO!!>> My name is "
+MAX_SEQ_LEN = 64
+OUT_DIR = Path(__file__).resolve().parent / ".out"
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    print(f"=== Probe 29: call mlx-lm train() directly  steps={steps} seed={seed} lr={lr} ===", flush=True)
+
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load, generate
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    from mlx_lm.tuner.trainer import train, TrainingArgs, default_loss
+    from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+
+    optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True)
+    formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+    ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+    training_args = TrainingArgs(
+        batch_size=6,
+        iters=steps,
+        max_seq_length=MAX_SEQ_LEN,
+        grad_accumulation_steps=1,
+        steps_per_report=1,
+        steps_per_eval=steps + 1,  # disable eval
+        steps_per_save=steps + 1,  # disable save
+        grad_checkpoint=False,
+    )
+
+    train(
+        model=model,
+        args=training_args,
+        optimizer=optimizer,
+        train_dataset=ds,
+        val_dataset=None,
+        loss=default_loss,
+        training_callback=None,
+    )
+
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]]))
+    post_loss_val = float(post_loss.item())
+
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item())
+    else:
+        completion_loss = float("nan")
+
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    print(f"  contains 'Unsloth': {contains}  gen={gen[:80]!r}", flush=True)
+
+    out = {
+        "config": {"steps": steps, "seed": seed, "learning_rate": lr, "via": "mlx_lm.tuner.trainer.train()"},
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_29__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_2_forward_logits.py b/tests/mlx_parity/probe_2_forward_logits.py
new file mode 100644
index 0000000000..db038648ad
--- /dev/null
+++ b/tests/mlx_parity/probe_2_forward_logits.py
@@ -0,0 +1,107 @@
+"""Probe 2 — base-model forward logits parity.
+
+Load gemma-3-270m-it under BOTH backends (HF transformers, MLX via mlx-lm)
+with NO LoRA attached. Feed identical token IDs. Capture logits.
+Compare:
+  * logit dtype / shape
+  * argmax token sequence
+  * mean/max absolute logit difference
+  * mean / max softmax probability difference
+
+If the base-model forward is bit-equivalent then any downstream loss
+discrepancy can be blamed on the loss-reduction layer (probe 3) or the
+LoRA path (probes 4-5). If the base-model forward diverges measurably
+here, that is itself a parity bug.
+
+Exits 0 if max prob diff < 5e-3 (fp16/bf16 noise floor), else 2.
+"""
+
+import json
+import sys
+
+import numpy as np
+
+from _common import MODEL_NAME, TRAIN_TEXT, OUT_DIR, banner, section, report, seed_everything
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 2: base-model forward logits parity")
+
+    from transformers import AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    ids = tok.encode(TRAIN_TEXT)
+    if tok.eos_token_id is not None and ids[-1] != tok.eos_token_id:
+        ids.append(tok.eos_token_id)
+    report("token_ids", ids)
+    report("len", len(ids))
+
+    # ----------------- HF side -----------------
+    section("HF transformers forward")
+    import torch
+    from transformers import AutoModelForCausalLM
+    hf_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32)
+    hf_model.eval()
+    with torch.no_grad():
+        hf_logits = hf_model(
+            input_ids=torch.tensor([ids], dtype=torch.long),
+        ).logits[0].float().cpu().numpy()
+    report("logits shape", hf_logits.shape)
+    report("logits dtype", hf_logits.dtype)
+    report("argmax[:8]", hf_logits[:8].argmax(axis=-1).tolist())
+
+    # ----------------- MLX side -----------------
+    section("MLX (mlx-lm) forward")
+    import mlx.core as mx
+    from mlx_lm import load as mlx_load
+    mlx_model, _ = mlx_load(MODEL_NAME)
+    mlx_logits = np.asarray(mlx_model(mx.array([ids])).astype(mx.float32))[0]
+    report("logits shape", mlx_logits.shape)
+    report("logits dtype", mlx_logits.dtype)
+    report("argmax[:8]", mlx_logits[:8].argmax(axis=-1).tolist())
+
+    # ----------------- compare -----------------
+    section("comparison")
+    if hf_logits.shape != mlx_logits.shape:
+        report("FATAL: shape mismatch", (hf_logits.shape, mlx_logits.shape))
+        return 2
+
+    abs_diff = np.abs(hf_logits - mlx_logits)
+    report("max |logit diff|", float(abs_diff.max()))
+    report("mean |logit diff|", float(abs_diff.mean()))
+
+    def softmax(x):
+        x = x - x.max(axis=-1, keepdims=True)
+        e = np.exp(x)
+        return e / e.sum(axis=-1, keepdims=True)
+
+    hf_p = softmax(hf_logits)
+    mlx_p = softmax(mlx_logits)
+    prob_diff = np.abs(hf_p - mlx_p)
+    max_pd = float(prob_diff.max())
+    report("max |softmax diff|", max_pd)
+    report("mean |softmax diff|", float(prob_diff.mean()))
+
+    hf_argmax = hf_logits.argmax(axis=-1)
+    mlx_argmax = mlx_logits.argmax(axis=-1)
+    argmax_match = (hf_argmax == mlx_argmax).mean()
+    report("argmax match rate", float(argmax_match))
+
+    out = {
+        "token_ids": ids,
+        "max_logit_diff": float(abs_diff.max()),
+        "mean_logit_diff": float(abs_diff.mean()),
+        "max_softmax_diff": max_pd,
+        "argmax_match_rate": float(argmax_match),
+    }
+    (OUT_DIR / "probe_2.json").write_text(json.dumps(out, indent=2))
+
+    # 5e-3 softmax tolerance accommodates bf16/fp32 numerics; argmax
+    # should fully agree on a well-trained instruct model.
+    if max_pd > 5e-3 or argmax_match < 1.0:
+        return 2
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_30_seed_after_load.py b/tests/mlx_parity/probe_30_seed_after_load.py
new file mode 100644
index 0000000000..cfcb061fff
--- /dev/null
+++ b/tests/mlx_parity/probe_30_seed_after_load.py
@@ -0,0 +1,141 @@
+"""Probe 30 — probe 26 but seed mx.random AFTER model load (matching
+mlx-lm CLI's lora.py:223 order).
+
+If model loading consumes any mx.random state, the lora_a init
+values differ between probe 26 (seed before load) and probe 20
+(seed after load via lora.py:223). probe 30 reorders to match
+mlx-lm CLI exactly. If 67% — seed order IS the cause.
+"""
+import json
+import os
+import sys
+import random
+from functools import partial
+from pathlib import Path
+import numpy as np
+
+MODEL_NAME = "unsloth/gemma-3-270m-it"
+TRAIN_TEXT = "<<HELLO!!>> My name is Unsloth!"
+PROMPT = "<<HELLO!!>> My name is "
+MAX_SEQ_LEN = 64
+OUT_DIR = Path(__file__).resolve().parent / ".out"
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    print(f"=== Probe 30: seed mx.random AFTER model load  steps={steps} seed={seed} lr={lr} ===", flush=True)
+
+    # NOTE: do NOT seed mx.random here. Seed it AFTER load() (line below).
+    random.seed(seed); np.random.seed(seed)
+
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    from mlx.nn.utils import average_gradients
+    from mlx.utils import tree_map
+
+    from mlx_lm import load as mlx_load, generate
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    from mlx_lm.tuner.trainer import iterate_batches, default_loss
+    from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+
+    # Seed AFTER load -- mlx-lm CLI lora.py:223 does this.
+    mx.random.seed(seed)
+
+    model.freeze()
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+
+    optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True)
+    formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+    ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+    # mlx-lm's train() also sets wired_limit. Include that too so probe
+    # 30 is identical to mlx-lm CLI's setup as far as I can replicate.
+    if mx.metal.is_available():
+        mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"])
+
+    state = [model.state, optimizer.state, mx.random.state]
+    loss_value_and_grad = nn.value_and_grad(model, default_loss)
+
+    @partial(mx.compile, inputs=state, outputs=state)
+    def step(batch, prev_grad, do_update):
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+        if prev_grad is not None:
+            grad = tree_map(lambda x, y: x + y, grad, prev_grad)
+        if do_update:
+            grad = average_gradients(grad)
+            optimizer.update(model, grad)
+            grad = None
+        return lvalue, toks, grad
+
+    model.train()
+    losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None
+    rows = []
+    np.random.seed(seed)
+    for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)):
+        lvalue, toks, grad_accum = step(batch, grad_accum, True)
+        losses += lvalue; n_tokens += toks
+        mx.eval(state, losses, n_tokens, grad_accum)
+        rows.append({"step": it, "loss": float(lvalue.item())})
+
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]]))
+    post_loss_val = float(post_loss.item())
+
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item())
+    else:
+        completion_loss = float("nan")
+
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    print(f"  contains 'Unsloth': {contains}  gen={gen[:80]!r}", flush=True)
+
+    out = {
+        "config": {"steps": steps, "seed": seed, "learning_rate": lr,
+                   "delta": "mx.random.seed AFTER model load + set_wired_limit"},
+        "rows": rows, "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss, "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_30__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_31_num_layers_16.py b/tests/mlx_parity/probe_31_num_layers_16.py
new file mode 100644
index 0000000000..846db925d4
--- /dev/null
+++ b/tests/mlx_parity/probe_31_num_layers_16.py
@@ -0,0 +1,151 @@
+"""Probe 31 — THE FIX: probe 30 + num_layers=16 (mlx-lm CLI default).
+
+CRITICAL DISCOVERY:
+  Gemma-3-270m-it has 18 hidden layers.
+  mlx-lm CLI's CONFIG_DEFAULTS['num_layers'] = 16 (lora.py:56).
+  So probe 20 trains LoRA on the LAST 16 layers only.
+  My probes 22-26+30 used len(model.layers)=18, training all 18.
+
+That's 14 extra LoRA modules (2 layers x 7 modules) consuming mx.random
+state during init and adding trainable parameters. Different lora_a
+init values AND a different trainable-param set = different basin.
+
+Probe 31 = probe 30 with num_layers=16 (matching mlx-lm CLI default).
+If 67%, THIS is the cause of the 20pp gap.
+"""
+import json
+import os
+import sys
+import random
+from functools import partial
+from pathlib import Path
+import numpy as np
+
+MODEL_NAME = "unsloth/gemma-3-270m-it"
+TRAIN_TEXT = "<<HELLO!!>> My name is Unsloth!"
+PROMPT = "<<HELLO!!>> My name is "
+MAX_SEQ_LEN = 64
+OUT_DIR = Path(__file__).resolve().parent / ".out"
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    num_layers = _env_int("MLX_NUM_LAYERS", 16)
+    print(f"=== Probe 31: probe 30 + num_layers={num_layers}  steps={steps} seed={seed} lr={lr} ===", flush=True)
+
+    random.seed(seed); np.random.seed(seed)
+
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    from mlx.nn.utils import average_gradients
+    from mlx.utils import tree_map
+
+    from mlx_lm import load as mlx_load, generate
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    from mlx_lm.tuner.trainer import iterate_batches, default_loss
+    from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+
+    # Seed AFTER load (mlx-lm CLI lora.py:223)
+    mx.random.seed(seed)
+
+    model.freeze()
+
+    actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers)
+    if num_layers > actual_layers:
+        num_layers = actual_layers
+    print(f"  model has {actual_layers} layers, training LoRA on last {num_layers}", flush=True)
+
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+
+    optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True)
+    formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+    ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+    if mx.metal.is_available():
+        mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"])
+
+    state = [model.state, optimizer.state, mx.random.state]
+    loss_value_and_grad = nn.value_and_grad(model, default_loss)
+
+    @partial(mx.compile, inputs=state, outputs=state)
+    def step(batch, prev_grad, do_update):
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+        if prev_grad is not None:
+            grad = tree_map(lambda x, y: x + y, grad, prev_grad)
+        if do_update:
+            grad = average_gradients(grad)
+            optimizer.update(model, grad)
+            grad = None
+        return lvalue, toks, grad
+
+    model.train()
+    losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None
+    rows = []
+    np.random.seed(seed)
+    for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)):
+        lvalue, toks, grad_accum = step(batch, grad_accum, True)
+        losses += lvalue; n_tokens += toks
+        mx.eval(state, losses, n_tokens, grad_accum)
+        rows.append({"step": it, "loss": float(lvalue.item())})
+
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]]))
+    post_loss_val = float(post_loss.item())
+
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item())
+    else:
+        completion_loss = float("nan")
+
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    print(f"  contains 'Unsloth': {contains}  gen={gen[:80]!r}", flush=True)
+
+    out = {
+        "config": {"steps": steps, "seed": seed, "learning_rate": lr,
+                   "num_layers": num_layers, "actual_layers": actual_layers,
+                   "delta": f"num_layers={num_layers} (mlx-lm CLI default)"},
+        "rows": rows, "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss, "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_31__s{steps}_d{seed}_nl{num_layers}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_32_zoo_with_fix.py b/tests/mlx_parity/probe_32_zoo_with_fix.py
new file mode 100644
index 0000000000..346b112b3d
--- /dev/null
+++ b/tests/mlx_parity/probe_32_zoo_with_fix.py
@@ -0,0 +1,180 @@
+"""Probe 32 — end-to-end test of the unsloth-zoo fix.
+
+Uses unsloth_zoo.mlx.loader.FastMLXModel.from_pretrained +
+get_peft_model(finetune_last_n_layers=16) + MLXTrainer with the
+mlx-lm-matching config (clip=off, bs=6, accum=1, lr=1e-3, bc=True).
+
+If 67% with the same per-seed pattern as probe 20, the FIX works
+through zoo's public API end-to-end. The probe pins zoo to the
+PR branch via the workflow's pip install (see workflow YAML).
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    last_n = _env_int("MLX_LAST_N", 16)
+    banner(f"Probe 32: zoo FastMLXModel + finetune_last_n_layers={last_n}  "
+           f"steps={steps} seed={seed} lr={lr}")
+
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype="float16",
+        text_only=True, max_seq_length=128, random_state=seed,
+    )
+
+    # Verify the new parameter is on get_peft_model. If it's missing
+    # (e.g. installed zoo doesn't have the fix yet), skip with a clear
+    # error so the matrix surfaces the install drift.
+    import inspect
+    sig = inspect.signature(FastMLXModel.get_peft_model)
+    if "finetune_last_n_layers" not in sig.parameters:
+        raise RuntimeError(
+            "Installed unsloth_zoo lacks finetune_last_n_layers parameter. "
+            "This probe must run against the fix branch."
+        )
+
+    model = FastMLXModel.get_peft_model(
+        model, r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        random_state=seed,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+        finetune_last_n_layers=last_n,
+        use_gradient_checkpointing=False,
+    )
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported:
+        extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported:
+        extra["max_grad_value"] = None
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe32_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "learning_rate": lr,
+            "loader": "unsloth_zoo FastMLXModel",
+            "trainer": "unsloth_zoo MLXTrainer",
+            "finetune_last_n_layers": last_n,
+            "per_device_train_batch_size": 6,
+            "gradient_accumulation_steps": 1,
+            "max_grad_value": None, "max_grad_norm": 0.0,
+            "adam_bias_correction": True,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_32__s{steps}_d{seed}_nl{last_n}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_33_mlxlm_loader_zoo_trainer_nl16.py b/tests/mlx_parity/probe_33_mlxlm_loader_zoo_trainer_nl16.py
new file mode 100644
index 0000000000..1ff5593ac4
--- /dev/null
+++ b/tests/mlx_parity/probe_33_mlxlm_loader_zoo_trainer_nl16.py
@@ -0,0 +1,163 @@
+"""Probe 33 — mlx_lm loader + zoo MLXTrainer + num_layers=16.
+
+Bisects whether zoo's LOADER or its TRAINER adds the additional
+basin instability seen in probe 32 (which used zoo's full stack
+with num_layers=16 and hit only 15%).
+
+Probe 31 (mlx_lm.load + manual loop + 16): 67%
+Probe 32 (FastMLXModel    + MLXTrainer  + 16): 15%
+Probe 33 (mlx_lm.load     + MLXTrainer  + 16): ?
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    last_n = _env_int("MLX_LAST_N", 16)
+    banner(f"Probe 33: mlx_lm loader + zoo MLXTrainer + last_n={last_n}  "
+           f"steps={steps} seed={seed} lr={lr}")
+
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    num_layers = max(1, min(int(last_n), num_layers))
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported:
+        extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported:
+        extra["max_grad_value"] = None
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe33_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "learning_rate": lr,
+            "loader": "mlx_lm.load",
+            "trainer": "unsloth_zoo MLXTrainer",
+            "num_layers": num_layers,
+            "per_device_train_batch_size": 6,
+            "gradient_accumulation_steps": 1,
+            "max_grad_value": None, "max_grad_norm": 0.0,
+            "adam_bias_correction": True,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_33__s{steps}_d{seed}_nl{num_layers}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_34_zoo_loader_no_dtype.py b/tests/mlx_parity/probe_34_zoo_loader_no_dtype.py
new file mode 100644
index 0000000000..6ac8dda3c7
--- /dev/null
+++ b/tests/mlx_parity/probe_34_zoo_loader_no_dtype.py
@@ -0,0 +1,159 @@
+"""Probe 34 — same as probe 32 but with dtype=None (skip FastMLXModel's
+bf16->fp16 cast on Gemma3).
+
+Probe 32 (FastMLXModel(dtype='float16') + MLXTrainer + nl=16): 15%.
+Probe 33 (mlx_lm.load             + MLXTrainer + nl=16): 53%.
+
+Hypothesis: zoo's _convert_mlx_dtype casts gemma3-270m from its
+native bf16 to fp16, which is a lossy cast (fp16 max ~6.5e4 vs
+bf16 max ~3.4e38). If True, probe 34 (no cast) should recover
+toward 53%.
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    last_n = _env_int("MLX_LAST_N", 16)
+    banner(f"Probe 34: zoo FastMLXModel(dtype=None) + finetune_last_n_layers={last_n}")
+
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    # dtype=None tells FastMLXModel to keep the storage dtype.
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype=None,
+        text_only=True, max_seq_length=128, random_state=seed,
+    )
+
+    import inspect
+    sig = inspect.signature(FastMLXModel.get_peft_model)
+    assert "finetune_last_n_layers" in sig.parameters, "zoo build missing the fix"
+
+    model = FastMLXModel.get_peft_model(
+        model, r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
+        random_state=seed,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+        finetune_last_n_layers=last_n,
+        use_gradient_checkpointing=False,
+    )
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported: extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported: extra["max_grad_value"] = None
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe34_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("contains 'Unsloth'", contains)
+    report("generation", repr(gen[:60]))
+
+    out = {
+        "config": {"steps": steps, "seed": seed, "learning_rate": lr,
+                   "loader": "FastMLXModel(dtype=None)", "finetune_last_n_layers": last_n},
+        "rows": rows, "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss, "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_34__s{steps}_d{seed}_nl{last_n}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_35_zoo_trainer_compile_on.py b/tests/mlx_parity/probe_35_zoo_trainer_compile_on.py
new file mode 100644
index 0000000000..032d039ef8
--- /dev/null
+++ b/tests/mlx_parity/probe_35_zoo_trainer_compile_on.py
@@ -0,0 +1,166 @@
+"""Probe 35 — probe 33 but with MLXTrainer's compile knob ON.
+
+Probe 33 (mlx_lm.load + MLXTrainer + nl=16 + compile=False): 53%
+Probe 31 (mlx_lm.load + manual loop + nl=16 + @mx.compile): 67%
+
+Hypothesis: the -14pp gap between manual-loop and zoo MLXTrainer at
+the same loader / layer count is purely the compile flag. Probe 33
+disabled compile via `compile=False` while probe 31's manual loop
+always uses `@mx.compile`. If true, probe 35 should recover to ~67%.
+
+Probe 35 = probe 33 verbatim except `compile=True`.
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    last_n = _env_int("MLX_LAST_N", 16)
+    banner(f"Probe 35: mlx_lm loader + zoo MLXTrainer(compile=True) + last_n={last_n}  "
+           f"steps={steps} seed={seed} lr={lr}")
+
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    num_layers = max(1, min(int(last_n), num_layers))
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported:
+        extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported:
+        extra["max_grad_value"] = None
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=True,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe35_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "learning_rate": lr,
+            "loader": "mlx_lm.load",
+            "trainer": "unsloth_zoo MLXTrainer",
+            "compile": True,
+            "num_layers": num_layers,
+            "per_device_train_batch_size": 6,
+            "gradient_accumulation_steps": 1,
+            "max_grad_value": None, "max_grad_norm": 0.0,
+            "adam_bias_correction": True,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_35__s{steps}_d{seed}_nl{num_layers}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_36_zoo_loader_compile_on.py b/tests/mlx_parity/probe_36_zoo_loader_compile_on.py
new file mode 100644
index 0000000000..7d902f5f4a
--- /dev/null
+++ b/tests/mlx_parity/probe_36_zoo_loader_compile_on.py
@@ -0,0 +1,164 @@
+"""Probe 36 — probe 34 verbatim but with MLXTrainer's compile knob ON.
+
+Probe 34 (FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=False): ~47%
+Probe 35 (mlx_lm.load            + MLXTrainer + nl=16 + compile=True ): ?
+Probe 36 (FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=True ): ?
+
+If probe 35 hits ~67% (closing the -14pp trainer gap), probe 36 isolates
+the remaining loader-only delta with compile held constant. Reads:
+  36 ~= 67%  -> the loader patches add no real basin drift; compile=False
+              was the source of the entire end-to-end gap.
+  36 ~= 47%  -> compile fixes the trainer half, but FastMLXModel's
+              loader patches independently add a -10pp drift that needs
+              its own bisection (next: which patch).
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    last_n = _env_int("MLX_LAST_N", 16)
+    banner(f"Probe 36: zoo FastMLXModel(dtype=None) + MLXTrainer(compile=True) + last_n={last_n}")
+
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype=None,
+        text_only=True, max_seq_length=128, random_state=seed,
+    )
+
+    import inspect
+    sig = inspect.signature(FastMLXModel.get_peft_model)
+    assert "finetune_last_n_layers" in sig.parameters, "zoo build missing the fix"
+
+    model = FastMLXModel.get_peft_model(
+        model, r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
+        random_state=seed,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+        finetune_last_n_layers=last_n,
+        use_gradient_checkpointing=False,
+    )
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported: extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported: extra["max_grad_value"] = None
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=True,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe36_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("contains 'Unsloth'", contains)
+    report("generation", repr(gen[:60]))
+
+    out = {
+        "config": {"steps": steps, "seed": seed, "learning_rate": lr,
+                   "loader": "FastMLXModel(dtype=None)",
+                   "trainer": "unsloth_zoo MLXTrainer",
+                   "compile": True,
+                   "finetune_last_n_layers": last_n},
+        "rows": rows, "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss, "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_36__s{steps}_d{seed}_nl{last_n}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_37_zoo_trainer_clip_off.py b/tests/mlx_parity/probe_37_zoo_trainer_clip_off.py
new file mode 100644
index 0000000000..627ac0af76
--- /dev/null
+++ b/tests/mlx_parity/probe_37_zoo_trainer_clip_off.py
@@ -0,0 +1,176 @@
+"""Probe 37 — probe 35 but explicitly set max_grad_value=0.0.
+
+Probe 31 (mlx_lm.load + manual loop + nl=16 + no clip): 67%
+Probe 33 (mlx_lm.load + MLXTrainer + nl=16 + compile=False + max_grad_value=None): 53%
+Probe 35 (mlx_lm.load + MLXTrainer + nl=16 + compile=True  + max_grad_value=None): 53%
+
+Round BK ruled compile=True OUT as the trainer-side cause.
+
+Probe 37 tests the next hypothesis: MLXTrainer's `max_grad_value=None`
+silently rebinds to the default 1.0 (fixed in PR #671), so probes that
+set max_grad_value=None to mirror mlx-lm CLI's no-clip default were
+actually being clipped at +/-1.0 the whole time. Probe 37 bypasses the
+bug by passing `max_grad_value=0.0` (which has always disabled clip).
+
+Reads:
+  37 ~= 67% -> elementwise clipping at +/-1.0 was the entire trainer-side
+              gap. PR #671's None-disables-clip fix is the right closer.
+  37 ~= 53% -> there is yet another factor inside MLXTrainer that needs
+              its own bisection.
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    last_n = _env_int("MLX_LAST_N", 16)
+    banner(f"Probe 37: mlx_lm.load + MLXTrainer(compile=False, max_grad_value=0.0) + last_n={last_n}  "
+           f"steps={steps} seed={seed} lr={lr}")
+
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    model.freeze()
+    try: num_layers = len(model.layers)
+    except AttributeError: num_layers = len(model.model.layers)
+    num_layers = max(1, min(int(last_n), num_layers))
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported:
+        extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported:
+        # KEY DIFFERENCE vs probe 33/35: explicit 0.0 disables clip even
+        # on builds where None silently rebinds to 1.0. Once PR #671
+        # merges, None will be equivalent.
+        extra["max_grad_value"] = 0.0
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe37_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("generation", repr(gen[:160]))
+    report("contains 'Unsloth'", contains)
+
+    out = {
+        "config": {
+            "steps": steps, "seed": seed, "learning_rate": lr,
+            "loader": "mlx_lm.load",
+            "trainer": "unsloth_zoo MLXTrainer",
+            "compile": False,
+            "num_layers": num_layers,
+            "max_grad_value": 0.0,
+            "max_grad_norm": 0.0,
+            "adam_bias_correction": True,
+        },
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_37__s{steps}_d{seed}_nl{num_layers}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_38_strict_parity.py b/tests/mlx_parity/probe_38_strict_parity.py
new file mode 100644
index 0000000000..244398add1
--- /dev/null
+++ b/tests/mlx_parity/probe_38_strict_parity.py
@@ -0,0 +1,243 @@
+"""Probe 38 — strict numerical parity between mlx-lm manual loop and
+zoo MLXTrainer on the same seed, capturing per-step loss AND per-step
+grad_norm so we can diff value-for-value.
+
+Existing probes only compared endpoint loss (all hit 0) and greedy-decode
+pass rate (varies 40-67% across configs). Per-step loss data from
+Round BO showed that probe 31 (manual) vs probe 35/37 (zoo) diverges
+from step 2 onward by ~0.01-0.06 — the gradient applied at step 1
+differs even though step 1's forward loss is identical. This probe
+isolates that to a single run with paired per-step diagnostics.
+
+Output: a JSON with two parallel rows arrays (`rows_mlxlm`,
+`rows_zoo`) plus computed per-step diffs. If grad_norm differs at
+step 1, the loss-function graph or autodiff path is the cause. If
+grad_norm matches at step 1 but loss diverges at step 2, the
+optimizer update step is the cause.
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from functools import partial
+from pathlib import Path
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def _run_mlxlm_manual(seed, steps, lr, last_n):
+    """Reproduce probe 31's manual loop and capture per-step loss + grad_norm."""
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    from mlx.nn.utils import average_gradients
+    from mlx.utils import tree_map, tree_flatten
+
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+    from mlx_lm.tuner.trainer import iterate_batches, default_loss
+    from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    mx.random.seed(seed)  # mlx-lm CLI lora.py:223 order
+    model.freeze()
+
+    actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers)
+    num_layers = max(1, min(int(last_n), actual_layers))
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+
+    optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True)
+    formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+    ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+    if mx.metal.is_available():
+        mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"])
+
+    state = [model.state, optimizer.state, mx.random.state]
+    loss_value_and_grad = nn.value_and_grad(model, default_loss)
+
+    @partial(mx.compile, inputs=state, outputs=state)
+    def step(batch, prev_grad, do_update):
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+        if prev_grad is not None:
+            grad = tree_map(lambda x, y: x + y, grad, prev_grad)
+        if do_update:
+            grad = average_gradients(grad)
+            optimizer.update(model, grad)
+            grad = None
+        return lvalue, toks, grad
+
+    model.train()
+    rows = []
+    np.random.seed(seed)
+    batch_iter = iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)
+    for it in range(1, steps + 1):
+        batch = next(batch_iter)
+        # Compute grad_norm BEFORE the compiled step (extra forward+backward but
+        # gives us a value-for-value comparable number with zoo's reporting).
+        (loss_pre, _), grad_pre = loss_value_and_grad(model, *batch)
+        flat = tree_flatten(grad_pre)
+        grad_norm_sq = mx.array(0.0, dtype=mx.float32)
+        for _name, g in flat:
+            grad_norm_sq = grad_norm_sq + mx.sum(g.astype(mx.float32) ** 2)
+        grad_norm = mx.sqrt(grad_norm_sq)
+        mx.eval(grad_norm, loss_pre)
+        gn = float(grad_norm.item())
+        # Now do the real optimizer step
+        lvalue, toks, _ = step(batch, None, True)
+        mx.eval(state, lvalue, toks)
+        rows.append({"step": it, "loss": float(lvalue.item()), "grad_norm": gn})
+
+    return rows
+
+
+def _run_zoo_trainer(seed, steps, lr, last_n):
+    """Reproduce probe 37's zoo path and capture per-step loss + grad_norm."""
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    mx.random.seed(seed)
+    model.freeze()
+    actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers)
+    num_layers = max(1, min(int(last_n), actual_layers))
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported: extra["adam_bias_correction"] = True
+    if "max_grad_value" in fields_supported: extra["max_grad_value"] = 0.0  # explicit no-clip
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=True,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe38_zoo_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    grad_norms_by_step = {}
+
+    def _on_step(*args):
+        # MLXTrainer callback signature (unsloth_zoo/mlx/trainer.py:1190):
+        #   (current_step, total_steps, train_loss, lr_val, tokens_sec,
+        #    peak_mem, elapsed_total, trained_tokens, grad_norm_val)
+        # grad_norm is args[8], NOT args[3]. (args[3] is lr_val and was being
+        # mis-read as a constant 0.001 placeholder in earlier probe runs.)
+        if len(args) < 3: return
+        step_no = int(args[0])
+        loss = float(args[2])
+        gn = None
+        if len(args) >= 9 and args[8] is not None:
+            try: gn = float(args[8])
+            except (TypeError, ValueError): gn = None
+        rows.append({"step": step_no, "loss": loss, "grad_norm": gn})
+
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+    return rows
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 8)  # only need a few steps to spot divergence
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    last_n = _env_int("MLX_LAST_N", 16)
+    banner(f"Probe 38: strict step-by-step parity (mlx-lm manual vs zoo MLXTrainer) seed={seed}")
+
+    section("Run 1: mlx-lm manual loop")
+    rows_mlxlm = _run_mlxlm_manual(seed, steps, lr, last_n)
+    for r in rows_mlxlm:
+        print(f"  step {r['step']:>2}: loss={r['loss']:.6f}  grad_norm={r['grad_norm']:.6f}")
+
+    section("Run 2: zoo MLXTrainer (explicit no-clip)")
+    rows_zoo = _run_zoo_trainer(seed, steps, lr, last_n)
+    for r in rows_zoo:
+        gn = r['grad_norm']
+        gn_s = f"{gn:.6f}" if gn is not None else "n/a"
+        print(f"  step {r['step']:>2}: loss={r['loss']:.6f}  grad_norm={gn_s}")
+
+    section("Per-step diff (mlx-lm - zoo)")
+    diffs = []
+    for r1, r2 in zip(rows_mlxlm, rows_zoo):
+        if r1['step'] != r2['step']: continue
+        loss_diff = r1['loss'] - r2['loss']
+        gn1 = r1.get('grad_norm'); gn2 = r2.get('grad_norm')
+        gn_diff = (gn1 - gn2) if (gn1 is not None and gn2 is not None) else None
+        gn_s = f"{gn_diff:+.6f}" if gn_diff is not None else "n/a"
+        print(f"  step {r1['step']:>2}: dloss={loss_diff:+.6f}  dgrad_norm={gn_s}")
+        diffs.append({
+            "step": r1['step'],
+            "loss_diff": loss_diff,
+            "grad_norm_diff": gn_diff,
+        })
+
+    out = {
+        "config": {"seed": seed, "steps": steps, "lr": lr, "last_n": last_n},
+        "rows_mlxlm": rows_mlxlm,
+        "rows_zoo": rows_zoo,
+        "diffs": diffs,
+    }
+    fname = f"probe_38__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_39_fastmlxmodel_parity.py b/tests/mlx_parity/probe_39_fastmlxmodel_parity.py
new file mode 100644
index 0000000000..db001b58ef
--- /dev/null
+++ b/tests/mlx_parity/probe_39_fastmlxmodel_parity.py
@@ -0,0 +1,205 @@
+"""Probe 39 — strict step-by-step parity between mlx-lm CLI's
+LoRA-init path and FastMLXModel + get_peft_model.
+
+Probe 38 v2 showed that mlx-lm manual loop + linear_to_lora_layers
+matches zoo MLXTrainer + linear_to_lora_layers value-for-value at the
+loss level when both reseed mx.random AFTER mlx_load. But probes that
+went through FastMLXModel.from_pretrained + FastMLXModel.get_peft_model
+(32 / 34 / 36) still hit 47% greedy pass rate vs 67% for mlx-lm CLI.
+
+Hypothesis: the seeding in zoo's get_peft_model (`_seed_mlx_random_state
+(random_state)` at line 2767 of loader.py) is the right place, but
+something else in FastMLXModel.from_pretrained or get_peft_model
+consumes mx.random state between the seed and `linear_to_lora_layers`,
+or the LoRA-key resolution / iteration order produces a different
+LoRA-module-creation order than the explicit-keys-list call in
+mlx-lm CLI.
+
+This probe runs both setups in one process with paired seeds and
+captures per-step loss + grad_norm so the divergence point (if any)
+is visible explicitly.
+
+Path A: mlx-lm CLI style. mlx_lm.load -> mx.random.seed(seed) after
+load -> linear_to_lora_layers(model, 16, {"keys": [suffix list]}) ->
+manual @mx.compile loop with bare optim.AdamW.
+
+Path B: FastMLXModel.from_pretrained(random_state=seed) ->
+FastMLXModel.get_peft_model(finetune_last_n_layers=16,
+random_state=seed) -> SAME manual @mx.compile loop, SAME optimizer
+construction (constructed here, not from MLXTrainer).
+
+We deliberately re-use the same manual training loop for both paths
+so the comparison isolates the LoRA-init pipeline only.
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from functools import partial
+from pathlib import Path
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def _run_training(model, tokenizer, seed, steps, lr):
+    """Shared manual-loop training driver -- identical for both paths so
+    any divergence is attributable to the LoRA-init pipeline upstream.
+
+    Returns rows: list[{step, loss, grad_norm}].
+    """
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    from mlx.nn.utils import average_gradients
+    from mlx.utils import tree_map, tree_flatten
+    from mlx_lm.tuner.trainer import iterate_batches, default_loss
+    from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+
+    optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True)
+    formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+    ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+    if mx.metal.is_available():
+        mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"])
+
+    state = [model.state, optimizer.state, mx.random.state]
+    loss_value_and_grad = nn.value_and_grad(model, default_loss)
+
+    @partial(mx.compile, inputs=state, outputs=state)
+    def step(batch, prev_grad, do_update):
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+        if prev_grad is not None:
+            grad = tree_map(lambda x, y: x + y, grad, prev_grad)
+        if do_update:
+            grad = average_gradients(grad)
+            optimizer.update(model, grad)
+            grad = None
+        return lvalue, toks, grad
+
+    model.train()
+    rows = []
+    np.random.seed(seed)
+    batch_iter = iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)
+    for it in range(1, steps + 1):
+        batch = next(batch_iter)
+        # Compute grad_norm BEFORE the compiled step using the same forward
+        # path; this gives us a value-for-value comparable number across paths.
+        (_, _), grad_pre = loss_value_and_grad(model, *batch)
+        grad_norm_sq = mx.array(0.0, dtype=mx.float32)
+        for _name, g in tree_flatten(grad_pre):
+            grad_norm_sq = grad_norm_sq + mx.sum(g.astype(mx.float32) ** 2)
+        grad_norm = mx.sqrt(grad_norm_sq)
+        mx.eval(grad_norm)
+        gn = float(grad_norm.item())
+        lvalue, toks, _ = step(batch, None, True)
+        mx.eval(state, lvalue, toks)
+        rows.append({"step": it, "loss": float(lvalue.item()), "grad_norm": gn})
+
+    return rows
+
+
+def _path_a_mlxlm(seed, steps, lr, last_n):
+    """mlx-lm CLI style: mlx_lm.load -> seed AFTER -> explicit-keys LoRA."""
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from mlx_lm import load as mlx_load
+    from mlx_lm.tuner.utils import linear_to_lora_layers
+
+    model, tokenizer = mlx_load(MODEL_NAME)
+    mx.random.seed(seed)  # mlx-lm CLI lora.py:223
+    model.freeze()
+    actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers)
+    num_layers = max(1, min(int(last_n), actual_layers))
+    linear_to_lora_layers(model, num_layers, {
+        "rank": 8, "scale": 2.0, "dropout": 0.0,
+        "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"],
+    })
+    return _run_training(model, tokenizer, seed, steps, lr)
+
+
+def _path_b_fastmlxmodel(seed, steps, lr, last_n):
+    """zoo FastMLXModel.from_pretrained + FastMLXModel.get_peft_model."""
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from unsloth_zoo.mlx.loader import FastMLXModel
+
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype=None,
+        text_only=True, max_seq_length=128, random_state=seed,
+    )
+    model = FastMLXModel.get_peft_model(
+        model, r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
+        random_state=seed,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+        finetune_last_n_layers=last_n,
+        use_gradient_checkpointing=False,
+    )
+    return _run_training(model, tokenizer, seed, steps, lr)
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    last_n = _env_int("MLX_LAST_N", 16)
+    banner(f"Probe 39: FastMLXModel get_peft_model vs mlx-lm CLI LoRA init  seed={seed}")
+
+    section("Path A: mlx_lm.load + mx.random.seed AFTER load + linear_to_lora_layers")
+    rows_a = _path_a_mlxlm(seed, steps, lr, last_n)
+    for r in rows_a:
+        print(f"  step {r['step']:>2}: loss={r['loss']:.6f}  grad_norm={r['grad_norm']:.6f}")
+
+    section("Path B: FastMLXModel.from_pretrained + FastMLXModel.get_peft_model")
+    rows_b = _path_b_fastmlxmodel(seed, steps, lr, last_n)
+    for r in rows_b:
+        print(f"  step {r['step']:>2}: loss={r['loss']:.6f}  grad_norm={r['grad_norm']:.6f}")
+
+    section("Per-step diff (Path A - Path B)")
+    diffs = []
+    for ra, rb in zip(rows_a, rows_b):
+        if ra['step'] != rb['step']: continue
+        dl = ra['loss'] - rb['loss']
+        dg = ra['grad_norm'] - rb['grad_norm']
+        print(f"  step {ra['step']:>2}: dloss={dl:+.6f}  dgrad_norm={dg:+.6f}")
+        diffs.append({"step": ra['step'], "loss_diff": dl, "grad_norm_diff": dg})
+
+    out = {
+        "config": {"seed": seed, "steps": steps, "lr": lr, "last_n": last_n},
+        "rows_mlxlm": rows_a,
+        "rows_fastmlxmodel": rows_b,
+        "diffs": diffs,
+    }
+    fname = f"probe_39__s{steps}_d{seed}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_3_loss_reduction.py b/tests/mlx_parity/probe_3_loss_reduction.py
new file mode 100644
index 0000000000..92957714f6
--- /dev/null
+++ b/tests/mlx_parity/probe_3_loss_reduction.py
@@ -0,0 +1,83 @@
+"""Probe 3 — loss reduction parity (synthetic logits/labels).
+
+Bypass the model entirely. Drive a fixed numpy (logits, labels) pair
+through:
+
+  (a) torch.nn.functional.cross_entropy with ignore_index=-100, reduction='mean'
+      (the HF SFTTrainer default).
+  (b) unsloth_zoo.mlx.utils.make_baseline_loss_fn's recipe replicated
+      in MLX: cross_entropy * mask, summed, divided by mask.sum().
+
+For identical inputs the two scalars MUST match (mod fp32 noise). If they
+diverge, the MLX trainer's loss-reduction layer differs from HF's.
+
+Exits 0 if |loss_a - loss_b| < 1e-4 AND ntok counts match, else 2.
+"""
+
+import json
+import sys
+
+import numpy as np
+
+from _common import OUT_DIR, banner, section, report, seed_everything
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 3: loss reduction parity (synthetic logits/labels)")
+
+    # Synthetic: batch=1, seq_len=10, vocab=8 -- small + reproducible.
+    np.random.seed(0)
+    V = 8
+    L = 10
+    logits = np.random.randn(1, L, V).astype(np.float32)
+    labels = np.array([[2, 5, 1, -100, 3, 4, 0, 7, -100, 6]], dtype=np.int64)
+    report("logits shape", logits.shape)
+    report("labels", labels.tolist())
+    n_valid = int((labels != -100).sum())
+    report("n_valid (non -100)", n_valid)
+
+    # Shift like HF / MLX both do: predict next token.
+    shift_logits = logits[:, :-1, :]
+    shift_labels = labels[:, 1:]
+    n_valid_shift = int((shift_labels != -100).sum())
+    report("n_valid after shift", n_valid_shift)
+
+    section("(a) torch.nn.functional.cross_entropy (HF SFTTrainer recipe)")
+    import torch
+    import torch.nn.functional as F
+    t_logits = torch.tensor(shift_logits.reshape(-1, V))
+    t_labels = torch.tensor(shift_labels.reshape(-1))
+    hf_loss = F.cross_entropy(t_logits, t_labels, ignore_index=-100, reduction="mean").item()
+    report("hf_loss", hf_loss)
+
+    section("(b) MLX baseline loss recipe (unsloth_zoo.mlx.utils:417)")
+    import mlx.core as mx
+    import mlx.nn as nn
+    mlx_logits = mx.array(shift_logits)
+    mlx_labels = mx.array(shift_labels)
+    mask = (mlx_labels != -100).astype(mx.float32)
+    safe = mx.where(mlx_labels == -100, 0, mlx_labels)
+    ce = nn.losses.cross_entropy(mlx_logits, safe) * mask
+    ntoks = mask.sum()
+    mlx_loss = (ce.astype(mx.float32).sum() / mx.maximum(ntoks, mx.array(1.0))).item()
+    report("mlx_loss", mlx_loss)
+    report("ntoks (mlx)", float(ntoks.item()))
+
+    section("comparison")
+    diff = abs(hf_loss - mlx_loss)
+    report("|hf - mlx|", diff)
+
+    out = {
+        "hf_loss": hf_loss,
+        "mlx_loss": mlx_loss,
+        "abs_diff": diff,
+        "n_valid_shift": n_valid_shift,
+    }
+    (OUT_DIR / "probe_3.json").write_text(json.dumps(out, indent=2))
+
+    return 0 if diff < 1e-4 else 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_40_fastmlxmodel_manual_loop.py b/tests/mlx_parity/probe_40_fastmlxmodel_manual_loop.py
new file mode 100644
index 0000000000..80a582d40d
--- /dev/null
+++ b/tests/mlx_parity/probe_40_fastmlxmodel_manual_loop.py
@@ -0,0 +1,184 @@
+"""Probe 40 -- FastMLXModel loader + manual @mx.compile loop.
+
+Round BS bisection of the residual 47% vs 67% gap that survived PR #674.
+
+After PR #674's seed-ordering fix, probe 39 proved
+FastMLXModel.from_pretrained + FastMLXModel.get_peft_model produces
+bit-identical losses and gradient norms vs mlx_lm.load +
+linear_to_lora_layers when both feed the same manual @mx.compile
+training loop (5 seeds x 30 steps, dloss = 0.0, dgrad_norm = 0.0).
+
+But probes 34 / 36 (`FastMLXModel + MLXTrainer.train`) still hit 47%
+greedy pass rate vs probe 31's (`mlx_lm.load + manual loop`) 67% on
+the same 15 seeds. Probes 34 and 36 share an identical pass/fail
+pattern, so `compile=True/False` is a no-op for the basin.
+
+Two remaining suspects for the gap:
+  (a) MLXTrainer.train introduces drift on top of the manual loop
+      (despite probe 38 showing dloss=0 between manual loop and
+      MLXTrainer on `mlx_lm.load` path -- maybe FastMLXModel exposes
+      a path that probe 38 didn't cover).
+  (b) FastMLXModel.from_pretrained adds drift outside of LoRA init
+      that survives all 30 training steps -- probe 39's 5 seeds may
+      not have hit a basin-tipping case.
+
+Probe 40 = exactly probe 31's manual loop but the loader/PEFT setup
+swapped for `FastMLXModel.from_pretrained` + `FastMLXModel.get_peft_model
+(finetune_last_n_layers=16)`. Read:
+  * probe 40 ~ 67% (matches probe 31): MLXTrainer.train IS the bug.
+    PR #674 closed the loader-side gap; the remaining gap is purely
+    trainer math.
+  * probe 40 ~ 47% (matches probe 34): FastMLXModel.from_pretrained
+    adds drift downstream of get_peft_model that probe 39's 5-seed
+    diagnostic missed. Bisect the loader next.
+
+Same 15 seeds as probes 31 / 34 / 36 for direct paired comparison.
+"""
+import json
+import os
+import sys
+import random
+from functools import partial
+from pathlib import Path
+import numpy as np
+
+MODEL_NAME = "unsloth/gemma-3-270m-it"
+TRAIN_TEXT = "<<HELLO!!>> My name is Unsloth!"
+PROMPT = "<<HELLO!!>> My name is "
+MAX_SEQ_LEN = 64
+OUT_DIR = Path(__file__).resolve().parent / ".out"
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    num_layers = _env_int("MLX_NUM_LAYERS", 16)
+    print(f"=== Probe 40: FastMLXModel + manual loop  steps={steps} seed={seed} lr={lr} nl={num_layers} ===", flush=True)
+
+    random.seed(seed); np.random.seed(seed)
+
+    import mlx.core as mx
+    import mlx.nn as nn
+    import mlx.optimizers as optim
+    from mlx.nn.utils import average_gradients
+    from mlx.utils import tree_map
+
+    from mlx_lm import generate
+    from mlx_lm.tuner.trainer import iterate_batches, default_loss
+    from mlx_lm.tuner.datasets import TextDataset, CacheDataset
+
+    # FastMLXModel path (same as probe 39 path B).
+    mx.random.seed(seed)
+    from unsloth_zoo.mlx.loader import FastMLXModel
+
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME,
+        load_in_4bit=False,
+        dtype=None,
+        text_only=True,
+        max_seq_length=128,
+        random_state=seed,
+    )
+    model = FastMLXModel.get_peft_model(
+        model,
+        r=8,
+        lora_alpha=16,
+        lora_dropout=0.0,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                        "gate_proj", "up_proj", "down_proj"],
+        random_state=seed,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+        finetune_last_n_layers=num_layers,
+        use_gradient_checkpointing=False,
+    )
+
+    actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers)
+    print(f"  model has {actual_layers} layers, LoRA on last {num_layers}", flush=True)
+
+    # From here down: bit-identical to probe 31's manual loop.
+    optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True)
+    formatted = [{"text": TRAIN_TEXT} for _ in range(64)]
+    ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text"))
+
+    if mx.metal.is_available():
+        mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"])
+
+    state = [model.state, optimizer.state, mx.random.state]
+    loss_value_and_grad = nn.value_and_grad(model, default_loss)
+
+    @partial(mx.compile, inputs=state, outputs=state)
+    def step(batch, prev_grad, do_update):
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+        if prev_grad is not None:
+            grad = tree_map(lambda x, y: x + y, grad, prev_grad)
+        if do_update:
+            grad = average_gradients(grad)
+            optimizer.update(model, grad)
+            grad = None
+        return lvalue, toks, grad
+
+    model.train()
+    losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None
+    rows = []
+    np.random.seed(seed)
+    for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)):
+        lvalue, toks, grad_accum = step(batch, grad_accum, True)
+        losses += lvalue; n_tokens += toks
+        mx.eval(state, losses, n_tokens, grad_accum)
+        rows.append({"step": it, "loss": float(lvalue.item())})
+
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]]))
+    post_loss_val = float(post_loss.item())
+
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item())
+    else:
+        completion_loss = float("nan")
+
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    print(f"  contains 'Unsloth': {contains}  gen={gen[:80]!r}", flush=True)
+
+    out = {
+        "config": {"steps": steps, "seed": seed, "learning_rate": lr,
+                   "num_layers": num_layers, "actual_layers": actual_layers,
+                   "delta": "FastMLXModel loader + manual @mx.compile loop"},
+        "rows": rows, "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss, "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_40__s{steps}_d{seed}_nl{num_layers}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_41_zoo_trainer_clip_explicit_zero.py b/tests/mlx_parity/probe_41_zoo_trainer_clip_explicit_zero.py
new file mode 100644
index 0000000000..6e7cbe03d4
--- /dev/null
+++ b/tests/mlx_parity/probe_41_zoo_trainer_clip_explicit_zero.py
@@ -0,0 +1,179 @@
+"""Probe 41 -- probe 34 with max_grad_value=0.0 (explicit disable).
+
+Round BT bisection.
+
+Round BS proved the residual 47%-vs-67% gap is in MLXTrainer.train,
+not FastMLXModel loader (probe 40 = probe 31 on 15/15 seeds). Reading
+unsloth_zoo/mlx/trainer.py:731-732:
+
+    _raw_mgv = getattr(args, "max_grad_value", 1.0)
+    max_grad_value = 1.0 if _raw_mgv is None else float(_raw_mgv or 0.0)
+
+means `max_grad_value=None` is reinterpreted as 1.0 (clip at +/-1.0
+elementwise), NOT "disable clipping". PR #671
+(`mlx: honor max_grad_value=None as a disable signal`, head 265534b)
+is currently OPEN, not merged. Probe 34 sets max_grad_value=None
+expecting "disable", actually gets clip-at-1. Manual loop in probes
+31 / 40 uses bare optim.AdamW with NO clipping.
+
+Probe 41 = probe 34 but with max_grad_value=0.0 (explicit zero hits
+the `float(_raw_mgv or 0.0)` branch -> 0.0 -> no clip on the current
+build).
+
+Read:
+  probe 41 ~ 67%  ->  Elementwise clip-at-1 IS the residual gap.
+                     PR #671 closes the FastMLXModel + MLXTrainer
+                     basin gap. Final missing piece.
+  probe 41 ~ 47%  ->  Clip isn't it; the gap is elsewhere in
+                     MLXTrainer.train (lr schedule, loss-fn, batch
+                     iteration, mx.eval timing, ...).
+
+Same 15 seeds as probes 31 / 34 / 40 for direct paired comparison.
+"""
+import json
+import os
+import sys
+import dataclasses
+import random
+from pathlib import Path
+import numpy as np
+
+from _common import (
+    MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR,
+    banner, section, report,
+)
+
+
+def _env_int(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return int(raw)
+    except ValueError: return default
+
+
+def _env_float(name, default):
+    raw = (os.environ.get(name) or "").strip()
+    if not raw: return default
+    try: return float(raw)
+    except ValueError: return default
+
+
+def main() -> int:
+    steps = _env_int("MLX_STEPS", 30)
+    seed = _env_int("MLX_SEED", 3407)
+    lr = _env_float("MLX_LR", 1e-3)
+    last_n = _env_int("MLX_LAST_N", 16)
+    banner(f"Probe 41: FastMLXModel + MLXTrainer + max_grad_value=0.0 (explicit disable)")
+
+    random.seed(seed); np.random.seed(seed)
+    import mlx.core as mx
+    mx.random.seed(seed)
+
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype=None,
+        text_only=True, max_seq_length=128, random_state=seed,
+    )
+
+    model = FastMLXModel.get_peft_model(
+        model, r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
+        random_state=seed,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+        finetune_last_n_layers=last_n,
+        use_gradient_checkpointing=False,
+    )
+
+    fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)}
+    extra = {}
+    if "adam_bias_correction" in fields_supported: extra["adam_bias_correction"] = True
+    # The key difference vs probe 34: explicit 0.0 hits trainer.py:732's
+    # `float(_raw_mgv or 0.0)` branch -> 0.0 -> no clip. Setting None
+    # would hit `1.0 if _raw_mgv is None` -> clip at 1.0.
+    if "max_grad_value" in fields_supported: extra["max_grad_value"] = 0.0
+
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=6,
+        gradient_accumulation_steps=1,
+        max_steps=steps,
+        learning_rate=lr,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=seed,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / f"probe41_outputs_s{steps}_d{seed}"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+        **extra,
+    )
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+    rows = []
+    def _on_step(*args):
+        if len(args) < 3: return
+        rows.append({"step": int(args[0]), "loss": float(args[2])})
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+
+    import mlx.nn as nn
+    prompt_ids = list(tokenizer.encode(PROMPT))
+    full_ids = list(tokenizer.encode(PROMPT + "Unsloth!"))
+    if len(full_ids) > len(prompt_ids):
+        cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32)
+        cf_targets = mx.array([full_ids[1:]], dtype=mx.int32)
+        cf_logits = model(cf_inputs)
+        start = len(prompt_ids) - 1
+        completion_loss = float(nn.losses.cross_entropy(
+            cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean"
+        ).item())
+    else:
+        completion_loss = float("nan")
+
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    contains = "Unsloth" in gen
+    report("contains 'Unsloth'", contains)
+    report("generation", repr(gen[:60]))
+
+    out = {
+        "config": {"steps": steps, "seed": seed, "learning_rate": lr,
+                   "loader": "FastMLXModel(dtype=None)", "finetune_last_n_layers": last_n,
+                   "delta": "max_grad_value=0.0 (explicit disable)"},
+        "rows": rows, "post_train_loss": post_loss_val,
+        "completion_teacher_forced_loss": completion_loss, "generation": gen,
+        "contains_unsloth": contains,
+    }
+    fname = f"probe_41__s{steps}_d{seed}_nl{last_n}.json"
+    (OUT_DIR / fname).write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_4_lora_init.py b/tests/mlx_parity/probe_4_lora_init.py
new file mode 100644
index 0000000000..0f0492a70c
--- /dev/null
+++ b/tests/mlx_parity/probe_4_lora_init.py
@@ -0,0 +1,151 @@
+"""Probe 4 — LoRA initialization parity.
+
+Attach LoRA r=8 alpha=16 on q_proj of layer 0 in both backends with
+seed=SEED. Inspect the resulting LoRA-A and LoRA-B matrices.
+
+Expected baseline (standard LoRA init):
+  A ~ Kaiming uniform (non-zero, small magnitude)
+  B ~ zero matrix
+
+If both backends honor this, the LoRA contribution at step 0 is zero
+and the base-model forward dominates (i.e. probe 2 + LoRA-attached
+forward should produce the same logits up to fp noise).
+
+This probe does not enforce A == A across backends (different RNGs),
+but DOES enforce:
+  * B is exactly zero in both
+  * |A.std()| within 2x across backends
+  * shapes match
+"""
+
+import json
+import sys
+
+import numpy as np
+
+from _common import MODEL_NAME, SEED, OUT_DIR, banner, section, report, seed_everything
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 4: LoRA initialization parity")
+
+    # ---------------- HF / torch / PEFT ----------------
+    section("HF + PEFT LoRA")
+    import torch
+    from transformers import AutoModelForCausalLM
+    from peft import LoraConfig, get_peft_model
+    torch.manual_seed(SEED)
+    hf_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32)
+    hf_peft = get_peft_model(
+        hf_model,
+        LoraConfig(
+            r=8, lora_alpha=16, lora_dropout=0.0, bias="none",
+            target_modules=["q_proj"],
+        ),
+    )
+    # Find layer-0 q_proj LoRA-A and LoRA-B
+    hf_A = None
+    hf_B = None
+    for name, p in hf_peft.named_parameters():
+        if "q_proj.lora_A.default.weight" in name and ".0." in name:
+            hf_A = p.detach().float().cpu().numpy()
+        if "q_proj.lora_B.default.weight" in name and ".0." in name:
+            hf_B = p.detach().float().cpu().numpy()
+        if hf_A is not None and hf_B is not None:
+            break
+    report("hf A shape / std", (None if hf_A is None else (hf_A.shape, float(hf_A.std()))))
+    report("hf B shape / max|B|", (None if hf_B is None else (hf_B.shape, float(np.abs(hf_B).max()))))
+
+    # ---------------- MLX / mlx-lm / unsloth_zoo.mlx ----------------
+    section("MLX + unsloth_zoo.mlx LoRA")
+    import mlx.core as mx
+    mx.random.seed(SEED)
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    mlx_model, _tok = FastMLXModel.from_pretrained(
+        MODEL_NAME,
+        load_in_4bit=False,
+        dtype="float32",
+        text_only=True,
+        max_seq_length=64,
+        random_state=SEED,
+    )
+    mlx_model = FastMLXModel.get_peft_model(
+        mlx_model,
+        r=8,
+        lora_alpha=16,
+        lora_dropout=0.0,
+        target_modules=["q_proj"],
+        random_state=SEED,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=False,
+    )
+    mlx_A = None
+    mlx_B = None
+    # Walk module tree and grab layer-0 q_proj LoRA params.
+    def walk(mod, prefix=""):
+        for name, child in getattr(mod, "named_modules", lambda: [])():
+            yield name, child
+    try:
+        for name, child in mlx_model.named_modules():
+            if name.endswith(".q_proj") and (".layers.0." in name or ".0.q_proj" in name):
+                for attr in ("lora_a", "lora_b", "lora_A", "lora_B"):
+                    if hasattr(child, attr):
+                        v = getattr(child, attr)
+                        arr = np.asarray(mx.eval(v) if callable(getattr(v, "eval", None)) else v)
+                        if attr.lower().endswith("a") and mlx_A is None:
+                            mlx_A = arr
+                        if attr.lower().endswith("b") and mlx_B is None:
+                            mlx_B = arr
+                break
+    except Exception as e:
+        report("introspection error", str(e))
+    report("mlx A shape / std", (None if mlx_A is None else (mlx_A.shape, float(mlx_A.std()))))
+    report("mlx B shape / max|B|", (None if mlx_B is None else (mlx_B.shape, float(np.abs(mlx_B).max()))))
+
+    section("comparison")
+    ok = True
+    issues = []
+    if hf_A is None or hf_B is None:
+        issues.append("could not locate HF layer-0 q_proj LoRA params")
+        ok = False
+    if mlx_A is None or mlx_B is None:
+        issues.append("could not locate MLX layer-0 q_proj LoRA params")
+        ok = False
+    if hf_B is not None and float(np.abs(hf_B).max()) != 0.0:
+        issues.append(f"HF B is non-zero (max|B|={float(np.abs(hf_B).max())})")
+        ok = False
+    if mlx_B is not None and float(np.abs(mlx_B).max()) != 0.0:
+        issues.append(f"MLX B is non-zero (max|B|={float(np.abs(mlx_B).max())})")
+        ok = False
+    if hf_A is not None and mlx_A is not None and hf_A.shape != mlx_A.shape:
+        issues.append(f"shape mismatch A: hf={hf_A.shape} mlx={mlx_A.shape}")
+        ok = False
+    if hf_A is not None and mlx_A is not None and hf_A.shape == mlx_A.shape:
+        ratio = float(mlx_A.std()) / max(float(hf_A.std()), 1e-12)
+        report("std ratio mlx/hf", ratio)
+        if not (0.5 <= ratio <= 2.0):
+            issues.append(f"A std ratio out of [0.5, 2.0]: {ratio:.3f}")
+            ok = False
+
+    for i in issues:
+        report("FAIL", i)
+    if ok:
+        report("OK", "B==0 in both and A stds within 2x")
+
+    out = {
+        "hf_A_shape": None if hf_A is None else list(hf_A.shape),
+        "hf_A_std": None if hf_A is None else float(hf_A.std()),
+        "hf_B_max_abs": None if hf_B is None else float(np.abs(hf_B).max()),
+        "mlx_A_shape": None if mlx_A is None else list(mlx_A.shape),
+        "mlx_A_std": None if mlx_A is None else float(mlx_A.std()),
+        "mlx_B_max_abs": None if mlx_B is None else float(np.abs(mlx_B).max()),
+        "issues": issues,
+    }
+    (OUT_DIR / "probe_4.json").write_text(json.dumps(out, indent=2))
+    return 0 if ok else 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_5_single_grad.py b/tests/mlx_parity/probe_5_single_grad.py
new file mode 100644
index 0000000000..3ad7d53132
--- /dev/null
+++ b/tests/mlx_parity/probe_5_single_grad.py
@@ -0,0 +1,163 @@
+"""Probe 5 — single-batch backward parity at LoRA-B=0.
+
+At step 0 LoRA-B is zero, so the LoRA contribution to forward is zero
+and gradients on LoRA-A and LoRA-B reduce to a simple function of base-
+model activations + base-model gradients w.r.t. q_proj output.
+
+Run ONE forward + backward in both backends, on identical token IDs
+(probe 1 already proves the IDs match). Compare the per-leaf
+gradient norms on layer-0 q_proj LoRA-A and LoRA-B. The shapes
+match (probe 4) so the norms are directly comparable.
+
+If forward+backward parity holds, gradient norms agree within 5%.
+A larger divergence here points the finger at the MLX
+backward / VJP / loss-reduction pipeline.
+
+This probe doesn't try to match the exact value of every gradient
+element (different RNG-initialized A makes that impossible by design);
+instead it asserts the AGGREGATE gradient magnitude is in the same
+ballpark on both sides.
+"""
+
+import json
+import sys
+
+import numpy as np
+
+from _common import MODEL_NAME, TRAIN_TEXT, SEED, OUT_DIR, banner, section, report, seed_everything
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 5: single-batch backward parity (B=0)")
+
+    # Build token batch (lengths/labels match what MLX trainer would use).
+    from transformers import AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    ids = tok.encode(TRAIN_TEXT)
+    if tok.eos_token_id is not None and ids[-1] != tok.eos_token_id:
+        ids.append(tok.eos_token_id)
+    report("token_ids len", len(ids))
+
+    # ---------------- HF side ----------------
+    section("HF + PEFT backward")
+    import torch
+    from transformers import AutoModelForCausalLM
+    from peft import LoraConfig, get_peft_model
+    torch.manual_seed(SEED)
+    hf_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32)
+    hf_peft = get_peft_model(
+        hf_model,
+        LoraConfig(r=8, lora_alpha=16, lora_dropout=0.0, target_modules=["q_proj"]),
+    )
+    inp = torch.tensor([ids], dtype=torch.long)
+    labels = inp.clone()
+    out = hf_peft(input_ids=inp, labels=labels)
+    out.loss.backward()
+    hf_norms = {}
+    for name, p in hf_peft.named_parameters():
+        if (".0." in name) and ("q_proj.lora_A" in name or "q_proj.lora_B" in name):
+            g = p.grad
+            if g is not None:
+                hf_norms[name.split(".0.")[-1]] = float(g.detach().float().norm().item())
+    report("hf grad norms", hf_norms)
+    report("hf loss", float(out.loss.item()))
+
+    # ---------------- MLX side ----------------
+    section("MLX + unsloth_zoo.mlx backward")
+    import mlx.core as mx
+    import mlx.nn as mlx_nn
+    mx.random.seed(SEED)
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+
+    mlx_model, _ = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype="float32",
+        text_only=True, max_seq_length=64, random_state=SEED,
+    )
+    mlx_model = FastMLXModel.get_peft_model(
+        mlx_model, r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=["q_proj"], random_state=SEED,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=False,
+    )
+    loss_fn = make_baseline_loss_fn()
+    batch = mx.array([ids])
+    L = batch.shape[1]
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+
+    # nn.value_and_grad takes (model, loss_fn) and uses model.trainable_parameters
+    # internally, avoiding the "argument should contain only arrays" tree_flatten
+    # error that mx.value_and_grad raises when the model tree has non-array
+    # metadata (PEFT wrappers).
+    def loss_for_grad(model, batch, lengths, labels_):
+        loss, _ntok = loss_fn(model, batch, lengths, labels_)
+        return loss
+    loss_and_grad = mlx_nn.value_and_grad(mlx_model, loss_for_grad)
+    loss_val, grads = loss_and_grad(mlx_model, batch, lengths, labels_mlx)
+
+    # Walk grads recursively (it is now a pure-array tree). Sum a per-name
+    # norm dict, restricted to layer-0 q_proj LoRA leaves.
+    mlx_norms = {}
+    total_norm_sq = mx.array(0.0, dtype=mx.float32)
+    n_leaves = 0
+    def _walk(tree, path):
+        nonlocal total_norm_sq, n_leaves
+        if isinstance(tree, dict):
+            for k, v in tree.items():
+                _walk(v, path + (str(k),))
+            return
+        if isinstance(tree, (list, tuple)):
+            for i, v in enumerate(tree):
+                _walk(v, path + (str(i),))
+            return
+        if hasattr(tree, "shape") and hasattr(tree, "dtype"):
+            arr = tree.astype(mx.float32) if hasattr(tree, "astype") else tree
+            total_norm_sq = total_norm_sq + mx.sum(arr * arr)
+            n_leaves += 1
+            name = ".".join(path)
+            if "q_proj" in name and (".0." in name or "layers.0" in name) and (
+                "lora_a" in name.lower() or "lora_b" in name.lower()
+            ):
+                mlx_norms[name] = float(mx.linalg.norm(arr).item())
+    _walk(grads, ())
+    mlx_total_norm = float(mx.sqrt(total_norm_sq).item())
+    report("mlx grad leaves", n_leaves)
+    report("mlx total grad norm (all trainable)", mlx_total_norm)
+    report("mlx q_proj.lora_* grad norms", mlx_norms)
+    report("mlx loss", float(loss_val.item()))
+
+    # Aggregate HF gradient norm for the same comparison.
+    hf_total_sq = 0.0
+    for _, p in hf_peft.named_parameters():
+        if p.grad is not None:
+            hf_total_sq += float((p.grad.detach().float() ** 2).sum().item())
+    hf_total_norm = hf_total_sq ** 0.5
+
+    # ---------------- compare ----------------
+    section("comparison")
+    ratio = mlx_total_norm / max(hf_total_norm, 1e-12)
+    report("hf total grad norm (all trainable)", hf_total_norm)
+    report("mlx total grad norm (all trainable)", mlx_total_norm)
+    report("ratio mlx/hf", ratio)
+    report("hf loss", float(out.loss.item()))
+    report("mlx loss", float(loss_val.item()))
+    ok = 0.5 <= ratio <= 2.0
+
+    out_blob = {
+        "hf_loss": float(out.loss.item()) if hasattr(out, "loss") else None,
+        "mlx_loss": float(loss_val.item()),
+        "hf_total_grad_norm": hf_total_norm,
+        "mlx_total_grad_norm": mlx_total_norm,
+        "ratio_mlx_hf": ratio,
+        "hf_norms": hf_norms,
+        "mlx_norms": mlx_norms,
+    }
+    (OUT_DIR / "probe_5.json").write_text(json.dumps(out_blob, indent=2, default=str))
+    return 0 if ok else 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_6_adamw_step.py b/tests/mlx_parity/probe_6_adamw_step.py
new file mode 100644
index 0000000000..71dd27eaf3
--- /dev/null
+++ b/tests/mlx_parity/probe_6_adamw_step.py
@@ -0,0 +1,77 @@
+"""Probe 6 — single AdamW step parity (synthetic).
+
+Bypass model + autograd. Drive torch.optim.AdamW and mlx.optimizers.AdamW
+with bit-identical hyperparameters and the SAME initial weights + the
+SAME gradient. Compare the post-step weight tensor.
+
+This is the strongest possible test of the optimizer math:
+  * bias_correction (PyTorch always on; MLX defaulted off pre-#634,
+    on post-#634 -- this probe verifies the post-#634 default actually
+    matches PyTorch's behavior at step 1).
+  * eps placement
+  * weight_decay (decoupled / coupled)
+
+Tolerance: |w_torch - w_mlx| < 1e-5.
+"""
+
+import json
+import sys
+
+import numpy as np
+
+from _common import OUT_DIR, banner, section, report, seed_everything
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 6: AdamW step parity (synthetic)")
+
+    np.random.seed(0)
+    W0 = np.random.randn(8, 16).astype(np.float32)
+    G = np.random.randn(8, 16).astype(np.float32) * 0.1
+
+    LR = 1e-3
+    BETA1, BETA2 = 0.9, 0.999
+    EPS = 1e-8
+    WD = 0.0
+
+    section("(a) torch.optim.AdamW one step")
+    import torch
+    w_t = torch.tensor(W0.copy(), requires_grad=True)
+    w_t.grad = torch.tensor(G.copy())
+    opt = torch.optim.AdamW([w_t], lr=LR, betas=(BETA1, BETA2), eps=EPS, weight_decay=WD)
+    opt.step()
+    w_after_t = w_t.detach().cpu().numpy()
+    report("max |w_after_t - W0|", float(np.abs(w_after_t - W0).max()))
+
+    section("(b) mlx.optimizers.AdamW one step, bias_correction=True")
+    import mlx.core as mx
+    import mlx.optimizers as optim
+    w_m = mx.array(W0.copy())
+    state = {"w": w_m}
+    grads = {"w": mx.array(G.copy())}
+    adamw = optim.AdamW(
+        learning_rate=LR, betas=(BETA1, BETA2), eps=EPS, weight_decay=WD,
+        bias_correction=True,
+    )
+    state = adamw.apply_gradients(grads, state)
+    w_after_m = np.asarray(state["w"].astype(mx.float32))
+    report("max |w_after_m - W0|", float(np.abs(w_after_m - W0).max()))
+
+    section("comparison")
+    diff = np.abs(w_after_t - w_after_m)
+    report("max |w_after_t - w_after_m|", float(diff.max()))
+    report("mean |w_after_t - w_after_m|", float(diff.mean()))
+
+    out = {
+        "max_diff": float(diff.max()),
+        "mean_diff": float(diff.mean()),
+        "torch_step_norm": float(np.linalg.norm(w_after_t - W0)),
+        "mlx_step_norm": float(np.linalg.norm(w_after_m - W0)),
+    }
+    (OUT_DIR / "probe_6.json").write_text(json.dumps(out, indent=2))
+    return 0 if diff.max() < 1e-5 else 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_7_loss_curve.py b/tests/mlx_parity/probe_7_loss_curve.py
new file mode 100644
index 0000000000..7604f9f2a6
--- /dev/null
+++ b/tests/mlx_parity/probe_7_loss_curve.py
@@ -0,0 +1,145 @@
+"""Probe 7 — end-to-end 7-step training loss curve, MLX-only.
+
+Re-run the same 7-step config that the smoke test uses, just MLXTrainer
+this time (we already know the HF curve from the CUDA mirror). Capture:
+
+  * per-step training loss
+  * per-step grad norm (as reported by the trainer)
+  * post-train loss on the train row (recomputed via a fresh forward)
+  * greedy generation from `"<<HELLO!!>> My name is "`
+  * tokenized train ids + ntoks-per-batch (from probe 1 path)
+
+Emit everything to probe_7.json so a follow-up analysis script (or a
+maintainer reading the CI log) can directly compare these numbers
+against the CUDA-mirror baseline numbers checked into
+`temp/torchcodec_test/.out/cuda_truemirror_*.json`.
+
+Always exits 0 -- this probe is a data dump, not a gate. It's the
+ground truth that probes 1-6 are debugging.
+"""
+
+import json
+import sys
+
+from _common import (
+    MODEL_NAME,
+    TRAIN_TEXT,
+    PROMPT,
+    SEED,
+    MAX_SEQ_LEN,
+    OUT_DIR,
+    banner,
+    section,
+    report,
+    seed_everything,
+)
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 7: end-to-end 7-step MLX loss curve")
+
+    import mlx.core as mx
+    from unsloth_zoo.mlx.loader import FastMLXModel
+    from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig
+
+    section("load + LoRA")
+    model, tokenizer = FastMLXModel.from_pretrained(
+        MODEL_NAME, load_in_4bit=False, dtype="float16",
+        text_only=True, max_seq_length=128,
+        random_state=SEED,
+    )
+    model = FastMLXModel.get_peft_model(
+        model,
+        r=8, lora_alpha=16, lora_dropout=0.0,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        random_state=SEED,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+    )
+
+    section("trainer config (same as the upstream smoke test, minus override workaround)")
+    config = MLXTrainingConfig(
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=3,
+        max_steps=7,
+        learning_rate=1e-3,
+        warmup_steps=0,
+        lr_scheduler_type="constant",
+        optim="adamw",
+        weight_decay=0.0,
+        max_grad_norm=1.0,
+        # leave max_grad_value at config default
+        logging_steps=1,
+        max_seq_length=MAX_SEQ_LEN,
+        seed=SEED,
+        use_cce=False,
+        compile=False,
+        gradient_checkpointing=False,
+        output_dir=str(OUT_DIR / "probe7_outputs"),
+        save_steps=0,
+        eval_steps=0,
+        dataset_text_field="text",
+    )
+    report("max_grad_value default", config.max_grad_value)
+    report("max_grad_norm", config.max_grad_norm)
+
+    trainer = MLXTrainer(
+        model=model, tokenizer=tokenizer,
+        train_dataset=[{"text": TRAIN_TEXT}] * 64,
+        args=config,
+    )
+
+    rows = []
+    def _on_step(step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens, grad_norm):
+        rows.append({
+            "step": int(step), "loss": float(loss),
+            "lr": float(lr), "grad_norm": None if grad_norm is None else float(grad_norm),
+            "num_tokens": int(num_tokens),
+        })
+    trainer.add_step_callback(_on_step)
+    trainer.train()
+
+    section("post-train forward")
+    from unsloth_zoo.mlx.utils import make_baseline_loss_fn
+    loss_fn = make_baseline_loss_fn()
+    ids = tokenizer.encode(TRAIN_TEXT)
+    if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id:
+        ids.append(tokenizer.eos_token_id)
+    L = len(ids)
+    batch = mx.array([ids])
+    lengths = mx.array([[1, L - 1]])
+    labels_mlx = mx.array([ids])
+    post_loss, _ = loss_fn(model, batch, lengths, labels_mlx)
+    post_loss_val = float(post_loss.item())
+    report("post_train_loss", post_loss_val)
+
+    section("greedy generation")
+    from mlx_lm import generate
+    gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False)
+    report("generation", repr(gen))
+    contains = "Unsloth" in gen
+
+    out = {
+        "tokenized_train_ids": ids,
+        "tokenized_train_len": L,
+        "rows": rows,
+        "post_train_loss": post_loss_val,
+        "generation": gen,
+        "contains_unsloth": contains,
+    }
+    (OUT_DIR / "probe_7.json").write_text(json.dumps(out, indent=2))
+    section("summary")
+    report("step-1 loss", rows[0]["loss"] if rows else None)
+    report("step-7 loss", rows[-1]["loss"] if rows else None)
+    report("post_train_loss", post_loss_val)
+    report("contains 'Unsloth'", contains)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_8_per_token_loss.py b/tests/mlx_parity/probe_8_per_token_loss.py
new file mode 100644
index 0000000000..5be99498c5
--- /dev/null
+++ b/tests/mlx_parity/probe_8_per_token_loss.py
@@ -0,0 +1,87 @@
+"""Probe 8 — per-token CE decomposition.
+
+The aggregate step-1 loss gap (HF 7.64 vs MLX 10.55) is a single scalar.
+This probe breaks it down per position:
+
+  * tokenize the train row identically
+  * forward through the base model on both backends (no LoRA)
+  * compute per-token cross-entropy at every position
+  * print: tok_idx, token_id, decoded, ce_hf, ce_mlx, abs(ce_hf - ce_mlx)
+
+If the gap is concentrated on specific positions (BOS, EOS, special
+tokens), the divergence is likely a masking / special-token handling
+bug. If it is spread evenly, it is a precision / numerics issue across
+the whole forward pass.
+
+Always exits 0 -- diagnostic dump.
+"""
+
+import json
+import sys
+
+import numpy as np
+
+from _common import MODEL_NAME, TRAIN_TEXT, OUT_DIR, banner, section, report, seed_everything
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 8: per-token CE decomposition")
+
+    from transformers import AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    ids = tok.encode(TRAIN_TEXT)
+    if tok.eos_token_id is not None and ids[-1] != tok.eos_token_id:
+        ids.append(tok.eos_token_id)
+    report("token_ids", ids)
+    L = len(ids)
+    report("len", L)
+
+    section("HF base forward (fp32)")
+    import torch
+    import torch.nn.functional as F
+    from transformers import AutoModelForCausalLM
+    hf_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32)
+    hf_model.eval()
+    with torch.no_grad():
+        logits = hf_model(input_ids=torch.tensor([ids], dtype=torch.long)).logits[0].float().cpu()
+    # shift: predict token t+1 from logits[t]
+    shift_logits = logits[:-1]
+    shift_targets = torch.tensor(ids[1:], dtype=torch.long)
+    hf_ce = F.cross_entropy(shift_logits, shift_targets, reduction="none").numpy()
+    report("hf mean CE", float(hf_ce.mean()))
+    report("hf sum CE", float(hf_ce.sum()))
+
+    section("MLX base forward (fp32)")
+    import mlx.core as mx
+    import mlx.nn as nn
+    from mlx_lm import load as mlx_load
+    mlx_model, _ = mlx_load(MODEL_NAME)
+    mlx_logits = np.asarray(mlx_model(mx.array([ids])).astype(mx.float32))[0]
+    shift_mlx = mx.array(mlx_logits[:-1])
+    shift_tgt = mx.array(np.asarray(ids[1:], dtype=np.int64))
+    mlx_ce = np.asarray(nn.losses.cross_entropy(shift_mlx, shift_tgt, reduction="none"))
+    report("mlx mean CE", float(mlx_ce.mean()))
+    report("mlx sum CE", float(mlx_ce.sum()))
+
+    section("per-token table")
+    print(f"  {'idx':>3}  {'tok_id':>7}  {'decoded':<24}  {'ce_hf':>9}  {'ce_mlx':>9}  {'abs_diff':>9}")
+    for i in range(L - 1):
+        tid = ids[i + 1]
+        dec = tok.decode([tid]).replace("\n", "\\n").replace("\t", "\\t")[:24]
+        print(f"  {i:>3}  {tid:>7}  {dec:<24}  {float(hf_ce[i]):>9.4f}  {float(mlx_ce[i]):>9.4f}  {abs(float(hf_ce[i]) - float(mlx_ce[i])):>9.4f}")
+
+    out = {
+        "token_ids": ids,
+        "hf_per_token_ce": hf_ce.tolist(),
+        "mlx_per_token_ce": mlx_ce.tolist(),
+        "hf_mean": float(hf_ce.mean()),
+        "mlx_mean": float(mlx_ce.mean()),
+        "abs_diff_total": float(np.abs(hf_ce - mlx_ce).sum()),
+    }
+    (OUT_DIR / "probe_8.json").write_text(json.dumps(out, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/mlx_parity/probe_9_attention_lengths.py b/tests/mlx_parity/probe_9_attention_lengths.py
new file mode 100644
index 0000000000..2f2810568c
--- /dev/null
+++ b/tests/mlx_parity/probe_9_attention_lengths.py
@@ -0,0 +1,103 @@
+"""Probe 9 — attention mask / lengths inspection.
+
+HF SFTTrainer's default collator and MLX trainer's create_batches both
+build a (batch, lengths_or_mask) representation. Their masking
+semantics may differ in subtle ways:
+
+  HF SFTTrainer:
+    * attention_mask is a (B, L) 0/1 tensor; 0 marks padding tokens.
+    * labels = input_ids with padding positions set to -100.
+    * loss is reduced over labels != -100.
+
+  MLX trainer (unsloth_zoo.mlx):
+    * batch is (B, L) padded with 0.
+    * lengths is (B, 2) of [start, end] = [1, L-1] for this dataset
+      (see trainer.py around batch_lengths.append([1, L-1])).
+    * labels mirror input_ids with [-100]*pad_len trailing.
+    * loss mask = (targets != -100) AND length_mask(start, end).
+
+This probe enumerates what tokens are actually being supervised in
+each case for our specific train row and confirms the two paths
+supervise the SAME positional set.
+"""
+
+import json
+import sys
+
+import numpy as np
+
+from _common import MODEL_NAME, TRAIN_TEXT, OUT_DIR, banner, section, report, seed_everything
+
+
+def main() -> int:
+    seed_everything()
+    banner("Probe 9: attention mask / lengths inspection")
+
+    from transformers import AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    ids = tok.encode(TRAIN_TEXT)
+    if tok.eos_token_id is not None and ids[-1] != tok.eos_token_id:
+        ids.append(tok.eos_token_id)
+    L = len(ids)
+    report("token_ids", ids)
+    report("len", L)
+
+    section("HF SFTTrainer style supervision mask")
+    # No padding here (batch of 1, length L) -> attention_mask is all 1s,
+    # labels mirror ids, all positions are supervised after shift.
+    attn = [1] * L
+    labels = list(ids)
+    shifted_labels = labels[1:]
+    hf_supervised_positions = list(range(L - 1))
+    hf_supervised_tokens = [tok.decode([t]) for t in shifted_labels]
+    report("attention_mask", attn)
+    report("shifted target ids", shifted_labels)
+    report("supervised positions (post-shift)", hf_supervised_positions)
+
+    section("MLX trainer style supervision mask")
+    # Mirrors the path in unsloth_zoo/mlx/trainer.py:
+    #   batch_lengths.append([1, L - 1])
+    #   length_mask = (steps >= lengths[:,0]) AND (steps <= lengths[:,1])
+    #   steps = mx.arange(1, targets.shape[1] + 1) == [1..L-1]
+    # so length_mask is TRUE for steps in [1, L-1], i.e. all post-shift
+    # positions for our unpadded batch.
+    lengths_pair = [1, L - 1]
+    steps = list(range(1, L))  # = [1..L-1]
+    length_mask = [(s >= lengths_pair[0]) and (s <= lengths_pair[1]) for s in steps]
+    targets_mlx = labels[1:]
+    mask_neg100 = [t != -100 for t in targets_mlx]
+    combined_mask = [a and b for a, b in zip(length_mask, mask_neg100)]
+    mlx_supervised_positions = [i for i, m in enumerate(combined_mask) if m]
+    mlx_supervised_tokens = [tok.decode([targets_mlx[i]]) for i in mlx_supervised_positions]
+    report("lengths_pair", lengths_pair)
+    report("steps", steps)
+    report("length_mask", length_mask)
+    report("supervised positions (post-shift)", mlx_supervised_positions)
+
+    section("comparison")
+    matches = hf_supervised_positions == mlx_supervised_positions
+    report("supervised positions match", matches)
+    report("hf supervises N tokens", len(hf_supervised_positions))
+    report("mlx supervises N tokens", len(mlx_supervised_positions))
+    only_hf = set(hf_supervised_positions) - set(mlx_supervised_positions)
+    only_mlx = set(mlx_supervised_positions) - set(hf_supervised_positions)
+    if only_hf:
+        report("only supervised by HF", list(only_hf))
+    if only_mlx:
+        report("only supervised by MLX", list(only_mlx))
+
+    out = {
+        "token_ids": ids,
+        "hf_supervised_positions": hf_supervised_positions,
+        "mlx_supervised_positions": mlx_supervised_positions,
+        "match": matches,
+        "n_supervised_hf": len(hf_supervised_positions),
+        "n_supervised_mlx": len(mlx_supervised_positions),
+        "lengths_pair": lengths_pair,
+    }
+    (OUT_DIR / "probe_9.json").write_text(json.dumps(out, indent=2))
+    return 0 if matches else 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())