From 5065a35441cdb070227a5c67b25b55a39c990486 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 11:34:06 +0000 Subject: [PATCH 01/84] tests/mlx_parity: 7-probe MLX vs HF parity bisection + Mac M1 workflow Symptom on upstream unsloth MLX CI: identical 7-step LoRA fine-tune of gemma-3-270m-it on "<> My name is Unsloth!" produces: * HF SFTTrainer (CUDA bf16): step-1 loss 7.64, post 0.001, gen contains "Unsloth" * MLX trainer (Apple M1): step-1 loss 10.55, post 0.009, gen "5 lbs!" The 1.38x pre-optimizer-step forward-pass discrepancy is the root anomaly. The clipping override fixed by unsloth-zoo#663 is real but does not explain the loss gap (CUDA mirror at every clip setting emits "Unsloth"). This drops 7 small probes that bisect the dispatch path: 1 tokenization do the input ids match? 2 forward logits does the base model emit the same logits? 3 loss reduction does CE-then-mean produce the same scalar (synthetic)? 4 LoRA init is B=0 in both; is A std within 2x? 5 single backward do gradient norms agree within 2x at LoRA-B=0? 6 AdamW step does one optimizer step produce the same weight (synthetic)? 7 7-step loss curve data dump of step losses + grad norms + final generation continue-on-error per probe so a single divergence does not hide diagnostics for the rest. Aggregated JSON dumps printed to the job log and uploaded as a CI artifact. --- .github/workflows/mlx-parity-probe.yml | 167 +++++++++++++++++++++ tests/mlx_parity/README.md | 30 ++++ tests/mlx_parity/_common.py | 57 +++++++ tests/mlx_parity/probe_1_tokenization.py | 72 +++++++++ tests/mlx_parity/probe_2_forward_logits.py | 107 +++++++++++++ tests/mlx_parity/probe_3_loss_reduction.py | 83 ++++++++++ tests/mlx_parity/probe_4_lora_init.py | 151 +++++++++++++++++++ tests/mlx_parity/probe_5_single_grad.py | 136 +++++++++++++++++ tests/mlx_parity/probe_6_adamw_step.py | 77 ++++++++++ tests/mlx_parity/probe_7_loss_curve.py | 145 ++++++++++++++++++ 10 files changed, 1025 insertions(+) create mode 100644 .github/workflows/mlx-parity-probe.yml create mode 100644 tests/mlx_parity/README.md create mode 100644 tests/mlx_parity/_common.py create mode 100644 tests/mlx_parity/probe_1_tokenization.py create mode 100644 tests/mlx_parity/probe_2_forward_logits.py create mode 100644 tests/mlx_parity/probe_3_loss_reduction.py create mode 100644 tests/mlx_parity/probe_4_lora_init.py create mode 100644 tests/mlx_parity/probe_5_single_grad.py create mode 100644 tests/mlx_parity/probe_6_adamw_step.py create mode 100644 tests/mlx_parity/probe_7_loss_curve.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml new file mode 100644 index 0000000000..d53aa0dcf5 --- /dev/null +++ b/.github/workflows/mlx-parity-probe.yml @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. +# +# MLX vs HF parity bisection probes. +# +# Why this workflow exists: +# The upstream MLX CI on `unslothai/unsloth` is failing the +# in-memory generation assertion in tests/studio/run_real_mlx_smoke.py +# (model emits "5 lbs!" instead of containing "Unsloth"). The CUDA +# mirror in temp/torchcodec_test/cuda_mirror.py emits "Unsloth" under +# every clip setting tested. Step-1 forward-pass loss is 7.64 on CUDA +# and 10.55 on MLX for the IDENTICAL model + data + seed -- a 1.38x +# pre-optimizer-step discrepancy. The clipping override fixed by +# unsloth-zoo#663 is a real bug but does not explain the loss gap. +# +# This workflow runs 7 small probes on a real macos-14-arm64 runner +# that bisect the parity gap across the dispatch path: +# 1. tokenization identical input ids? +# 2. base-model forward logits identical logits? +# 3. loss reduction (synthetic)same CE-mean number? +# 4. LoRA init B=0 in both; A std comparable? +# 5. single backward gradient norms comparable? +# 6. AdamW step (synthetic) same post-step weight? +# 7. 7-step loss curve data dump for follow-up analysis +# +# continue-on-error: true on each probe so a single divergence does +# not hide the diagnostics for the later probes. +# +# Status: experimental / debug. Surfaces "MLX parity probes" PR check. + +name: MLX parity probes + +on: + pull_request: + paths: + - 'tests/mlx_parity/**' + - '.github/workflows/mlx-parity-probe.yml' + push: + branches: [mlx-parity-probes] + workflow_dispatch: {} + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + probes: + name: probes + runs-on: macos-14 + timeout-minutes: 30 + steps: + - name: Harden runner (audit) + uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 + with: + egress-policy: audit + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install deps + run: | + python -m pip install --upgrade pip + # MLX + mlx-lm for the MLX-side calls + pip install \ + 'mlx==0.30.0' \ + 'mlx-lm==0.30.0' \ + 'numpy==2.4.4' \ + 'pytest==9.0.3' + # torch CPU for the HF-side calls; CPU is fine since we're + # measuring losses + grad norms, not throughput + pip install --index-url https://download.pytorch.org/whl/cpu \ + 'torch==2.10.0' + # transformers + peft for HF SFTTrainer parity + pip install \ + 'transformers==4.57.6' \ + 'peft==0.18.0' \ + 'datasets==4.3.0' \ + 'accelerate==1.13.0' \ + 'sentencepiece==0.2.1' \ + 'huggingface-hub==0.36.2' + # unsloth-zoo from git (provides unsloth_zoo.mlx.* on Apple Silicon) + for attempt in 1 2 3; do + if pip install "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo"; then break; fi + if [ "$attempt" -eq 3 ]; then exit 1; fi + sleep $((5*attempt)) + done + + - name: Probe 1 — tokenization parity + if: always() + continue-on-error: true + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + cd tests/mlx_parity && python probe_1_tokenization.py + + - name: Probe 2 — base-model forward logits parity + if: always() + continue-on-error: true + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + cd tests/mlx_parity && python probe_2_forward_logits.py + + - name: Probe 3 — loss reduction parity (synthetic) + if: always() + continue-on-error: true + run: | + cd tests/mlx_parity && python probe_3_loss_reduction.py + + - name: Probe 4 — LoRA init parity + if: always() + continue-on-error: true + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + cd tests/mlx_parity && python probe_4_lora_init.py + + - name: Probe 5 — single backward parity + if: always() + continue-on-error: true + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + cd tests/mlx_parity && python probe_5_single_grad.py + + - name: Probe 6 — AdamW step parity (synthetic) + if: always() + continue-on-error: true + run: | + cd tests/mlx_parity && python probe_6_adamw_step.py + + - name: Probe 7 — 7-step MLX loss curve (data dump) + if: always() + continue-on-error: true + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + UNSLOTH_COMPILE_DISABLE: '1' + run: | + cd tests/mlx_parity && python probe_7_loss_curve.py + + - name: Aggregate probe results + if: always() + run: | + set +e + echo "=== Aggregated probe JSON dumps ===" + for i in 1 2 3 4 5 6 7; do + echo "--- probe_${i}.json ---" + cat tests/mlx_parity/.out/probe_${i}.json 2>/dev/null || echo "(missing -- probe ${i} did not run or crashed)" + echo + done + + - name: Upload probe artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: mlx-parity-probe-output + path: tests/mlx_parity/.out/ + if-no-files-found: warn diff --git a/tests/mlx_parity/README.md b/tests/mlx_parity/README.md new file mode 100644 index 0000000000..04bf188d05 --- /dev/null +++ b/tests/mlx_parity/README.md @@ -0,0 +1,30 @@ +# MLX vs HF parity probes + +Seven small, focused probes designed to bisect the parity gap between MLX +training (via `unsloth_zoo.mlx.trainer`) and HF training (via +`transformers.SFTTrainer`) on the same hyperparameters. + +Symptom: identical 7-step LoRA fine-tune of `unsloth/gemma-3-270m-it` on the +single row `"<> My name is Unsloth!"` produces: + +| | step-1 loss | post-train loss | greedy generation | +|---|---|---|---| +| HF SFTTrainer (CUDA bf16) | 7.64 | 0.001 | `"... Unsloth! My personality is bubbly ..."` | +| MLX trainer | 10.55 | 0.009 | `"5 lbs!"` | + +The 1.38x step-1 forward-pass gap is the root anomaly. Each probe answers +one question along the dispatch path: + +| # | probe | question | +|---|---|---| +| 1 | `probe_1_tokenization.py` | does the tokenized input differ? | +| 2 | `probe_2_forward_logits.py` | does the base model emit different logits? | +| 3 | `probe_3_loss_reduction.py` | does CE-then-reduce produce different scalars? | +| 4 | `probe_4_lora_init.py` | does LoRA init produce different magnitudes? | +| 5 | `probe_5_single_grad.py` | does one backward produce different gradients? | +| 6 | `probe_6_adamw_step.py` | does one AdamW step produce different deltas? | +| 7 | `probe_7_loss_curve.py` | what does the 7-step curve look like end-to-end? | + +Each probe prints diagnostic data, then asserts a numeric tolerance. The +workflow runs them with `continue-on-error: true` so even a single +diverging probe still prints subsequent diagnostic data. diff --git a/tests/mlx_parity/_common.py b/tests/mlx_parity/_common.py new file mode 100644 index 0000000000..3356cbf41a --- /dev/null +++ b/tests/mlx_parity/_common.py @@ -0,0 +1,57 @@ +"""Shared constants + helpers for MLX parity probes. + +The probes deliberately share NOTHING with `unsloth_zoo.mlx.trainer` — +each probe re-derives the quantity from first principles so we can tell +where the trainer's wiring differs from the textbook HF/PyTorch recipe. +""" + +from __future__ import annotations + +import os +import random +from pathlib import Path + +import numpy as np + + +MODEL_NAME = "unsloth/gemma-3-270m-it" +TRAIN_TEXT = "<> My name is Unsloth!" +PROMPT = "<> My name is " +SEED = 3407 +MAX_SEQ_LEN = 64 + +OUT_DIR = Path(__file__).resolve().parent / ".out" +OUT_DIR.mkdir(parents=True, exist_ok=True) + + +def seed_everything(seed: int = SEED) -> None: + random.seed(seed) + np.random.seed(seed) + os.environ["PYTHONHASHSEED"] = str(seed) + try: + import torch + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + except Exception: + pass + try: + import mlx.core as mx + mx.random.seed(seed) + except Exception: + pass + + +def banner(title: str) -> None: + print() + print("=" * 72) + print(f"=== {title}") + print("=" * 72, flush=True) + + +def section(title: str) -> None: + print(f"\n--- {title} ---", flush=True) + + +def report(name: str, value) -> None: + print(f" {name}: {value}", flush=True) diff --git a/tests/mlx_parity/probe_1_tokenization.py b/tests/mlx_parity/probe_1_tokenization.py new file mode 100644 index 0000000000..41df476e4b --- /dev/null +++ b/tests/mlx_parity/probe_1_tokenization.py @@ -0,0 +1,72 @@ +"""Probe 1 — tokenization parity. + +Compare two ways of tokenizing the same training text: + + (a) HF SFTTrainer path: tokenizer(TRAIN_TEXT, return_tensors=...) + (b) MLX trainer path: tokenizer.encode(TRAIN_TEXT); maybe append EOS + +Difference in token IDs / length here would explain a different per-token +denominator and thus a different reported scalar loss, even with identical +math downstream. + +Exits 0 on parity, 2 on divergence (with diagnostic printout). +""" + +import json +import sys + +from _common import MODEL_NAME, TRAIN_TEXT, OUT_DIR, banner, section, report, seed_everything + + +def main() -> int: + seed_everything() + banner("Probe 1: tokenization parity") + + from transformers import AutoTokenizer + tok = AutoTokenizer.from_pretrained(MODEL_NAME) + report("tokenizer class", type(tok).__name__) + report("vocab_size", tok.vocab_size) + report("bos_token_id", tok.bos_token_id) + report("eos_token_id", tok.eos_token_id) + report("pad_token_id", tok.pad_token_id) + + section("(a) HF SFTTrainer path: tokenizer(TRAIN_TEXT)") + hf_enc = tok(TRAIN_TEXT, add_special_tokens=True) + hf_ids = list(hf_enc["input_ids"]) + report("input_ids", hf_ids) + report("len", len(hf_ids)) + report("first/last id", (hf_ids[0], hf_ids[-1])) + report("decoded", repr(tok.decode(hf_ids))) + + section("(b) MLX trainer path: tokenizer.encode + EOS append") + mlx_ids = tok.encode(TRAIN_TEXT) + if tok.eos_token_id is not None and (not mlx_ids or mlx_ids[-1] != tok.eos_token_id): + mlx_ids.append(tok.eos_token_id) + report("input_ids", mlx_ids) + report("len", len(mlx_ids)) + report("first/last id", (mlx_ids[0], mlx_ids[-1])) + report("decoded", repr(tok.decode(mlx_ids))) + + section("comparison") + same = hf_ids == mlx_ids + delta_len = len(mlx_ids) - len(hf_ids) + report("identical id list", same) + report("len_mlx - len_hf", delta_len) + if not same: + only_a = [i for i in hf_ids if i not in mlx_ids] + only_b = [i for i in mlx_ids if i not in hf_ids] + report("ids only in HF path", only_a) + report("ids only in MLX path", only_b) + + out = { + "hf_ids": hf_ids, + "mlx_ids": mlx_ids, + "delta_len": delta_len, + "identical": same, + } + (OUT_DIR / "probe_1.json").write_text(json.dumps(out, indent=2)) + return 0 if same else 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_2_forward_logits.py b/tests/mlx_parity/probe_2_forward_logits.py new file mode 100644 index 0000000000..db038648ad --- /dev/null +++ b/tests/mlx_parity/probe_2_forward_logits.py @@ -0,0 +1,107 @@ +"""Probe 2 — base-model forward logits parity. + +Load gemma-3-270m-it under BOTH backends (HF transformers, MLX via mlx-lm) +with NO LoRA attached. Feed identical token IDs. Capture logits. +Compare: + * logit dtype / shape + * argmax token sequence + * mean/max absolute logit difference + * mean / max softmax probability difference + +If the base-model forward is bit-equivalent then any downstream loss +discrepancy can be blamed on the loss-reduction layer (probe 3) or the +LoRA path (probes 4-5). If the base-model forward diverges measurably +here, that is itself a parity bug. + +Exits 0 if max prob diff < 5e-3 (fp16/bf16 noise floor), else 2. +""" + +import json +import sys + +import numpy as np + +from _common import MODEL_NAME, TRAIN_TEXT, OUT_DIR, banner, section, report, seed_everything + + +def main() -> int: + seed_everything() + banner("Probe 2: base-model forward logits parity") + + from transformers import AutoTokenizer + tok = AutoTokenizer.from_pretrained(MODEL_NAME) + ids = tok.encode(TRAIN_TEXT) + if tok.eos_token_id is not None and ids[-1] != tok.eos_token_id: + ids.append(tok.eos_token_id) + report("token_ids", ids) + report("len", len(ids)) + + # ----------------- HF side ----------------- + section("HF transformers forward") + import torch + from transformers import AutoModelForCausalLM + hf_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32) + hf_model.eval() + with torch.no_grad(): + hf_logits = hf_model( + input_ids=torch.tensor([ids], dtype=torch.long), + ).logits[0].float().cpu().numpy() + report("logits shape", hf_logits.shape) + report("logits dtype", hf_logits.dtype) + report("argmax[:8]", hf_logits[:8].argmax(axis=-1).tolist()) + + # ----------------- MLX side ----------------- + section("MLX (mlx-lm) forward") + import mlx.core as mx + from mlx_lm import load as mlx_load + mlx_model, _ = mlx_load(MODEL_NAME) + mlx_logits = np.asarray(mlx_model(mx.array([ids])).astype(mx.float32))[0] + report("logits shape", mlx_logits.shape) + report("logits dtype", mlx_logits.dtype) + report("argmax[:8]", mlx_logits[:8].argmax(axis=-1).tolist()) + + # ----------------- compare ----------------- + section("comparison") + if hf_logits.shape != mlx_logits.shape: + report("FATAL: shape mismatch", (hf_logits.shape, mlx_logits.shape)) + return 2 + + abs_diff = np.abs(hf_logits - mlx_logits) + report("max |logit diff|", float(abs_diff.max())) + report("mean |logit diff|", float(abs_diff.mean())) + + def softmax(x): + x = x - x.max(axis=-1, keepdims=True) + e = np.exp(x) + return e / e.sum(axis=-1, keepdims=True) + + hf_p = softmax(hf_logits) + mlx_p = softmax(mlx_logits) + prob_diff = np.abs(hf_p - mlx_p) + max_pd = float(prob_diff.max()) + report("max |softmax diff|", max_pd) + report("mean |softmax diff|", float(prob_diff.mean())) + + hf_argmax = hf_logits.argmax(axis=-1) + mlx_argmax = mlx_logits.argmax(axis=-1) + argmax_match = (hf_argmax == mlx_argmax).mean() + report("argmax match rate", float(argmax_match)) + + out = { + "token_ids": ids, + "max_logit_diff": float(abs_diff.max()), + "mean_logit_diff": float(abs_diff.mean()), + "max_softmax_diff": max_pd, + "argmax_match_rate": float(argmax_match), + } + (OUT_DIR / "probe_2.json").write_text(json.dumps(out, indent=2)) + + # 5e-3 softmax tolerance accommodates bf16/fp32 numerics; argmax + # should fully agree on a well-trained instruct model. + if max_pd > 5e-3 or argmax_match < 1.0: + return 2 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_3_loss_reduction.py b/tests/mlx_parity/probe_3_loss_reduction.py new file mode 100644 index 0000000000..92957714f6 --- /dev/null +++ b/tests/mlx_parity/probe_3_loss_reduction.py @@ -0,0 +1,83 @@ +"""Probe 3 — loss reduction parity (synthetic logits/labels). + +Bypass the model entirely. Drive a fixed numpy (logits, labels) pair +through: + + (a) torch.nn.functional.cross_entropy with ignore_index=-100, reduction='mean' + (the HF SFTTrainer default). + (b) unsloth_zoo.mlx.utils.make_baseline_loss_fn's recipe replicated + in MLX: cross_entropy * mask, summed, divided by mask.sum(). + +For identical inputs the two scalars MUST match (mod fp32 noise). If they +diverge, the MLX trainer's loss-reduction layer differs from HF's. + +Exits 0 if |loss_a - loss_b| < 1e-4 AND ntok counts match, else 2. +""" + +import json +import sys + +import numpy as np + +from _common import OUT_DIR, banner, section, report, seed_everything + + +def main() -> int: + seed_everything() + banner("Probe 3: loss reduction parity (synthetic logits/labels)") + + # Synthetic: batch=1, seq_len=10, vocab=8 -- small + reproducible. + np.random.seed(0) + V = 8 + L = 10 + logits = np.random.randn(1, L, V).astype(np.float32) + labels = np.array([[2, 5, 1, -100, 3, 4, 0, 7, -100, 6]], dtype=np.int64) + report("logits shape", logits.shape) + report("labels", labels.tolist()) + n_valid = int((labels != -100).sum()) + report("n_valid (non -100)", n_valid) + + # Shift like HF / MLX both do: predict next token. + shift_logits = logits[:, :-1, :] + shift_labels = labels[:, 1:] + n_valid_shift = int((shift_labels != -100).sum()) + report("n_valid after shift", n_valid_shift) + + section("(a) torch.nn.functional.cross_entropy (HF SFTTrainer recipe)") + import torch + import torch.nn.functional as F + t_logits = torch.tensor(shift_logits.reshape(-1, V)) + t_labels = torch.tensor(shift_labels.reshape(-1)) + hf_loss = F.cross_entropy(t_logits, t_labels, ignore_index=-100, reduction="mean").item() + report("hf_loss", hf_loss) + + section("(b) MLX baseline loss recipe (unsloth_zoo.mlx.utils:417)") + import mlx.core as mx + import mlx.nn as nn + mlx_logits = mx.array(shift_logits) + mlx_labels = mx.array(shift_labels) + mask = (mlx_labels != -100).astype(mx.float32) + safe = mx.where(mlx_labels == -100, 0, mlx_labels) + ce = nn.losses.cross_entropy(mlx_logits, safe) * mask + ntoks = mask.sum() + mlx_loss = (ce.astype(mx.float32).sum() / mx.maximum(ntoks, mx.array(1.0))).item() + report("mlx_loss", mlx_loss) + report("ntoks (mlx)", float(ntoks.item())) + + section("comparison") + diff = abs(hf_loss - mlx_loss) + report("|hf - mlx|", diff) + + out = { + "hf_loss": hf_loss, + "mlx_loss": mlx_loss, + "abs_diff": diff, + "n_valid_shift": n_valid_shift, + } + (OUT_DIR / "probe_3.json").write_text(json.dumps(out, indent=2)) + + return 0 if diff < 1e-4 else 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_4_lora_init.py b/tests/mlx_parity/probe_4_lora_init.py new file mode 100644 index 0000000000..0f0492a70c --- /dev/null +++ b/tests/mlx_parity/probe_4_lora_init.py @@ -0,0 +1,151 @@ +"""Probe 4 — LoRA initialization parity. + +Attach LoRA r=8 alpha=16 on q_proj of layer 0 in both backends with +seed=SEED. Inspect the resulting LoRA-A and LoRA-B matrices. + +Expected baseline (standard LoRA init): + A ~ Kaiming uniform (non-zero, small magnitude) + B ~ zero matrix + +If both backends honor this, the LoRA contribution at step 0 is zero +and the base-model forward dominates (i.e. probe 2 + LoRA-attached +forward should produce the same logits up to fp noise). + +This probe does not enforce A == A across backends (different RNGs), +but DOES enforce: + * B is exactly zero in both + * |A.std()| within 2x across backends + * shapes match +""" + +import json +import sys + +import numpy as np + +from _common import MODEL_NAME, SEED, OUT_DIR, banner, section, report, seed_everything + + +def main() -> int: + seed_everything() + banner("Probe 4: LoRA initialization parity") + + # ---------------- HF / torch / PEFT ---------------- + section("HF + PEFT LoRA") + import torch + from transformers import AutoModelForCausalLM + from peft import LoraConfig, get_peft_model + torch.manual_seed(SEED) + hf_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32) + hf_peft = get_peft_model( + hf_model, + LoraConfig( + r=8, lora_alpha=16, lora_dropout=0.0, bias="none", + target_modules=["q_proj"], + ), + ) + # Find layer-0 q_proj LoRA-A and LoRA-B + hf_A = None + hf_B = None + for name, p in hf_peft.named_parameters(): + if "q_proj.lora_A.default.weight" in name and ".0." in name: + hf_A = p.detach().float().cpu().numpy() + if "q_proj.lora_B.default.weight" in name and ".0." in name: + hf_B = p.detach().float().cpu().numpy() + if hf_A is not None and hf_B is not None: + break + report("hf A shape / std", (None if hf_A is None else (hf_A.shape, float(hf_A.std())))) + report("hf B shape / max|B|", (None if hf_B is None else (hf_B.shape, float(np.abs(hf_B).max())))) + + # ---------------- MLX / mlx-lm / unsloth_zoo.mlx ---------------- + section("MLX + unsloth_zoo.mlx LoRA") + import mlx.core as mx + mx.random.seed(SEED) + from unsloth_zoo.mlx.loader import FastMLXModel + mlx_model, _tok = FastMLXModel.from_pretrained( + MODEL_NAME, + load_in_4bit=False, + dtype="float32", + text_only=True, + max_seq_length=64, + random_state=SEED, + ) + mlx_model = FastMLXModel.get_peft_model( + mlx_model, + r=8, + lora_alpha=16, + lora_dropout=0.0, + target_modules=["q_proj"], + random_state=SEED, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=False, + ) + mlx_A = None + mlx_B = None + # Walk module tree and grab layer-0 q_proj LoRA params. + def walk(mod, prefix=""): + for name, child in getattr(mod, "named_modules", lambda: [])(): + yield name, child + try: + for name, child in mlx_model.named_modules(): + if name.endswith(".q_proj") and (".layers.0." in name or ".0.q_proj" in name): + for attr in ("lora_a", "lora_b", "lora_A", "lora_B"): + if hasattr(child, attr): + v = getattr(child, attr) + arr = np.asarray(mx.eval(v) if callable(getattr(v, "eval", None)) else v) + if attr.lower().endswith("a") and mlx_A is None: + mlx_A = arr + if attr.lower().endswith("b") and mlx_B is None: + mlx_B = arr + break + except Exception as e: + report("introspection error", str(e)) + report("mlx A shape / std", (None if mlx_A is None else (mlx_A.shape, float(mlx_A.std())))) + report("mlx B shape / max|B|", (None if mlx_B is None else (mlx_B.shape, float(np.abs(mlx_B).max())))) + + section("comparison") + ok = True + issues = [] + if hf_A is None or hf_B is None: + issues.append("could not locate HF layer-0 q_proj LoRA params") + ok = False + if mlx_A is None or mlx_B is None: + issues.append("could not locate MLX layer-0 q_proj LoRA params") + ok = False + if hf_B is not None and float(np.abs(hf_B).max()) != 0.0: + issues.append(f"HF B is non-zero (max|B|={float(np.abs(hf_B).max())})") + ok = False + if mlx_B is not None and float(np.abs(mlx_B).max()) != 0.0: + issues.append(f"MLX B is non-zero (max|B|={float(np.abs(mlx_B).max())})") + ok = False + if hf_A is not None and mlx_A is not None and hf_A.shape != mlx_A.shape: + issues.append(f"shape mismatch A: hf={hf_A.shape} mlx={mlx_A.shape}") + ok = False + if hf_A is not None and mlx_A is not None and hf_A.shape == mlx_A.shape: + ratio = float(mlx_A.std()) / max(float(hf_A.std()), 1e-12) + report("std ratio mlx/hf", ratio) + if not (0.5 <= ratio <= 2.0): + issues.append(f"A std ratio out of [0.5, 2.0]: {ratio:.3f}") + ok = False + + for i in issues: + report("FAIL", i) + if ok: + report("OK", "B==0 in both and A stds within 2x") + + out = { + "hf_A_shape": None if hf_A is None else list(hf_A.shape), + "hf_A_std": None if hf_A is None else float(hf_A.std()), + "hf_B_max_abs": None if hf_B is None else float(np.abs(hf_B).max()), + "mlx_A_shape": None if mlx_A is None else list(mlx_A.shape), + "mlx_A_std": None if mlx_A is None else float(mlx_A.std()), + "mlx_B_max_abs": None if mlx_B is None else float(np.abs(mlx_B).max()), + "issues": issues, + } + (OUT_DIR / "probe_4.json").write_text(json.dumps(out, indent=2)) + return 0 if ok else 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_5_single_grad.py b/tests/mlx_parity/probe_5_single_grad.py new file mode 100644 index 0000000000..9b00617b60 --- /dev/null +++ b/tests/mlx_parity/probe_5_single_grad.py @@ -0,0 +1,136 @@ +"""Probe 5 — single-batch backward parity at LoRA-B=0. + +At step 0 LoRA-B is zero, so the LoRA contribution to forward is zero +and gradients on LoRA-A and LoRA-B reduce to a simple function of base- +model activations + base-model gradients w.r.t. q_proj output. + +Run ONE forward + backward in both backends, on identical token IDs +(probe 1 already proves the IDs match). Compare the per-leaf +gradient norms on layer-0 q_proj LoRA-A and LoRA-B. The shapes +match (probe 4) so the norms are directly comparable. + +If forward+backward parity holds, gradient norms agree within 5%. +A larger divergence here points the finger at the MLX +backward / VJP / loss-reduction pipeline. + +This probe doesn't try to match the exact value of every gradient +element (different RNG-initialized A makes that impossible by design); +instead it asserts the AGGREGATE gradient magnitude is in the same +ballpark on both sides. +""" + +import json +import sys + +import numpy as np + +from _common import MODEL_NAME, TRAIN_TEXT, SEED, OUT_DIR, banner, section, report, seed_everything + + +def main() -> int: + seed_everything() + banner("Probe 5: single-batch backward parity (B=0)") + + # Build token batch (lengths/labels match what MLX trainer would use). + from transformers import AutoTokenizer + tok = AutoTokenizer.from_pretrained(MODEL_NAME) + ids = tok.encode(TRAIN_TEXT) + if tok.eos_token_id is not None and ids[-1] != tok.eos_token_id: + ids.append(tok.eos_token_id) + report("token_ids len", len(ids)) + + # ---------------- HF side ---------------- + section("HF + PEFT backward") + import torch + from transformers import AutoModelForCausalLM + from peft import LoraConfig, get_peft_model + torch.manual_seed(SEED) + hf_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32) + hf_peft = get_peft_model( + hf_model, + LoraConfig(r=8, lora_alpha=16, lora_dropout=0.0, target_modules=["q_proj"]), + ) + inp = torch.tensor([ids], dtype=torch.long) + labels = inp.clone() + out = hf_peft(input_ids=inp, labels=labels) + out.loss.backward() + hf_norms = {} + for name, p in hf_peft.named_parameters(): + if (".0." in name) and ("q_proj.lora_A" in name or "q_proj.lora_B" in name): + g = p.grad + if g is not None: + hf_norms[name.split(".0.")[-1]] = float(g.detach().float().norm().item()) + report("hf grad norms", hf_norms) + report("hf loss", float(out.loss.item())) + + # ---------------- MLX side ---------------- + section("MLX + unsloth_zoo.mlx backward") + import mlx.core as mx + mx.random.seed(SEED) + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + mlx_model, _ = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype="float32", + text_only=True, max_seq_length=64, random_state=SEED, + ) + mlx_model = FastMLXModel.get_peft_model( + mlx_model, r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=["q_proj"], random_state=SEED, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=False, + ) + loss_fn = make_baseline_loss_fn() + batch = mx.array([ids]) + L = batch.shape[1] + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + + import mlx.utils as mxu + + def loss_only(model): + loss, _ntok = loss_fn(model, batch, lengths, labels_mlx) + return loss + loss_val, grads = mx.value_and_grad(loss_only)(mlx_model) + flat = mxu.tree_flatten(grads) + mlx_norms = {} + for n, g in flat: + if (".0." in n or "layers.0" in n) and "q_proj" in n and ("lora_A" in n or "lora_B" in n or "lora_a" in n or "lora_b" in n): + mlx_norms[n.split(".0.")[-1] if ".0." in n else n] = float(mx.linalg.norm(g.astype(mx.float32)).item()) + report("mlx grad norms (q_proj.lora_*)", mlx_norms) + report("mlx loss", float(loss_val.item())) + + # ---------------- compare ---------------- + section("comparison") + ratio_info = {} + ok = True + for key_hf, val_hf in hf_norms.items(): + # find the corresponding MLX key by suffix match + match = None + for key_mlx in mlx_norms: + if key_hf.lower().replace("default.weight", "") in key_mlx.lower(): + match = key_mlx + break + if match is None: + ratio_info[key_hf] = {"hf": val_hf, "mlx": None} + ok = False + continue + ratio = mlx_norms[match] / max(val_hf, 1e-12) + ratio_info[key_hf] = {"hf": val_hf, "mlx": mlx_norms[match], "ratio_mlx_hf": ratio} + if not (0.5 <= ratio <= 2.0): + ok = False + report("grad-norm ratios", ratio_info) + out = { + "hf_loss": float(out.loss.item()) if hasattr(out, "loss") else None, + "mlx_loss": float(loss_val.item()), + "hf_norms": hf_norms, + "mlx_norms": mlx_norms, + "ratios": ratio_info, + } + (OUT_DIR / "probe_5.json").write_text(json.dumps(out, indent=2, default=str)) + return 0 if ok else 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_6_adamw_step.py b/tests/mlx_parity/probe_6_adamw_step.py new file mode 100644 index 0000000000..71dd27eaf3 --- /dev/null +++ b/tests/mlx_parity/probe_6_adamw_step.py @@ -0,0 +1,77 @@ +"""Probe 6 — single AdamW step parity (synthetic). + +Bypass model + autograd. Drive torch.optim.AdamW and mlx.optimizers.AdamW +with bit-identical hyperparameters and the SAME initial weights + the +SAME gradient. Compare the post-step weight tensor. + +This is the strongest possible test of the optimizer math: + * bias_correction (PyTorch always on; MLX defaulted off pre-#634, + on post-#634 -- this probe verifies the post-#634 default actually + matches PyTorch's behavior at step 1). + * eps placement + * weight_decay (decoupled / coupled) + +Tolerance: |w_torch - w_mlx| < 1e-5. +""" + +import json +import sys + +import numpy as np + +from _common import OUT_DIR, banner, section, report, seed_everything + + +def main() -> int: + seed_everything() + banner("Probe 6: AdamW step parity (synthetic)") + + np.random.seed(0) + W0 = np.random.randn(8, 16).astype(np.float32) + G = np.random.randn(8, 16).astype(np.float32) * 0.1 + + LR = 1e-3 + BETA1, BETA2 = 0.9, 0.999 + EPS = 1e-8 + WD = 0.0 + + section("(a) torch.optim.AdamW one step") + import torch + w_t = torch.tensor(W0.copy(), requires_grad=True) + w_t.grad = torch.tensor(G.copy()) + opt = torch.optim.AdamW([w_t], lr=LR, betas=(BETA1, BETA2), eps=EPS, weight_decay=WD) + opt.step() + w_after_t = w_t.detach().cpu().numpy() + report("max |w_after_t - W0|", float(np.abs(w_after_t - W0).max())) + + section("(b) mlx.optimizers.AdamW one step, bias_correction=True") + import mlx.core as mx + import mlx.optimizers as optim + w_m = mx.array(W0.copy()) + state = {"w": w_m} + grads = {"w": mx.array(G.copy())} + adamw = optim.AdamW( + learning_rate=LR, betas=(BETA1, BETA2), eps=EPS, weight_decay=WD, + bias_correction=True, + ) + state = adamw.apply_gradients(grads, state) + w_after_m = np.asarray(state["w"].astype(mx.float32)) + report("max |w_after_m - W0|", float(np.abs(w_after_m - W0).max())) + + section("comparison") + diff = np.abs(w_after_t - w_after_m) + report("max |w_after_t - w_after_m|", float(diff.max())) + report("mean |w_after_t - w_after_m|", float(diff.mean())) + + out = { + "max_diff": float(diff.max()), + "mean_diff": float(diff.mean()), + "torch_step_norm": float(np.linalg.norm(w_after_t - W0)), + "mlx_step_norm": float(np.linalg.norm(w_after_m - W0)), + } + (OUT_DIR / "probe_6.json").write_text(json.dumps(out, indent=2)) + return 0 if diff.max() < 1e-5 else 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_7_loss_curve.py b/tests/mlx_parity/probe_7_loss_curve.py new file mode 100644 index 0000000000..7604f9f2a6 --- /dev/null +++ b/tests/mlx_parity/probe_7_loss_curve.py @@ -0,0 +1,145 @@ +"""Probe 7 — end-to-end 7-step training loss curve, MLX-only. + +Re-run the same 7-step config that the smoke test uses, just MLXTrainer +this time (we already know the HF curve from the CUDA mirror). Capture: + + * per-step training loss + * per-step grad norm (as reported by the trainer) + * post-train loss on the train row (recomputed via a fresh forward) + * greedy generation from `"<> My name is "` + * tokenized train ids + ntoks-per-batch (from probe 1 path) + +Emit everything to probe_7.json so a follow-up analysis script (or a +maintainer reading the CI log) can directly compare these numbers +against the CUDA-mirror baseline numbers checked into +`temp/torchcodec_test/.out/cuda_truemirror_*.json`. + +Always exits 0 -- this probe is a data dump, not a gate. It's the +ground truth that probes 1-6 are debugging. +""" + +import json +import sys + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + SEED, + MAX_SEQ_LEN, + OUT_DIR, + banner, + section, + report, + seed_everything, +) + + +def main() -> int: + seed_everything() + banner("Probe 7: end-to-end 7-step MLX loss curve") + + import mlx.core as mx + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + + section("load + LoRA") + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype="float16", + text_only=True, max_seq_length=128, + random_state=SEED, + ) + model = FastMLXModel.get_peft_model( + model, + r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + random_state=SEED, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + ) + + section("trainer config (same as the upstream smoke test, minus override workaround)") + config = MLXTrainingConfig( + per_device_train_batch_size=2, + gradient_accumulation_steps=3, + max_steps=7, + learning_rate=1e-3, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=1.0, + # leave max_grad_value at config default + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=SEED, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / "probe7_outputs"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + ) + report("max_grad_value default", config.max_grad_value) + report("max_grad_norm", config.max_grad_norm) + + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + + rows = [] + def _on_step(step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens, grad_norm): + rows.append({ + "step": int(step), "loss": float(loss), + "lr": float(lr), "grad_norm": None if grad_norm is None else float(grad_norm), + "num_tokens": int(num_tokens), + }) + trainer.add_step_callback(_on_step) + trainer.train() + + section("post-train forward") + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + report("post_train_loss", post_loss_val) + + section("greedy generation") + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + report("generation", repr(gen)) + contains = "Unsloth" in gen + + out = { + "tokenized_train_ids": ids, + "tokenized_train_len": L, + "rows": rows, + "post_train_loss": post_loss_val, + "generation": gen, + "contains_unsloth": contains, + } + (OUT_DIR / "probe_7.json").write_text(json.dumps(out, indent=2)) + section("summary") + report("step-1 loss", rows[0]["loss"] if rows else None) + report("step-7 loss", rows[-1]["loss"] if rows else None) + report("post_train_loss", post_loss_val) + report("contains 'Unsloth'", contains) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From c3b0c9efa9b55a0d5236bdf74e1b30fbe5e61da3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 11:46:24 +0000 Subject: [PATCH 02/84] .github/workflows: drop unrelated workflows on this branch Keep only mlx-parity-probe.yml so the parity-probe PR runs a single Mac M1 job. The deleted workflows still exist on main; this branch is debug-only and does not get merged back. --- .github/workflows/consolidated-tests-ci.yml | 2265 ----------------- .github/workflows/lint-ci.yml | 321 --- .github/workflows/mlx-ci.yml | 430 ---- .github/workflows/notebooks-ci.yml | 440 ---- .github/workflows/release-desktop.yml | 902 ------- .github/workflows/security-audit.yml | 1126 -------- .github/workflows/stale.yml | 37 - .github/workflows/studio-api-smoke.yml | 166 -- .github/workflows/studio-backend-ci.yml | 221 -- .github/workflows/studio-frontend-ci.yml | 151 -- .github/workflows/studio-inference-smoke.yml | 887 ------- .github/workflows/studio-mac-api-smoke.yml | 153 -- .../workflows/studio-mac-inference-smoke.yml | 1042 -------- .github/workflows/studio-mac-ui-smoke.yml | 343 --- .github/workflows/studio-mac-update-smoke.yml | 150 -- .github/workflows/studio-tauri-smoke.yml | 128 - .github/workflows/studio-ui-smoke.yml | 293 --- .github/workflows/studio-update-smoke.yml | 154 -- .../workflows/studio-windows-api-smoke.yml | 246 -- .../studio-windows-inference-smoke.yml | 1167 --------- .github/workflows/studio-windows-ui-smoke.yml | 342 --- .../workflows/studio-windows-update-smoke.yml | 279 -- .github/workflows/version-compat-ci.yml | 312 --- .github/workflows/wheel-smoke.yml | 136 - 24 files changed, 11691 deletions(-) delete mode 100644 .github/workflows/consolidated-tests-ci.yml delete mode 100644 .github/workflows/lint-ci.yml delete mode 100644 .github/workflows/mlx-ci.yml delete mode 100644 .github/workflows/notebooks-ci.yml delete mode 100644 .github/workflows/release-desktop.yml delete mode 100644 .github/workflows/security-audit.yml delete mode 100644 .github/workflows/stale.yml delete mode 100644 .github/workflows/studio-api-smoke.yml delete mode 100644 .github/workflows/studio-backend-ci.yml delete mode 100644 .github/workflows/studio-frontend-ci.yml delete mode 100644 .github/workflows/studio-inference-smoke.yml delete mode 100644 .github/workflows/studio-mac-api-smoke.yml delete mode 100644 .github/workflows/studio-mac-inference-smoke.yml delete mode 100644 .github/workflows/studio-mac-ui-smoke.yml delete mode 100644 .github/workflows/studio-mac-update-smoke.yml delete mode 100644 .github/workflows/studio-tauri-smoke.yml delete mode 100644 .github/workflows/studio-ui-smoke.yml delete mode 100644 .github/workflows/studio-update-smoke.yml delete mode 100644 .github/workflows/studio-windows-api-smoke.yml delete mode 100644 .github/workflows/studio-windows-inference-smoke.yml delete mode 100644 .github/workflows/studio-windows-ui-smoke.yml delete mode 100644 .github/workflows/studio-windows-update-smoke.yml delete mode 100644 .github/workflows/version-compat-ci.yml delete mode 100644 .github/workflows/wheel-smoke.yml diff --git a/.github/workflows/consolidated-tests-ci.yml b/.github/workflows/consolidated-tests-ci.yml deleted file mode 100644 index 6b008d4bb1..0000000000 --- a/.github/workflows/consolidated-tests-ci.yml +++ /dev/null @@ -1,2265 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# One consolidated CPU-only job that runs every test_* function the existing -# CI does not already cover from this repo plus the full unsloth_zoo@main -# CPU test suite plus unsloth_zoo.compiler.test_apply_fused_lm_head. -# -# Why a separate workflow: -# - studio-backend-ci.yml's "Repo tests (CPU)" job already auto-discovers -# tests/ minus tests/qlora, tests/saving, tests/utils, tests/sh. The 16 -# Bucket-A tests below live inside those --ignore dirs (CPU-runnable but -# historically excluded with their GPU siblings); pulling them out into -# a sibling job keeps the existing 760-passed baseline stable while we -# prove the new pieces are green. -# - unsloth_zoo has no CI on main today (.github/workflows/ is empty -# upstream as of HEAD 030e4ba). 106 of its 111 test_* functions are -# CPU-runnable; the 5 GPU/vLLM ones are deselected here. -# - test_apply_fused_lm_head lives at unsloth_zoo/compiler.py:1983, not -# under tests/, so it is not picked up by `pytest tests/`. It is a -# plain function with no fixtures: pure regex over transformers source -# strings, ~5-15 s wall, no GPU. -# -# Strict mode: every test step is gating (no `continue-on-error`). The -# upstream patch fixes that previously caused per-cell red have landed: -# - unslothai/unsloth#5319 (patch_fast_lora import, patch_sft_trainer -# Union, openenv OSError graceful skip). -# - unslothai/unsloth-zoo#628 (MoE coverage canary so old transformers -# skips legitimately while real discovery regressions still fail). -# After those merges every observed cell failure was one of these two -# things; if they regress we want a red cell, not a green-with-fail-prints -# cell. - -name: Core - -on: - pull_request: - paths: - - 'unsloth/**' - - 'unsloth_cli/**' - - 'studio/**' - - 'tests/**' - - 'pyproject.toml' - - '.github/workflows/consolidated-tests-ci.yml' - push: - branches: [main, pip] - workflow_dispatch: - inputs: - unsloth_zoo_ref: - description: 'unsloth_zoo git ref to test against (default main)' - required: false - default: 'main' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - consolidated: - # Matrix: three (transformers, TRL) combos cover the failure surface the - # PR cares about: - # 1. transformers==4.57.6 + TRL latest <1.0.0 (the just-before-5.x line) - # 2. transformers latest 5.x + TRL latest 1.x (the absolute upstream tip; - # currently 5.8.0 + 1.3.0, both BEYOND the unsloth/unsloth_zoo - # <=5.5.0 / <=0.24.0 caps -- the cell exists explicitly to surface - # drift signal) - # 3. transformers + TRL pinned by pyproject.toml's dependency entries - # (resolved dynamically at job time via tomllib) - # fail-fast: false so each cell runs independently and a transformers / - # TRL drift signal in one cell does not cancel the others. No - # job-level or per-step `continue-on-error` -- real test failures now - # fail the cell. Patches with legitimate CPU-runner preconditions - # (real CUDA dispatcher, runtime args) are explicitly skipped via - # NEEDS_PRECONDITION in the runtime check shim below. - strategy: - fail-fast: false - matrix: - combo: - - id: t4576-trl0latest - label: "HF=4.57.6 + TRL<1" - transformers_spec: "transformers==4.57.6" - trl_spec: "trl>=0.18.2,<1.0.0" - - id: tlatest5-trl1latest - label: "HF=latest + TRL=latest" - transformers_spec: "transformers>=5,<6" - trl_spec: "trl>=1,<2" - - id: pyproject - label: "HF=default + TRL=default" - transformers_spec: "__from_pyproject__" - trl_spec: "__from_pyproject__" - name: "Core (${{ matrix.combo.label }})" - runs-on: ubuntu-latest - timeout-minutes: 35 - # No job-level or per-step `continue-on-error`. Earlier iterations - # masked real test failures behind green check icons; that lie is - # gone. A failing test step fails the cell. NEEDS_PRECONDITION in - # the runtime check shim handles patches that legitimately cannot - # run on a CPU-only runner (real CUDA dispatcher, runtime args). - env: - UNSLOTH_ZOO_REF: ${{ inputs.unsloth_zoo_ref || 'main' }} - MATRIX_TRANSFORMERS_SPEC: ${{ matrix.combo.transformers_spec }} - MATRIX_TRL_SPEC: ${{ matrix.combo.trl_spec }} - MATRIX_COMBO_ID: ${{ matrix.combo.id }} - # Hoisted to job-level so every step (Sanity, Bucket-A, unsloth_zoo - # pytest, test_apply_fused_lm_head) inherits it. transformers' bundled - # *_pb2.py was generated against an older protoc; the C++ protobuf - # 4+/5+/6 implementation rejects them with "Descriptors cannot be - # created directly". The pure-Python parser bypasses the check; the - # speed cost is negligible for these tests. - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python - PYTHONPATH: ${{ github.workspace }}/studio - UNSLOTH_COMPILE_DISABLE: '1' - # unsloth_zoo/__init__.py:314 raises ImportError unless UNSLOTH_IS_PRESENT - # is set — normally it is set by unsloth.__init__ when unsloth is imported - # first. In this job we sometimes import unsloth_zoo.* (e.g. - # unsloth_zoo.saving_utils, unsloth_zoo.temporary_patches) without going - # through `import unsloth` first; pin the env var to 1 so unsloth_zoo's - # bootstrap accepts it. Setting it has no effect on unsloth itself. - UNSLOTH_IS_PRESENT: '1' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - # Node 22 unblocks tests/studio/test_chat_preset_builtin_invariants.py's - # `node --experimental-strip-types` subprocess. Cheap to install; keeps - # the consolidated job self-sufficient even if studio-backend-ci.yml - # changes its node setup. - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - name: Install uv (some unsloth_zoo dev tooling expects it on PATH) - run: pip install uv - - - name: Resolve matrix specs (handle __from_pyproject__ sentinel) - # The pyproject cell uses a sentinel; resolve the real `transformers` - # and `trl` constraints from the project's pyproject.toml at job time. - # unsloth's pyproject puts the LLM stack pins in - # [project.optional-dependencies] under the `huggingfacenotorch` - # extra (top-level [project.dependencies] is just typer/pydantic/etc.), - # so we walk every optional extra and pick the first matching spec. - # Other cells pass their spec through unchanged. - run: | - set -euxo pipefail - python <<'PY' >> "$GITHUB_ENV" - import os, re, tomllib - spec_t = os.environ["MATRIX_TRANSFORMERS_SPEC"] - spec_r = os.environ["MATRIX_TRL_SPEC"] - - def _pkg_name(spec: str) -> str: - m = re.match(r"\s*([A-Za-z0-9_.-]+)", spec) - return (m.group(1).lower() if m else "") - - if spec_t == "__from_pyproject__" or spec_r == "__from_pyproject__": - with open("pyproject.toml", "rb") as f: - doc = tomllib.load(f) - proj = doc.get("project", {}) - # Try top-level deps first, then all optional extras. - all_deps: list[str] = list(proj.get("dependencies", [])) - for _name, dep_list in proj.get("optional-dependencies", {}).items(): - all_deps.extend(dep_list) - - if spec_t == "__from_pyproject__": - spec_t = next((x for x in all_deps if _pkg_name(x) == "transformers"), - "transformers") - if spec_r == "__from_pyproject__": - spec_r = next((x for x in all_deps if _pkg_name(x) == "trl"), - "trl") - print(f"RESOLVED_TRANSFORMERS_SPEC={spec_t}") - print(f"RESOLVED_TRL_SPEC={spec_r}") - PY - # Echo to logs so the matrix cell label maps cleanly to a spec. - grep RESOLVED_ "$GITHUB_ENV" || true - - - name: Install runtime deps (mirrors studio-backend-ci.yml + mlx-ci.yml) - # The shape matches studio-backend-ci.yml's "Repo tests (CPU)" install - # so we inherit the same CPU-spoof harness in tests/conftest.py and - # the same import-chain guarantees, plus the extra deps that the - # tests/saving + tests/utils Bucket-A files transitively need but - # which Repo tests (CPU) does not require because it --ignores - # those directories: - # - protobuf + sentencepiece: tests/saving/test_fix_sentencepiece_gguf_robustness.py - # does `from transformers.utils import sentencepiece_model_pb2`, - # which imports `google.protobuf`. Not pulled by transformers' - # base install. - # - triton: unsloth/_gpu_init.py:232 does an unconditional - # `import triton`. The triton PyPI wheel installs cleanly on - # Linux x86_64 even without CUDA (the import succeeds; runtime - # GPU work is what would fail, which we never do here). - # transformers + trl are matrix-parameterized. - run: | - set -euxo pipefail - python -m pip install --upgrade pip - pip install -r studio/backend/requirements/studio.txt - pip install \ - python-multipart aiofiles sqlalchemy cryptography \ - pyyaml jinja2 mammoth unpdf requests typer \ - 'numpy<3' pytest==9.0.3 pytest-asyncio httpx \ - protobuf sentencepiece triton \ - psutil packaging tqdm safetensors datasets \ - 'peft>=0.18,<0.20' 'accelerate>=0.34,<2' \ - ipython - # torchvision: unsloth_zoo.vision_utils imports it at module scope. - pip install --index-url https://download.pytorch.org/whl/cpu \ - 'torch>=2.4,<2.11' 'torchvision<0.26' - # transformers + trl from the matrix combo. - pip install "$RESOLVED_TRANSFORMERS_SPEC" - pip install "$RESOLVED_TRL_SPEC" - # bitsandbytes: hard import in unsloth/models/_utils.py. Recent - # versions ship a CPU build that imports cleanly on Linux. - pip install 'bitsandbytes>=0.45' - # unsloth itself, editable, no-deps so pip does not fight the - # explicit torch CPU-index install above. - pip install -e . --no-deps - echo "::group::Installed transformers + trl + torch + unsloth versions" - pip show transformers - pip show trl - pip show torch - pip show unsloth - echo "::endgroup::" - - - name: Clone unsloth_zoo @ ${{ env.UNSLOTH_ZOO_REF }} - # We need the repository tree (the wheel does not ship tests/), so - # clone shallow then editable-install so unsloth_zoo.* imports - # resolve to the cloned tree. We use `pip show` for the location - # check rather than `import unsloth_zoo` because the latter calls - # device_type.get_device_type() at module load and raises on a - # GPU-less runner; pytest steps below route through the existing - # tests/conftest.py spoof which handles that. - run: | - set -euxo pipefail - # github.com occasionally 500s on the git fetch; retry so a - # single upstream blip does not fail CI. - for attempt in 1 2 3; do - rm -rf "$RUNNER_TEMP/unsloth-zoo" - if git clone --depth=1 --branch="$UNSLOTH_ZOO_REF" \ - https://github.com/unslothai/unsloth-zoo \ - "$RUNNER_TEMP/unsloth-zoo"; then - break - fi - if [ "$attempt" -eq 3 ]; then - echo "::error::git clone unsloth-zoo failed after 3 attempts" - exit 1 - fi - delay=$((5 * attempt)) - echo "::warning::clone failed (attempt $attempt/3), retrying in ${delay}s..." - sleep "$delay" - done - pip install -e "$RUNNER_TEMP/unsloth-zoo" --no-deps - pip show unsloth_zoo - - - name: Sanity — collection only (both repos) - # Catches import-time breakage before we run the suite. Cheap; bails - # the job out fast if a transformers/torch resolution went sideways. - # Inherits PYTHONPATH / UNSLOTH_COMPILE_DISABLE / PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION - # from the job-level env block. - run: | - set -euxo pipefail - python -m pytest --collect-only -q \ - tests/saving/test_save_shell_injection.py \ - tests/saving/test_patch_saving_none_tokenizer.py \ - tests/saving/test_fix_sentencepiece_gguf_robustness.py \ - tests/utils/test_attention_masks.py \ - tests/utils/test_trunc_normal_patch.py - python -m pytest --collect-only -q "$RUNNER_TEMP/unsloth-zoo/tests/" - - - name: import_fixes drift detectors (18 tests, HARD GATE) - # One drift detector per fix_* / patch_* function in - # unsloth/import_fixes.py. The detectors assert the *healthy* - # upstream shape that the fix expects ABSENT the regression; - # ANY DRIFT DETECTED -> pytest.fail (NEVER skip) so the - # matrix cell goes red and the maintainer triages on the - # next PR, not in a downstream user's crash report. - # - # Pathologies covered by the suite (each maps to one fix - # function with the line range cited in the test docstring): - # * protobuf MessageFactory GetPrototype / GetMessageClass - # * datasets 4.4.x recursion range - # * TRL tuple-vs-bool _*_available caching - # * transformers PreTrainedModel.enable_input_require_grads - # source pattern flip - # * transformers torchcodec / causal_conv1d availability - # flags - # * transformers + accelerate is_wandb_available - # * peft.utils.transformers_weight_conversion importability - # + build_peft_weight_mapping signature - # * triton 3.6+ CompiledKernel num_ctas / cluster_dims - # * torch / torchvision pinned compatibility table - # * vllm guided_decoding_params / structured_outputs + - # aimv2 ovis config version - # * huggingface_hub is_offline_mode / HF_HUB_OFFLINE - # * torch.nn.init.trunc_normal_ presence (patch site for - # patch_trunc_normal_precision_issue) - # * xformers post-num_splits-key fix version - # HARD GATE: a red cell here is a real upstream regression - # without a corresponding zoo / unsloth-side workaround. - run: | - python -m pytest -v --tb=short tests/test_import_fixes_drift.py - - - name: public-api surface drift detectors (9 tests, HARD GATE) - # Companion to test_import_fixes_drift.py: that file catches - # third-party drift; this one catches drift in unsloth's OWN - # public surface (FastLanguageModel / FastVisionModel / - # FastModel + their classmethods + is_bf16_supported). A - # rename here would silently break the unslothai/notebooks tree - # one PR cycle later -- this gate catches it BEFORE the - # breakage reaches users. - run: | - python -m pytest -v --tb=short tests/test_public_api_surface.py - - - name: unsloth Bucket-A — CPU tests not in Repo tests (CPU) - # 16 tests across 5 files. They live inside tests/saving/ and - # tests/utils/, both of which Repo tests (CPU) excludes via --ignore - # because their sibling files need real GPUs / real HF weights. - # The five files below are pure-Python + AST/protobuf/regex tests - # that run cleanly on CPU. Env inherited from the job block. - run: | - python -m pytest -q --tb=short \ - tests/saving/test_save_shell_injection.py \ - tests/saving/test_patch_saving_none_tokenizer.py \ - tests/saving/test_fix_sentencepiece_gguf_robustness.py \ - tests/utils/test_attention_masks.py \ - tests/utils/test_trunc_normal_patch.py \ - --deselect 'tests/utils/test_attention_masks.py::test_run_attention_flash_varlen_receives_window_and_softcap' - # The deselected test monkeypatches flash_attn_varlen_func, which is - # only bound on the module when `flash_attn` is importable. flash_attn - # requires CUDA + dev toolchain, which the CPU-only ubuntu-latest - # runner does not have. The other 15 Bucket-A tests pass cleanly. - - - name: unsloth_zoo @ ${{ env.UNSLOTH_ZOO_REF }} — full pytest (CPU) - # 106 of 111 test_* in unsloth_zoo are CPU-only. The two CUDA-skip - # cases below auto-skip on a GPU-less runner; deselect them - # explicitly so the no-CUDA outcome is "deselected", not "skipped", - # making intent visible in the report. Env inherited from job block. - working-directory: ${{ runner.temp }}/unsloth-zoo - run: | - python -m pytest -q --tb=short tests/ \ - --deselect tests/test_unsloth_zoo_lora_merge.py::test_active_merge_device_returns_string_on_cuda_host \ - --deselect tests/test_unsloth_zoo_lora_merge.py::test_merge_lora_moves_cpu_inputs_to_active_device - - - name: unsloth_zoo — test_apply_fused_lm_head (lives in compiler.py) - # `test_apply_fused_lm_head` lives at unsloth_zoo/compiler.py:1983, - # not under tests/, so pytest's default discovery does not pick it up. - # We route it through pytest by writing a one-shot shim test file - # inside the unsloth checkout's tests/ — pytest then walks UP and - # picks up tests/conftest.py, whose GPU-spoof harness (lines 84-141) - # patches torch.cuda.is_available, torch.cuda.memory.mem_get_info, - # torch.cuda.get_device_capability, and is_bf16_supported. That full - # spoof is required because unsloth_zoo/temporary_patches/gpt_oss.py - # at module load reads torch.cuda.memory.mem_get_info(0), which - # bare `is_available = True` doesn't cover. Env inherited. - run: | - set -euxo pipefail - cat > tests/_zoo_apply_fused_lm_head_shim.py <<'PY' - # Auto-generated by .github/workflows/consolidated-tests-ci.yml. - # Wraps unsloth_zoo.compiler.test_apply_fused_lm_head so that - # tests/conftest.py's GPU-spoof harness applies before the import. - # _zoo_aggressive_cuda_spoof extends conftest's harness with deeper - # patches (see tests/_zoo_aggressive_cuda_spoof.py). - import sys, pathlib - sys.path.insert(0, str(pathlib.Path(__file__).parent)) - import _zoo_aggressive_cuda_spoof as _spoof - _spoof.apply() - from unsloth_zoo.compiler import test_apply_fused_lm_head as _zoo_test - def test_zoo_apply_fused_lm_head_runs(): - _zoo_test() - PY - python -m pytest -q --tb=short tests/_zoo_apply_fused_lm_head_shim.py - rm -f tests/_zoo_apply_fused_lm_head_shim.py - - - name: Static checks — unsloth/trainer.py + unsloth/models/rl.py against latest pip TRL - # AST-only sanity: confirm both files parse and that every TRL symbol - # they reference still exists in the installed `trl`. Catches API - # drift (renamed / removed TRL classes) without running training. - # Pre-fetches latest pip transformers in case TRL pinned an older one. - run: | - set -euxo pipefail - # Use the matrix-resolved transformers + trl versions already - # installed by the runtime-deps step (don't upgrade here; that - # would defeat the matrix's purpose of testing against the - # specific (transformers, trl) combination the cell selected). - python <<'PY' - import ast, importlib, pathlib, sys - paths = [pathlib.Path("unsloth/trainer.py"), - pathlib.Path("unsloth/models/rl.py")] - for p in paths: - src = p.read_text() - tree = ast.parse(src, filename=str(p)) - # Collect every `from trl... import X` and `from trl... import (X, Y)` - missing = [] - for node in ast.walk(tree): - if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith("trl"): - mod = importlib.import_module(node.module) - for alias in node.names: - if alias.name == "*": - continue - if not hasattr(mod, alias.name): - missing.append(f"{node.module}.{alias.name}") - print(f"{p}: TRL symbols referenced and resolved -> {'OK' if not missing else 'MISSING ' + ', '.join(missing)}") - if missing: - sys.exit(1) - PY - - - name: Static checks — unsloth_zoo/tiled_mlp.py against latest pip transformers - # AST parse + transformers symbol-resolution. The user flagged tiled - # MLP patching as the path that breaks first when transformers ships - # an MLP class rename; this step is the canary against whatever - # transformers version the matrix cell selected. - working-directory: ${{ runner.temp }}/unsloth-zoo - run: | - set -euxo pipefail - python <<'PY' - import ast, importlib, pathlib, sys - p = pathlib.Path("unsloth_zoo/tiled_mlp.py") - src = p.read_text() - tree = ast.parse(src, filename=str(p)) - missing = [] - for node in ast.walk(tree): - if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith("transformers"): - try: - mod = importlib.import_module(node.module) - except Exception as e: - missing.append(f"{node.module} (import failed: {type(e).__name__})") - continue - for alias in node.names: - if alias.name == "*": - continue - if not hasattr(mod, alias.name): - missing.append(f"{node.module}.{alias.name}") - print(f"{p}: transformers symbols referenced -> {'OK' if not missing else 'MISSING ' + ', '.join(missing)}") - if missing: - sys.exit(1) - PY - - - name: Static checks — unsloth_zoo/hf_utils.py syntax + import-graph - working-directory: ${{ runner.temp }}/unsloth-zoo - run: | - set -euxo pipefail - python <<'PY' - import ast, pathlib - p = pathlib.Path("unsloth_zoo/hf_utils.py") - tree = ast.parse(p.read_text(), filename=str(p)) - # Surface every public function + class so the PR check log shows - # what's covered, not just OK/FAIL. - public = [] - for node in tree.body: - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)) and not node.name.startswith("_"): - public.append(f"{type(node).__name__.replace('Def','').lower()}:{node.name}") - print(f"hf_utils.py public surface ({len(public)}): " + ", ".join(public)) - PY - - - name: Runtime checks — invoke every zero-arg patch_* across both repos (via pytest shim) - # Routed through pytest so tests/conftest.py's GPU-spoof harness - # applies before any unsloth_zoo.temporary_patches.* import. - # Locally validated 50/51 zero-arg patches succeed; the lone failure - # surfaces a real bug (unsloth.models._utils.patch_fast_lora raises - # NameError: name 'fast_lora_forward' is not defined). The shim - # reports the full ledger but only fails when one of the two - # `required` helpers is absent. - run: | - set -euxo pipefail - cat > tests/_runtime_patch_check_shim.py <<'PY' - # Auto-generated by .github/workflows/consolidated-tests-ci.yml. - # Wraps the runtime patch_* validation into a pytest test so the - # tests/conftest.py GPU-spoof harness applies. continue-on-error - # at the workflow level catches per-patch failures; this shim only - # asserts that the two `required` helpers are reachable. - import sys, pathlib - sys.path.insert(0, str(pathlib.Path(__file__).parent)) - import _zoo_aggressive_cuda_spoof as _spoof - _spoof.apply() - import importlib, inspect - - MODULES = [ - "unsloth.models._utils", "unsloth.models.rl", "unsloth.import_fixes", - "unsloth.kernels.cross_entropy_loss", "unsloth.kernels.rms_layernorm", - "unsloth.tokenizer_utils", "unsloth.save", - "unsloth_zoo.patching_utils", "unsloth_zoo.gradient_checkpointing", - "unsloth_zoo.loss_utils", "unsloth_zoo.tokenizer_utils", - "unsloth_zoo.tiled_mlp", "unsloth_zoo.dataset_utils", - "unsloth_zoo.patch_torch_functions", - "unsloth_zoo.temporary_patches.gemma", - "unsloth_zoo.temporary_patches.ministral", - "unsloth_zoo.temporary_patches.pixtral", - "unsloth_zoo.temporary_patches.deepseek_v3_moe", - "unsloth_zoo.temporary_patches.qwen3_5_moe", - "unsloth_zoo.temporary_patches.mxfp4", - "unsloth_zoo.temporary_patches.bitsandbytes", - "unsloth_zoo.temporary_patches.flex_attention_bwd", - ] - REQUIRED = { - "patch_unsloth_smart_gradient_checkpointing", - "patch_gradient_accumulation_fix", - } - # Patches whose signature looks zero-arg (`()` or all-defaulted) - # but which actually require either runtime args or real CUDA. - # Calling these in isolation is meaningless, so skip the - # invocation. Symbol presence (REQUIRED above) is still verified. - # patch_linear_scaling / patch_llama_rope_scaling: defaults are - # None placeholders; the bodies start with - # `assert is not None`. - # patch_unsloth_smart_gradient_checkpointing: legitimately - # allocates CUDA tensors via aten::empty.memory_format inside - # initialize_unsloth_gradient_checkpointing(); the - # torch.cuda.* spoof can't intercept that at the dispatcher - # level. - NEEDS_PRECONDITION = { - "patch_linear_scaling", - "patch_llama_rope_scaling", - "patch_unsloth_smart_gradient_checkpointing", - } - - def test_zero_arg_patch_invocations(): - ok, fail, args, skipped, miss_imports = 0, [], [], [], {} - seen_required = set() - for mod_name in MODULES: - try: - mod = importlib.import_module(mod_name) - except Exception as e: - miss_imports[mod_name] = f"{type(e).__name__}: {e}" - continue - for name in sorted(dir(mod)): - if not name.startswith("patch_"): continue - fn = getattr(mod, name, None) - if not callable(fn): continue - if name in REQUIRED: seen_required.add(name) - try: - sig = inspect.signature(fn) - need = [p.name for p in sig.parameters.values() - if p.default is inspect.Parameter.empty - and p.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, - inspect.Parameter.POSITIONAL_ONLY)] - except (TypeError, ValueError): - need = [] - if need: - args.append((mod_name, name, need)); continue - if name in NEEDS_PRECONDITION: - skipped.append(f"{mod_name}.{name}") - print(f" SKIP {mod_name}.{name} (needs precondition / CUDA)") - continue - try: - fn() - ok += 1 - print(f" OK {mod_name}.{name}") - except Exception as e: - fail.append((mod_name, name, type(e).__name__, str(e)[:200])) - print(f" FAIL {mod_name}.{name} -> {type(e).__name__}: {str(e)[:200]}") - print(f"\nzero-arg patch_*: ok={ok} fail={len(fail)} skipped={len(skipped)}") - print(f"arg-required patch_* (skipped, listed for review): {len(args)}") - for m, n, r in args: - print(f" needs={r}: {m}.{n}") - if skipped: - print(f"explicitly skipped (needs precondition / CUDA): {skipped}") - if miss_imports: - print("\nmodules failed to import (skipped):") - for k, v in miss_imports.items(): - print(f" {k}: {v}") - print(f"required patch_* helpers seen: {sorted(seen_required)}") - missing = REQUIRED - seen_required - assert not missing, f"required patch_* helpers MISSING: {sorted(missing)}" - # Strict: any zero-arg patch that raises is a real - # regression now that #5319 has landed (the three previously - # known-broken patches are fixed; legitimate - # CPU-precondition skips are recorded in NEEDS_PRECONDITION - # above, not in `fail`). Print all failures and re-raise - # them as one assertion message. - if fail: - raise AssertionError( - f"zero-arg patch_* invocation failures (ok={ok}, " - f"fail={len(fail)}, skipped={len(skipped)}):\n " - + "\n ".join( - f"{m}.{n} -> {ec}: {msg}" for m, n, ec, msg in fail - ) - ) - PY - python -m pytest -q --tb=short tests/_runtime_patch_check_shim.py -s - rm -f tests/_runtime_patch_check_shim.py - - - name: Runtime checks — patch_tiled_mlp on a synthetic MLP module (via pytest shim) - # Same shim pattern: pytest picks up tests/conftest.py before importing - # unsloth_zoo.tiled_mlp, so the GPU-spoof harness covers - # unsloth_zoo.temporary_patches.gpt_oss's mem_get_info call. - run: | - set -euxo pipefail - cat > tests/_tiled_mlp_check_shim.py <<'PY' - # Auto-generated by .github/workflows/consolidated-tests-ci.yml. - import sys, pathlib - sys.path.insert(0, str(pathlib.Path(__file__).parent)) - import _zoo_aggressive_cuda_spoof as _spoof - _spoof.apply() - import torch - import torch.nn as nn - from unsloth_zoo.tiled_mlp import patch_tiled_mlp, patch_mlp - - class _MLP(nn.Module): - def __init__(self, hidden=64, intermediate=128): - super().__init__() - self.gate_proj = nn.Linear(hidden, intermediate, bias=False) - self.up_proj = nn.Linear(hidden, intermediate, bias=False) - self.down_proj = nn.Linear(intermediate, hidden, bias=False) - self.act_fn = nn.SiLU() - def forward(self, x): - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - class _FakeModel(nn.Module): - def __init__(self): - super().__init__() - self.layers = nn.ModuleList([nn.ModuleDict({"mlp": _MLP()}) for _ in range(2)]) - def forward(self, x): - for layer in self.layers: - x = x + layer["mlp"](x) - return x - - def test_patch_tiled_mlp_numerical_equivalence(): - # `patch_mlp(target_arctic=True)` sets `chunk_size = max(1, H)` - # and shards the SEQUENCE dim with `n_shards = max(1, S // - # chunk_size)`. Pick S > H so the tiled path actually runs - # multi-shard (n_shards = 192 // 64 = 3, plus a remainder - # shard) rather than degenerating to n_shards = 1 which is - # bit-exact and only confirms patching installed something. - # If the tiled implementation is correct, multi-shard output - # must still match the un-tiled reference within FP32 noise. - torch.manual_seed(0) - m = _FakeModel().eval() - hidden = 64 - # 192 = 3 * hidden, so divmod(192, 64) = (3, 0) -> 3 shards, - # no remainder; gives a clean multi-shard verification. - x = torch.randn(2, 192, hidden) - with torch.no_grad(): - y_before = m(x).clone() - patch_mlp(m.layers[0]["mlp"]) - patch_tiled_mlp(m) - # Sanity-check we are actually exercising the multi-shard - # path: poke chunk_size by re-deriving it the same way - # `tiled_forward_arctic_size` does. - S = x.shape[1] - chunk = max(1, hidden) - n_shards_expected = max(1, S // chunk) - assert n_shards_expected > 1, ( - "tiled MLP shim is not exercising multi-shard: " - f"S={S}, chunk={chunk}, n_shards={n_shards_expected}" - ) - with torch.no_grad(): - y_after = m(x).clone() - err = (y_before - y_after).abs().max().item() - print( - f"patch_tiled_mlp multi-shard (n_shards={n_shards_expected}) " - f"output diff = {err:.3e}" - ) - assert err < 1e-3, f"tiled MLP output drifted: {err}" - PY - python -m pytest -q --tb=short tests/_tiled_mlp_check_shim.py -s - rm -f tests/_tiled_mlp_check_shim.py - - - name: Compiler cache hygiene + source-rewriter invariants (synthetic inputs) - # Lightweight pipeline coverage for unsloth_zoo.compiler. Pure regex - # / tokenize / ast paths driven by tiny synthetic source strings: - # - higher_precision_softmax (basic + idempotent) - # - fix_rotary_embedding_dtype (no-op + active under - # UNSLOTH_FORCE_CUSTOM_DTYPE) - # - fix_attention_dtype_consistency (insert + idempotent) - # - convert_attention_masks_to_bool (rewrite + no-op) - # - create_new_function happy-path (versioning block, license - # header, AST parse, importlib re-import) - # - create_new_function **kwargs collision (exercises - # _rewrite_kwargs_param + _insert_kwargs_alias) - # - UNSLOTH_COMPILE_OVERWRITE=0 forced-recompile on transformers - # version mismatch (compiler.py:947-963) - # - matching short-circuit when versions are equal - # No real transformers modeling module is loaded; complements the - # heavier real-class round-trip step below. Wall-time ~10-25s. - run: | - set -euxo pipefail - cat > tests/_compiler_cache_invariants_shim.py <<'PY' - # Auto-generated by .github/workflows/consolidated-tests-ci.yml. - # Cache-hygiene + source-rewriter invariants for unsloth_zoo.compiler. - import sys, pathlib, os, ast, importlib, importlib.util, time - sys.path.insert(0, str(pathlib.Path(__file__).parent)) - import _zoo_aggressive_cuda_spoof as _spoof - _spoof.apply() - import pytest - import torch # noqa: F401 (compiler.py imports torch at module load) - - - def _isolate_cache(tmp_path, monkeypatch): - """Point UNSLOTH_COMPILE_LOCATION at tmp_path and reset module - globals. The compiler.py global is captured at module load - (line 75/179), so we delete + reimport per test.""" - monkeypatch.setenv("UNSLOTH_COMPILE_LOCATION", str(tmp_path)) - if "unsloth_zoo.compiler" in sys.modules: - del sys.modules["unsloth_zoo.compiler"] - import unsloth_zoo.compiler as compiler - compiler.UNSLOTH_COMPILE_LOCATION = str(tmp_path) - compiler.UNSLOTH_COMPILE_USE_TEMP = False - return compiler - - - def test_higher_precision_softmax_basic_and_idempotent(tmp_path, monkeypatch): - c = _isolate_cache(tmp_path, monkeypatch) - src = ( - "y = nn.functional.softmax(x, dim=-1)\n" - "z = F.softmax(a, dim=1, dtype=torch.bfloat16)\n" - ) - out = c.higher_precision_softmax(src) - assert "dtype = torch.float32).to(x.dtype)" in out - assert "dtype = torch.float32).to(a.dtype)" in out - # Idempotency landed in unslothai/unsloth-zoo#631 - # (negative-lookahead on `.to(.dtype)` so a second - # pass does not append another cast). - assert c.higher_precision_softmax(out) == out - - - def test_fix_rotary_dtype_no_op_without_env(tmp_path, monkeypatch): - c = _isolate_cache(tmp_path, monkeypatch) - monkeypatch.delenv("UNSLOTH_FORCE_CUSTOM_DTYPE", raising=False) - src = "out = cos.to(dtype=x.dtype) + sin.to(dtype=x.dtype)\n" - assert c.fix_rotary_embedding_dtype(src) == src - - - def test_fix_rotary_dtype_active(tmp_path, monkeypatch): - c = _isolate_cache(tmp_path, monkeypatch) - monkeypatch.setenv( - "UNSLOTH_FORCE_CUSTOM_DTYPE", - "float16;torch.float32;torch.bfloat16;torch.float16;pass", - ) - monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1") - src = "out = cos.to(dtype=x.dtype) + sin.to(dtype=x.dtype)\n" - out = c.fix_rotary_embedding_dtype(src) - # Active form rewrites cos.to / sin.to. Either the conditional - # form or the cast form is acceptable -- different transformers - # versions surface slightly different outputs from the rewriter. - assert "cos.to(dtype=x.dtype)" not in out - assert "sin.to(dtype=x.dtype)" not in out - - - def test_fix_attention_dtype_consistency_insert_then_idempotent(tmp_path, monkeypatch): - c = _isolate_cache(tmp_path, monkeypatch) - src = ( - " query_states, key_states = apply_rotary_pos_emb(" - "query_states, key_states, cos, sin)\n" - " attn = q @ k.T\n" - ) - out = c.fix_attention_dtype_consistency(src) - assert out.count("value_states = value_states.to(query_states.dtype)") == 1 - assert c.fix_attention_dtype_consistency(out) == out - - - def test_convert_attention_masks_to_bool_rewrites(tmp_path, monkeypatch): - c = _isolate_cache(tmp_path, monkeypatch) - src = ( - "def make_mask(x):\n" - " out = torch.finfo(x.dtype).min * x\n" - " return out\n" - ) - out = c.convert_attention_masks_to_bool("make_mask", src) - # Loose match: rewriter inserts a `!=torch.finfo(...).min` check - # somewhere on the return path. Tightening to an exact - # last-line match is brittle across transformers versions. - assert "!=torch.finfo" in out - - - def test_convert_attention_masks_to_bool_no_op(tmp_path, monkeypatch): - c = _isolate_cache(tmp_path, monkeypatch) - src = "def make_mask(x):\n return x\n" - assert c.convert_attention_masks_to_bool("make_mask", src) == src - - - def _versioning_lines(file_text): - """Extract the four version strings from the versioning block.""" - assert file_text.startswith('"""\n'), "missing opening triple-quote" - head = file_text.split("__UNSLOTH_VERSIONING__", 1)[0] - lines = [ln for ln in head.splitlines() if ln and ln != '"""'] - return lines - - - def test_create_new_function_happy_path(tmp_path, monkeypatch): - c = _isolate_cache(tmp_path, monkeypatch) - src = "def f(x):\n return nn.functional.softmax(x, dim=-1)\n" - c.create_new_function( - name="f_happy", new_source=src, model_location="builtins", - functions=[], overwrite=True, - ) - cached = tmp_path / "f_happy.py" - assert cached.exists() - text = cached.read_text(encoding="utf-8") - versions = _versioning_lines(text) - assert len(versions) == 4, versions - assert text.count(c._full_license_header) == 1 - ast.parse(text) - spec = importlib.util.spec_from_file_location("f_happy_reimport", cached) - m2 = importlib.util.module_from_spec(spec) - spec.loader.exec_module(m2) - assert callable(m2.f) - import inspect as _inspect - # higher_precision_softmax should have promoted to float32. - assert "dtype = torch.float32" in _inspect.getsource(m2.f) - - - def test_create_new_function_overwrite_zero_recompiles_on_version_mismatch( - tmp_path, monkeypatch, - ): - c = _isolate_cache(tmp_path, monkeypatch) - name = "vmismatch" - cached = tmp_path / f"{name}.py" - stub = ( - '"""\n0.0.0\n0.0.0\n0.0.0-stub\n0.0.0\n__UNSLOTH_VERSIONING__\n"""\n' - + c._full_license_header - + "def vmismatch(x):\n return x\n" - ) - cached.write_text(stub, encoding="utf-8") - monkeypatch.setenv("UNSLOTH_COMPILE_OVERWRITE", "0") - src = "def vmismatch(x):\n return x + 1\n" - c.create_new_function( - name=name, new_source=src, model_location="builtins", - functions=[], overwrite=False, - ) - text = cached.read_text(encoding="utf-8") - assert "0.0.0-stub" not in text, ( - "OVERWRITE=0 + transformers-version-mismatch did NOT recompile" - ) - versions = _versioning_lines(text) - import importlib.metadata as _md - assert versions[2] == _md.version("transformers") - - - def test_create_new_function_overwrite_zero_short_circuits_when_versions_match( - tmp_path, monkeypatch, - ): - c = _isolate_cache(tmp_path, monkeypatch) - name = "vmatch" - src = "def vmatch(x):\n return x\n" - c.create_new_function( - name=name, new_source=src, model_location="builtins", - functions=[], overwrite=True, - ) - cached = tmp_path / f"{name}.py" - mtime_before = cached.stat().st_mtime_ns - time.sleep(0.05) - monkeypatch.setenv("UNSLOTH_COMPILE_OVERWRITE", "0") - c.create_new_function( - name=name, new_source=src, model_location="builtins", - functions=[], overwrite=False, - ) - assert cached.stat().st_mtime_ns == mtime_before, ( - "OVERWRITE=0 + matching versions should NOT rewrite the file" - ) - PY - python -m pytest -q --tb=short tests/_compiler_cache_invariants_shim.py - rm -f tests/_compiler_cache_invariants_shim.py - - - name: Compiler full-model-sweep (every transformers.models.*) + SFT trainer round-trip - # Calls `unsloth_compile_transformers(model_type=...)` against EVERY - # `transformers.models.` package the matrix's transformers ships - # (pkgutil.iter_modules walk -- 383 packages on 4.57.6, similar on - # latest), then ast.parse / importlib-load / introspect the - # generated unsloth_compiled_cache/*.py file per model. Catches - # regex / source-rewriter drift across the matrix's (transformers, - # trl) combination -- the dominant failure mode of - # `unsloth_compile_transformers` after a transformers point release. - # - # 21 model_types currently break the compiler (verified locally on - # transformers 4.57.6). They are listed in KNOWN_BROKEN below with - # their failure mode so the sweep stays green and any NEW breakage - # surfaces as red. Each entry is tracked for an individual fix - # PR on unsloth-zoo. The list is split by failure category so - # follow-up PRs can target one bug at a time. - # - # Hermetic cache dir per pytest invocation; we override the - # job-level UNSLOTH_COMPILE_DISABLE=1 inside the shim so - # compilation actually runs here. Wall-time estimate ~2-3 min - # warm (mean ~0.3s/model, 383 models = ~110s on the runner). - run: | - set -euxo pipefail - cat > tests/_zoo_compiler_cache_shim.py <<'PY' - # Auto-generated by .github/workflows/consolidated-tests-ci.yml. - import os, sys, ast, pathlib, importlib.util, tempfile - _HERE = pathlib.Path(__file__).parent - sys.path.insert(0, str(_HERE)) - import _zoo_aggressive_cuda_spoof as _spoof - _spoof.apply() - - # Hermetic cache dir + force compile path. The compiler's - # globals (UNSLOTH_COMPILE_LOCATION, UNSLOTH_COMPILE_USE_TEMP) - # are captured at module load; an earlier conftest `import - # unsloth` may have already imported unsloth_zoo.compiler with - # the default "unsloth_compiled_cache" path. Mutate the live - # module globals after import so this shim is robust to that - # ordering. Otherwise the compiler silently writes to the - # default cache and the per-model file assertion fails. - _CACHE = pathlib.Path(tempfile.mkdtemp(prefix="unsloth_cache_")) - os.environ["UNSLOTH_COMPILE_LOCATION"] = str(_CACHE) - os.environ["UNSLOTH_COMPILE_OVERWRITE"] = "1" - os.environ.pop("UNSLOTH_COMPILE_DISABLE", None) - - import pytest - import unsloth_zoo.compiler as _zoo_compiler - _zoo_compiler.UNSLOTH_COMPILE_LOCATION = str(_CACHE) - _zoo_compiler.UNSLOTH_COMPILE_USE_TEMP = False - from unsloth_zoo.compiler import unsloth_compile_transformers - - - def _verify_file(path: pathlib.Path, must_expose): - assert path.exists(), f"compiler did not write {path}" - src = path.read_text(encoding="utf-8") - ast.parse(src, filename=str(path)) - spec = importlib.util.spec_from_file_location(path.stem, path) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - for name in must_expose: - assert hasattr(mod, name), ( - f"{path.name} missing expected attr {name!r}; " - f"found: {sorted(n for n in dir(mod) if not n.startswith('_'))[:25]}" - ) - - - # ---------- Full transformers.models.* compile sweep ---------- - # Track the model_types that currently break the compiler on - # transformers >=5,<6. After unsloth-zoo#632 landed, transformers - # 4.57.6 has zero failures across all model_types; the 27 entries - # below are the residual failures on the tf 5.x line. New breakage - # on any OTHER model_type fails the cell. Each entry is a - # tracking item for a follow-up unsloth-zoo PR. - KNOWN_BROKEN_COMPILE = { - # Category A: `string index out of range` in source rewriter. - "colpali": "string index out of range", - "colqwen2": "string index out of range", - "colmodernvbert": "string index out of range", - "dpr": "string index out of range", - "gemma4_assistant":"string index out of range", - "rag": "string index out of range", - "shieldgemma2": "string index out of range", - "timm_backbone": "string index out of range", - # Category B: rewriter emits invalid Python source. - "clvp": "emitted file: unexpected indent", - "falcon_mamba": "emitted file: unexpected indent", - "gpt2": "emitted file: unexpected indent", - "imagegpt": "emitted file: unexpected indent", - "mamba": "emitted file: unexpected indent", - "tapas": "emitted file: expected ':'", - "xlstm": "emitted file: unexpected indent", - # Category B-2: emit unterminated string literal (latest tf). - "audioflamingo3": "emitted file: unterminated string literal", - "musicflamingo": "emitted file: unterminated string literal", - "voxtral": "emitted file: unterminated string literal", - "voxtral_realtime":"emitted file: unterminated string literal", - # Category C: rewriter emits unclosed paren. - "kosmos2": "emitted file: '(' was never closed", - "kosmos2_5": "emitted file: '(' was never closed", - # Category D: imports list builder picks up a non-exported name. - "auto": "module has no attribute _BaseModelWithGenerate", - "bit": "module has no attribute Linear", - "regnet": "module has no attribute Linear", - "resnet": "module has no attribute Linear", - # Category E: undefined name in emitted file. - "perceiver": "name 'AbstractPreprocessor' is not defined", - "sam3_lite_text": "name 'Sam3LiteTextLayerScaledResidual' is not defined", - # Category F: compile exceeds 60s budget on the runner. - # First seen on transformers >=5,<6; each represents a slow - # or recursive source-rewriter path the zoo can address. - "beit": "TimeoutError: compile exceeds per-model budget", - "sam": "TimeoutError: compile exceeds per-model budget", - "sam_hq": "TimeoutError: compile exceeds per-model budget", - } - - - def _all_model_types(): - import pkgutil, transformers.models as tm - return sorted(s.name for s in pkgutil.iter_modules(tm.__path__) if s.ispkg) - - - def test_compile_every_transformers_model_type(): - """Run unsloth_compile_transformers across every model_type - the matrix's transformers ships. Allowed outcomes: - ok -> compile emitted a parseable, importable cache file - skipped -> no `modeling_.py` file (expected for some - umbrella packages like `auto`, `deprecated`) - known -> in KNOWN_BROKEN_COMPILE; tracked for follow-up. - Any uncaught failure fails the cell. - - Per-model SIGALRM cap so one infinite-looping model_type - cannot wedge the whole sweep + nuke the job timeout - (observed on transformers >=5,<6 -- 30+ min hang before - this guard landed).""" - import importlib as _il - import signal - ok = 0 - skipped = [] - known = [] - new_failures = [] - models = _all_model_types() - def _on_timeout(signum, frame): - raise TimeoutError("compile exceeded per-model budget") - prev_handler = signal.signal(signal.SIGALRM, _on_timeout) - try: - for i, model_type in enumerate(models): - if i % 25 == 0: - print(f" sweep progress: {i}/{len(models)} -> {model_type}", flush=True) - modeling_path = f"transformers.models.{model_type}.modeling_{model_type}" - try: - _il.import_module(modeling_path) - except (ModuleNotFoundError, ImportError): - skipped.append((model_type, "no modeling file")) - continue - signal.alarm(60) - try: - unsloth_compile_transformers( - model_type=model_type, fast_lora_forwards=False, - ) - except Exception as e: - signal.alarm(0) - msg = f"{type(e).__name__}: {str(e)[:200]}" - if model_type in KNOWN_BROKEN_COMPILE: - known.append((model_type, msg)) - else: - new_failures.append((model_type, msg)) - continue - signal.alarm(0) - if model_type in KNOWN_BROKEN_COMPILE: - # Came back green unexpectedly -- that's GOOD news, - # the bug was fixed. Surface it so we can drop the - # entry from KNOWN_BROKEN_COMPILE. - print( - f" UNEXPECTED-OK {model_type}: was in " - "KNOWN_BROKEN_COMPILE, now compiles cleanly. " - "Drop the entry." - ) - ok += 1 - finally: - signal.alarm(0) - signal.signal(signal.SIGALRM, prev_handler) - print(f"\nCompile sweep: ok={ok} skipped={len(skipped)} " - f"known-broken={len(known)} new-failures={len(new_failures)}") - for m, r in known: - print(f" KNOWN {m}: {r}") - for m, r in new_failures[:30]: - print(f" NEW {m}: {r}") - if len(new_failures) > 30: - print(f" ...and {len(new_failures)-30} more new failures") - assert not new_failures, ( - f"unsloth_compile_transformers introduced new failures on " - f"{len(new_failures)} model_types not in the known-broken " - f"list: {[m for m, _ in new_failures]}" - ) - # Sanity floor: at least 200 model_types should compile cleanly - # (we observed 362 ok / 383 total on transformers 4.57.6). - assert ok >= 200, ( - f"only {ok} model_types compiled cleanly; expected >=200. " - "Possible transformers-version-induced regression." - ) - - - @pytest.mark.parametrize("model_type,rms_class", [ - ("llama", "LlamaRMSNorm"), - ("qwen3", "Qwen3RMSNorm"), - ("gemma3", "Gemma3RMSNorm"), - ]) - def test_compile_real_modeling_module(model_type, rms_class): - """Spot-check on the three production-relevant families that - the compile_every sweep also covers; this case verifies the - emitted cache file has the model-specific RMSNorm class - attribute, not just that the file parses + imports. - - ``unsloth_compile_transformers`` is not idempotent in- - process: calling it twice on the same modeling module - after rewriting class attributes corrupts the inspect - source/line cache and the second emitted file is malformed - Python. The sweep above already produced a valid cache - file for every non-KNOWN_BROKEN model_type, so just verify - that artefact here. Trigger a compile only when running - this test in isolation (no sweep preceded).""" - import importlib as _il - try: - modeling = _il.import_module( - f"transformers.models.{model_type}.modeling_{model_type}" - ) - except ModuleNotFoundError: - pytest.skip( - f"transformers build lacks model_type={model_type}" - ) - combined = _CACHE / f"unsloth_compiled_module_{model_type}.py" - if not combined.exists(): - unsloth_compile_transformers( - model_type=model_type, fast_lora_forwards=False, - ) - modeling = _il.import_module( - f"transformers.models.{model_type}.modeling_{model_type}" - ) - assert getattr(modeling, "__UNSLOTH_PATCHED__", False) is True - _verify_file(combined, must_expose=[rms_class]) - - - def test_compile_disable_writes_nothing(): - """Negative control: when UNSLOTH_COMPILE_DISABLE=1 the - compile path must early-return without producing new files.""" - os.environ["UNSLOTH_COMPILE_DISABLE"] = "1" - try: - before = set(_CACHE.iterdir()) - # Pick a model_type that still resolves on this transformers. - for mt in ("llama", "mistral", "qwen2"): - try: - import importlib as _il - _il.import_module( - f"transformers.models.{mt}.modeling_{mt}" - ) - break - except ModuleNotFoundError: - continue - else: - pytest.skip("no probe model_type available") - unsloth_compile_transformers( - model_type=mt, fast_lora_forwards=False, - ) - after = set(_CACHE.iterdir()) - assert after == before, ( - f"DISABLE=1 still wrote: {[p.name for p in after - before]}" - ) - finally: - os.environ.pop("UNSLOTH_COMPILE_DISABLE", None) - - - def test_compile_sft_trainer_patch(): - """Round-trip TRL's SFTTrainer through the rl.py patch path - and verify the generated UnslothSFTTrainer.py.""" - pytest.importorskip("trl") - try: - from unsloth.models.rl import _patch_trl_rl_trainers - except ImportError: - pytest.skip("unsloth.models.rl._patch_trl_rl_trainers absent") - try: - _patch_trl_rl_trainers("sft_trainer") - except Exception as e: - # TRL 1.x renames break the patch helper internally; we - # accept that here and skip rather than fail the cell. - pytest.skip(f"_patch_trl_rl_trainers raised: {type(e).__name__}: {e}") - sft = _CACHE / "UnslothSFTTrainer.py" - if not sft.exists(): - pytest.skip( - "_patch_trl_rl_trainers ran but did not emit " - "UnslothSFTTrainer.py on this TRL version." - ) - _verify_file(sft, must_expose=["UnslothSFTTrainer"]) - PY - python -m pytest -q --tb=short tests/_zoo_compiler_cache_shim.py - rm -f tests/_zoo_compiler_cache_shim.py - - - name: TRL trainer + Config auto-discovery + dynamic patch coverage - # Mirror unsloth/models/rl.py:patch_trl_rl_trainers AND verify the - # dynamic per-version patch surface: - # 1. AST-parse every *_trainer / *_config submodule. - # 2. Apply the same *Trainer / *Config discovery rules - # _patch_trl_rl_trainers uses (rl.py:553-620). - # 3. Orphan check: every _trainer must have a sibling - # _config OR an inline *Config. - # 4. Dynamic count: enumerate every canonical trainer that - # imports cleanly, run patch_trl_rl_trainers(), assert - # every one ends up Unsloth-prefixed in-place. Floor matches - # the cohort sizes from the version sweep: - # TRL 0.22-0.23 -> 18 canonical trainers - # TRL 0.24-0.28 -> 15 canonical trainers - # TRL 0.29-1.x -> 6 canonical (rest are experimental - # thin-wrappers; covered next) - # 5. Experimental coverage (TRL 0.29+): walk trl.experimental.*, - # find every *Trainer class, verify the umbrella patch - # reaches them via the thin-wrapper MRO walk in - # _patch_trl_rl_trainers (rl.py:677-702). - # Per-cell wall-time ~30-60s. - run: | - set -euxo pipefail - cat > tests/_trl_trainer_discovery_shim.py <<'PY' - # Auto-generated by .github/workflows/consolidated-tests-ci.yml. - # Walks every *_trainer / *_config module in trl.trainer and - # validates that unsloth's auto-discovery rules in - # unsloth/models/rl.py:_patch_trl_rl_trainers (lines 542-620, - # 1934-1949) still pick out exactly one *Trainer and one - # *Config per module on the matrix's TRL version. - import sys, pathlib, importlib, importlib.util, ast, inspect - - sys.path.insert(0, str(pathlib.Path(__file__).parent)) - import _zoo_aggressive_cuda_spoof as _spoof - _spoof.apply() - - import pytest - pytest.importorskip("trl") - import trl # noqa: F401 (forces lazy-module init) - import trl.trainer - - - def _is_real_submodule(qual_name: str) -> bool: - """True iff `qual_name` resolves to an importable submodule - with a file on disk (i.e. has a non-None find_spec().origin). - - TRL re-exports utility FUNCTIONS into `trl.trainer.__init__` - whose names happen to end with `_config` (e.g. - `get_peft_config`, `get_quantization_config`). Without this - filter the `endswith` check below picks them up as if they - were submodules and the AST stage fails on `no spec`. The - same trap exists for `_trainer` (none today, but defensive). - """ - try: - spec = importlib.util.find_spec(qual_name) - except (ImportError, ValueError): - return False - return spec is not None and bool(getattr(spec, "origin", None)) - - - # Replicate rl.py:1939-1943 verbatim, then filter to actual - # submodules so re-exported utility functions (e.g. - # `get_peft_config`) do not pollute the AST sweep. - def _trainer_files(): - return [ - x for x in dir(trl.trainer) - if x.islower() - and x.endswith("_trainer") - and x != "base_trainer" - and _is_real_submodule(f"trl.trainer.{x}") - ] - - - def _config_files(): - return [ - x for x in dir(trl.trainer) - if x.islower() - and x.endswith("_config") - and _is_real_submodule(f"trl.trainer.{x}") - ] - - - def _ast_parse_module_via_spec(qual_name: str): - """AST-parse a module's source on disk WITHOUT importing it. - `trl.trainer` uses _LazyModule so `find_spec` resolves the - file path without firing the module-level `__init__`. This - dodges optional-dep ImportErrors (e.g. grpo_trainer's vllm - import) and still surfaces real syntax drift in the file.""" - spec = importlib.util.find_spec(qual_name) - if spec is None or not spec.origin: - return None, "no spec" - path = pathlib.Path(spec.origin) - if not path.is_file(): - return None, f"spec.origin not a file: {path}" - src = path.read_text(encoding="utf-8") - ast.parse(src, filename=str(path)) - return path, None - - - def test_every_trl_trainer_and_config_module_ast_parses(): - """Stage 1: pure file-on-disk AST parse. Catches a TRL - source-level syntax issue on any matrix cell without - triggering optional-dep imports.""" - fail = [] - ok = 0 - for name in _trainer_files() + _config_files(): - qual = f"trl.trainer.{name}" - try: - path, err = _ast_parse_module_via_spec(qual) - if err: - fail.append((qual, err)) - else: - ok += 1 - except SyntaxError as e: - fail.append((qual, f"SyntaxError: {e}")) - except Exception as e: - fail.append((qual, f"{type(e).__name__}: {e}")) - print(f"AST-parsed {ok} TRL trainer+config modules; failed={len(fail)}") - for q, e in fail: - print(f" AST FAIL {q}: {e}") - assert not fail, f"AST parse failed for {len(fail)} TRL modules" - - - def _apply_unsloth_discovery_rules(mod, trainer_file): - """Replicate the four endswith filters in - rl.py:553-569 verbatim.""" - prefix = trainer_file.split("_")[0] - names = [ - x for x in dir(mod) - if x.endswith("Trainer") and x != "Trainer" - and not x.startswith("_") and prefix in x.lower() - ] - configs = [ - x for x in dir(mod) - if x.endswith("Config") and x != "Config" - and not x.startswith("_") and prefix in x.lower() - ] - return names, configs - - - def _resolve_config_via_fallbacks(trainer_file, name_list, mod): - """Replicate rl.py:575-615: try the sibling *_config.py - module, then the MRO walk fallback. Returns the resolved - config-name list (length 0 or 1).""" - # Fallback 1: _config.py module sibling. - cfg_module_name = trainer_file.replace("_trainer", "_config") - try: - cfg_mod = getattr(trl.trainer, cfg_module_name) - except Exception: - cfg_mod = None - if cfg_mod is not None: - prefix = trainer_file.split("_")[0] - hits = [ - x for x in dir(cfg_mod) - if x.endswith("Config") and x != "Config" - and not x.startswith("_") and prefix in x.lower() - ] - if len(hits) == 1: - return hits - # Fallback 2: MRO walk into experimental parent module. - if len(name_list) != 1: - return [] - try: - trainer_cls = getattr(mod, name_list[0]) - except Exception: - return [] - prefix = trainer_file.split("_")[0] - for parent in trainer_cls.__mro__[1:]: - if parent is object: - continue - parent_mod = inspect.getmodule(parent) - if parent_mod is None: - continue - if parent_mod.__name__ == f"trl.trainer.{trainer_file}": - continue - hits = [ - x for x in dir(parent_mod) - if x.endswith("Config") and x != "Config" - and not x.startswith("_") and prefix in x.lower() - ] - if len(hits) == 1: - return hits - return [] - - - def test_unsloth_auto_discovery_finds_trainer_and_config_per_module(): - """Stage 2: drive the same unsloth rules over every trainer - file. import-failures (optional deps) are recorded as - `import-skipped`, mirroring rl.py:1944-1948 try/except.""" - ok = 0 - import_skipped = [] - discovery_skipped = [] - fail = [] - for trainer_file in _trainer_files(): - qual = f"trl.trainer.{trainer_file}" - try: - mod = getattr(trl.trainer, trainer_file) - except Exception as e: - import_skipped.append((qual, f"{type(e).__name__}: {e}")) - continue - trainers, configs = _apply_unsloth_discovery_rules( - mod, trainer_file, - ) - if len(trainers) != 1: - discovery_skipped.append( - (qual, f"trainers={trainers}") - ) - continue - if len(configs) != 1: - configs = _resolve_config_via_fallbacks( - trainer_file, trainers, mod, - ) - if len(configs) != 1: - fail.append( - (qual, - f"trainer={trainers[0]} but config not found " - "(checked module, *_config sibling, and MRO)") - ) - continue - ok += 1 - print(f" OK {qual}: trainer={trainers[0]}, config={configs[0]}") - print( - f"\nDiscovery: ok={ok} import_skipped={len(import_skipped)} " - f"discovery_skipped={len(discovery_skipped)} fail={len(fail)}" - ) - for q, r in import_skipped: - print(f" IMPORT-SKIP {q}: {r}") - for q, r in discovery_skipped: - print(f" DISC-SKIP {q}: {r}") - for q, r in fail: - print(f" FAIL {q}: {r}") - # Hard contract: every TRAINER that imports cleanly AND has - # exactly one *Trainer must also resolve exactly one *Config - # via one of the three rules. import-skipped + discovery- - # skipped (no/multiple *Trainer) are tolerated. - assert not fail, ( - f"unsloth discovery rules failed for {len(fail)} trainers" - ) - # Sanity: at least 3 trainers should fully discover on any - # matrix cell (sft + reward + dpo are the historical core). - assert ok >= 3, ( - f"only {ok} trainers fully discovered; expected >=3 " - "(sft/reward/dpo). Possible TRL surface regression." - ) - - - def test_orphan_trainer_modules_do_not_exist(): - """Stage 3: every _trainer module should have a sibling - _config (TRL 0.26+ convention) OR an inline *Config. An - ORPHAN _trainer with neither is a TRL refactor we want - to know about: it would silently break unsloth's - auto-discovery without raising.""" - orphans = [] - for trainer_file in _trainer_files(): - cfg_module_name = trainer_file.replace("_trainer", "_config") - has_sibling_cfg = ( - importlib.util.find_spec( - f"trl.trainer.{cfg_module_name}" - ) is not None - ) - if has_sibling_cfg: - continue - # No sibling -> require an inline *Config in the - # trainer module itself (resolved via discovery rules). - try: - mod = getattr(trl.trainer, trainer_file) - except Exception: - # Optional-dep failure -> skip; the AST-parse stage - # already covered the file. - continue - _, configs = _apply_unsloth_discovery_rules( - mod, trainer_file, - ) - if not configs: - orphans.append(trainer_file) - assert not orphans, ( - "Orphan TRL trainer modules with neither sibling " - f"_config.py nor an inline *Config: {orphans}. " - "unsloth auto-discovery would silently skip these." - ) - - - # ---- Dynamic patch coverage: count + verify Unsloth-prefixed ---- - - def _enumerate_canonical_trainer_classes(): - """Walk trl.trainer/*_trainer.py on disk (the source of - truth for what `dir(trl.trainer)` should expose) and return - [(trainer_file, TrainerClass), ...] for every entry that - imports + has exactly-one resolvable *Trainer per the - unsloth rules. Skips optional-dep ImportErrors.""" - out = [] - for trainer_file in _trainer_files(): - try: - mod = getattr(trl.trainer, trainer_file) - except Exception: - continue - trainers, _ = _apply_unsloth_discovery_rules(mod, trainer_file) - if len(trainers) != 1: - continue - try: - cls = getattr(mod, trainers[0]) - except Exception: - continue - out.append((trainer_file, cls)) - return out - - - def _enumerate_experimental_trainer_packages(): - """TRL 0.29+ moved many trainers (bco, cpo, gkd, nash_md, - online_dpo, orpo, ppo, prm, xpo, ...) to `trl.experimental.`, - re-exposing them via thin-wrapper deprecation shims in - `trl.trainer._trainer`. List every `trl.experimental.` - that defines at least one *Trainer class, parsed by AST so we - do NOT trigger the optional-dep imports on the package init.""" - spec = importlib.util.find_spec("trl.experimental") - if spec is None or not spec.submodule_search_locations: - return [] - import re as _re - hits = [] - for root in spec.submodule_search_locations: - rp = pathlib.Path(root) - for sub in sorted(rp.iterdir()): - if not sub.is_dir() or sub.name.startswith("_"): - continue - classes = [] - for py in sub.rglob("*.py"): - try: - src = py.read_text(encoding="utf-8") - except Exception: - continue - for m in _re.finditer( - r"^class\s+([A-Za-z0-9_]+Trainer)\b", src, _re.M, - ): - classes.append(m.group(1)) - if classes: - hits.append((sub.name, sorted(set(classes)))) - return hits - - - def _is_unsloth_patched(cls) -> bool: - return getattr(cls, "__name__", "").startswith("Unsloth") - - - def test_unsloth_patches_every_canonical_trainer_in_this_trl_version(): - """Verify the count + identity of canonically-patched trainers - matches the trainer surface this TRL version actually ships. - - For TRL 0.22.x-0.23.x: ~18 canonical trainers expected. - For TRL 0.24.x-0.28.x: ~15 canonical trainers expected. - For TRL 0.29.x-1.x: 6 canonical (rest are experimental - thin-wrappers; covered by the next test).""" - from unsloth.models.rl import patch_trl_rl_trainers - before = _enumerate_canonical_trainer_classes() - before_count = len(before) - before_unpatched = [ - (tf, cls.__name__) for tf, cls in before - if not _is_unsloth_patched(cls) - ] - # Apply unsloth's umbrella patch. - patch_trl_rl_trainers() - # Re-enumerate (some classes may have been replaced in-module). - after = _enumerate_canonical_trainer_classes() - after_count = len(after) - patched = [(tf, cls.__name__) for tf, cls in after - if _is_unsloth_patched(cls)] - unpatched = [(tf, cls.__name__) for tf, cls in after - if not _is_unsloth_patched(cls)] - print( - f"\nCanonical trainer surface for TRL {trl.__version__}: " - f"discoverable_before={before_count} " - f"discoverable_after={after_count} " - f"patched={len(patched)} unpatched={len(unpatched)}" - ) - for tf, n in patched: - print(f" PATCHED {tf}: {n}") - for tf, n in unpatched: - print(f" UNPATCHED {tf}: {n}") - # Hard contract: every canonical trainer that imports - # cleanly must end up Unsloth-prefixed after the umbrella - # patch. If a trainer was discoverable BEFORE the patch but - # is missing from `after`, that is a separate (rare) issue - # we surface as failure. - assert before_count == after_count, ( - f"trainer-class set changed across patching: " - f"before={[n for _, n in before_unpatched]} " - f"after={[n for _, n in unpatched]}" - ) - assert not unpatched, ( - "unsloth.models.rl.patch_trl_rl_trainers did NOT patch: " - + ", ".join(f"{tf}:{n}" for tf, n in unpatched) - ) - # Floor matches the cohort sizes from the TRL version sweep: - # 18 (0.22-0.23), 15 (0.24-0.28), 6 (0.29+ canonical only). - assert len(patched) >= 6, ( - f"only {len(patched)} canonical trainers patched; " - "expected >= 6 (the smallest production cohort)." - ) - - - def test_unsloth_patches_experimental_trainers_via_thin_wrappers(): - """TRL 0.29+ ships canonical-`trl.trainer._trainer` modules - for many trainers as deprecation thin-wrappers that forward - to `trl.experimental.`. unsloth's - `_patch_trl_rl_trainers` (rl.py:677-702) detects - `trl.experimental` in the trainer source and resolves to - the parent class -- so patching the canonical entry should - also Unsloth-prefix the experimental class via in-module - setattr. - - Verify by walking trl.experimental.* AST for every *Trainer - class, then checking whether it (or any class with the same - name in the experimental package) carries the Unsloth - prefix after the umbrella patch.""" - from unsloth.models.rl import patch_trl_rl_trainers - patch_trl_rl_trainers() - experimental_pkgs = _enumerate_experimental_trainer_packages() - if not experimental_pkgs: - pytest.skip( - f"TRL {trl.__version__} has no trl.experimental.* " - "trainer surface (pre-0.29 cohort). The canonical " - "test above already covers patching here." - ) - found = [] - missing = [] - for pkg_name, class_names in experimental_pkgs: - qual = f"trl.experimental.{pkg_name}" - try: - pkg_mod = importlib.import_module(qual) - except Exception as e: - # Optional-dep ImportError: experimental package - # could not be loaded. Match unsloth's runtime - # tolerance: this would also be silently skipped - # by `_patch_trl_rl_trainers`. Record but do not - # fail. - print( - f" IMPORT-SKIP {qual}: " - f"{type(e).__name__}: {str(e)[:120]}" - ) - continue - for cls_name in class_names: - cls = getattr(pkg_mod, cls_name, None) - if cls is None: - # Class is defined inside the package but not - # re-exported on the package init. Walk - # submodules to find it. - import pkgutil as _pku - for sub in _pku.walk_packages( - pkg_mod.__path__, prefix=qual + "." - ): - try: - sub_mod = importlib.import_module(sub.name) - except Exception: - continue - cls = getattr(sub_mod, cls_name, None) - if cls is not None: - break - if cls is None: - missing.append((pkg_name, cls_name)) - continue - if _is_unsloth_patched(cls): - found.append((pkg_name, cls_name)) - print(f" PATCHED trl.experimental.{pkg_name}.{cls_name}") - else: - # Not Unsloth-prefixed: either unsloth chose - # not to patch this surface (e.g. the canonical - # thin-wrapper module did not exist) or the - # patch silently failed. Record both - # outcomes; the assertion below tolerates the - # gap as informational, not failure -- the - # canonical test enforces the hard contract. - print( - f" NOT-PATCHED trl.experimental.{pkg_name}." - f"{cls_name} (no Unsloth-prefix on the " - "experimental surface)" - ) - total_experimental = sum(len(cs) for _, cs in experimental_pkgs) - print( - f"\nExperimental trainer surface (TRL {trl.__version__}): " - f"{len(experimental_pkgs)} packages, " - f"{total_experimental} *Trainer classes; " - f"unsloth-patched={len(found)} class-missing={len(missing)}" - ) - # Hard contract: a *Trainer class declared in a python - # source file must be locatable in its package after import. - # If we saw the class definition but cannot find the symbol - # at runtime, the package's public surface drifted. - assert not missing, ( - "experimental *Trainer classes declared in source but " - f"not importable: {missing}" - ) - PY - python -m pytest -q --tb=short -s tests/_trl_trainer_discovery_shim.py - rm -f tests/_trl_trainer_discovery_shim.py - - - name: MoE per-family coverage + GRPO patches + grouped_gemm AST - # Catches the recurring class of bugs that PR #624 (gemma4 missing - # extractor), PR #612 (gemma4 GRPO patch silently dropped), PR #607 - # (gate_up LoRA dropped from grad graph), PR #601 (qwen MoE shape - # mismatch), unsloth#4934 (TRL disable_gradient_checkpointing - # corrupts unsloth GC), and unsloth#3598 (gradient_accumulation - # double-scale on accepts_loss_kwargs=False) targeted. Coverage: - # - # 1. Per-MoE-family side-effect contract: for every patch_*_moe - # function in unsloth_zoo.temporary_patches, if its target - # transformers class is importable on this matrix cell, the - # patch must mark the class with `_unsloth_already_patched=True` - # after running. This is exactly what unsloth_zoo's existing - # test_moe_lora_extractor_coverage walks at the registration - # level; here we tie each patch fn to its declared target so a - # silent early-return (PR #612 style) surfaces as red rather - # than a coverage skip. - # - # 2. PR #4934 (GRPO + TRL 1.0): patch_trl_disable_gradient_checkpointing - # must rebind trl.models.utils.disable_gradient_checkpointing to - # the unsloth no-op AND propagate the rebinding to every trl.* - # module that imported the symbol by reference. - # - # 3. PR #3598 (gradient_accumulation): patch_gradient_accumulation_fix - # must run cleanly on a synthetic Trainer whose training_step - # signature carries `num_items_in_batch`. The original bug was - # that `accepts_loss_kwargs=False` (Qwen3VL, Gemma3 in t-4.57) - # caused double loss-scaling; here we verify the rewrite path - # itself does not raise on a CPU-resolvable shape. - # - # 4. unsloth/kernels/moe/grouped_gemm AST smoke: the Triton kernels - # are GPU-only at runtime, but a SyntaxError or stray - # string-literal in the source still surfaces as a test-time - # ImportError on every install. ast.parse the .py files without - # executing. - # - # Wall-time per cell ~30-60s. Routed through pytest for the spoof - # harness so unsloth_zoo.temporary_patches imports are clean. - run: | - set -euxo pipefail - cat > tests/_moe_coverage_shim.py <<'PY' - # Auto-generated by .github/workflows/consolidated-tests-ci.yml. - import sys, pathlib, ast, importlib, importlib.util, contextlib, os - sys.path.insert(0, str(pathlib.Path(__file__).parent)) - import _zoo_aggressive_cuda_spoof as _spoof - _spoof.apply() - - import pytest - - # Map each MoE patch function to the transformers classes it is - # contractually responsible for marking with _unsloth_already_patched - # after a successful run. Sourced from - # unsloth_zoo/temporary_patches/_moe.py: - # - qwen3_moe.py:382-398 patches Qwen3MoeExperts (new path) or - # Qwen3MoeSparseMoeBlock (old path). - # - qwen3_5_moe.py + qwen3_next_moe.py + qwen3_vl_moe.py register - # extractors on Qwen3_5MoeExperts / Qwen3NextExperts / - # Qwen3VLMoeTextExperts respectively. - # - gemma4_moe.py marks Gemma4TextExperts (current) or - # Gemma4TextMoEBlock (legacy). - # - glm4_moe.py marks Glm4MoeLiteNaiveMoe. - # - deepseek_v3_moe.py marks DeepseekV3NaiveMoe. - # - gpt_oss.py:patch_gpt_oss_moe_for_lora marks GptOssExperts. - # Each cell skips a target if the transformers version lacks it - # (legitimate version-skew); only patches with at least one - # importable target are exercised. - # Each entry = ((patch_module, patch_fn), targets, env_setup, - # version_gate). env_setup runs before the patch fn (e.g. set - # UNSLOTH_MODEL_NAME for gpt_oss). version_gate is a callable - # returning True when the patch SHOULD run on this transformers; - # if False, the test skips with a documented reason. - def _v5_or_later(): - try: - import transformers - major = int(transformers.__version__.split(".")[0]) - return major >= 5 - except Exception: - return False - - MOE_PATCHES = [ - { - "module": "unsloth_zoo.temporary_patches.qwen3_moe", - "fn": "patch_qwen3_moe", - "targets": [ - ("transformers.models.qwen3_moe.modeling_qwen3_moe", "Qwen3MoeExperts"), - ("transformers.models.qwen3_moe.modeling_qwen3_moe", "Qwen3MoeSparseMoeBlock"), - ], - "env": {}, - "gate": lambda: True, - "gate_reason": "", - }, - { - "module": "unsloth_zoo.temporary_patches.qwen3_5_moe", - "fn": "patch_qwen3_5_moe", - "targets": [ - ("transformers.models.qwen3_5_moe.modeling_qwen3_5_moe", "Qwen3_5MoeExperts"), - ], - "env": {}, "gate": lambda: True, "gate_reason": "", - }, - { - "module": "unsloth_zoo.temporary_patches.qwen3_next_moe", - "fn": "patch_qwen3_next_moe", - "targets": [ - ("transformers.models.qwen3_next.modeling_qwen3_next", "Qwen3NextExperts"), - ], - "env": {}, "gate": lambda: True, "gate_reason": "", - }, - { - "module": "unsloth_zoo.temporary_patches.qwen3_vl_moe", - "fn": "patch_qwen3_vl_moe", - "targets": [ - ("transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe", "Qwen3VLMoeTextExperts"), - ], - "env": {}, "gate": lambda: True, "gate_reason": "", - }, - { - "module": "unsloth_zoo.temporary_patches.gemma4_moe", - "fn": "patch_gemma4_moe", - "targets": [ - ("transformers.models.gemma4.modeling_gemma4", "Gemma4TextExperts"), - ], - "env": {}, "gate": lambda: True, "gate_reason": "", - }, - { - "module": "unsloth_zoo.temporary_patches.glm4_moe", - "fn": "patch_glm4_moe", - "targets": [ - ("transformers.models.glm4_moe.modeling_glm4_moe", "Glm4MoeLiteNaiveMoe"), - ], - "env": {}, "gate": lambda: True, "gate_reason": "", - }, - { - "module": "unsloth_zoo.temporary_patches.deepseek_v3_moe", - "fn": "patch_deepseek_v3_moe", - "targets": [ - ("transformers.models.deepseek_v3.modeling_deepseek_v3", "DeepseekV3NaiveMoe"), - ], - "env": {}, "gate": lambda: True, "gate_reason": "", - }, - { - "module": "unsloth_zoo.temporary_patches.gpt_oss", - "fn": "patch_gpt_oss_moe_for_lora", - "targets": [ - ("transformers.models.gpt_oss.modeling_gpt_oss", "GptOssExperts"), - ], - # The patch reads UNSLOTH_MODEL_NAME and only runs when - # "gpt_oss" is in the normalized form. Set it explicitly - # so the gate at gpt_oss.py:1387 passes; otherwise the - # patch silently early-returns and the test would - # spuriously fail. - "env": {"UNSLOTH_MODEL_NAME": "gpt_oss"}, - # Additionally only runs on transformers >= 5 - # (gpt_oss.py:1392 `_is_transformers_v5()` gate). - "gate": _v5_or_later, - "gate_reason": ( - "patch_gpt_oss_moe_for_lora gates on " - "transformers >= 5 (split-LoRA grouped_mm path)" - ), - }, - ] - - - def _resolve_target_classes(targets): - """Return [(qual, cls), ...] for every importable target.""" - out = [] - for mod_path, cls_name in targets: - try: - mod = importlib.import_module(mod_path) - except Exception: - continue - cls = getattr(mod, cls_name, None) - if cls is None: - continue - out.append((f"{mod_path}.{cls_name}", cls)) - return out - - - @pytest.mark.parametrize( - "spec", - MOE_PATCHES, - ids=lambda s: s["fn"], - ) - def test_moe_patch_marks_its_target_when_class_present(spec, monkeypatch): - """If at least one target class is importable AND the - version gate passes, run the patch fn and assert at least - one target is marked patched afterwards. Skips when the - transformers version lacks every target or when the - version gate blocks the patch (legitimate). Fails on - silent patch-fn early-returns (PR #612 class of bug).""" - targets = spec["targets"] - patch_module = spec["module"] - patch_name = spec["fn"] - importable = _resolve_target_classes(targets) - if not importable: - pytest.skip( - f"{patch_name}: no target class importable on this " - f"transformers (looked for {[c for _, c in targets]})." - ) - if not spec["gate"](): - pytest.skip( - f"{patch_name}: version gate blocks this cell. " - f"Reason: {spec['gate_reason']}" - ) - for k, v in spec["env"].items(): - monkeypatch.setenv(k, v) - try: - pmod = importlib.import_module(patch_module) - except Exception as e: - pytest.skip( - f"{patch_module} import failed (likely optional dep): " - f"{type(e).__name__}: {e}" - ) - fn = getattr(pmod, patch_name, None) - if fn is None or not callable(fn): - pytest.skip(f"{patch_module} has no callable {patch_name}") - try: - fn() - except Exception as e: - raise AssertionError( - f"{patch_name}() raised on a transformers that " - f"DOES ship at least one target class ({importable}). " - f"This is the silent-failure mode PR #612 fixed: " - f"{type(e).__name__}: {e}" - ) - # At least one importable target must now carry SOME marker - # showing unsloth touched it. Accepted signals (each is set - # by a different patch flow in unsloth_zoo): - # - `_unsloth_already_patched=True` (gemma4, deepseek_v3, glm4) - # - `_unsloth_lora_patched=True` (gpt_oss_moe_for_lora) - # - `_unsloth_lora_extractor_fn` is callable (qwen3_*, glm4_moe) - # - `_original___forward` attr - # (set by patch_function: qwen3_moe SparseMoeBlock, etc.) - # - `_original_forward` attribute (gpt_oss in-place patch) - # Accept any one as "patched". - def _is_patched(cls) -> bool: - if getattr(cls, "_unsloth_already_patched", False) is True: - return True - if getattr(cls, "_unsloth_lora_patched", False) is True: - return True - if callable(getattr(cls, "_unsloth_lora_extractor_fn", None)): - return True - if "_original_forward" in dir(cls): - return True - cls_name = cls.__name__ - for attr in dir(cls): - if attr.startswith("_original_") and attr.endswith( - f"_{cls_name}_forward" - ): - return True - return False - - after = _resolve_target_classes(targets) - marked = [qual for qual, cls in after if _is_patched(cls)] - if not marked: - raise AssertionError( - f"{patch_name}() ran without exception but no target " - f"in {importable} carries any of the unsloth markers " - "(_unsloth_already_patched / _unsloth_lora_patched / " - "_unsloth_lora_extractor_fn / _original_*_forward). " - "Patch silently no-op'd (PR #612 class of bug)." - ) - print(f" {patch_name}: marked {marked}") - - - # ---- PR #4934 (TRL 1.0+ GRPO disable_gradient_checkpointing) ---- - - def test_patch_trl_disable_gradient_checkpointing(): - """unsloth/models/rl.py:patch_trl_disable_gradient_checkpointing - must rebind trl.models.utils.disable_gradient_checkpointing to - the unsloth no-op when TRL >= 1.0. Pre-1.0 TRL has no such - symbol -> the patch returns early.""" - try: - import trl.models.utils as _tmu - except ImportError: - pytest.skip("trl not installed") - had_symbol = hasattr(_tmu, "disable_gradient_checkpointing") - try: - from unsloth.models.rl import patch_trl_disable_gradient_checkpointing - except ImportError: - pytest.skip( - "unsloth.models.rl.patch_trl_disable_gradient_checkpointing " - "absent (older unsloth than #4934)" - ) - patch_trl_disable_gradient_checkpointing() - if not had_symbol: - # Pre-1.0 TRL: patch is a no-op early-return. Verify - # nothing broke. - pytest.skip( - "TRL pre-1.0 has no disable_gradient_checkpointing; " - "patch correctly early-returned." - ) - fn = getattr(_tmu, "disable_gradient_checkpointing", None) - assert fn is not None, ( - "trl.models.utils.disable_gradient_checkpointing missing " - "after patch -- patch removed the symbol entirely?" - ) - assert getattr(fn, "_unsloth_noop_patched", False) is True, ( - "trl.models.utils.disable_gradient_checkpointing was NOT " - "rebound to the unsloth no-op. PR #4934 regression." - ) - # PR #4934 also walks sys.modules to rebind trl.* modules - # that imported the symbol by reference. Verify at least the - # canonical trainer modules picked up the rebinding when - # they re-export it. - import sys - checked = 0 - missed = [] - for mod_name, mod in list(sys.modules.items()): - if not mod_name.startswith("trl."): - continue - bound = getattr(mod, "disable_gradient_checkpointing", None) - if bound is None: - continue - checked += 1 - if not getattr(bound, "_unsloth_noop_patched", False): - missed.append(mod_name) - print(f" rebound disable_gradient_checkpointing in {checked} trl.* modules") - assert not missed, ( - "trl.* modules that imported disable_gradient_checkpointing " - f"by reference but did not get rebound: {missed}" - ) - - - # ---- PR #3598 (gradient_accumulation loss-scaling rewrite) ---- - - def test_patch_gradient_accumulation_fix_runs_on_synthetic_trainer(): - """patch_gradient_accumulation_fix rewrites a Trainer's - `training_step` source via inspect+exec when the signature - carries `num_items_in_batch`. PR #3598 fixed the rewrite - path to not double-scale for trainers with - `accepts_loss_kwargs=False`. Verify the patch fn runs - without raising on a synthetic Trainer carrying that - signature.""" - try: - from unsloth.models._utils import patch_gradient_accumulation_fix - except ImportError: - pytest.skip( - "unsloth.models._utils.patch_gradient_accumulation_fix absent" - ) - try: - from transformers import Trainer - except ImportError: - pytest.skip("transformers.Trainer absent") - # The patch reads the live Trainer.training_step source. We - # exercise the standard transformers.Trainer here -- if the - # bug is reintroduced in the source rewriter (e.g. broken - # exec, missing import injection), the patch fn raises. - try: - patch_gradient_accumulation_fix(Trainer) - except Exception as e: - raise AssertionError( - "patch_gradient_accumulation_fix raised on a vanilla " - f"transformers.Trainer: {type(e).__name__}: {e}" - ) - # Idempotency: second call must not raise either (the rewrite - # adds `_unsloth_training_step` marker so the second call - # short-circuits per _utils.py:1692-1693). - patch_gradient_accumulation_fix(Trainer) - - - # ---- unsloth/kernels/moe/grouped_gemm AST smoke ---- - - def _walk_py_files(root: pathlib.Path): - for p in root.rglob("*.py"): - if "__pycache__" in p.parts: - continue - yield p - - - def test_unsloth_kernels_moe_grouped_gemm_ast_parses(): - """unsloth/kernels/moe/grouped_gemm hosts the Triton MoE - kernels (GPU-only at runtime). A SyntaxError or stray token - at the SOURCE level still surfaces as ImportError on every - install, so AST-parse the .py files without executing.""" - # Locate `unsloth/kernels/moe/grouped_gemm` via the installed - # `unsloth` package. - import unsloth as _unsloth - kernel_root = ( - pathlib.Path(_unsloth.__file__).parent - / "kernels" / "moe" / "grouped_gemm" - ) - if not kernel_root.exists(): - pytest.skip( - f"{kernel_root} not present in this unsloth checkout." - ) - fail = [] - ok = 0 - for p in _walk_py_files(kernel_root): - try: - ast.parse(p.read_text(encoding="utf-8"), filename=str(p)) - ok += 1 - except SyntaxError as e: - fail.append((str(p), f"SyntaxError: {e}")) - except Exception as e: - fail.append((str(p), f"{type(e).__name__}: {e}")) - print(f"AST-parsed {ok} grouped_gemm files; failed={len(fail)}") - for path, err in fail: - print(f" AST FAIL {path}: {err}") - assert not fail, ( - f"AST parse failed for {len(fail)} grouped_gemm files" - ) - # Sanity: the directory MUST contain at least the interface - # + kernels + reference subtrees as documented. - expected = [ - "interface.py", - "kernels/forward.py", - "kernels/backward.py", - "reference/moe_block.py", - "reference/moe_ops.py", - ] - missing = [e for e in expected if not (kernel_root / e).is_file()] - assert not missing, ( - "grouped_gemm directory layout regressed; missing: " - f"{missing}" - ) - PY - python -m pytest -q --tb=short -s tests/_moe_coverage_shim.py - rm -f tests/_moe_coverage_shim.py - - - name: Summary - if: always() - run: | - echo "::group::Versions" - python -c "import sys, platform; print(sys.version); print(platform.platform())" - python -c "import torch; print('torch', torch.__version__, 'cuda?', torch.cuda.is_available())" - python -c "import transformers; print('transformers', transformers.__version__)" - # `pip show` instead of `import unsloth_zoo` — its __init__ raises - # without an accelerator and the spoof harness only kicks in under - # pytest. Cheap and accurate. - pip show unsloth_zoo - echo "::endgroup::" - echo "Consolidated job done. Coverage:" - echo " - 16 unsloth Bucket-A tests under tests/saving/ + tests/utils/" - echo " - unsloth_zoo @ ${UNSLOTH_ZOO_REF} pytest tests/ (5 GPU cases deselected)" - echo " - unsloth_zoo.compiler.test_apply_fused_lm_head" - - llama-cpp-smoke: - # Standalone llama.cpp build + smoke. Earlier this lived inside every - # consolidated matrix cell and re-cmake'd llama.cpp ~5 min per cell -- - # 3 cells x 275 s = ~14 min of duplicated CPU on every PR for an - # artefact that has nothing to do with the (transformers, TRL) combo. - # `install_llama_cpp` clones ggml-org/llama.cpp at a pinned commit and - # builds the LLAMA_CPP_TARGETS list; the result is independent of the - # HF stack version. Run once, gate the PR. - name: llama.cpp build + smoke - runs-on: ubuntu-latest - timeout-minutes: 25 - env: - UNSLOTH_ZOO_REF: ${{ inputs.unsloth_zoo_ref || 'main' }} - # Same env contract the matrix cells use: protobuf python parser - # (transformers' bundled *_pb2.py needs it), studio on PYTHONPATH, - # compile-disable + UNSLOTH_IS_PRESENT so unsloth_zoo's __init__ - # bootstrap accepts a pure-import. - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python - PYTHONPATH: ${{ github.workspace }}/studio - UNSLOTH_COMPILE_DISABLE: '1' - UNSLOTH_IS_PRESENT: '1' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Install runtime deps for unsloth_zoo.llama_cpp - # unsloth_zoo's `__init__` imports `temporary_patches`, which - # in turn pulls per-architecture submodules (gemma3n, gemma4, - # qwen3_*_moe, glm4_moe, deepseek_v3_moe, pixtral, ministral, - # mxfp4, bitsandbytes, flex_attention_bwd) -- many of those - # transitively touch transformers and peft / accelerate. Mirror - # the matrix job's install minus the heavy bits that have no - # bearing on `install_llama_cpp` itself: studio.txt's FastAPI - # stack, bitsandbytes (CUDA-only build dependency), triton, - # mammoth/unpdf (PDF tools), datasets, sqlalchemy/cryptography, - # pytest (we run no tests). The remaining pin shape matches - # studio-backend-ci.yml's "Repo tests (CPU)" baseline. - run: | - set -euxo pipefail - python -m pip install --upgrade pip - # Match the matrix job's torch path so unsloth_zoo's - # `import torch` resolves to the same CPU build. - pip install --index-url https://download.pytorch.org/whl/cpu \ - 'torch>=2.4,<2.11' 'torchvision<0.26' - pip install \ - 'numpy<3' protobuf sentencepiece \ - requests tqdm psutil packaging safetensors \ - 'peft>=0.18,<0.20' 'accelerate>=0.34,<2' - # transformers + trl come from pyproject.toml's pinned line - # so this job stays in sync with whatever the consolidated - # `__from_pyproject__` matrix cell is using. - pip install transformers trl - pip install -e . --no-deps - - - name: Clone unsloth_zoo @ ${{ env.UNSLOTH_ZOO_REF }} - # Same shallow clone as the matrix job; we install editable so - # `unsloth_zoo.llama_cpp` resolves to the cloned tree (and any - # main-branch fixes flow into the smoke without a release). - run: | - set -euxo pipefail - # github.com occasionally 500s on the git fetch; retry so a - # single upstream blip does not fail CI. - for attempt in 1 2 3; do - rm -rf "$RUNNER_TEMP/unsloth-zoo" - if git clone --depth=1 --branch="$UNSLOTH_ZOO_REF" \ - https://github.com/unslothai/unsloth-zoo \ - "$RUNNER_TEMP/unsloth-zoo"; then - break - fi - if [ "$attempt" -eq 3 ]; then - echo "::error::git clone unsloth-zoo failed after 3 attempts" - exit 1 - fi - delay=$((5 * attempt)) - echo "::warning::clone failed (attempt $attempt/3), retrying in ${delay}s..." - sleep "$delay" - done - pip install -e "$RUNNER_TEMP/unsloth-zoo" --no-deps - pip show unsloth_zoo - - - name: llama.cpp install via unsloth_zoo.llama_cpp + `llama-cli --help` smoke - # Exercise the canonical `unsloth_zoo.llama_cpp.install_llama_cpp` - # flow that GGUF export uses at runtime: clone ggml-org/llama.cpp - # into ~/.unsloth/llama.cpp, build the LLAMA_CPP_TARGETS list - # (llama-quantize, llama-cli, llama-mtmd-cli, llama-gguf-split, - # llama-server) via cmake, then run `llama-cli --help`. - # - # This replaces the previous "download upstream prebuilt zip" - # approach, which silently exited 0 with the message - # "no ubuntu-x64 prebuilt asset" when ggml-org's release-asset - # naming drifted (the regex `bin-ubuntu-x64.*\.zip$` no longer - # matched their current asset names). The build path is the same - # one Unsloth users hit in production via `model.save_pretrained_gguf`. - # - # Wall-time budget: ~3-5 min cold, dominated by cmake build of - # 5 targets on the runner's 4 cores. Apt-package install is - # handled by `install_llama_cpp` itself via its - # `check_build_requirements` -> `install_package` chain. - run: | - set -euxo pipefail - # libssl-dev / libcurl4-openssl-dev are needed by llama.cpp's - # cmake build for HTTPS support; install up-front so the - # `install_llama_cpp` requirement-check is a no-op. - sudo apt-get update -qq - sudo apt-get install -y -qq build-essential cmake git curl \ - libgomp1 libssl-dev libcurl4-openssl-dev - python <<'PY' - import os, shutil, subprocess, sys, pathlib - # Apply the same CPU spoof the pytest shims use BEFORE any - # unsloth_zoo import: unsloth_zoo/__init__.py calls - # device_type.get_device_type() at module load and raises - # `NotImplementedError: Unsloth cannot find any torch - # accelerator` on a GPU-less runner. The spoof flips - # torch.cuda.is_available() to True so the device probe takes - # the cuda branch; we never actually run CUDA tensor ops in - # this step (just clone+cmake+--help on the binaries). - sys.path.insert(0, str(pathlib.Path("tests").resolve())) - import _zoo_aggressive_cuda_spoof as _spoof - _spoof.apply() - from unsloth_zoo.llama_cpp import ( - install_llama_cpp, - LLAMA_CPP_DEFAULT_DIR, - LLAMA_CPP_TARGETS, - ) - print(f"Unsloth llama.cpp default dir: {LLAMA_CPP_DEFAULT_DIR}") - print(f"Build targets: {LLAMA_CPP_TARGETS}") - # install_llama_cpp returns (quantizer_path, converter_script_path). - # The quantizer's directory is the `llama.cpp` install root, which - # also holds llama-cli after build/bin/llama-* gets copied up - # (llama_cpp.py:867-871). - quantizer, converter = install_llama_cpp(print_output=True) - assert quantizer and os.path.exists(quantizer), ( - f"install_llama_cpp returned quantizer={quantizer!r} but file missing" - ) - assert converter and os.path.isfile(converter), ( - f"install_llama_cpp returned converter={converter!r} but missing" - ) - install_root = os.path.dirname(quantizer) - cli = os.path.join(install_root, "llama-cli") - assert os.path.exists(cli), ( - f"llama-cli not found at {cli!r} after build. Build root contents: " - f"{sorted(p for p in os.listdir(install_root) if p.startswith('llama-'))[:20]}" - ) - assert os.access(cli, os.X_OK), f"{cli!r} not executable" - # `llama-cli --help` exits non-zero on some builds; the contract - # is that recognizable help text appears on stdout/stderr. - proc = subprocess.run( - [cli, "--help"], capture_output=True, text=True, timeout=30, - ) - combined = (proc.stdout or "") + (proc.stderr or "") - print("--- llama-cli --help (first 30 lines) ---") - print("\n".join(combined.splitlines()[:30])) - assert any( - tok in combined.lower() - for tok in ("usage", "--help", "--model", "-m,") - ), ( - f"llama-cli --help produced no recognizable help text. " - f"exit={proc.returncode}\nstdout: {proc.stdout[:400]!r}\n" - f"stderr: {proc.stderr[:400]!r}" - ) - # Also exercise the quantizer the way GGUF export does: --help - # round-trip on the binary that does the actual heavy lifting. - q = subprocess.run( - [quantizer, "--help"], capture_output=True, text=True, timeout=15, - ) - q_combined = (q.stdout or "") + (q.stderr or "") - assert "usage" in q_combined.lower() or "type" in q_combined.lower(), ( - f"llama-quantize --help produced no help text. " - f"exit={q.returncode}\nstdout: {q.stdout[:400]!r}\n" - f"stderr: {q.stderr[:400]!r}" - ) - print( - f"\nOK: install_llama_cpp produced a working llama-cli at {cli} " - f"and llama-quantize at {quantizer}." - ) - PY diff --git a/.github/workflows/lint-ci.yml b/.github/workflows/lint-ci.yml deleted file mode 100644 index 00e6e357e2..0000000000 --- a/.github/workflows/lint-ci.yml +++ /dev/null @@ -1,321 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Whole-repo, multi-language source-lint gate. Runs on every PR -# (no path filter) because each step is sub-second to a few seconds -# and together they catch a class of breakage the focused build -# workflows would miss: -# -# - Python syntax + ruff + leftover debugger calls (across 350+ -# committed .py files, not just studio/backend). -# - Shell `bash -n` parse for every committed *.sh. -# - `yaml.safe_load` and `json.loads` round-trip for every -# committed YAML / JSON config. -# -# TypeScript and Rust are NOT duplicated here on purpose: -# - Studio Frontend CI runs `npm run typecheck` (= `tsc --noEmit`) -# and `npm run build` (vite/swc) on every studio/frontend/** -# change, which is a full TS AST + type check. -# - Studio Tauri CI runs `tauri build --debug --no-bundle` on -# every studio/src-tauri/** or studio/frontend/** change, which -# compiles the Rust crate (= cargo check + cargo build). -# Each is a stricter check than a parse-only step would be, so a -# fast-fail duplicate here would only burn cache; the dedicated -# workflows already block merges on Rust / TS regressions. - -name: Lint CI - -on: - pull_request: - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - source-lint: - name: Source lint (Python + shell + YAML + JSON + safety nets) - runs-on: ubuntu-latest - timeout-minutes: 5 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - # Pin ruff to match .pre-commit-config.yaml so a CI-only ruff - # bump cannot disagree with what pre-commit accepted. - # codespell is pinned for the same reason: a reviewer should - # never see a typo report appear and disappear depending on - # which codespell version the runner happened to install. - - run: pip install 'ruff==0.15.12' 'pyyaml>=6' 'codespell>=2.3,<3' - - - name: Linux deps for shellcheck - run: sudo apt-get update -qq && sudo apt-get install -y --no-install-recommends shellcheck - - - name: Python AST/syntax check (every committed .py must compile) - # python -m compileall uses the same parser the interpreter - # uses, so anything broken here would also crash at - # `import X` on a user's machine. Sub-second across 350+ - # files. Hard gate. - run: | - python -m compileall -q -j 0 \ - unsloth unsloth_cli studio tests cli.py unsloth-cli.py - - - name: Python ruff check (whole repo) - # The narrow rule set in pyproject.toml [tool.ruff.lint] - # selects E9 / F63 / F7 / F82 -- syntax errors, broken - # comparisons, undefined names. The whole repo passes today, - # so this is a hard gate. - run: | - ruff check unsloth unsloth_cli studio tests cli.py unsloth-cli.py - - - name: No leftover debugger / pdb / breakpoint calls - # Catches the "I'll just stick a breakpoint() here" mistake - # before it ships. AST-based so commented-out debugger - # markers don't false-positive (a bare grep would; there - # are three commented `# breakpoint()` markers in - # unsloth/models/rl* today). Sub-second. - run: | - python <<'PY' - import ast, pathlib, sys - - SKIP_PARTS = {".venv", "venv", "build", "dist", ".git", - "unsloth_compiled_cache", "node_modules", - "unsloth.egg-info"} - - bad = [] - scanned = 0 - for path in sorted(pathlib.Path(".").rglob("*.py")): - if any(part in SKIP_PARTS for part in path.parts): - continue - scanned += 1 - try: - tree = ast.parse(path.read_text(encoding="utf-8", errors="replace")) - except SyntaxError: - continue # compileall step above already failed this - for node in ast.walk(tree): - if not isinstance(node, ast.Call): - continue - fn = node.func - if isinstance(fn, ast.Name) and fn.id == "breakpoint": - bad.append((path, node.lineno, "breakpoint()")) - elif (isinstance(fn, ast.Attribute) and fn.attr == "set_trace" - and isinstance(fn.value, ast.Name) - and fn.value.id in {"pdb", "ipdb"}): - bad.append((path, node.lineno, f"{fn.value.id}.set_trace()")) - - if bad: - for path, lineno, what in bad: - print(f"::error file={path},line={lineno}::leftover {what} -- remove before merging") - sys.exit(1) - print(f"no leftover debugger calls (scanned {scanned} files)") - PY - - - name: License-header drift (informational; whole repo) - # Three header families are accepted across the repo: - # 1. SPDX one-liner: `# SPDX-License-Identifier: ...` - # Used across studio/ (AGPL-3.0-only) and a few new - # files elsewhere. - # 2. Apache-2.0 long form, marker phrase - # "Licensed under the Apache License". Used across - # unsloth/ and unsloth_cli/. - # 3. GNU long form, marker phrase "General Public License". - # That single substring covers GPL, LGPL ("GNU Lesser - # General Public License") and AGPL ("GNU Affero - # General Public License") preambles, all three of - # which appear in unsloth/kernels/* (LGPL/AGPL) without - # the SPDX line. - # Empty files (mainly empty __init__.py) are skipped. - # Surfaced as a warning; cleaning up the actual misses is a - # follow-up PR, not a CI fix. - continue-on-error: true - run: | - python <<'PY' - import pathlib - - ACCEPTED = ( - "SPDX-License-Identifier", # any SPDX line - "Licensed under the Apache License", # Apache-2.0 long form - "General Public License", # GPL / LGPL / AGPL long form - ) - SKIP_PARTS = {".venv", "venv", "build", "dist", ".git", - "unsloth_compiled_cache", "node_modules", - "unsloth.egg-info"} - - studio_missing = [] - other_missing = [] - for path in sorted(pathlib.Path(".").rglob("*.py")): - if any(part in SKIP_PARTS for part in path.parts): - continue - text = path.read_text(encoding="utf-8", errors="replace") - if not text.strip(): - continue # empty __init__.py etc. - head = "\n".join(text.splitlines()[:25]) - if any(marker in head for marker in ACCEPTED): - continue - if "studio" in path.parts: - studio_missing.append(path) - else: - other_missing.append(path) - - total = len(studio_missing) + len(other_missing) - if total == 0: - print("every committed .py has a recognised license header") - else: - print(f"::warning::{total} Python files have no recognised license " - f"header (SPDX / Apache-2.0 / GNU long form): " - f"studio={len(studio_missing)}, other={len(other_missing)}") - for path in (studio_missing + other_missing)[:30]: - print(f" {path}") - if total > 30: - print(f" ... and {total - 30} more") - PY - - - name: Shell scripts parse cleanly (`bash -n`) - # Same idea as Python's compileall: parse-only check that - # every committed *.sh would not blow up at `bash script.sh` - # invocation time on a release box. tests/sh/ is the largest - # cluster (the install.sh shape tests). - run: | - shopt -s globstar - fail=0 - for f in $(git ls-files '*.sh'); do - if ! bash -n "$f"; then - echo "::error file=$f::shell parse error" - fail=1 - fi - done - if [ "$fail" -ne 0 ]; then - exit 1 - fi - n=$(git ls-files '*.sh' | wc -l) - echo "$n shell scripts parse cleanly" - - - name: YAML files parse cleanly (yaml.safe_load) - # Catches truncated workflow files, broken indents in - # dependabot.yml / pre-commit configs, etc. Includes - # .github/workflows/*.yml so a typo in the file we just - # added shows up immediately. - run: | - python <<'PY' - import pathlib, sys, yaml - - SKIP_PARTS = {".venv", "venv", "build", "dist", ".git", - "node_modules", "unsloth_compiled_cache", - "unsloth.egg-info"} - - bad = [] - scanned = 0 - for path in sorted(list(pathlib.Path(".").rglob("*.yml")) - + list(pathlib.Path(".").rglob("*.yaml"))): - if any(part in SKIP_PARTS for part in path.parts): - continue - scanned += 1 - try: - with path.open("r", encoding="utf-8") as fh: - list(yaml.safe_load_all(fh)) - except Exception as exc: - bad.append((path, exc)) - - if bad: - for path, exc in bad: - print(f"::error file={path}::YAML parse failed: {exc}") - sys.exit(1) - print(f"{scanned} YAML files parse cleanly") - PY - - - name: JSON files parse cleanly (json.loads) - # Catches malformed package.json, biome.json, etc. Skips: - # - huge npm/bun lockfiles (machine-generated, slow to - # parse, no value). - # - tsconfig*.json: TypeScript convention is JSONC (JSON - # with `/* ... */` comments), which standard json.loads - # rejects. Strip-and-validate would need json5 or a - # hand-rolled comment scrubber for marginal value, since - # `tsc --noEmit` already validates these in Frontend CI. - run: | - python <<'PY' - import fnmatch, json, pathlib, sys - - SKIP_PARTS = {".venv", "venv", "build", "dist", ".git", - "node_modules", "unsloth_compiled_cache", - "unsloth.egg-info"} - SKIP_NAMES = {"package-lock.json", "bun.lock"} - SKIP_PATTERNS = ("tsconfig*.json",) - - bad = [] - scanned = 0 - for path in sorted(pathlib.Path(".").rglob("*.json")): - if any(part in SKIP_PARTS for part in path.parts): - continue - if path.name in SKIP_NAMES: - continue - if any(fnmatch.fnmatch(path.name, pat) for pat in SKIP_PATTERNS): - continue - scanned += 1 - try: - json.loads(path.read_text(encoding="utf-8")) - except Exception as exc: - bad.append((path, exc)) - - if bad: - for path, exc in bad: - print(f"::error file={path}::JSON parse failed: {exc}") - sys.exit(1) - print(f"{scanned} JSON files parse cleanly") - PY - - - name: codespell typo check (informational) - # Catches typos in code, comments, and docs across the repo. - # Skips lockfiles, generated assets, binary artefacts, and - # the LICENSE files (US/UK spelling drift in legal text is - # not ours to second-guess). The ignore-words-list pulls - # out short identifiers + valid technical terms that - # codespell's default dictionary would otherwise flag - # (e.g. `ans` as a math-quiz variable name in - # tests/utils/aime_eval.py, `parm`/`parms` in PyTorch - # nn.Module idioms). Non-blocking until the surfaced typos - # are fixed; drop continue-on-error after the cleanup. - continue-on-error: true - run: | - codespell \ - --skip='*.lock,*.lockb,*.json,*.svg,*.png,*.jpg,*.jpeg,*.gif,*.ico,*.woff*,*.ttf,*.eot,*.zip,*.gz,*.gguf,*.safetensors,*.bin,node_modules,.git,build,dist,unsloth_compiled_cache,unsloth.egg-info,target,studio/frontend/dist,*.pyc,*-licenses.txt,LICENSE*' \ - --ignore-words-list='ans,bu,hel,fo,te,ot,hist,ned,sav,recurser,datas,nin,parm,parms,checkin,nd,fr,inout,donot,uint' \ - --quiet-level=2 - - - name: shellcheck on committed *.sh (informational) - # Goes beyond `bash -n` (which only parses): catches subtle - # shell bugs like unquoted variable expansions, useless - # `cat`, command substitutions inside `[[`, etc. The - # install/setup scripts are critical-path so the signal is - # worth surfacing. Non-blocking until install.sh's - # hand-rolled patterns get cleaned up; drop continue-on-error - # afterwards. - continue-on-error: true - run: | - # Exclude SC1090 ("source not followable") -- legitimate - # for installer scripts that source files at runtime - # paths shellcheck cannot resolve statically. - # SC2034 ("variable assigned but never used") fires on - # the export-only assignment idiom we use in install.sh. - shellcheck -e SC1090,SC2034 $(git ls-files '*.sh') - - - name: ruff format drift (informational) - # The canonical formatter is scripts/run_ruff_format.py - # = ruff format + scripts/enforce_kwargs_spacing.py, so plain - # `ruff format --check` reports the kwarg-spacing diff as - # drift. Surface the count for visibility but keep - # non-blocking until the custom pipeline is wired in here. - continue-on-error: true - run: | - ruff format --check unsloth unsloth_cli studio tests cli.py unsloth-cli.py diff --git a/.github/workflows/mlx-ci.yml b/.github/workflows/mlx-ci.yml deleted file mode 100644 index 75940832a0..0000000000 --- a/.github/workflows/mlx-ci.yml +++ /dev/null @@ -1,430 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Focused PR gate for the MLX dispatch surface, running on a real -# Apple Silicon runner. -# -# Runner: macos-14 (M1, 3 vCPU / 7 GB / Apple Silicon standard runner -# -- FREE for public repositories per the GitHub Actions billing -# reference; larger variants like macos-14-large/-xlarge are paid so -# we deliberately avoid those). -# -# Why a single Mac job (no Linux+spoof leg): the dispatch tests are -# 100% spoofed monkeypatches and run identically on any host, so the -# Linux leg was duplicating the matrix tests already covered on Mac -# while missing everything Apple-specific. The Mac job runs the SAME -# spoofed matrix PLUS three things only a real Apple Silicon host -# can prove: -# -# 1. unsloth._IS_MLX flips True on Darwin+arm64 with mlx genuinely -# installed (no spoof). -# 2. Every PR-A MLX-only unsloth_zoo module (mlx_loader, mlx_trainer, -# mlx_compile, mlx_utils, mlx_cce, gated_delta_vjp) imports -# against the real `mlx` + `mlx-lm` + `mlx-vlm` PyPI wheels -- -# each does `import mlx.core as mx` at module top level, so this -# catches a future change that breaks the real wheels without -# needing a Mac developer in the loop. -# 3. The hardware-dispatch spoofs do not collide with the real -# environment (the test fixture installs a MetaPathFinder that -# blocks `import mlx.core` for "no-mlx" profiles, faithfully -# simulating a Mac without mlx even when mlx IS installed). -# 4. End-to-end MLX training + inference smoke test: -# run_real_mlx_smoke.py trains unsloth/gemma-3-270m-it for 7 -# deterministic LoRA steps on a single repeated text row, then -# verifies the trained model can complete the prompt and that -# losses + grad norms are finite and well-behaved. This is the -# only place in CI that exercises a real MLX backward pass + -# optimizer step + inference call. -# -# Three dispatch test files documented in tests/studio/README.md: -# - test_hardware_dispatch_matrix.py parametrized 7-profile matrix -# + 2 dispatch-priority canaries -# - test_is_mlx_dispatch_gate.py AST + runtime guard on -# unsloth._IS_MLX -# - test_mlx_training_worker_behaviors.py AST contract checks on -# studio/backend/core/training/worker.py -# -# Surfaces a single PR check ("MLX CI on Mac M1 / dispatch"). -# -# Security audit footprint: every package this workflow installs is -# already covered by .github/workflows/security-audit.yml -- the deps -# come from studio/backend/requirements/studio.txt and unsloth-zoo's -# pyproject (resolved transitively). The git+ install of unsloth-zoo -# is intentionally skipped by the audit (pip-audit cannot resolve a -# git URL through PyPI metadata; the audit comment in security-audit.yml -# documents this). No new package is introduced solely by MLX CI. - -name: MLX CI on Mac M1 - -on: - pull_request: - paths: - - 'unsloth/__init__.py' - - 'unsloth/_gpu_init.py' - - 'studio/backend/utils/hardware/**' - - 'studio/backend/core/training/worker.py' - - 'studio/backend/core/inference/mlx_inference.py' - - 'tests/studio/test_hardware_dispatch_matrix.py' - - 'tests/studio/test_is_mlx_dispatch_gate.py' - - 'tests/studio/test_mlx_training_worker_behaviors.py' - - 'tests/studio/run_real_mlx_smoke.py' - - 'tests/conftest.py' - - '.github/workflows/mlx-ci.yml' - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - dispatch: - name: dispatch - runs-on: macos-14 - # 25 min: dispatch + spoofed matrix + 7-step real LoRA training is - # under 2 min; GGUF export builds llama.cpp via cmake on Apple - # Silicon (~5-7 min), so we budget headroom. - timeout-minutes: 25 - steps: - # harden-runner audit mode: macOS runners cannot use blocking mode - # today (eBPF egress enforcement is Linux-only), but audit mode is - # supported cross-platform and surfaces the egress destinations in - # the runner log. This produces the data needed to graduate this - # job to a block-mode allowlist once macOS support lands. - - name: Harden runner (audit) - uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 - with: - egress-policy: audit - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - # macOS install ladder, validated locally against a Linux - # mac-sim venv (platform spoofed + mlx_simulation shim + real - # datasets/transformers/structlog). - # - # 1. studio/backend/requirements/studio.txt brings structlog, - # fastapi, etc. The hardware probe imports structlog at - # module top level. - # 2. Same pytest / numpy / httpx stack the rest of the repo CI - # uses. - # 3. torch is explicitly installed: unsloth-zoo's pyproject - # deliberately excludes torch on darwin+arm64 (mlx replaces - # it for runtime use), but the dispatch tests spoof - # torch.cuda / torch.xpu / torch.backends.mps via monkeypatch - # and so the test process needs torch importable. We pull - # from the PyTorch CPU index so Apple Silicon gets the - # explicit cpu+MPS arm64 wheel rather than something the - # default PyPI resolver might pick up. The CPU index hosts - # macosx_*_arm64 wheels alongside the Linux x86_64 ones. - # 4. unsloth-zoo from git main (NOT PyPI), WITH deps. PR-A's - # MLX support landed after the most recent unsloth-zoo PyPI - # release; the wheel still raises NotImplementedError on - # Apple Silicon when device_type.get_device_type() runs - # unguarded. Studio's own install.sh overlays unsloth-zoo - # from git main for the same reason. Pulling deps lets pip - # resolve the platform-conditional MLX-only wheels (mlx, - # mlx-lm, mlx-vlm gated on darwin+arm64 in unsloth-zoo's - # pyproject) AND the shared deps (datasets, transformers, - # sentencepiece, ...) that unsloth's MLX branch loads via - # dataprep/raw_text.py. - # 5. unsloth -e . --no-deps so the editable install does not - # fight the unsloth-zoo dep set. - # - # All explicit pip installs are version-pinned to a single - # released version (the latest as of 2026-05-07 within each - # project's existing constraint range). bump alongside the rest - # of the security audit when a new release lands. - - name: Install deps - run: | - python -m pip install --upgrade pip - pip install -r studio/backend/requirements/studio.txt - pip install \ - 'python-multipart==0.0.27' \ - 'aiofiles==25.1.0' \ - 'sqlalchemy==2.0.49' \ - 'cryptography==48.0.0' \ - 'pyyaml==6.0.3' \ - 'jinja2==3.1.6' \ - 'mammoth==1.12.0' \ - 'unpdf==1.0.0' \ - 'requests==2.33.1' \ - 'typer==0.25.1' \ - 'numpy==2.4.4' \ - 'pytest==9.0.3' \ - 'pytest-asyncio==1.3.0' \ - 'httpx==0.28.1' - pip install --index-url https://download.pytorch.org/whl/cpu \ - 'torch==2.10.0' - # github.com occasionally 500s on the git fetch; retry the - # zoo install so a single upstream blip does not fail CI. - for attempt in 1 2 3; do - if pip install "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo"; then - break - fi - if [ "$attempt" -eq 3 ]; then - echo "::error::pip install unsloth_zoo failed after 3 attempts" - exit 1 - fi - delay=$((5 * attempt)) - echo "::warning::unsloth_zoo install failed (attempt $attempt/3), retrying in ${delay}s..." - sleep "$delay" - done - pip install -e . --no-deps - - # Real Apple Silicon sanity: confirm _IS_MLX activates on real - # hardware with no platform spoof. - - name: Verify _IS_MLX flips True on real Apple Silicon - run: | - python -c " - import platform - assert platform.system() == 'Darwin', platform.system() - assert platform.machine() == 'arm64', platform.machine() - import unsloth - assert unsloth._IS_MLX is True, f'expected _IS_MLX=True on real Apple Silicon, got {unsloth._IS_MLX}' - print('OK: _IS_MLX activated on real Apple Silicon') - " - - # Real Apple Silicon sanity: confirm every PR-A MLX-only module - # loads against real mlx + mlx-lm + mlx-vlm wheels. - - name: Smoke-import every MLX-only unsloth_zoo module - run: | - python -c " - import importlib - for name in [ - 'unsloth_zoo.mlx_loader', - 'unsloth_zoo.mlx_trainer', - 'unsloth_zoo.mlx_compile', - 'unsloth_zoo.mlx_utils', - 'unsloth_zoo.mlx_cce', - 'unsloth_zoo.gated_delta_vjp', - ]: - importlib.import_module(name) - print('OK:', name) - from unsloth_zoo.mlx_loader import FastMLXModel - from unsloth_zoo.mlx_trainer import MLXTrainer, MLXTrainingConfig - assert hasattr(FastMLXModel, 'from_pretrained') - print('OK: FastMLXModel + MLXTrainer surface present') - " - - # Spoofed dispatch matrix. Runs on the real Mac too -- the - # test fixture installs a MetaPathFinder that blocks - # `import mlx.core` for "no-mlx" profiles, so the spoofs - # faithfully simulate every supported hardware combo regardless - # of whether mlx is installed for real. - - name: MLX dispatch tests (3 files, 36 tests) - env: - PYTHONPATH: ${{ github.workspace }}/studio - UNSLOTH_COMPILE_DISABLE: '1' - run: | - python -m pytest -v --tb=short \ - tests/studio/test_hardware_dispatch_matrix.py \ - tests/studio/test_is_mlx_dispatch_gate.py \ - tests/studio/test_mlx_training_worker_behaviors.py - - # Studio prebuilt llama.cpp install + GGUF inference. Drives the - # exact path Studio's setup.sh takes on macOS: invokes - # studio/install_llama_prebuilt.py with --published-repo - # ggml-org/llama.cpp and --published-release-tag b9049 (the - # latest llama.cpp release at the time this step was added; bump - # via UNSLOTH_LLAMA_TAG / DEFAULT_LLAMA_TAG when refreshing). - # The installer downloads llama-b9049-bin-macos-arm64.tar.gz, - # which is the universal Apple Silicon (arm64) build -- the - # same artifact works on M1/M2/M3/M4 because llama.cpp compiles - # against the ARMv8.2 baseline. - # - # The b9049 release also publishes: - # - llama-b9049-bin-macos-arm64-kleidiai.tar.gz - # KleidiAI dispatches at runtime; on M1 it falls back where - # ISA features (e.g. I8MM) are missing, so this asset also - # runs on M1 -- Studio just doesn't choose it by default. - # - llama-b9049-bin-macos-x64.tar.gz - # Intel-only; would only run on M1 via Rosetta 2 emulation, - # which we explicitly avoid. - # - iOS XCFramework - # iOS-app build artifact, unrelated to a macOS desktop CI. - # - # After install, downloads a small published GGUF - # (unsloth/gemma-3-270m-it-GGUF, Q4_K_M) from HuggingFace and - # runs the prebuilt llama-cli on it. Asserts the prompt echo - # appears in stdout. If the install fails OR the binary exits - # non-zero, that's an Unsloth/Studio bug. - - name: Studio prebuilt llama.cpp install + GGUF inference (Mac M1) - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - # install_llama_prebuilt.py hits the GitHub releases API to - # resolve the asset URL. Anonymous calls share the runner-IP - # rate-limit bucket and 403 quickly -- pass the workflow's - # automatic GITHUB_TOKEN to bump us to the 5000/hr authenticated - # bucket. - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - set -euo pipefail - INSTALL_DIR="$HOME/.unsloth-studio-prebuilt-test/llama.cpp" - rm -rf "$INSTALL_DIR" - # --simple-policy is required when --published-repo points - # at upstream ggml-org/llama.cpp; that repo doesn't ship the - # llama-prebuilt-manifest.json asset Studio's default policy - # expects, so the simple platform-specific policy maps - # Darwin+arm64 -> bin-macos-arm64 directly. studio/setup.sh - # passes both --published-repo ggml-org/llama.cpp AND - # --simple-policy automatically on macOS, so this CI step - # exercises the same code path users hit when they run - # `curl -fsSL https://unsloth.ai/install.sh | sh`. - python studio/install_llama_prebuilt.py \ - --install-dir "$INSTALL_DIR" \ - --published-repo ggml-org/llama.cpp \ - --published-release-tag b9049 \ - --simple-policy - - # Studio bundles only llama-server + llama-quantize from the - # prebuilt (not llama-cli) -- inference goes through - # llama-server's HTTP /completion endpoint. Validate both: - # llama-quantize --help proves the dynamic libs link, then - # spin up llama-server and POST a /completion request on a - # tiny published GGUF. - LLAMA_SERVER="$INSTALL_DIR/build/bin/llama-server" - LLAMA_QUANT="$INSTALL_DIR/build/bin/llama-quantize" - [ -x "$LLAMA_SERVER" ] || { echo "::error::llama-server missing at $LLAMA_SERVER"; find "$INSTALL_DIR/build" -type f | head -40; exit 1; } - [ -x "$LLAMA_QUANT" ] || { echo "::error::llama-quantize missing at $LLAMA_QUANT"; exit 1; } - echo "llama-server : $LLAMA_SERVER" - echo "llama-quantize: $LLAMA_QUANT" - "$LLAMA_QUANT" --help >/dev/null && echo " llama-quantize loads OK" - - mkdir -p /tmp/ggufs - bash .github/scripts/hf-download-with-retry.sh \ - 'unsloth/gemma-3-270m-it-GGUF' \ - 'gemma-3-270m-it-Q4_K_M.gguf' \ - /tmp/ggufs - - PORT=18080 - echo "=== starting llama-server on 127.0.0.1:$PORT ===" - "$LLAMA_SERVER" \ - -m /tmp/ggufs/gemma-3-270m-it-Q4_K_M.gguf \ - --host 127.0.0.1 \ - --port "$PORT" \ - -c 256 \ - -n 16 \ - --no-warmup \ - > /tmp/llama-server.log 2>&1 & - SERVER_PID=$! - trap 'kill "$SERVER_PID" 2>/dev/null || true' EXIT - - # Wait for /health to come up - for i in $(seq 1 30); do - if curl -sf "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then - echo " server up after ${i}s" - break - fi - sleep 1 - done - if ! curl -sf "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then - echo "::error::llama-server never became healthy" - tail -40 /tmp/llama-server.log - exit 1 - fi - - PROMPT="Hello, my name is" - echo "=== POST /completion ===" - RESP=$(curl -sf -X POST "http://127.0.0.1:$PORT/completion" \ - -H 'Content-Type: application/json' \ - -d "{\"prompt\":\"$PROMPT\",\"n_predict\":16,\"temperature\":0,\"seed\":3407}") - echo "raw response (head): $(echo "$RESP" | head -c 600)" - CONTENT=$(echo "$RESP" | python -c "import json,sys; print(json.loads(sys.stdin.read()).get('content',''))") - echo "completion content: $CONTENT" - - if [ -z "$CONTENT" ]; then - echo "::error::llama-server /completion returned empty content" - tail -40 /tmp/llama-server.log - exit 1 - fi - echo "OK: Studio prebuilt llama.cpp on Mac M1 + GGUF /completion works" - - # Real MLX training + inference smoke test. Trains - # unsloth/gemma-3-270m-it for 7 deterministic LoRA steps - # (batch_size=2, gradient_accumulation_steps=3) on a single - # repeated row ("<> My name is Unsloth!"), then saves - # the trained model in 3 export formats. The `train` subcommand - # captures per-phase timing + peak GPU + peak RSS into - # train_metrics.json so we can detect regressions across CI runs. - - name: MLX export round-trip — TRAIN + SAVE 3 formats - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - UNSLOTH_COMPILE_DISABLE: '1' - run: | - mkdir -p mlx_workdir - python tests/studio/run_real_mlx_smoke.py train \ - --workdir "$PWD/mlx_workdir" - - # Each reload step runs in a FRESH Python process to confirm - # the cold-start path users would hit in production also works - # (not just the in-memory continuation of a still-running - # trainer). FastMLXModel.from_pretrained gets called from - # scratch; mx.random is re-seeded; per-step timing + peak - # memory are emitted to {format}_reload_metrics.json next to - # the saved dir. - - name: MLX export round-trip — RELOAD LoRA (fresh process) - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - UNSLOTH_COMPILE_DISABLE: '1' - run: | - python tests/studio/run_real_mlx_smoke.py reload \ - --format lora \ - --dir "$PWD/mlx_workdir/lora" - - - name: MLX export round-trip — RELOAD merged_16bit (fresh process) - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - UNSLOTH_COMPILE_DISABLE: '1' - run: | - python tests/studio/run_real_mlx_smoke.py reload \ - --format merged \ - --dir "$PWD/mlx_workdir/merged_16bit" - - # GGUF reload uses the llama-cli binary that save_pretrained_gguf - # built. If save_pretrained_gguf was skipped during train (e.g. - # llama.cpp's convert_hf_to_gguf asserts on the model's tokenizer - # vocab -- a downstream llama.cpp limitation, not an unsloth_zoo - # bug), this step emits a workflow warning and exits 0 so the - # LoRA + merged_16bit assertions remain the gating signal. - - name: MLX export round-trip — RELOAD GGUF via llama-cli (fresh process) - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - if python -c "import json,sys; m=json.load(open('mlx_workdir/train_metrics.json')); sys.exit(0 if m.get('gguf_supported') else 1)"; then - python tests/studio/run_real_mlx_smoke.py reload \ - --format gguf \ - --dir "$PWD/mlx_workdir/gguf" - else - REASON=$(python -c "import json; m=json.load(open('mlx_workdir/train_metrics.json')); print(m.get('gguf_skip_reason') or 'unknown')") - echo "::warning title=GGUF round-trip skipped::${REASON}" - echo "GGUF export was skipped during the train phase. Reason:" - echo " ${REASON}" - echo "Continuing without failing the job; the LoRA + merged_16bit" - echo "reload assertions are still gating this PR." - fi - - # Print all metrics JSON files so regressions are visible in the - # job log. always() so we get telemetry even if a reload step - # asserted gibberish. - - name: MLX export round-trip — aggregate metrics - if: always() - run: | - for f in mlx_workdir/train_metrics.json \ - mlx_workdir/lora_reload_metrics.json \ - mlx_workdir/merged_reload_metrics.json \ - mlx_workdir/gguf_reload_metrics.json; do - echo "=== $f ===" - cat "$f" 2>/dev/null || echo "(missing)" - echo - done diff --git a/.github/workflows/notebooks-ci.yml b/.github/workflows/notebooks-ci.yml deleted file mode 100644 index 673b2f3cc5..0000000000 --- a/.github/workflows/notebooks-ci.yml +++ /dev/null @@ -1,440 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. -# -# Cross-repo notebook validator. Lives in unslothai/unsloth (this repo) -# and inspects every notebook in unslothai/notebooks at HEAD (or the -# ref dispatched in via repository_dispatch). -# -# Catches the bug classes that landed in: -# - unslothai/notebooks#258 Colab torchao 0.10 vs peft 0.19 floor -# - unslothai/notebooks#260 DONT_UPDATE_EXCEPTIONS coverage drift -# - unslothai/notebooks#261 torch/torchcodec ABI; --no-deps tokenizers -# - unslothai/notebooks#264 --no-deps transformers + Colab tokenizers drift -# - unslothai/notebooks#221 git+ HEAD installs in install cells -# - unslothai/notebooks commit 51b1462 template/notebook drift -# -# CPU-only by design. Layer 2 (api-introspect) reuses the existing -# tests/_zoo_aggressive_cuda_spoof.py harness so `import unsloth` -# succeeds on a GPU-less ubuntu-latest runner. - -name: Notebooks CI - -on: - pull_request: - paths: - - 'unsloth/**' - - 'scripts/notebook_validator.py' - - 'scripts/notebook_to_python.py' - - 'scripts/data/colab_pip_freeze.gpu.txt' - - 'scripts/data/colab_to_cpu_pin.json' - - 'tests/notebooks/**' - - 'tests/_zoo_aggressive_cuda_spoof.py' - - '.github/workflows/notebooks-ci.yml' - schedule: - # Daily 06:17 UTC. Catches Colab preinstall bumps (the upstream image - # is rebuilt roughly weekly) without us waiting on a PR. Off the - # :00/:30 fleet-collision spots. - - cron: '17 6 * * *' - workflow_dispatch: - inputs: - notebooks_ref: - description: 'unslothai/notebooks ref to lint (branch / SHA / tag)' - default: 'main' - include_smoke: - description: 'Also run the install-cell smoke matrix (longer)' - type: boolean - default: false - repository_dispatch: - # Fired by a tiny companion workflow on unslothai/notebooks. - types: [notebooks_pr_opened, notebooks_main_pushed] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -env: - NOTEBOOKS_REF: >- - ${{ github.event.inputs.notebooks_ref || - github.event.client_payload.ref || - 'main' }} - -jobs: - static: - name: static (drift + lint + exceptions) - runs-on: ubuntu-latest - timeout-minutes: 10 - steps: - # Validate the dispatched ref before it reaches actions/checkout's `ref:` - # input. Reading via env (NOT direct ${{ ... }} interpolation in the - # regex test) closes the GitHub-Actions-injection class where a - # client_payload.ref like `main"; rm -rf / #` would be embedded into the - # shell command. NOTEBOOKS_REF defaults to 'main' on non-dispatch - # events, but only repository_dispatch can supply attacker-controlled - # values, so we gate this check on that event type. - - name: Validate client_payload.ref shape - if: github.event_name == 'repository_dispatch' - env: - NOTEBOOKS_REF: ${{ github.event.client_payload.ref }} - run: | - if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then - echo "::error::client_payload.ref contains disallowed characters" >&2 - exit 1 - fi - - - name: Checkout unsloth (this PR) - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - path: unsloth - persist-credentials: false - - - name: Checkout unslothai/notebooks @ ${{ env.NOTEBOOKS_REF }} - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - repository: unslothai/notebooks - ref: ${{ env.NOTEBOOKS_REF }} - path: notebooks - fetch-depth: 0 # drift check needs git status / diff - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Install validator deps - run: | - python -m pip install --upgrade pip - # nbformat + nbconvert come from the converter's requirements; - # spellchecker + huggingface_hub are imported at module top of - # update_all_notebooks.py. - pip install \ - 'nbformat>=5.10' 'nbconvert>=7.16' 'pyspellchecker>=0.8' \ - 'huggingface_hub>=0.34' 'tqdm>=4.66' - - - name: Refresh Colab pip-freeze (best-effort; falls back to snapshot) - run: | - python unsloth/scripts/notebook_validator.py refresh-colab \ - --out unsloth/scripts/data/colab_pip_freeze.gpu.txt \ - || echo "::warning::refresh-colab failed; using committed snapshot" - - - name: Diff Colab oracle vs committed snapshots (advisory) - # Pulls pip-freeze.gpu.txt + apt-list-gpu.txt + os-info-gpu.txt - # from googlecolab/backend-info and prints NEW / REMOVED / - # CHANGED entries against scripts/data/colab_*.txt. Non-blocking - # on PRs; the daily cron job below runs the same step with - # --strict so upstream rotations surface within ~24h. - continue-on-error: true - working-directory: ${{ github.workspace }} - run: | - python unsloth/scripts/notebook_validator.py colab-diff \ - --snapshot-dir unsloth/scripts/data - - - name: Drift check (re-run update_all_notebooks.py + git diff) - working-directory: ${{ github.workspace }} - # Reported as non-blocking until the upstream `unslothai/notebooks` - # tree is regenerated. The first run on @main surfaces ~463 files - # of drift (7359 / 9634 line delta), which is a real backlog the - # notebooks-side maintainers need to clear in their own repo -- - # this PR's role is to surface the count, not auto-fix it. - continue-on-error: true - run: | - python unsloth/scripts/notebook_validator.py drift \ - --notebooks-dir notebooks - - - name: Convert sanity (every nb / kaggle / original_template -> .py) - # Same rationale as Drift: a handful of upstream notebooks fail - # the converter (custom magics, malformed JSON, etc). Surface - # the count without blocking; the team triages in unslothai/notebooks. - continue-on-error: true - run: | - python unsloth/scripts/notebook_validator.py convert \ - --notebooks-dir notebooks \ - --out _converted - - - name: Lint (install cells + AST scan, env-scoped) - # Reported as non-blocking (continue-on-error: true) until the - # backlog of pre-existing findings on unslothai/notebooks@main is - # cleared. Same pattern PR #5298 used for biome:check on the - # frontend. As of this commit the live tree surfaces 27 errors + - # 6 warnings, all real (peft/torchao floor missing in 6 nb/ - # notebooks, 14 git+ HEAD installs in hand-tuned exception - # notebooks, 6 torch/torchcodec ABI mismatches, 1 - # transformers/tokenizers --no-deps drift). The count surfaces - # in the PR check UI. Drop continue-on-error once it hits zero. - continue-on-error: true - run: | - python unsloth/scripts/notebook_validator.py lint \ - --notebooks-dir notebooks \ - --colab-pin unsloth/scripts/data/colab_pip_freeze.gpu.txt \ - --no-pypi - # --no-pypi skips R-INST-002 (transitive resolve via PyPI metadata). - # Layer 1 keeps PR-time wall-clock predictable; the daily cron run - # below drops --no-pypi and refreshes the cache. - - - name: DONT_UPDATE_EXCEPTIONS coverage - run: | - python unsloth/scripts/notebook_validator.py exceptions \ - --notebooks-dir notebooks - - static-with-pypi: - name: static + transitive resolve (cron / dispatch only) - if: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - # See `static.Validate client_payload.ref shape` for rationale. This - # job's `if:` excludes repository_dispatch today, so the validation - # step is a defence-in-depth no-op until that gate ever relaxes. - - name: Validate client_payload.ref shape - if: github.event_name == 'repository_dispatch' - env: - NOTEBOOKS_REF: ${{ github.event.client_payload.ref }} - run: | - if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then - echo "::error::client_payload.ref contains disallowed characters" >&2 - exit 1 - fi - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - path: unsloth - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - repository: unslothai/notebooks - ref: ${{ env.NOTEBOOKS_REF }} - path: notebooks - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: { python-version: '3.12', cache: 'pip' } - - name: Install - run: pip install -U pip - - name: Refresh Colab oracle - run: | - python unsloth/scripts/notebook_validator.py refresh-colab \ - --out unsloth/scripts/data/colab_pip_freeze.gpu.txt - - name: Diff Colab oracle vs committed snapshots (--strict on cron) - # Cron-only escalation of the advisory PR-time check. Fails if - # any of pip-freeze.gpu.txt / apt-list-gpu.txt / os-info-gpu.txt - # has drifted from scripts/data/colab_*.txt; refresh the - # snapshots in this repo to acknowledge. - run: | - python unsloth/scripts/notebook_validator.py colab-diff \ - --snapshot-dir unsloth/scripts/data --strict - - name: Lint with live PyPI metadata - run: | - python unsloth/scripts/notebook_validator.py lint \ - --notebooks-dir notebooks \ - --colab-pin unsloth/scripts/data/colab_pip_freeze.gpu.txt - - api-introspect: - name: api surface (under CUDA spoof) - runs-on: ubuntu-latest - timeout-minutes: 12 - steps: - - name: Validate client_payload.ref shape - if: github.event_name == 'repository_dispatch' - env: - NOTEBOOKS_REF: ${{ github.event.client_payload.ref }} - run: | - if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then - echo "::error::client_payload.ref contains disallowed characters" >&2 - exit 1 - fi - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - path: unsloth - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - repository: unslothai/notebooks - ref: ${{ env.NOTEBOOKS_REF }} - path: notebooks - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: { python-version: '3.12', cache: 'pip' } - - - name: Install CPU torch + pinned unsloth + trl + converter deps - run: | - python -m pip install --upgrade pip - # CPU torch + torchvision. torchvision is required because - # unsloth_zoo.vision_utils imports PIL at module top, and the - # easiest way to get a torch-compatible PIL on a CPU runner is - # to let torchvision pull the right Pillow version. - pip install --index-url https://download.pytorch.org/whl/cpu \ - 'torch>=2.8,<2.11' 'torchvision<0.26' - # Pin to the same versions update_all_notebooks.py installs in - # generated notebooks. Keep these in lockstep with PIN_TRL / - # PIN_TRANSFORMERS in unslothai/notebooks/update_all_notebooks.py. - # `triton` is added because unsloth/_gpu_init.py:232 does an - # unconditional `import triton`; the PyPI wheel installs cleanly - # on Linux x86_64 even without CUDA (same rationale as - # consolidated-tests-ci.yml line 192-205). - # Pillow is listed explicitly as a defensive belt-and-braces - # next to torchvision (vision_utils crashes ModuleNotFoundError - # if torchvision skipped its Pillow dep for any reason). - pip install 'transformers>=4.56,<5.6' 'trl>=0.22,<0.26' 'accelerate>=1.0' \ - 'datasets>=3.4,<5' 'peft>=0.15,<0.20' \ - 'bitsandbytes>=0.43' 'sentencepiece' 'protobuf' triton \ - Pillow safetensors tqdm packaging psutil - # Converter deps (nbformat for notebook_to_python.py). - pip install 'nbformat>=5.10' 'nbconvert>=7.16' - # Install unsloth from the LOCAL checkout (the PR head), not PyPI. - # The PR-time CI must validate the code in this PR; PyPI unsloth - # may lag the in-repo CPU-torch fallback in unsloth/kernels/utils.py - # (lines 162-170) that handles missing torch._C._cuda_getCurrentRawStream. - pip install --no-deps unsloth_zoo - pip install --no-deps -e ./unsloth - - - name: Convert notebooks for AST scan - # Same upstream-conversion-error tolerance as the static job. - continue-on-error: true - run: | - python unsloth/scripts/notebook_validator.py convert \ - --notebooks-dir notebooks --out _converted - - - name: Dump unsloth + trl API surface (under CUDA spoof) - run: | - PYTHONPATH=unsloth/tests python -u - <<'PY' - import sys, json, inspect - import _zoo_aggressive_cuda_spoof as _spoof - _spoof.apply() - import unsloth - import trl - surface = {} - for cls_name in ("FastLanguageModel", "FastVisionModel", "FastModel"): - cls = getattr(unsloth, cls_name, None) - if cls is None: - continue - surface[cls_name] = sorted(n for n in dir(cls) if not n.startswith("_")) - surface["SFTConfig_kwargs"] = sorted(inspect.signature(trl.SFTConfig.__init__).parameters) - json.dump(surface, open("_api_surface.json", "w"), indent=2) - print("dumped surface for:", list(surface)) - PY - - - name: Run API rule against converted notebooks - run: | - python unsloth/scripts/notebook_validator.py api \ - --converted-dir _converted \ - --surface _api_surface.json - - smoke-install: - name: smoke install (Colab-shaped venv, opt-in) - if: ${{ github.event.inputs.include_smoke == 'true' || github.event_name == 'schedule' }} - runs-on: ubuntu-latest - timeout-minutes: 25 - strategy: - fail-fast: false - matrix: - # One representative notebook per installation_*_content template. - # Add rows when a new install template lands in update_all_notebooks.py. - notebook: - - 'nb/Llama3.1_(8B)-Alpaca.ipynb' # installation_content - - 'nb/Gemma3_(4B)-Vision.ipynb' # installation_content + vision - - 'nb/Llama3.1_(8B)-GRPO.ipynb' # installation_extra_grpo_content - - 'nb/gpt-oss-(20B)-Fine-tuning.ipynb' # installation_gpt_oss_content - - 'nb/Qwen3_5_(4B)_Vision.ipynb' # installation_qwen3_5_content - - 'nb/Nemotron-3-Nano-30B-A3B_A100.ipynb' # installation_nemotron_nano_content - - 'nb/Whisper.ipynb' # installation_whisper_content - - 'nb/Synthetic_Data_Hackathon.ipynb' # installation_synthetic_data_content - steps: - - name: Validate client_payload.ref shape - if: github.event_name == 'repository_dispatch' - env: - NOTEBOOKS_REF: ${{ github.event.client_payload.ref }} - run: | - if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then - echo "::error::client_payload.ref contains disallowed characters" >&2 - exit 1 - fi - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - path: unsloth - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - repository: unslothai/notebooks - ref: ${{ env.NOTEBOOKS_REF }} - path: notebooks - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: { python-version: '3.12' } - - - name: Seed Colab-shaped venv from pip-freeze (CPU-mapped) - run: | - # Strip cu128 local versions, route torch/torchvision to the CPU - # wheel index, drop CUDA-specific deps the runner can't use. - python -u - <<'PY' > /tmp/seed_pins.txt - import json, re - mapping = json.load(open("unsloth/scripts/data/colab_to_cpu_pin.json")) - rewrite = mapping["rewrite"] - skip = set(mapping["skip"]) - spoof = set(mapping["module_spoof"]) - out = [] - for line in open("unsloth/scripts/data/colab_pip_freeze.gpu.txt"): - line = line.strip() - if not line or line.startswith("#"): - continue - m = re.match(r"^([A-Za-z0-9._-]+)\s*==\s*(.+)$", line) - if not m: - continue - name, ver = m.group(1).lower(), m.group(2) - if name in skip: - continue - if name in spoof: - continue - if name in rewrite: - ver = re.sub(r"[+\-].+$", "", ver) - out.append(f"{name}=={ver}") - else: - ver = re.sub(r"[+\-].+$", "", ver) - out.append(f"{name}=={ver}") - print("\n".join(out)) - PY - head -5 /tmp/seed_pins.txt - wc -l /tmp/seed_pins.txt - - - name: Install Colab-shaped venv - run: | - python -m pip install --upgrade pip - # Best-effort: any single line that fails to resolve on CPU is - # tolerated; the smoke contract is "the install cell + the unsloth - # import works", not "the entire Colab venv reproduces." - while IFS= read -r spec; do - pip install "$spec" --index-url https://download.pytorch.org/whl/cpu \ - --extra-index-url https://pypi.org/simple || \ - echo "::warning::pin failed: $spec" - done < /tmp/seed_pins.txt - - - name: Run install cell - run: | - python unsloth/scripts/notebook_validator.py convert \ - --notebooks-dir notebooks --out _converted - # Take the converted .py and run the install cell only. - BASE="$(basename '${{ matrix.notebook }}' .ipynb | tr -d '()' | tr -c '[:alnum:]_' _)" - PY="_converted/${BASE}.py" - [ -f "$PY" ] || { echo "::error::$PY not found"; ls _converted | head; exit 1; } - # Truncate at the first `from unsloth import` so we run install + - # core imports only. - awk '/^from unsloth import/ { print "import sys; sys.exit(0)"; exit } { print }' "$PY" > _smoke.py - PYTHONPATH=unsloth/tests python -u - <<'PY' - import _zoo_aggressive_cuda_spoof as _s; _s.apply() - # Stub torchcodec for cells that import it — no CPU wheel exists. - import sys, types - if "torchcodec" not in sys.modules: - sys.modules["torchcodec"] = types.ModuleType("torchcodec") - exec(open("_smoke.py").read(), {"__name__": "__main__"}) - PY - - - name: Verify imports under spoof - run: | - PYTHONPATH=unsloth/tests python -u - <<'PY' - import sys, types - if "torchcodec" not in sys.modules: - sys.modules["torchcodec"] = types.ModuleType("torchcodec") - import _zoo_aggressive_cuda_spoof as _s; _s.apply() - import unsloth, peft, torch, torchao, transformers, tokenizers - print("OK: imports pass under CUDA spoof") - PY diff --git a/.github/workflows/release-desktop.yml b/.github/workflows/release-desktop.yml deleted file mode 100644 index 810bb644ba..0000000000 --- a/.github/workflows/release-desktop.yml +++ /dev/null @@ -1,902 +0,0 @@ -name: Release Desktop App - -on: - workflow_dispatch: - inputs: - studio_version: - description: 'Studio version tag to release (for example, v0.1.39-beta)' - type: string - required: true - pypi_version: - description: 'Exact PyPI unsloth version just published/stamped (for example, 2026.5.3); leave blank to use MIN_DESKTOP_BACKEND_VERSION' - type: string - required: false - draft: - description: 'Create as draft release; draft runs do not advance desktop-latest updater channel' - type: boolean - default: true - -permissions: - contents: read - -concurrency: - group: release-desktop-${{ github.repository }} - cancel-in-progress: false - -jobs: - prepare-version: - name: Prepare release versions - runs-on: ubuntu-latest - outputs: - studio_version: ${{ steps.prepare.outputs.studio_version }} - app_version: ${{ steps.prepare.outputs.app_version }} - desktop_release_tag: ${{ steps.prepare.outputs.desktop_release_tag }} - prerelease: ${{ steps.prepare.outputs.prerelease }} - pypi_version: ${{ steps.prepare.outputs.pypi_version }} - - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - with: - persist-credentials: false - - - name: Validate release versions - id: prepare - shell: bash - env: - INPUT_STUDIO_VERSION: ${{ inputs.studio_version }} - INPUT_PYPI_VERSION: ${{ inputs.pypi_version }} - run: | - python3 <<'PY' - import os - import pathlib - import re - import sys - - studio_version = os.environ['INPUT_STUDIO_VERSION'].strip() - if not studio_version: - sys.exit('studio_version is required, for example v0.1.39-beta') - if re.fullmatch(r'v?20\d{2}\.\d+\.\d+(?:[-+][0-9A-Za-z.-]+)?', studio_version): - sys.exit(f'studio_version must be a Studio SemVer tag, not a date-style backend version: {studio_version}') - - semver_tag = re.compile( - r'^v(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)' - r'(?:-[0-9A-Za-z.][0-9A-Za-z.-]*)?$' - ) - if not semver_tag.fullmatch(studio_version): - sys.exit(f'studio_version must be a SemVer tag with leading v, for example v0.1.39-beta: {studio_version}') - - app_version = studio_version.removeprefix('v') - desktop_release_tag = f'desktop-v{app_version}' - prerelease = 'true' if '-' in app_version.split('+', 1)[0] else 'false' - - def parse_backend_version(version): - match = re.fullmatch( - r'(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)' - r'(?:([a-zA-Z]|\.dev|dev|\.rc|rc|\.post|post)(\d*))?' - r'(?:[-+]([0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?', - version, - ) - if not match: - return None - major, minor, patch, suffix_name, suffix_number, suffix_text = match.groups() - if suffix_name: - normalized = suffix_name.lower().lstrip('.') - order = {'dev': 0, 'a': 1, 'b': 2, 'rc': 3, 'post': 5}.get(normalized) - if order is None: - return None - number = int(suffix_number or '0') - elif suffix_text: - order = 3 if version[version.find(suffix_text) - 1] == '-' else 4 - number = 0 - else: - order = 4 - number = 0 - return (int(major), int(minor), int(patch), order, number) - - preflight = pathlib.Path('studio/src-tauri/src/preflight/version.rs').read_text() - match = re.search(r'MIN_DESKTOP_BACKEND_VERSION:\s*&str\s*=\s*"([^"]+)"', preflight) - if not match: - sys.exit('Could not read MIN_DESKTOP_BACKEND_VERSION') - min_backend_version = match.group(1) - - input_pypi_version = os.environ.get('INPUT_PYPI_VERSION', '').strip() - parsed_min_backend = parse_backend_version(min_backend_version) - if parsed_min_backend is None: - sys.exit(f'MIN_DESKTOP_BACKEND_VERSION is not a supported backend package version: {min_backend_version}') - - pypi_version = input_pypi_version or min_backend_version - parsed_pypi = parse_backend_version(pypi_version) - if parsed_pypi is None: - sys.exit(f'pypi_version is not a supported backend package version: {pypi_version}') - if parsed_pypi < parsed_min_backend: - sys.exit( - f'pypi_version {pypi_version} is lower than desktop minimum ' - f'MIN_DESKTOP_BACKEND_VERSION {min_backend_version}' - ) - - if input_pypi_version: - print( - 'Using exact PyPI unsloth version from pypi_version input: ' - f'{pypi_version} (desktop minimum: {min_backend_version})' - ) - else: - print( - 'Using exact PyPI unsloth version from MIN_DESKTOP_BACKEND_VERSION: ' - f'{pypi_version}' - ) - - with open(os.environ['GITHUB_OUTPUT'], 'a', encoding='utf-8') as output: - print(f'studio_version={studio_version}', file=output) - print(f'app_version={app_version}', file=output) - print(f'desktop_release_tag={desktop_release_tag}', file=output) - print(f'prerelease={prerelease}', file=output) - print(f'pypi_version={pypi_version}', file=output) - PY - - - name: Verify PyPI package and Studio stamp - shell: bash - env: - STUDIO_VERSION: ${{ steps.prepare.outputs.studio_version }} - PYPI_VERSION: ${{ steps.prepare.outputs.pypi_version }} - run: | - set -euo pipefail - python3 <<'PY' - import json - import os - import pathlib - import sys - import time - import urllib.error - import urllib.request - - pypi_version = os.environ['PYPI_VERSION'] - dist_dir = pathlib.Path(os.environ['RUNNER_TEMP'], 'pypi-unsloth-dist') - dist_dir.mkdir(parents=True, exist_ok=True) - metadata_url = f'https://pypi.org/pypi/unsloth/{pypi_version}/json' - - last_error = None - for attempt in range(1, 6): - try: - with urllib.request.urlopen(metadata_url, timeout=30) as response: - metadata = json.load(response) - break - except Exception as exc: - last_error = exc - if attempt < 5: - time.sleep(10 * attempt) - else: - sys.exit(f'Publish unsloth=={pypi_version} to PyPI before the desktop release ({last_error})') - - files = metadata.get('urls') or [] - if not files: - sys.exit(f'PyPI returned no distribution files for unsloth=={pypi_version}') - - for file_info in files: - filename = file_info.get('filename') - url = file_info.get('url') - if not filename or '/' in filename or not url: - sys.exit(f'Unexpected PyPI file entry for unsloth=={pypi_version}: {file_info!r}') - target = dist_dir / filename - for attempt in range(1, 4): - try: - with urllib.request.urlopen(url, timeout=60) as response: - target.write_bytes(response.read()) - break - except Exception as exc: - last_error = exc - if attempt < 3: - time.sleep(5 * attempt) - else: - sys.exit(f'Could not download {filename} from PyPI ({last_error})') - PY - - if [ -f scripts/stamp_studio_release.py ]; then - mapfile -t dists < <(find "$RUNNER_TEMP/pypi-unsloth-dist" -type f \( -name '*.whl' -o -name '*.tar.gz' \) | sort) - if [ "${#dists[@]}" -eq 0 ]; then - echo "No PyPI wheel/sdist artifacts downloaded for unsloth==$PYPI_VERSION" >&2 - exit 1 - fi - python3 scripts/stamp_studio_release.py --verify-dist "$RUNNER_TEMP/pypi-unsloth-dist" --expected "$STUDIO_VERSION" - else - echo "scripts/stamp_studio_release.py not found; release-desktop requires #5308 to verify the PyPI Studio stamp." >&2 - exit 1 - fi - - - name: Guard public updater channel version - if: ${{ !inputs.draft }} - shell: bash - env: - GH_REPO: ${{ github.repository }} - GH_TOKEN: ${{ github.token }} - APP_VERSION: ${{ steps.prepare.outputs.app_version }} - run: | - set -euo pipefail - mkdir -p "$RUNNER_TEMP/desktop-current" - if ! gh release download desktop-latest --pattern latest.json --dir "$RUNNER_TEMP/desktop-current" --clobber 2>/dev/null; then - echo "No existing desktop-latest latest.json found; allowing first channel publish." - exit 0 - fi - python3 <<'PY' - import json - import os - import pathlib - import re - import sys - - def parse(value: str): - value = value.removeprefix('v') - match = re.fullmatch( - r'(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)' - r'(?:-([0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?' - r'(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?', - value, - ) - if not match: - sys.exit(f'desktop-latest latest.json has invalid version: {value}') - major, minor, patch, prerelease = match.groups() - return (int(major), int(minor), int(patch), prerelease) - - def numeric_tail(identifier: str) -> tuple[str, int] | None: - match = re.fullmatch(r'([A-Za-z-]+)(\d+)', identifier) - if not match: - return None - return (match.group(1).lower(), int(match.group(2))) - - def compare_identifier(left: str, right: str) -> int: - left_num = left.isdigit() - right_num = right.isdigit() - if left_num and right_num: - return (int(left) > int(right)) - (int(left) < int(right)) - if left_num: - return -1 - if right_num: - return 1 - - left_tail = numeric_tail(left) - right_tail = numeric_tail(right) - if left_tail and right_tail and left_tail[0] == right_tail[0]: - return (left_tail[1] > right_tail[1]) - (left_tail[1] < right_tail[1]) - - return (left > right) - (left < right) - - def compare_prerelease(left: str | None, right: str | None) -> int: - if left == right: - return 0 - if left is None: - return 1 - if right is None: - return -1 - left_parts = left.split('.') - right_parts = right.split('.') - for left_part, right_part in zip(left_parts, right_parts): - order = compare_identifier(left_part, right_part) - if order: - return order - return (len(left_parts) > len(right_parts)) - (len(left_parts) < len(right_parts)) - - def compare(left: str, right: str) -> int: - left_major, left_minor, left_patch, left_pre = parse(left) - right_major, right_minor, right_patch, right_pre = parse(right) - left_core = (left_major, left_minor, left_patch) - right_core = (right_major, right_minor, right_patch) - if left_core != right_core: - return (left_core > right_core) - (left_core < right_core) - return compare_prerelease(left_pre, right_pre) - - current_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-current', 'latest.json') - current = json.loads(current_path.read_text()).get('version') - next_version = os.environ['APP_VERSION'] - if not isinstance(current, str): - sys.exit('desktop-latest latest.json has missing version') - if compare(next_version, current) < 0: - sys.exit( - f'Refusing to publish {next_version}; desktop-latest currently points at newer version {current}.' - ) - PY - - build: - # TODO: split into a "build (no secrets)" + "publish (secrets)" job pair - # with actions/upload-artifact handoff so the matrix build cannot - # publish a Release on its own. The current matrix runs across - # Linux/macOS/Windows in a single job, so the split needs artefact - # collection across the OS matrix and is out of scope for this - # hardening pass. - permissions: - contents: write # tauri-apps/tauri-action creates / uploads a GitHub Release - strategy: - fail-fast: false - max-parallel: 1 - matrix: - include: - - platform: macos-latest - args: '--target aarch64-apple-darwin' - label: macOS (Apple Silicon) - # - platform: macos-latest - # args: '--target x86_64-apple-darwin' - # label: macOS (Intel) - - platform: ubuntu-22.04 - args: '' - label: Linux (x64) - - platform: windows-latest - args: '' - label: Windows (x64) - - name: Build ${{ matrix.label }} - needs: prepare-version - runs-on: ${{ matrix.platform }} - - env: - FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true - APP_VERSION: ${{ needs.prepare-version.outputs.app_version }} - STUDIO_VERSION: ${{ needs.prepare-version.outputs.studio_version }} - DESKTOP_RELEASE_TAG: ${{ needs.prepare-version.outputs.desktop_release_tag }} - DESKTOP_PRERELEASE: ${{ needs.prepare-version.outputs.prerelease }} - - steps: - # harden-runner in audit mode: surfaces every egress destination in - # the runner log so the allowlist for a future `egress-policy: block` - # promotion can be derived from observed traffic. Audit mode is - # cross-platform (Linux / macOS / Windows runners); blocking mode is - # currently Linux-only, so we deliberately stay in audit until the - # macOS + Windows codesign paths have been observed. - - name: Harden runner (audit) - uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 - with: - egress-policy: audit - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - with: - persist-credentials: false - - # ── Linux dependencies ── - - name: Install Linux dependencies - if: matrix.platform == 'ubuntu-22.04' - run: | - sudo apt-get update - sudo apt-get install -y libwebkit2gtk-4.1-dev libayatana-appindicator3-dev librsvg2-dev libxdo-dev libssl-dev patchelf - - # ── Node.js ── - - name: Setup Node.js - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e - with: - node-version: 24 - - - name: Install pinned Tauri CLI - # Lifecycle scripts (esbuild native-binary postinstall, etc.) are - # required for `vite build`. The pre-install lockfile structural - # audit (lockfile_supply_chain_audit.py) is the practical defence - # against the npm postinstall-dropper class -- it fires BEFORE any - # tarball runs, on the injection pattern itself rather than an - # advisory-DB lookup. - run: npm install --save-dev --prefix studio @tauri-apps/cli@2.10.1 --no-fund --no-audit - - - name: Verify pinned Tauri CLI - shell: bash - run: | - out="$(npx --prefix studio tauri --version)" - echo "$out" - if [ "$out" != "tauri-cli 2.10.1" ]; then - echo "Expected tauri-cli 2.10.1, got $out" >&2 - exit 1 - fi - - - name: Verify desktop updater and Linux package config - shell: bash - run: | - node <<'JS' - const { readFileSync } = require('node:fs'); - - const expected = 'https://github.com/unslothai/unsloth/releases/download/desktop-latest/latest.json'; - const config = JSON.parse(readFileSync('studio/src-tauri/tauri.conf.json', 'utf8')); - const endpoints = config.plugins?.updater?.endpoints; - if (!Array.isArray(endpoints) || endpoints.length !== 1) { - throw new Error('Expected exactly one desktop updater endpoint'); - } - if (endpoints[0] !== expected) { - throw new Error('Desktop updater endpoint must be ' + expected + ', got ' + endpoints[0]); - } - if (endpoints.some((endpoint) => endpoint.includes('/releases/latest/'))) { - throw new Error('Desktop updater endpoint must not use repo-wide /releases/latest/'); - } - - const targets = config.bundle?.targets; - if (Array.isArray(targets) && targets.some((target) => String(target).toLowerCase() === 'rpm')) { - throw new Error('Desktop release must not target RPM packages'); - } - if (config.bundle?.linux?.rpm) { - throw new Error('bundle.linux.rpm must not be configured'); - } - - const workflow = readFileSync('.github/workflows/release-desktop.yml', 'utf8'); - const lines = workflow.split(/\r?\n/); - const releaseBodies = []; - for (let i = 0; i < lines.length; i += 1) { - const match = lines[i].match(/^(\s*)releaseBody:\s*\|\s*$/); - if (!match) continue; - const baseIndent = match[1].length; - const bodyLines = []; - i += 1; - for (; i < lines.length; i += 1) { - const line = lines[i]; - if (line.trim() === '') { - bodyLines.push(''); - continue; - } - const indent = line.match(/^\s*/)[0].length; - if (indent <= baseIndent) { - i -= 1; - break; - } - bodyLines.push(line.slice(baseIndent + 2)); - } - releaseBodies.push(bodyLines.join('\n')); - } - if (releaseBodies.length === 0) { - throw new Error('Expected at least one desktop release body'); - } - for (const body of releaseBodies) { - if (/\brpm\b|\.rpm/i.test(body)) { - throw new Error('Desktop release body must not advertise RPM packages'); - } - } - JS - - - name: Install frontend dependencies - working-directory: studio/frontend - # Lifecycle scripts (esbuild native-binary postinstall, etc.) are - # required for `vite build`. The pre-install lockfile structural - # audit (lockfile_supply_chain_audit.py) is the practical defence - # against the npm postinstall-dropper class -- it fires BEFORE any - # tarball runs, on the injection pattern itself rather than an - # advisory-DB lookup. - run: npm install --no-fund --no-audit - - # ── Rust ── - - name: Install Rust stable - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable @ 2026-03-27 - with: - targets: ${{ matrix.platform == 'macos-latest' && 'aarch64-apple-darwin,x86_64-apple-darwin' || '' }} - - - name: Patch desktop app version - shell: bash - working-directory: studio/src-tauri - run: | - set -euo pipefail - if command -v python3 >/dev/null 2>&1; then - PYTHON=python3 - else - PYTHON=python - fi - "$PYTHON" <<'PY' - import os - import pathlib - import re - import sys - - app_version = os.environ['APP_VERSION'] - if not app_version: - sys.exit('APP_VERSION is required') - - cargo_toml = pathlib.Path('Cargo.toml') - lines = cargo_toml.read_text().splitlines(keepends=True) - in_package = False - patched = False - for index, line in enumerate(lines): - stripped = line.strip() - if stripped == '[package]': - in_package = True - continue - if stripped.startswith('[') and stripped.endswith(']'): - in_package = False - if in_package and re.fullmatch(r'version\s*=\s*"[^"]+"\s*', stripped): - lines[index] = f'version = "{app_version}"\n' - patched = True - break - if not patched: - sys.exit('Could not patch [package] version in Cargo.toml') - cargo_toml.write_text(''.join(lines)) - - cargo_lock = pathlib.Path('Cargo.lock') - lock_text = cargo_lock.read_text() - lock_text, count = re.subn( - r'(?m)(^\[\[package\]\]\nname = "unsloth-studio"\nversion = ")[^"]+(")', - lambda match: f'{match.group(1)}{app_version}{match.group(2)}', - lock_text, - ) - if count != 1: - sys.exit(f'Could not patch unsloth-studio version in Cargo.lock (matches={count})') - cargo_lock.write_text(lock_text) - PY - - cargo metadata --locked --no-deps --format-version 1 > "$RUNNER_TEMP/cargo-metadata.json" - "$PYTHON" <<'PY' - import json - import os - import pathlib - import sys - - app_version = os.environ['APP_VERSION'] - metadata = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'cargo-metadata.json').read_text()) - versions = [package['version'] for package in metadata.get('packages', []) if package.get('name') == 'unsloth-studio'] - if versions != [app_version]: - sys.exit(f'cargo metadata unsloth-studio version mismatch: expected {app_version}, got {versions}') - PY - - git diff -- Cargo.toml Cargo.lock - - - name: Rust cache - uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 - with: - workspaces: 'studio/src-tauri -> target' - - # ── macOS: import signing certificate ── - - name: Import Apple certificate - if: matrix.platform == 'macos-latest' - env: - APPLE_CERTIFICATE: ${{ secrets.APPLE_CERTIFICATE }} - APPLE_CERTIFICATE_PASSWORD: ${{ secrets.APPLE_CERTIFICATE_PASSWORD }} - KEYCHAIN_PASSWORD: ${{ secrets.KEYCHAIN_PASSWORD }} - run: | - echo $APPLE_CERTIFICATE | base64 --decode > certificate.p12 - security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain - security default-keychain -s build.keychain - security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain - security set-keychain-settings -t 3600 -u build.keychain - security import certificate.p12 -k build.keychain -P "$APPLE_CERTIFICATE_PASSWORD" -T /usr/bin/codesign - security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k "$KEYCHAIN_PASSWORD" build.keychain - security find-identity -v -p codesigning build.keychain - rm -f certificate.p12 - - # ── Windows: install Azure Trusted Signing CLI ── - - name: Install trusted-signing-cli - if: matrix.platform == 'windows-latest' - run: | - cargo install trusted-signing-cli --version 0.9.0 --locked - echo "$env:USERPROFILE\.cargo\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - - # ── Windows: verify signing CLI is accessible ── - - name: Verify trusted-signing-cli - if: matrix.platform == 'windows-latest' - run: | - Write-Output "PATH: $env:PATH" - Get-Command trusted-signing-cli -ErrorAction SilentlyContinue || Write-Output "trusted-signing-cli NOT in PATH" - trusted-signing-cli --version || Write-Output "trusted-signing-cli failed to run" - - # ── Linux: build + sign + upload ── - - name: Build Linux app - if: matrix.platform == 'ubuntu-22.04' - uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }} - TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }} - with: - projectPath: studio - tauriScript: npx --prefix . tauri - tagName: ${{ needs.prepare-version.outputs.desktop_release_tag }} - releaseName: 'Unsloth Studio (Desktop) ${{ needs.prepare-version.outputs.studio_version }}' - releaseBody: | - Desktop app for Unsloth Studio. - - **macOS**: Download the Apple Silicon `.dmg`. - **Windows**: Download the `-setup.exe` installer. - **Linux**: Download `.deb` (Ubuntu/Debian) or `.AppImage` (universal). - - > Linux in-app updates are AppImage-oriented. Package installs should update by downloading a new package. - > Linux AppImage on Ubuntu 24.04+ may require: `sudo apt install libfuse2t64` - > First-run system dependency elevation is supported on Ubuntu/Debian. Other Linux distributions should install system packages manually. - releaseDraft: ${{ inputs.draft }} - prerelease: ${{ needs.prepare-version.outputs.prerelease }} - args: -v ${{ matrix.args }} - - # ── macOS: build + sign + notarize + upload ── - - name: Build macOS app - if: matrix.platform == 'macos-latest' - uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }} - TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }} - APPLE_SIGNING_IDENTITY: ${{ secrets.APPLE_SIGNING_IDENTITY }} - APPLE_ID: ${{ secrets.APPLE_ID }} - APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }} - APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }} - with: - projectPath: studio - tauriScript: npx --prefix . tauri - tagName: ${{ needs.prepare-version.outputs.desktop_release_tag }} - releaseName: 'Unsloth Studio (Desktop) ${{ needs.prepare-version.outputs.studio_version }}' - releaseBody: | - Desktop app for Unsloth Studio. - - **macOS**: Download the Apple Silicon `.dmg`. - **Windows**: Download the `-setup.exe` installer. - **Linux**: Download `.deb` (Ubuntu/Debian) or `.AppImage` (universal). - - > Linux in-app updates are AppImage-oriented. Package installs should update by downloading a new package. - > Linux AppImage on Ubuntu 24.04+ may require: `sudo apt install libfuse2t64` - > First-run system dependency elevation is supported on Ubuntu/Debian. Other Linux distributions should install system packages manually. - releaseDraft: ${{ inputs.draft }} - prerelease: ${{ needs.prepare-version.outputs.prerelease }} - args: -v ${{ matrix.args }} - - # ── Windows: build + sign + upload ── - - name: Build Windows app - if: matrix.platform == 'windows-latest' - uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }} - TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }} - AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} - AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} - AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} - AZURE_TRUSTED_SIGNING_ACCOUNT_NAME: ${{ secrets.AZURE_TRUSTED_SIGNING_ACCOUNT_NAME }} - AZURE_CERTIFICATE_PROFILE_NAME: ${{ secrets.AZURE_CERTIFICATE_PROFILE_NAME }} - with: - projectPath: studio - tauriScript: npx --prefix . tauri - tagName: ${{ needs.prepare-version.outputs.desktop_release_tag }} - releaseName: 'Unsloth Studio (Desktop) ${{ needs.prepare-version.outputs.studio_version }}' - releaseBody: | - Desktop app for Unsloth Studio. - - **macOS**: Download the Apple Silicon `.dmg`. - **Windows**: Download the `-setup.exe` installer. - **Linux**: Download `.deb` (Ubuntu/Debian) or `.AppImage` (universal). - - > Linux in-app updates are AppImage-oriented. Package installs should update by downloading a new package. - > Linux AppImage on Ubuntu 24.04+ may require: `sudo apt install libfuse2t64` - > First-run system dependency elevation is supported on Ubuntu/Debian. Other Linux distributions should install system packages manually. - releaseDraft: ${{ inputs.draft }} - prerelease: ${{ needs.prepare-version.outputs.prerelease }} - args: -v ${{ matrix.args }} - - # Release process note: only non-draft workflow runs advance the public - # desktop-latest updater channel. Draft builds are for private review; if a - # draft is manually published later, this channel intentionally remains - # unchanged until a narrow manual channel-publish flow is added or a public - # desktop release is created by running this workflow with draft=false. - publish-updater-channel: - name: Publish desktop updater channel - needs: [prepare-version, build] - if: ${{ !inputs.draft }} - runs-on: ubuntu-latest - permissions: - contents: write - env: - GH_REPO: ${{ github.repository }} - APP_VERSION: ${{ needs.prepare-version.outputs.app_version }} - STUDIO_VERSION: ${{ needs.prepare-version.outputs.studio_version }} - DESKTOP_RELEASE_TAG: ${{ needs.prepare-version.outputs.desktop_release_tag }} - DESKTOP_PRERELEASE: ${{ needs.prepare-version.outputs.prerelease }} - - steps: - - name: Download versioned updater metadata - shell: bash - env: - GH_TOKEN: ${{ github.token }} - run: | - set -euo pipefail - mkdir -p "$RUNNER_TEMP/desktop-updater" - gh api "repos/${GITHUB_REPOSITORY}/releases/tags/${DESKTOP_RELEASE_TAG}" > "$RUNNER_TEMP/source-release.json" - python3 <<'PY' - import json - import os - import pathlib - import sys - - source = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'source-release.json').read_text()) - expected_tag = os.environ['DESKTOP_RELEASE_TAG'] - if source.get('tag_name') != expected_tag: - sys.exit(f'Expected source release {expected_tag}, got {source.get("tag_name")}') - if source.get('draft'): - sys.exit(f'Source desktop release {expected_tag} is draft; refusing to publish public updater channel') - PY - gh release download "$DESKTOP_RELEASE_TAG" --pattern latest.json --dir "$RUNNER_TEMP/desktop-updater" --clobber - test -s "$RUNNER_TEMP/desktop-updater/latest.json" - - - name: Validate versioned updater metadata - shell: bash - run: | - python3 <<'PY' - import json - import os - import pathlib - import re - import sys - - app_version = os.environ['APP_VERSION'] - release_tag = os.environ['DESKTOP_RELEASE_TAG'] - latest_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-updater', 'latest.json') - data = json.loads(latest_path.read_text()) - if not isinstance(data, dict): - sys.exit('latest.json must be a JSON object') - - version = data.get('version') - if not isinstance(version, str) or not version: - sys.exit('latest.json missing version') - if not re.fullmatch(r'v?\d+\.\d+\.\d+(?:[-+][0-9A-Za-z.-]+)?', version): - sys.exit(f'latest.json version is not SemVer-like: {version}') - if version.removeprefix('v') != app_version: - sys.exit(f'latest.json version {version} does not match desktop app version {app_version}') - - platforms = data.get('platforms') - if not isinstance(platforms, dict) or not platforms: - sys.exit('latest.json missing platforms') - - required_families = { - 'darwin-aarch64': False, - 'linux-x86_64': False, - 'windows-x86_64': False, - } - expected_prefix = f'https://github.com/unslothai/unsloth/releases/download/{release_tag}/' - forbidden_fragments = ('/releases/latest/', '/releases/download/desktop-latest/') - - for platform, entry in platforms.items(): - if not isinstance(entry, dict): - sys.exit(f'Platform {platform} must be an object') - url = entry.get('url') - signature = entry.get('signature') - if not isinstance(url, str) or not url.strip(): - sys.exit(f'Platform {platform} missing url') - if not isinstance(signature, str) or not signature.strip(): - sys.exit(f'Platform {platform} missing signature') - if any(fragment in url for fragment in forbidden_fragments): - sys.exit(f'Platform {platform} points at a moving updater channel: {url}') - if not url.startswith(expected_prefix): - sys.exit(f'Platform {platform} URL must point at {release_tag}: {url}') - for family in required_families: - if platform == family or platform.startswith(family + '-'): - required_families[family] = True - - missing = [family for family, found in required_families.items() if not found] - if missing: - sys.exit('latest.json missing required platform families: ' + ', '.join(missing)) - PY - - - name: Ensure desktop updater channel release - shell: bash - env: - GH_TOKEN: ${{ github.token }} - run: | - set -euo pipefail - channel_json="$RUNNER_TEMP/desktop-latest-release.json" - if ! gh api "repos/${GITHUB_REPOSITORY}/releases/tags/desktop-latest" > "$channel_json" 2>/dev/null; then - gh release create desktop-latest \ - --title "Unsloth Studio Desktop updater channel" \ - --notes "Machine-managed desktop updater channel; latest.json is replaced by release-desktop.yml." \ - --prerelease \ - --latest=false \ - --target "$GITHUB_SHA" - gh api "repos/${GITHUB_REPOSITORY}/releases/tags/desktop-latest" > "$channel_json" - fi - - python3 <<'PY' - import json - import os - import pathlib - import sys - - channel = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-latest-release.json').read_text()) - if channel.get('draft'): - sys.exit('desktop-latest release is draft; refusing to publish updater channel') - if channel.get('immutable'): - sys.exit('desktop-latest release is immutable; cannot replace latest.json') - if not channel.get('prerelease'): - sys.exit('desktop-latest release must be a prerelease so it cannot compete with repo-wide latest') - PY - - - name: Prevent updater channel downgrade - shell: bash - env: - GH_TOKEN: ${{ github.token }} - run: | - set -euo pipefail - mkdir -p "$RUNNER_TEMP/desktop-current" - if ! gh release download desktop-latest --pattern latest.json --dir "$RUNNER_TEMP/desktop-current" --clobber 2>/dev/null; then - echo "No existing desktop-latest latest.json found; allowing first channel publish." - exit 0 - fi - python3 <<'PY' - import json - import os - import pathlib - import re - import sys - - def parse(value: str): - value = value.removeprefix('v') - match = re.fullmatch( - r'(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)' - r'(?:-([0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?' - r'(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?', - value, - ) - if not match: - sys.exit(f'desktop-latest latest.json has invalid version: {value}') - major, minor, patch, prerelease = match.groups() - return (int(major), int(minor), int(patch), prerelease) - - def numeric_tail(identifier: str) -> tuple[str, int] | None: - match = re.fullmatch(r'([A-Za-z-]+)(\d+)', identifier) - if not match: - return None - return (match.group(1).lower(), int(match.group(2))) - - def compare_identifier(left: str, right: str) -> int: - left_num = left.isdigit() - right_num = right.isdigit() - if left_num and right_num: - return (int(left) > int(right)) - (int(left) < int(right)) - if left_num: - return -1 - if right_num: - return 1 - - left_tail = numeric_tail(left) - right_tail = numeric_tail(right) - if left_tail and right_tail and left_tail[0] == right_tail[0]: - return (left_tail[1] > right_tail[1]) - (left_tail[1] < right_tail[1]) - - return (left > right) - (left < right) - - def compare_prerelease(left: str | None, right: str | None) -> int: - if left == right: - return 0 - if left is None: - return 1 - if right is None: - return -1 - left_parts = left.split('.') - right_parts = right.split('.') - for left_part, right_part in zip(left_parts, right_parts): - order = compare_identifier(left_part, right_part) - if order: - return order - return (len(left_parts) > len(right_parts)) - (len(left_parts) < len(right_parts)) - - def compare(left: str, right: str) -> int: - left_major, left_minor, left_patch, left_pre = parse(left) - right_major, right_minor, right_patch, right_pre = parse(right) - left_core = (left_major, left_minor, left_patch) - right_core = (right_major, right_minor, right_patch) - if left_core != right_core: - return (left_core > right_core) - (left_core < right_core) - return compare_prerelease(left_pre, right_pre) - - current_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-current', 'latest.json') - next_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-updater', 'latest.json') - current = json.loads(current_path.read_text()).get('version') - next_version = json.loads(next_path.read_text()).get('version') - if not isinstance(current, str) or not isinstance(next_version, str): - sys.exit('Could not compare desktop-latest channel versions') - if compare(next_version, current) < 0: - sys.exit( - f'Refusing to move desktop-latest from {current} to older version {next_version}.' - ) - PY - - - name: Publish desktop updater channel metadata - shell: bash - env: - GH_TOKEN: ${{ github.token }} - run: | - set -euo pipefail - gh release upload desktop-latest "$RUNNER_TEMP/desktop-updater/latest.json" --clobber - gh api "repos/${GITHUB_REPOSITORY}/releases/tags/desktop-latest" > "$RUNNER_TEMP/desktop-latest-release.json" - python3 <<'PY' - import json - import os - import pathlib - import sys - - channel = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-latest-release.json').read_text()) - assets = [asset for asset in channel.get('assets', []) if asset.get('name') == 'latest.json'] - if len(assets) != 1: - sys.exit(f'Expected exactly one desktop-latest latest.json asset, found {len(assets)}') - expected_url = f'https://github.com/{os.environ["GITHUB_REPOSITORY"]}/releases/download/desktop-latest/latest.json' - actual_url = assets[0].get('browser_download_url') - if actual_url != expected_url: - sys.exit(f'desktop-latest latest.json URL mismatch: expected {expected_url}, got {actual_url}') - PY diff --git a/.github/workflows/security-audit.yml b/.github/workflows/security-audit.yml deleted file mode 100644 index a1e7b2efa6..0000000000 --- a/.github/workflows/security-audit.yml +++ /dev/null @@ -1,1126 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Multi-language supply-chain audit. Triggers: -# - PRs touching any dependency manifest (Python / npm / Cargo) or -# this workflow file, -# - push to main / pip, -# - nightly @ 04:13 UTC so newly-published advisories surface even -# when no PR opens, -# - workflow_dispatch for ad-hoc invocations. -# -# Two jobs: -# - advisory-audit: one runner that runs pip-audit + npm audit + -# cargo audit back-to-back. All three are -# advisory-DB lookups -- fast, lockfile-driven, -# no archive download. Setting up the python / -# node / rust toolchains on one runner and -# running the three commands serially is -# cheaper than spinning up three runners. -# - pip-scan-packages: 3-shard matrix that downloads + pattern-scans -# every PyPI archive in the transitive closure. -# This is the expensive job (~6 min/shard, -# running in parallel) and it must stay -# independent so a CVE-DB hit in advisory-audit -# does not block the supply-chain pattern scan -# (or vice versa). -# -# All steps are non-blocking initially. The default branch already -# carries a known-vuln backlog (the dependabot banner shows 17 today, -# pip-audit catches 2 more, npm/cargo will catch their own); a hard -# gate now would block every PR on a baseline we have not triaged. -# As each baseline closes, drop continue-on-error per step. -# -# Dependency coverage: -# - unsloth core (pyproject.toml [project.dependencies]) -# - unsloth `huggingfacenotorch` extras (the canonical install path -# for fine-tuning users; pulls transformers / peft / accelerate / -# trl / datasets / diffusers / sentence-transformers / etc.) -# - all six Studio backend requirements files -# - Studio frontend (npm) and Tauri shell (cargo) -# Each Python step builds a filtered dep list from pyproject.toml + -# requirements/*.txt before auditing. We do NOT install any of these -# -- pip-audit resolves through PyPI metadata, scan_packages.py -# downloads sdist/wheel archives and inspects them without running -# install hooks, so an attacker who has compromised a transitive dep -# cannot execute code in this workflow. - -name: Security audit - -on: - pull_request: - paths: - - 'studio/backend/requirements/**' - - 'studio/frontend/package.json' - - 'studio/frontend/package-lock.json' - - 'studio/src-tauri/Cargo.toml' - - 'studio/src-tauri/Cargo.lock' - - 'pyproject.toml' - - 'scripts/scan_packages.py' - - 'scripts/scan_npm_packages.py' - - '.github/workflows/security-audit.yml' - push: - branches: [main, pip] - schedule: - - cron: '13 4 * * *' # 04:13 UTC daily, off the cron rush - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - # ───────────────────────────────────────────────────────────────────── - # Combined advisory-DB audit: pip-audit + npm audit + cargo audit - # all on one runner. Each step is continue-on-error so a finding in - # one toolchain does not suppress the others. - # ───────────────────────────────────────────────────────────────────── - advisory-audit: - name: advisory audit (pip + npm + cargo) - runs-on: ubuntu-latest - timeout-minutes: 25 - steps: - # step-security/harden-runner installs an eBPF-based egress - # firewall on the runner. In `audit` mode it logs every outbound - # connection without blocking; in `block` mode it rejects - # anything outside `allowed-endpoints`. We run audit-only - # initially: the next time this job hits a real PyPI advisory or - # an attacker-funded archive in pip-scan-packages, the audit log - # tells us exactly which hosts were dialed and we promote the - # allowlist to block. Would have *contained* the litellm exfil - # even if scan_packages had missed the .pth payload. - # SHA-pinned (not @v2): the litellm 1.82.7 attack chain hijacked - # mutable tags on aquasecurity/trivy-action and would have hit - # anyone using @v0 / @v2 / @latest references. Pinning to a 40- - # char SHA freezes this action at known-good code; Dependabot's - # github-actions ecosystem will auto-bump the SHA. - # v2.19.1 commit: - # Per-job allowlist: advisory-audit hits PyPI, npm registry, - # crates.io advisories, GitHub release artefacts (osv-scanner - # binary), Semgrep registry, and TruffleHog's own GitHub action. - - name: Harden runner (egress block) - uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 - with: - egress-policy: block - disable-sudo: true - allowed-endpoints: > - api.github.com:443 - github.com:443 - codeload.github.com:443 - objects.githubusercontent.com:443 - raw.githubusercontent.com:443 - release-assets.githubusercontent.com:443 - registry.npmjs.org:443 - pypi.org:443 - files.pythonhosted.org:443 - static.rust-lang.org:443 - index.crates.io:443 - static.crates.io:443 - crates.io:443 - semgrep.dev:443 - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - # Full history so TruffleHog can diff base..head; without - # this it sees only the latest commit and reports nothing. - fetch-depth: 0 - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable @ 2026-03-27 - - - uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2.9.1 - with: - workspaces: studio/src-tauri -> target - - - name: Install pip-audit + cargo-audit - # cargo-audit pulls advisories from the RustSec advisory-db on - # first run and caches them under ~/.cargo/advisory-db. Pin - # --locked so the version we install matches Cargo.lock - # determinism. cargo-audit 0.22 supports the CVSS 4.0 schema - # used in 2026 advisories (e.g. RUSTSEC-2026-0073); 0.21 - # crashes with a TOML parse error on that file. - # npm audit is bundled with the node toolchain, no install. - run: | - python -m pip install --upgrade pip 'pip-audit>=2.7' - cargo install --locked --version '^0.22' cargo-audit - - # ───────────────────────────────────────────────────────────── - # Python: pip-audit - # ───────────────────────────────────────────────────────────── - - name: Build filtered Python requirements set - # Two transforms: - # (1) Generate audit-reqs/unsloth-deps.txt from pyproject.toml - # so pip-audit sees the unsloth pip package's own dep set - # (core + huggingfacenotorch extras: transformers / peft / - # accelerate / trl / datasets / diffusers / - # sentence-transformers / huggingface_hub / hf_transfer / - # etc.). - # (2) Copy each studio/backend/requirements/*.txt into - # audit-reqs/ with `git+` lines stripped. pip-audit's `-r` - # mode does a dry-run resolve against PyPI metadata; a - # `git+https://...` spec forces it to clone, which is - # both slow and outside the threat model (we audit - # PyPI-served archives; a git ref is whatever HEAD says - # on the runner). A comment line is left in place so the - # skipped specs are obvious in the artifact. - # The `huggingface` extra is `huggingfacenotorch` plus torch / - # torchvision / triton, deliberately skipped: Studio backend - # already pins a torch and the +cu* / +cpu local-version tags - # trip up the PyPI resolver in `-r` mode. - run: | - mkdir -p audit-reqs - python <<'PY' > audit-reqs/unsloth-deps.txt - import tomllib - with open("pyproject.toml", "rb") as f: - d = tomllib.load(f) - core = d["project"]["dependencies"] - extras = d["project"]["optional-dependencies"]["huggingfacenotorch"] - print("# Auto-generated from pyproject.toml by security-audit.yml.") - print("# core deps + huggingfacenotorch extras.") - for spec in core + extras: - print(spec) - PY - for f in studio.txt extras.txt extras-no-deps.txt \ - no-torch-runtime.txt overrides.txt triton-kernels.txt; do - python < "audit-reqs/$f" - src = "studio/backend/requirements/$f" - with open(src) as fh: - for line in fh: - stripped = line.strip() - before_comment = stripped.split("#", 1)[0] - if "git+" in before_comment: - print(f"# [security-audit] skipped git+ spec: {stripped}") - continue - print(line.rstrip("\n")) - PY - done - - - name: pip-audit (declared Python deps, no install) - # `-r requirements.txt` resolves the requirements through pip's - # dependency resolver against PyPI metadata and audits the - # resolved tree without ever executing setup.py / install - # hooks. Way faster than installing the full Studio runtime - # and -- critically -- safer: an attacker who has compromised - # a transitive dep cannot run code in this job. - # - # extras.txt + extras-no-deps.txt have legacy setup.py - # packages (notably openai-whisper) whose setup.py imports - # `pkg_resources`, which the isolated build env's current - # setuptools no longer ships. PIP_CONSTRAINT pins an older - # setuptools into the build env so those builds resolve. - # Per-file loop so one bad file doesn't take out the whole - # audit. - continue-on-error: true - env: - PIP_CONSTRAINT: ${{ github.workspace }}/audit-reqs/build-constraints.txt - run: | - set +e - cat > audit-reqs/build-constraints.txt <<'CONSTRAINTS' - setuptools<78 - wheel - CONSTRAINTS - : > logs-pip-audit.txt - for f in unsloth-deps studio extras extras-no-deps \ - no-torch-runtime overrides triton-kernels; do - if ! grep -qE '^[^#[:space:]]' "audit-reqs/$f.txt"; then - echo "[security-audit] $f.txt has no PyPI specs after git+ filter, skipping" \ - | tee -a logs-pip-audit.txt - continue - fi - echo "::group::pip-audit -r audit-reqs/$f.txt" - { - echo - echo "=== $f ===" - pip-audit -r "audit-reqs/$f.txt" --format=columns - echo "=== end $f (rc=$?) ===" - } 2>&1 | tee -a logs-pip-audit.txt - echo "::endgroup::" - done - { - echo "## pip-audit (Python)" - echo - echo '### Coverage' - echo '- unsloth core + `huggingfacenotorch` extras (pyproject.toml)' - echo '- studio/backend/requirements/{studio,extras,extras-no-deps,no-torch-runtime,overrides,triton-kernels}.txt' - echo '- `git+` specs are stripped before audit (out of scope: we audit PyPI archives)' - echo - echo '### Findings' - echo '```' - cat logs-pip-audit.txt - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - # ───────────────────────────────────────────────────────────── - # Pre-install lockfile supply-chain audit (npm + cargo). - # Catches structural anomalies (non-registry resolved URLs, - # missing integrity hashes, known IOC strings) BEFORE `npm - # audit` or OSV-Scanner consult the advisory DB. The advisory - # path is reactive -- there is a window between a malicious - # publication and the GHSA landing. This step fires on the - # injection pattern itself so it catches the same class of - # attack the moment the lockfile shape becomes wrong. - # ───────────────────────────────────────────────────────────── - - name: Lockfile supply-chain audit (pre-install scan) - run: | - python3 scripts/lockfile_supply_chain_audit.py - { - echo "## Lockfile supply-chain audit" - echo - echo "Scanned: studio/frontend/package-lock.json + studio/src-tauri/Cargo.lock" - echo - echo "No structural anomalies or known IOC strings." - } >> "$GITHUB_STEP_SUMMARY" - - # ───────────────────────────────────────────────────────────── - # npm: Studio frontend - # ───────────────────────────────────────────────────────────── - - name: npm audit (Studio frontend) - # `npm audit` resolves the lockfile through the npmjs.com - # advisory DB. `--audit-level=high` filters the noise floor - # to only HIGH and CRITICAL. We do NOT pass --omit=dev: a - # malicious dev-only dep can still steal secrets from a CI - # runner, so dev deps need to be in the audit surface. - continue-on-error: true - working-directory: studio/frontend - run: | - set +e - npm audit --audit-level=high | tee ../../logs-npm-audit.txt - # Always also write the full JSON for grep-ability. - npm audit --json > ../../logs-npm-audit.json || true - { - echo "## npm audit (Studio frontend)" - echo - echo '```' - tail -200 ../../logs-npm-audit.txt - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - # ───────────────────────────────────────────────────────────── - # cargo: Studio Tauri shell - # ───────────────────────────────────────────────────────────── - - name: cargo audit (Studio Tauri) - # `--deny warnings` would make the job fail on any advisory. - # Keep non-blocking initially; drop continue-on-error after - # the baseline closes. - continue-on-error: true - working-directory: studio/src-tauri - run: | - set +e - cargo audit | tee ../../logs-cargo-audit.txt - { - echo "## cargo audit (Studio Tauri)" - echo - echo '```' - tail -200 ../../logs-cargo-audit.txt - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - # ───────────────────────────────────────────────────────────── - # OSV-Scanner: cross-ecosystem advisory DB (PyPI + npm + cargo) - # ───────────────────────────────────────────────────────────── - - name: OSV-Scanner (PyPI + npm + cargo, cross-ecosystem advisories) - # OSV's advisory feed is a superset of GitHub-Advisory + RustSec - # + npm advisories; running it alongside the per-ecosystem audit - # tools catches CVEs that haven't propagated to the per-ecosystem - # DBs yet (e.g. langchain-core CVE-2025-68664 was on OSV before - # GitHub Advisory). Single binary, one transitive resolver, all - # three lockfile types in one pass. Non-blocking until baselines - # close. - continue-on-error: true - run: | - set +e - # OSV-Scanner ships a raw binary (no tarball) in v2.x. - curl -fsSL -o /tmp/osv-scanner \ - https://github.com/google/osv-scanner/releases/download/v2.0.2/osv-scanner_linux_amd64 - chmod +x /tmp/osv-scanner - /tmp/osv-scanner --version - /tmp/osv-scanner scan source \ - --lockfile=studio/frontend/package-lock.json \ - --lockfile=studio/src-tauri/Cargo.lock \ - --lockfile=requirements.txt:audit-reqs/unsloth-deps.txt \ - --lockfile=requirements.txt:audit-reqs/studio.txt \ - --lockfile=requirements.txt:audit-reqs/no-torch-runtime.txt \ - --lockfile=requirements.txt:audit-reqs/overrides.txt \ - --lockfile=requirements.txt:audit-reqs/extras.txt \ - --lockfile=requirements.txt:audit-reqs/extras-no-deps.txt \ - --format=table 2>&1 | tee logs-osv-scanner.txt - { - echo "## OSV-Scanner (cross-ecosystem)" - echo - echo '```' - tail -200 logs-osv-scanner.txt - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - # ───────────────────────────────────────────────────────────── - # Semgrep: design-flaw detection (catches what regex-pattern - # scanning of malicious authors cannot — first-party logic bugs - # like langchain-core CVE-2025-68664 dumps/dumpd injection, - # n8n CVE-2025-68668 _pyodide.eval_code sandbox escape, marimo - # CVE-2026-39987 unauth WebSocket). - # ───────────────────────────────────────────────────────────── - - name: Semgrep (supply-chain + python rule packs) - continue-on-error: true - run: | - set +e - python -m pip install --quiet 'semgrep>=1.95' - semgrep --version - semgrep scan \ - --config p/supply-chain \ - --config p/python \ - --config p/javascript \ - --config p/security-audit \ - --severity ERROR --severity WARNING \ - --metrics off \ - --timeout 120 \ - studio/backend unsloth scripts \ - 2>&1 | tee logs-semgrep.txt - { - echo "## Semgrep (supply-chain + python + javascript rules)" - echo - echo '```' - tail -200 logs-semgrep.txt - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - # ───────────────────────────────────────────────────────────── - # Lockfile pin verifier. The litellm 1.82.7 attack window was - # ~40 minutes; anyone resolving with `>=` got the malicious - # version automatically. Flag every spec in the requirements - # files that does not pin to an exact `==` (or `@` for git - # refs, or `===` for arbitrary equality). Warning-only for now; - # graduate to blocking once the baseline is clean. - # ───────────────────────────────────────────────────────────── - - name: Lockfile pin verifier (Python requirements) - continue-on-error: true - run: | - python <<'PY' | tee logs-pin-verifier.txt - import re - from pathlib import Path - - # Specs that look like `pkg==1.2.3` or `pkg @ git+...` or - # bare comments / -r lines are pinned-or-not-applicable. - PINNED = re.compile(r"^\s*[A-Za-z0-9_.\-]+\s*(?:===|==)\s*[^,;]+\s*$") - GIT_OR_URL = re.compile(r"^\s*[A-Za-z0-9_.\-]+\s*@\s*(?:git\+|https?://)") - - unpinned = [] - for f in sorted(Path("studio/backend/requirements").glob("*.txt")): - for i, raw in enumerate(f.read_text().splitlines(), 1): - line = raw.strip() - if not line or line.startswith("#") or line.startswith("-"): - continue - spec = line.split("#", 1)[0].strip().split(";", 1)[0].strip() - if not spec: - continue - if "git+" in spec or PINNED.match(spec) or GIT_OR_URL.match(spec): - continue - unpinned.append((str(f), i, line)) - - print(f"::group::Lockfile pin status") - if unpinned: - print(f"WARN: {len(unpinned)} non-`==` specs across requirements/*.txt") - print("(litellm 1.82.7 wave hit anyone on `>=`; tighten when feasible.)") - for f, i, line in unpinned[:80]: - print(f" {f}:{i}: {line}") - if len(unpinned) > 80: - print(f" ... and {len(unpinned) - 80} more") - else: - print("OK: every spec is exact-pinned.") - print("::endgroup::") - PY - { - echo "## Lockfile pin verifier" - echo - echo '```' - cat logs-pin-verifier.txt - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - # ───────────────────────────────────────────────────────────── - # Trivy is deliberately NOT installed here. Trivy was the entry - # point for the litellm 1.82.7 supply-chain compromise (March - # 2026): attackers force-rewrote 76 of 77 tags in - # aquasecurity/trivy-action to point at malicious commits; - # anyone running the action with a tag ref auto-pulled a - # credential-harvesting payload. By design a security scanner - # has broad read access to runner secrets, which is exactly - # what made it the ideal pivot. We pick up Trivy's CVE coverage - # from OSV-Scanner (NVD + GHSA + GitLab) and its secret - # detection from TruffleHog. IaC misconfig detection (Trivy's - # one unique value-add) is unfilled for now -- revisit with - # checkov / kics when we ship a Dockerfile or k8s manifests. - # See https://docs.litellm.ai/blog/security-update-march-2026 - # and the Microsoft / Trend Micro / Snyk incident write-ups. - # ───────────────────────────────────────────────────────────── - - # ───────────────────────────────────────────────────────────── - # TruffleHog secret-leak scan on the PR diff. Catches API keys - # / tokens / cred files committed accidentally. --only-verified - # filters out probabilistic findings, so we only flag tokens - # that the source provider confirmed are live. On push to main - # / pip we scan the full repo; on PR we scan base..head. - # SHA-pinned for the same reason as harden-runner above. - # v3.95.2 commit: - # ───────────────────────────────────────────────────────────── - - name: TruffleHog (secrets in diff) - continue-on-error: true - uses: trufflesecurity/trufflehog@37b77001d0174ebec2fcca2bd83ff83a6d45a3ab # v3.95.3 - with: - path: ./ - base: ${{ github.event.pull_request.base.sha || '' }} - head: ${{ github.event.pull_request.head.sha || github.sha }} - # The action passes --no-update internally; passing it here - # too triggers `flag 'no-update' cannot be repeated`. Stick - # with --only-verified so we only flag tokens the source - # provider confirmed are live (no probabilistic findings). - extra_args: --only-verified - - # ───────────────────────────────────────────────────────────── - # CycloneDX SBOM. Lets downstream consumers audit what's - # actually shipped in unsloth wheels and the Studio backend - # runtime. Generates one JSON file per requirements input plus - # a combined SBOM keyed off pyproject.toml; uploads as a build - # artifact (and a future step can attest it via SLSA). - # ───────────────────────────────────────────────────────────── - - name: Generate CycloneDX SBOM - continue-on-error: true - run: | - set +e - python -m pip install --quiet 'cyclonedx-bom>=4.6' - mkdir -p sbom - # Per-requirements-file SBOM (the audit-reqs/ files are the - # filtered, git+-stripped views built earlier in this job). - # cyclonedx-py 4.x uses `--sv` for spec version and `-o` for - # the output file; the older `--schema-version`/`--outfile` - # spellings are not accepted. - for f in audit-reqs/*.txt; do - base=$(basename "$f" .txt) - if grep -qE '^[^#[:space:]]' "$f"; then - cyclonedx-py requirements "$f" \ - --sv 1.6 \ - --of JSON \ - -o "sbom/sbom-$base.json" 2>&1 | tail -5 || true - fi - done - # Project-level SBOM from pyproject.toml. - cyclonedx-py environment \ - --sv 1.6 \ - --of JSON \ - -o sbom/sbom-environment.json 2>&1 | tail -5 || true - ls -la sbom/ - { - echo "## CycloneDX SBOM" - echo - echo "Generated SBOM files:" - ls sbom/ | sed 's/^/- sbom\//' - } >> "$GITHUB_STEP_SUMMARY" - - # ───────────────────────────────────────────────────────────── - # GitHub Actions pinning verifier. tj-actions/changed-files - # was compromised in March 2025; anyone using `@v4` (a mutable - # ref) auto-shipped the malicious version. Catch every - # non-SHA-pinned `uses:` across the workflows tree. Warn-only - # initially so the existing baseline doesn't block PRs. - # ───────────────────────────────────────────────────────────── - - name: GitHub Actions pinning verifier - continue-on-error: true - run: | - python <<'PY' | tee logs-actions-pinning.txt - import re - from pathlib import Path - # SHA pin = 40 hex chars after @ - SHA_PIN = re.compile(r"@[0-9a-f]{40}\b") - # First-party / GitHub-published actions get a softer pass - # (still recommended to pin; not a security gate). - FIRST_PARTY = re.compile(r"^\s*-\s*uses:\s*(actions|github)/[^@]+@") - USES = re.compile(r"^\s*-\s*uses:\s*([^@\s]+)@(\S+)") - unpinned_third = [] - unpinned_first = [] - for f in sorted(Path(".github/workflows").glob("*.yml")): - for i, line in enumerate(f.read_text().splitlines(), 1): - m = USES.match(line) - if not m: - continue - name, ref = m.group(1), m.group(2) - if SHA_PIN.search(line): - continue - bucket = unpinned_first if FIRST_PARTY.match(line) else unpinned_third - bucket.append((str(f), i, name, ref)) - print("::group::Action pinning status") - print(f"third-party actions on mutable refs: {len(unpinned_third)}") - for f, i, n, r in unpinned_third: - print(f" HIGH {f}:{i}: {n}@{r}") - print() - print(f"first-party (actions/* | github/*) on mutable refs: {len(unpinned_first)}") - for f, i, n, r in unpinned_first[:30]: - print(f" WARN {f}:{i}: {n}@{r}") - if len(unpinned_first) > 30: - print(f" ... and {len(unpinned_first) - 30} more") - print() - print("Recommendation: pin third-party actions to a 40-char SHA.") - print("Dependabot's github-actions ecosystem will auto-bump them.") - print("::endgroup::") - PY - { - echo "## GitHub Actions pinning verifier" - echo - echo '```' - cat logs-actions-pinning.txt - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - # ───────────────────────────────────────────────────────────── - # Hash-pin verifier. `==` pinning protects against version - # drift but not against a re-uploaded malicious wheel at the - # same version (PyPI lets a yanked release be re-published with - # different bytes for ~5 minutes via `--filename` collision). - # `pip install --require-hashes` rejects any download whose - # SHA-256 doesn't match. Inspector step that reports how many - # specs would gain from a hash pin -- conversion is a roadmap - # item (needs pip-tools / uv pip compile --generate-hashes). - # ───────────────────────────────────────────────────────────── - - name: Hash-pin verifier (Python requirements) - continue-on-error: true - run: | - python <<'PY' | tee logs-hash-verifier.txt - import re - from pathlib import Path - PINNED = re.compile(r"^\s*[A-Za-z0-9_.\-]+\s*==\s*[^,;]+\s*$") - HASH_LINE = re.compile(r"--hash=sha256:[0-9a-f]{64}") - total_pinned = 0 - with_hash = 0 - for f in sorted(Path("studio/backend/requirements").glob("*.txt")): - text = f.read_text() - for raw in text.splitlines(): - line = raw.strip() - if not line or line.startswith("#") or line.startswith("-"): - continue - spec = line.split("#", 1)[0].strip().split(";", 1)[0] - if PINNED.match(spec): - total_pinned += 1 - if HASH_LINE.search(raw): - with_hash += 1 - print(f"::group::Hash-pin status") - print(f" exact == pins: {total_pinned}") - print(f" with --hash=sha256: {with_hash}") - print(f" without --hash: {total_pinned - with_hash}") - print() - print("Roadmap: convert to hash-locked installs via") - print("`uv pip compile --generate-hashes` and `pip install --require-hashes`.") - print("Hash-locked installs would have refused a republished") - print("malicious litellm 1.82.7 wheel even at the same version.") - print("::endgroup::") - PY - { - echo "## Hash-pin verifier" - echo - echo '```' - cat logs-hash-verifier.txt - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - if: always() - with: - name: advisory-audit-logs - path: | - logs-pip-audit.txt - logs-npm-audit.txt - logs-npm-audit.json - logs-cargo-audit.txt - logs-osv-scanner.txt - logs-semgrep.txt - logs-pin-verifier.txt - logs-actions-pinning.txt - logs-hash-verifier.txt - audit-reqs/ - sbom/ - retention-days: 30 - - # ───────────────────────────────────────────────────────────────────── - # Python: pre-install package scan (no install, no execution) - # ───────────────────────────────────────────────────────────────────── - pip-scan-packages: - # Downloads each declared dep WITHOUT installing it and inspects - # the archive contents for known malicious patterns: weaponized - # .pth files, credential stealers, obfuscated payloads, - # install-time droppers, suspicious subprocess / network / - # base64-blob combinations. - # - # This is the kind of check that would have caught: - # - litellm 1.82.7 / 1.82.8 (March 2026, supply-chain compromise) - # - the typo-squat campaign against PyTorch Lightning - # before either landed in the install path. pip-audit only knows - # about CVE-published vulnerabilities, so it does NOT see novel - # malicious uploads. scan_packages.py runs deterministic regex - # pattern matching, no LLM calls. - # - # `--with-deps` makes the scan transitive: every package the - # declared set resolves to gets fetched and pattern-scanned, not - # just the top-level pins. Resolving the full transitive closure - # of the unsloth + Studio dep tree downloads several hundred - # archives, hence the longer timeout. - # - # Sharded across runners for wall-clock parallelism. Each shard - # runs scan_packages.py once with --with-deps so its own slice - # benefits from pip's deduped transitive resolve. Shard - # composition tries to balance load: - # - hf-stack: pyproject extras + no-torch-runtime - # (~150 archives, transformers/peft/accelerate/...) - # - studio: FastAPI/Studio backend + overrides + extras-no-deps - # (~150 archives, smaller scientific stack) - # - extras: the heavy openai-whisper / scikit-learn / librosa - # stack (~250 archives, dominant cost) - # triton-kernels.txt is git+-only, fully skipped. - name: ${{ matrix.shard.name }} - runs-on: ubuntu-latest - timeout-minutes: 25 - strategy: - fail-fast: false - matrix: - shard: - - name: 'pip scan-packages :: hf-stack' - id: hf-stack - files: 'unsloth-deps no-torch-runtime' - - name: 'pip scan-packages :: studio' - id: studio - files: 'studio overrides extras-no-deps' - - name: 'pip scan-packages :: extras' - id: extras - files: 'extras' - steps: - # Egress block on every shard. Each shard pulls hundreds of - # PyPI archives -- if a malicious wheel ever phones home from - # within the scanner sandbox (it shouldn't; we never execute - # the archive), harden-runner now rejects the connect outright. - # Per-job allowlist: pip-scan-packages only fetches PyPI archives - # via scan_packages.py + pip download. No npm or cargo traffic. - - name: Harden runner (egress block) - uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 - with: - egress-policy: block - disable-sudo: true - allowed-endpoints: > - api.github.com:443 - github.com:443 - codeload.github.com:443 - objects.githubusercontent.com:443 - pypi.org:443 - files.pythonhosted.org:443 - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Install scan_packages.py runtime deps - # scan_packages.py imports requests + packaging at runtime to - # talk to PyPI's JSON API and to parse version specifiers. We - # do not install the packages it scans -- those are downloaded - # raw and inspected without ever touching `pip install`. - run: python -m pip install --upgrade pip requests packaging - - - name: Build filtered requirements set - # Mirrors the advisory-audit job's input transform: pyproject.toml - # extraction + git+ stripping. scan_packages.py downloads - # PyPI archives without building, so it tolerates legacy - # setup.py packages (no resolver dry-run); but `--with-deps` - # delegates resolution to a single `pip download` call that - # cannot satisfy `git+` specs without git operations, so we - # strip them here too. - run: | - mkdir -p audit-reqs - python <<'PY' > audit-reqs/unsloth-deps.txt - import tomllib - with open("pyproject.toml", "rb") as f: - d = tomllib.load(f) - core = d["project"]["dependencies"] - extras = d["project"]["optional-dependencies"]["huggingfacenotorch"] - print("# Auto-generated from pyproject.toml by security-audit.yml.") - print("# core deps + huggingfacenotorch extras.") - for spec in core + extras: - print(spec) - PY - for f in studio.txt extras.txt extras-no-deps.txt \ - no-torch-runtime.txt overrides.txt triton-kernels.txt; do - python < "audit-reqs/$f" - src = "studio/backend/requirements/$f" - with open(src) as fh: - for line in fh: - stripped = line.strip() - before_comment = stripped.split("#", 1)[0] - if "git+" in before_comment: - print(f"# [security-audit] skipped git+ spec: {stripped}") - continue - print(line.rstrip("\n")) - PY - done - - - name: Sanity-check scan_packages.py - # The scanner lives at scripts/scan_packages.py in this repo - # so we don't depend on a network fetch at job time. - run: | - test -f scripts/scan_packages.py - head -3 scripts/scan_packages.py - grep -q "Standalone pre-install package scanner" scripts/scan_packages.py - - - name: Scan declared + transitive Python deps - # scan_packages.py exits 1 on CRITICAL/HIGH findings, 0 on - # clean. We swallow the exit because the baseline isn't - # triaged yet; surface the findings in the workflow summary. - # Drop continue-on-error after the first clean run on main. - # - # `--with-deps` walks PyPI metadata to enumerate every - # transitive dep the declared set would install, then scans - # them all. Without this flag, we'd only catch a malicious - # *direct* dep -- and supply-chain attacks usually land - # several hops down (litellm 1.82.7 was a dep of a dep for - # most users). - # - # This step runs once per matrix shard. Within a shard, every - # -r file is fed to a single `pip download` call so pip - # intersects version constraints and yields a deduped - # transitive set (no point fetching the same transformers - # wheel five times). Across shards we accept some redundant - # downloads in exchange for wall-clock parallelism. - env: - SHARD_FILES: ${{ matrix.shard.files }} - run: | - set +e - mkdir -p logs - LOG="logs-scan-packages-${{ matrix.shard.id }}.txt" - echo "::group::shard ${{ matrix.shard.id }} input files" - REQ_ARGS=() - for f in $SHARD_FILES; do - if grep -qE '^[^#[:space:]]' "audit-reqs/$f.txt"; then - echo " + audit-reqs/$f.txt" - REQ_ARGS+=( -r "audit-reqs/$f.txt" ) - else - echo " - audit-reqs/$f.txt (empty after git+ filter, skipping)" - fi - done - echo "::endgroup::" - if [ ${#REQ_ARGS[@]} -eq 0 ]; then - echo "[security-audit] shard ${{ matrix.shard.id }}: no PyPI specs, nothing to scan" \ - | tee "$LOG" - else - python scripts/scan_packages.py --with-deps "${REQ_ARGS[@]}" \ - 2>&1 | tee "$LOG" - fi - { - echo "## scan_packages :: shard ${{ matrix.shard.id }}" - echo - echo "### Files in this shard" - for f in $SHARD_FILES; do echo "- audit-reqs/$f.txt"; done - echo - echo '### Findings (tail)' - echo '```' - tail -200 "$LOG" - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - if: always() - with: - name: scan-packages-log-${{ matrix.shard.id }} - path: | - logs-scan-packages-${{ matrix.shard.id }}.txt - audit-reqs/ - retention-days: 30 - - # ───────────────────────────────────────────────────────────────────── - # npm: pre-install tarball content scan. - # ───────────────────────────────────────────────────────────────────── - npm-scan-packages: - # Counterpart to pip-scan-packages for the npm side. Reads - # studio/frontend/package-lock.json, downloads each resolved - # tarball DIRECTLY from registry.npmjs.org (never via `npm - # install` -- no lifecycle scripts ever run), verifies the - # lockfile integrity hash, unpacks each tarball into a sandboxed - # temp dir behind size / count / path-escape / symlink guards, - # and pattern-scans the extracted file contents for the - # signatures common to npm supply-chain attacks: - # - # - lifecycle (preinstall / install / postinstall / prepare) - # scripts in any package.json that fetch + execute external - # code, - # - C2 / exfiltration hosts (getsession.org, AWS IMDS, - # Kubernetes ServiceAccount token paths, GitHub Actions OIDC, - # HashiCorp Vault endpoints), - # - credential-stealing references (.npmrc, .aws/credentials, - # GITHUB_TOKEN / NPM_TOKEN in JS sources), - # - known IOC filenames (router_init.js, tanstack_runner.js, - # router_runtime.js), - # - obfuscation shapes (Function/eval against base64 blobs). - # - # Threat model: every tarball is hostile. Safety guarantees are - # documented at scripts/scan_npm_packages.py top-of-file. The - # script is stdlib-only so adding it does not increase the - # transitive supply-chain surface. - name: npm scan-packages (Studio frontend tarballs) - runs-on: ubuntu-latest - timeout-minutes: 30 - needs: [] - steps: - # Per-job allowlist: npm-scan-packages only fetches tarballs from - # registry.npmjs.org. GitHub endpoints retained for checkout + - # setup-python action machinery. - - name: Harden runner (egress block) - uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 - with: - egress-policy: block - disable-sudo: true - allowed-endpoints: > - api.github.com:443 - github.com:443 - codeload.github.com:443 - objects.githubusercontent.com:443 - registry.npmjs.org:443 - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - - name: Sanity-check scan_npm_packages.py - run: | - test -f scripts/scan_npm_packages.py - python3 -c "import ast; ast.parse(open('scripts/scan_npm_packages.py').read())" - - - name: Scan npm tarballs (declared + transitive, no install) - # The script exits 1 on HIGH/CRITICAL findings; we capture the - # full log and surface it in the step summary either way. It - # never runs `npm install`, never executes anything from a - # downloaded tarball, and only fetches from registry.npmjs.org. - # Initially non-blocking so the baseline can settle; drop - # continue-on-error once the baseline is clean for a week. - run: | - set -o pipefail - LOG=logs-scan-npm.txt - python3 scripts/scan_npm_packages.py 2>&1 | tee "$LOG" - { - echo "## scan_npm_packages" - echo - echo '### Findings (tail)' - echo '```' - tail -300 "$LOG" - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - if: always() - with: - name: scan-npm-packages-log - path: logs-scan-npm.txt - retention-days: 30 - - # ───────────────────────────────────────────────────────────────────── - # Workflow-trigger lint. Refuses two patterns that together powered the - # TanStack GHSA-g7cv-rxg3-hmpx supply-chain compromise: - # - # 1. `pull_request_target` -- runs a fork's workflow YAML against - # the base repository's secrets. There is no safe use of this - # trigger for a public open-source project. - # - # 2. Shared cache keys between PR-triggered workflows and the - # publish workflow. A fork PR can poison the cache; the publish - # workflow then restores the poisoned cache on next run. - # - # Cheap pure-Python lint, runs in seconds. Fail-closed. - # ───────────────────────────────────────────────────────────────────── - workflow-trigger-lint: - name: workflow-trigger lint (pull_request_target / cache-poisoning) - runs-on: ubuntu-latest - timeout-minutes: 5 - steps: - - name: Harden runner (egress block) - uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 - with: - egress-policy: block - disable-sudo: true - allowed-endpoints: > - api.github.com:443 - github.com:443 - codeload.github.com:443 - objects.githubusercontent.com:443 - pypi.org:443 - files.pythonhosted.org:443 - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - - name: Install PyYAML - run: pip install pyyaml - - - name: Lint workflow triggers + cache keys - run: python3 scripts/lint_workflow_triggers.py - - # ───────────────────────────────────────────────────────────────────── - # Regression tests: pin scanner IOC tables and pre-install fixtures. - # Hard gate (no continue-on-error) so future drift in the IOC tables - # or scanner exit semantics fails this PR at review time. - # ───────────────────────────────────────────────────────────────────── - tests-security: - name: pytest tests/security - runs-on: ubuntu-latest - timeout-minutes: 10 - steps: - - name: Harden runner (egress block) - uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 - with: - egress-policy: block - disable-sudo: true - allowed-endpoints: > - api.github.com:443 - github.com:443 - codeload.github.com:443 - objects.githubusercontent.com:443 - pypi.org:443 - files.pythonhosted.org:443 - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - - name: Install pytest + PyYAML - # PyYAML is imported by scripts/lint_workflow_triggers.py, which the - # `tests/security/test_lint_workflow_triggers.py` regression suite - # exercises as a subprocess. Without it the lint script bails with - # `ERROR: PyYAML is required` (exit 2) and the 5 lint regression - # tests fail. Pinned the same way pytest is pinned. - run: pip install pytest==9.0.3 pyyaml==6.0.2 - - - name: Run security regression tests - run: python3 -m pytest tests/security -v - - # ───────────────────────────────────────────────────────────────────── - # npm provenance + new install-script diff. Catches the two npm - # supply-chain levers we don't yet gate on: - # - # 1. `npm audit signatures` validates the registry-signed - # provenance of every tarball laid down in node_modules. Pulled - # from the public npm transparency log; surfaces unsigned or - # mis-signed deps. Informational for now (continue-on-error) - # while the baseline settles. - # - # 2. `check_new_install_scripts.py` diffs the PR's lockfile - # against the base ref and refuses any newly-added dep that - # ships a postinstall hook. Every recent npm supply-chain - # compromise leveraged a postinstall as the execution lever, so - # blocking new ones at PR time is a small, high-signal gate. - # ───────────────────────────────────────────────────────────────────── - npm-provenance-and-install-scripts: - name: npm provenance + new install-script diff - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - name: Harden runner (egress block) - uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 - with: - egress-policy: audit - disable-sudo: true - allowed-endpoints: > - api.github.com:443 - github.com:443 - codeload.github.com:443 - objects.githubusercontent.com:443 - registry.npmjs.org:443 - - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - # Need the base commit accessible for `git show - # :studio/frontend/package-lock.json` below. - fetch-depth: 0 - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - - name: Install Studio frontend deps (--ignore-scripts) - # `npm audit signatures` requires node_modules to be populated. - # `--ignore-scripts` is mandatory: this is exactly the lever the - # new-install-script gate below protects against, and we must - # not run any third-party hook to set up the audit. - working-directory: studio/frontend - run: npm ci --ignore-scripts - - - name: npm audit signatures (informational) - # Surfaces unsigned / mis-signed packages from the npm - # transparency log. continue-on-error during baseline-build - # phase; promote to hard gate once the lockfile is fully - # signed (most major maintainers signed by mid-2025). - working-directory: studio/frontend - continue-on-error: true - run: | - set -o pipefail - LOG=logs-audit-signatures.txt - npm audit signatures 2>&1 | tee "$LOG" - { - echo "## npm audit signatures" - echo - echo '```' - tail -200 "$LOG" - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - - - name: Extract base-ref lockfile (PR triggers only) - if: github.event_name == 'pull_request' - run: | - set -e - BASE_SHA="${{ github.event.pull_request.base.sha }}" - git show "$BASE_SHA:studio/frontend/package-lock.json" \ - > /tmp/base-package-lock.json - - - name: Diff for newly-added install-script deps - if: github.event_name == 'pull_request' - run: | - python3 scripts/check_new_install_scripts.py \ - --base /tmp/base-package-lock.json \ - --head studio/frontend/package-lock.json - - - name: Skip install-script diff (non-PR trigger) - if: github.event_name != 'pull_request' - run: | - echo "Not a pull_request event; install-script diff requires a base ref." - echo "This step is intentionally a no-op outside PR triggers." - - - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - if: always() - with: - name: npm-audit-signatures-log - path: studio/frontend/logs-audit-signatures.txt - if-no-files-found: ignore - retention-days: 30 diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml deleted file mode 100644 index 1a4cf841d0..0000000000 --- a/.github/workflows/stale.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: 'Inactive Issue Pinger' - -on: - schedule: - - cron: '30 5 * * *' # Runs at 5:30 UTC every day - -jobs: - stale: - runs-on: ubuntu-latest - permissions: - issues: write - - steps: - - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0 - with: - # The message to post on stale issues. - # This message will ping the issue author. - # Note: The stale bot action does not currently support a direct placeholder for the last commenter. - # As a workaround, this message encourages any participant to reply. - stale-issue-message: > - Is this issue still important to you? - Apologies in advance we might have missed this issue as well. - For faster response times, please post on our Reddit server - https://www.reddit.com/r/unsloth or our Discord - https://discord.com/invite/unsloth - - # The number of days of inactivity before an issue is considered stale. - days-before-issue-stale: 9999 - - # Set to -1 to never close stale issues. - days-before-issue-close: -1 - - # A label to apply to stale issues. - stale-issue-label: 'inactive' - - # The number of operations to perform per run to avoid rate limiting. - operations-per-run: 500 - - enable-statistics: false diff --git a/.github/workflows/studio-api-smoke.yml b/.github/workflows/studio-api-smoke.yml deleted file mode 100644 index 53514e2ce1..0000000000 --- a/.github/workflows/studio-api-smoke.yml +++ /dev/null @@ -1,166 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Studio API & Auth Tests -- HTTP-level integration tests for the -# FastAPI surface. No Playwright, no model UI; tests/studio/test_studio_api_smoke.py -# runs ~30 s and asserts: -# - CORS hardening (no wildcard + credentials, no bootstrap leak) -# - /api/system + /api/system/hardware require auth -# - Auth state machine + JWT expiry -# - API key lifecycle E2E (create / list / use / delete / reject) -# - Auth file-mode hardening (Linux only) -# - Inference lifecycle (force reload, bogus variant, /v1/models, /v1/embeddings, /v1/responses) -# - Endpoint-by-endpoint auth audit -# -# Reuses the GGUF cache key from studio-ui-smoke.yml so the model -# download is one cache-hit on the second job. - -name: Studio API CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.sh' - - 'pyproject.toml' - - 'tests/studio/**' - - '.github/workflows/studio-api-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - api-smoke: - name: Studio API & Auth Tests - runs-on: ubuntu-latest - timeout-minutes: 12 - env: - GGUF_REPO: unsloth/gemma-3-270m-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf - STUDIO_PORT: '18893' - HF_HOME: ${{ github.workspace }}/hf-cache - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Linux deps - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - libcurl4-openssl-dev libssl-dev jq - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Restore HF_HOME for ${{ env.GGUF_REPO }} - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - # Same key as studio-ui-smoke.yml so the two jobs share a - # single GGUF download across CI. - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Prime HF_HOME with the GGUF - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - - - name: Save HF_HOME for ${{ env.GGUF_REPO }} - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Install pyjwt for the JWT-expiry forge test - run: pip install 'pyjwt>=2.6' - - - name: Reset auth + boot Studio (API-only) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - - - name: Pass bootstrap password + rotated targets to the test - # The test does its own bootstrap-login + rotation to exercise - # the auth state machine; we just pre-mint two random rotated - # passwords for it. Mask them so the log is clean. - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - NEW2="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - echo "::add-mask::$NEW2" - echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV" - echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV" - echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV" - - - name: Run Studio API & Auth tests - # The script is named WITHOUT a `test_` prefix so it isn't - # auto-collected by pytest in Backend CI's `tests/` walk - # (which doesn't set BASE_URL and would crash at import). - env: - BASE_URL: http://127.0.0.1:18893 - STUDIO_AUTH_DIR: /home/runner/.unsloth/studio/auth - run: python tests/studio/studio_api_smoke.py - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - - - name: Upload API smoke logs - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: studio-api-smoke-log - path: | - logs/install.log - logs/studio.log - retention-days: 7 diff --git a/.github/workflows/studio-backend-ci.yml b/.github/workflows/studio-backend-ci.yml deleted file mode 100644 index 63eb70f7f1..0000000000 --- a/.github/workflows/studio-backend-ci.yml +++ /dev/null @@ -1,221 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Runs the existing studio/backend/tests/ suite (~860 tests, all CPU-friendly) -# on every PR that touches the backend or unsloth library. Until this lands, -# none of those tests run automatically. Verified locally on Python 3.13 with -# the surgical exclusions below: 861 pass, 4 skipped. -# -# Exclusions: -# - tests/test_studio_api.py: end-to-end against a live model + GGUF download, -# too heavy for free runners. Run separately when GPU CI is available. -# - -k 'not llama_cpp_load_progress_live': spawns a real llama.cpp process, -# not appropriate for CPU-only runners. -# -# Two jobs: -# - pytest matrix (3.10/3.11/3.12/3.13) over studio/backend/tests -# - repo-cpu-tests: auto-discovered tests/ + state-isolated spoof files -# -# Whole-repo Python lint (syntax + ruff + debugger-leftover scan) -# moved to the dedicated `Lint CI` workflow (.github/workflows/lint-ci.yml) -# so it fires on every PR rather than only on studio/unsloth/tests -# path changes. - -name: Backend CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'tests/**' - - 'pyproject.toml' - - '.github/workflows/studio-backend-ci.yml' - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - pytest: - name: (Python ${{ matrix.python }}) - runs-on: ubuntu-latest - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - python: ['3.10', '3.11', '3.12', '3.13'] - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '${{ matrix.python }}' - cache: 'pip' - - - name: Install backend test dependencies (CPU only) - run: | - python -m pip install --upgrade pip - # Studio's declared backend deps: - pip install -r studio/backend/requirements/studio.txt - # Extras that studio.txt does not list but the import chain needs - # (python-multipart for FastAPI form/file uploads, sqlalchemy/cryptography - # for the auth DB, yaml/jinja2 for utils.models.model_config, etc.): - pip install \ - python-multipart aiofiles sqlalchemy cryptography \ - pyyaml jinja2 mammoth unpdf requests \ - 'numpy<3' pytest pytest-asyncio httpx - # Torch CPU + transformers are required by a chunk of the backend test - # suite (gpu_selection, kv_cache_estimation, utils). CPU-only torch - # keeps the install ~250 MB / ~1 min on a clean runner. - pip install --index-url https://download.pytorch.org/whl/cpu 'torch>=2.4,<2.11' - pip install 'transformers>=4.51,<5.5' - - - name: Backend tests - working-directory: studio/backend - # Locally validated against this dep set: 831 passed, 5 skipped, 35 deselected. - # Deselections (all environment-specific, would never pass on a GPU-less - # `ubuntu-latest` runner regardless of code correctness): - # - llama_cpp_load_progress_live: spawns a real llama.cpp process - # - TestGpuAutoSelection / TestPreSpawnGpuResolution / TestPerGpuFitGuardAllCounts: - # require live transformers config introspection on real GPUs - # - TestTransformersIntrospection: same - # - test_returns_cuda_when_cuda_available / test_calls_cuda_cache_when_cuda: - # assume CUDA-capable GPU - run: | - python -m pytest tests/ -q --tb=short \ - --ignore=tests/test_studio_api.py \ - -k 'not llama_cpp_load_progress_live and not TestGpuAutoSelection and not TestPreSpawnGpuResolution and not TestPerGpuFitGuardAllCounts and not TestTransformersIntrospection and not test_returns_cuda_when_cuda_available and not test_calls_cuda_cache_when_cuda' - - repo-cpu-tests: - # Auto-discover everything under tests/ that is not GPU-bound by - # design. New tests added in covered directories are picked up - # without a workflow edit. Locally validated: 760 passed, 1 skipped, - # 23 deselected. tests/conftest.py (mirroring unsloth-zoo PR #624) - # pre-loads unsloth_zoo.device_type and unsloth.device_type under a - # mocked torch.cuda.is_available so the unsloth import chain - # succeeds on CPU. - name: Repo tests (CPU) - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - # node + uv unlock ~60 tests that previously skipped on CI: - # - 9 tests in test_chat_preset_builtin_invariants.py need node to - # compile a tiny TS harness against the frontend chat sources. - # - tests/python/* spawn fresh `uv venv`s to verify the no-torch - # install path; they self-skip when uv is missing. - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - name: Install uv (for tests/python/* sandboxed venvs) - run: pip install uv - - - name: Install deps (shared shape with backend pytest job) - run: | - python -m pip install --upgrade pip - pip install -r studio/backend/requirements/studio.txt - pip install \ - python-multipart aiofiles sqlalchemy cryptography \ - pyyaml jinja2 mammoth unpdf requests typer \ - 'numpy<3' pytest pytest-asyncio httpx - # torchvision: unsloth_zoo.vision_utils imports it at module scope. - pip install --index-url https://download.pytorch.org/whl/cpu \ - 'torch>=2.4,<2.11' 'torchvision<0.26' - pip install 'transformers>=4.51,<5.5' - # bitsandbytes: hard import in unsloth/models/_utils.py. Recent - # versions ship a CPU build that imports cleanly on Linux. - pip install 'bitsandbytes>=0.45' - # unsloth.device_type imports unsloth_zoo.utils.Version at module - # scope, so the conftest preload needs unsloth_zoo even though - # it is an optional dep of unsloth. - pip install 'unsloth_zoo>=2026.5.1' - pip install -e . --no-deps - - - name: Repo tests (CPU, auto-discovered) - env: - # tests/python/* import install_python_stack from studio/. - PYTHONPATH: ${{ github.workspace }}/studio - # Skip lazy compilation work the unsloth import chain wants to - # do at import time on a real GPU. - UNSLOTH_COMPILE_DISABLE: '1' - # --ignore: GPU-bound directories (qlora/saving need real weights; - # tests/sh is the shell suite the next step handles; tests/utils - # is a helpers folder); tests/vllm_compat + tests/version_compat - # are dedicated multi-version drift canaries with their own job - # in version-compat-ci.yml that installs the heavier dep set - # (torchcodec, full transformers/peft/bnb pins) those tests need. - # State-sensitive hardware-spoofing files run in isolation in the - # next step because they mutate hardware.py module globals. - # -m: honour markers from tests/python/conftest.py (`server` = - # needs studio venv, `e2e` = needs network). - # --deselect: - # - test_model_registration / test_all_model_registration: - # hit huggingface_hub for live model existence checks. - # - test_autoconfig_works_with_no_torch_runtime / test_autoconfig_succeeds: - # fail because no-torch-runtime.txt does not pin tokenizers - # and the latest tokenizers (0.23.1) is incompatible with the - # transformers it resolves to. Tracked separately; this is a - # real bug in the no-torch install path, not a CI issue. - run: | - python -m pytest tests/ -q --tb=short \ - --ignore=tests/qlora \ - --ignore=tests/saving \ - --ignore=tests/utils \ - --ignore=tests/sh \ - --ignore=tests/studio/test_hardware_dispatch_matrix.py \ - --ignore=tests/studio/test_is_mlx_dispatch_gate.py \ - --ignore=tests/vllm_compat \ - --ignore=tests/version_compat \ - -m 'not server and not e2e' \ - --deselect tests/test_model_registry.py::test_model_registration \ - --deselect tests/test_model_registry.py::test_all_model_registration \ - --deselect 'tests/python/test_tokenizers_and_torch_constraint.py::TestE2ETokenizersFix::test_autoconfig_works_with_no_torch_runtime' \ - --deselect 'tests/python/test_tokenizers_and_torch_constraint.py::TestE2EFullNoTorchSandbox::test_autoconfig_succeeds' - - - name: Hardware-spoof tests (state-sensitive, run in isolation) - env: - PYTHONPATH: ${{ github.workspace }}/studio - UNSLOTH_COMPILE_DISABLE: '1' - # These two files mutate hardware.py module globals at runtime - # via the spoof fixtures, which leaks state into any other test - # that imports hardware. Run them in their own pytest invocation - # so the leak does not cross file boundaries. - run: | - python -m pytest -q --tb=short \ - tests/studio/test_hardware_dispatch_matrix.py \ - tests/studio/test_is_mlx_dispatch_gate.py - - - name: Shell installer tests - # Subset that does not depend on a writable / pristine install.sh - # tree; test_install_host_defaults.sh checks install.ps1 layout - # which has drifted (separate followup). - run: | - set -e - for s in \ - tests/sh/test_get_torch_index_url.sh \ - tests/sh/test_mac_intel_compat.sh \ - tests/sh/test_tauri_install_exit_order.sh \ - tests/sh/test_torch_constraint.sh; do - echo "::group::$s" - bash "$s" - echo "::endgroup::" - done - diff --git a/.github/workflows/studio-frontend-ci.yml b/.github/workflows/studio-frontend-ci.yml deleted file mode 100644 index 1270a57ef6..0000000000 --- a/.github/workflows/studio-frontend-ci.yml +++ /dev/null @@ -1,151 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Frontend PR gate: lockfile freshness, typecheck, build, and a bundle grep -# that catches the 2026.5.1 chat-history regression at the JS level. -# -# biome runs as non-blocking for now: the codebase currently has accumulated -# ~470 errors and ~1650 warnings against the existing biome config. Surfacing -# the count in CI lets us drive it down without forcing a fleet-wide cleanup -# in the same PR. Drop `continue-on-error` once that number is zero. - -name: Frontend CI - -on: - pull_request: - paths: - - 'studio/frontend/**' - - 'scripts/check_frontend_dep_removal.py' - - 'tests/studio/test_frontend_dep_removal.py' - - '.github/workflows/studio-frontend-ci.yml' - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - build: - name: Frontend build + bundle sanity - runs-on: ubuntu-latest - timeout-minutes: 10 - defaults: - run: - working-directory: studio/frontend - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - # FIXME: drop this step once @assistant-ui/* and assistant-stream - # leave 0.x -- on 1.x, caret ranges are conventional. Until then, - # every 0.minor on this surface is a SemVer-major (this is exactly - # how 2026.5.1 shipped a broken chat runtime: ^0.12.19 quietly - # resolved to 0.12.28). - - name: '@assistant-ui must be pinned exactly (no caret/tilde)' - working-directory: ${{ github.workspace }} - run: | - set -e - if grep -nE '"(@assistant-ui/[a-z-]+|assistant-stream)":[[:space:]]*"[\^~]' studio/frontend/package.json; then - echo "::error file=studio/frontend/package.json::These packages must be pinned to exact versions until they leave 0.x. Drop the leading ^ or ~." - exit 1 - fi - echo "All assistant-ui packages are pinned exactly." - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - # Run the structural lockfile scan BEFORE npm ci. A compromised - # tarball runs its `prepare` / `postinstall` during `npm ci`, - # so any catch has to fire upstream of that. The scanner is - # pure-Python read-only; safe to call ahead of every install. - - name: Lockfile supply-chain audit (pre-install scan) - working-directory: ${{ github.workspace }} - run: python3 scripts/lockfile_supply_chain_audit.py - - - name: Lockfile must agree with package.json (npm ci is strict) - # Lifecycle scripts (esbuild native-binary postinstall, etc.) are - # required for `vite build`. The pre-install lockfile structural - # audit (lockfile_supply_chain_audit.py) is the practical defence - # against the npm postinstall-dropper class -- it fires BEFORE any - # tarball runs, on the injection pattern itself rather than an - # advisory-DB lookup. - run: npm ci --no-fund --no-audit - - - name: npm ci must not have modified the working tree - working-directory: ${{ github.workspace }} - run: | - if ! git diff --quiet -- studio/frontend; then - echo "::error::npm ci modified files; commit the updated lockfile" - git status -- studio/frontend - exit 1 - fi - - # Catch the common foot-gun: a dep dropped from package.json that is - # still imported somewhere. The script walks the lockfile dep graph - # from the new top-level deps and only counts top-level node_modules - # paths as valid resolution targets for bare src/ imports. - # - # actions/checkout uses fetch-depth: 1 by default, so the base branch - # is not available locally. Fetch the single base commit with an - # explicit refspec so origin/ is reliably created (a bare - # `git fetch origin ` only updates FETCH_HEAD in some configs). - - name: Dependency removal safety check - if: github.event_name == 'pull_request' - working-directory: ${{ github.workspace }} - run: | - git fetch --no-tags --depth=1 origin \ - "${{ github.base_ref }}:refs/remotes/origin/${{ github.base_ref }}" - python3 scripts/check_frontend_dep_removal.py \ - --base "origin/${{ github.base_ref }}" \ - --enumerate-dead - python3 tests/studio/test_frontend_dep_removal.py - - - name: Typecheck - run: npm run typecheck - - - name: Build - run: npm run build - - - name: Built bundle must not contain Studio's unstable_Provider call site - run: | - set -e - JS=$(ls dist/assets/index-*.js | head -1) - HITS=$(grep -c 'unstable_Provider:' "$JS" || echo 0) - echo "main bundle: $JS" - echo "unstable_Provider: hits=$HITS (assistant-ui internals contribute up to 3)" - if [ "$HITS" -gt 3 ]; then - echo "::error file=studio/frontend/src/features/chat/runtime-provider.tsx::Studio bundle still passes unstable_Provider through useRemoteThreadListRuntime; this is the 2026.5.1 chat-history regression. Pass adapters directly into useLocalRuntime instead." - exit 1 - fi - - - name: Bundle size budget (75 MB) - run: | - SIZE=$(du -sb dist | cut -f1) - BUDGET=$((75 * 1024 * 1024)) - echo "dist size: $SIZE bytes ($((SIZE/1024/1024)) MB), budget: $BUDGET bytes (75 MB)" - if [ "$SIZE" -gt "$BUDGET" ]; then - echo "::error::studio/frontend/dist/ exceeded the 75 MB budget. Drop dead deps (e.g. the unused next dep) or split chunks." - exit 1 - fi - - - name: Biome (non-blocking until accumulated drift is cleared) - continue-on-error: true - run: npm run biome:check - - - name: Upload built dist - # Always upload so a green run is reviewable too -- the dist - # output catches "tests passed but bundle changed unexpectedly" - # regressions that would be invisible if we only kept artifacts - # on failure. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: studio-frontend-dist - path: studio/frontend/dist - retention-days: 3 diff --git a/.github/workflows/studio-inference-smoke.yml b/.github/workflows/studio-inference-smoke.yml deleted file mode 100644 index 775363e73c..0000000000 --- a/.github/workflows/studio-inference-smoke.yml +++ /dev/null @@ -1,887 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Three end-to-end smoke jobs that boot a freshly-installed Studio and -# exercise the surfaces real users hit through the OpenAI / Anthropic -# SDKs and curl. Each job picks the smallest model that exercises the -# behaviour under test, primes HF_HOME via actions/cache, and shares -# the install.sh --local --no-torch bootstrap. -# -# 1. OpenAI, Anthropic API tests -# gemma-3-270m-it UD-Q4_K_XL (~254 MiB). -# Password rotation via /api/auth/change-password (old fails, -# new works), then OpenAI + Anthropic Python SDKs against /v1/* -# with temperature=0 and a fixed seed. Asserts the four-turn -# conversation is deterministic across two runs. -# -# 2. Tool calling Tests -# Qwen3.5-2B UD-IQ3_XXS (~890 MiB). OpenAI function calling, -# server-side tools (python, terminal, web_search) via -# enable_tools / enabled_tools, and enable_thinking on/off. -# -# 3. JSON, images -# gemma-4-E2B-it UD-IQ3_XXS (~2.4 GiB) + mmproj-F16 (~986 MiB). -# response_format JSON-schema decoding and OpenAI image_url -# (data URI) plus Anthropic source/base64 image inputs. -# -# All three jobs run in parallel. Total wall time is dominated by job 3 -# on a cold cache; warm cache cuts that to ~3 min. - -name: Studio GGUF CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.sh' - - 'pyproject.toml' - - '.github/workflows/studio-inference-smoke.yml' - push: - branches: [main, pip] - # Manual trigger for pre-warming HF_HOME caches on main, or re-running - # against an arbitrary branch without pushing a no-op commit. - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - # ───────────────────────────────────────────────────────────────────── - # Job 1: OpenAI, Anthropic API tests - # ───────────────────────────────────────────────────────────────────── - openai-anthropic: - name: OpenAI, Anthropic API tests - runs-on: ubuntu-latest - timeout-minutes: 25 - env: - GGUF_REPO: unsloth/gemma-3-270m-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf - STUDIO_PORT: '18888' - HF_HOME: ${{ github.workspace }}/hf-cache - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Linux deps for llama.cpp prebuilt - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - libcurl4-openssl-dev libssl-dev jq - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Restore HF_HOME for ${{ env.GGUF_REPO }} - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Prime HF_HOME with the GGUF - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - - - name: Save HF_HOME for ${{ env.GGUF_REPO }} - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Install OpenAI + Anthropic Python SDKs - run: pip install 'openai>=1.50' 'anthropic>=0.40' - - - name: Reset auth + boot Studio (API-only) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json - exit 0 - fi - sleep 1 - done - echo "Studio did not become healthy in 180s" - tail -200 logs/studio.log - exit 1 - - - name: Password rotation (old must fail, new must work) - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - # 1. Login with the bootstrap password. - OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token) - [ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] || { echo "bootstrap login failed"; exit 1; } - # 2. Rotate to a fresh random password. - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null - # 3. Old password must now be rejected (HTTP 401). - OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \ - -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}") - if [ "$OLD_STATUS" != "401" ]; then - echo "::error::Login with old password returned $OLD_STATUS, expected 401" - exit 1 - fi - # 4. New password must succeed; capture the JWT for downstream steps. - NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - [ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] || { echo "new login failed"; exit 1; } - echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV" - echo "password rotation OK (old=401, new=200)" - - - name: Load the GGUF (HF repo + variant, served from HF_HOME cache) - run: | - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 600 \ - -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name, is_gguf, context_length}' - - - name: Multi-turn determinism via OpenAI + Anthropic SDKs - env: - BASE_URL: http://127.0.0.1:18888 - run: | - python - <<'PY' - import json - import os - from openai import OpenAI - from anthropic import Anthropic - - BASE = os.environ["BASE_URL"] - KEY = os.environ["TOKEN"] # JWT also accepted as Bearer on /v1/* - SEED = 3407 - - # Four-turn conversation: the second and fourth turns can only be - # answered correctly if the model sees the prior turns, so this - # also exercises the conversation-history wiring. - PROMPTS = [ - "What is 1+1?", - "What did I ask before?", - "What is the capital of France?", - "Repeat the city name", - ] - - def run_openai(): - client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY) - history, replies = [], [] - for prompt in PROMPTS: - history.append({"role": "user", "content": prompt}) - resp = client.chat.completions.create( - model = "default", - messages = history, - temperature = 0.0, - max_tokens = 80, - seed = SEED, - extra_body = {"enable_thinking": False}, - ) - text = resp.choices[0].message.content or "" - replies.append(text) - history.append({"role": "assistant", "content": text}) - return replies - - def run_anthropic(): - # Two SDK quirks vs. Studio: - # 1. base_url must NOT include /v1 -- the SDK appends - # /v1/messages itself; otherwise the request hits - # /v1/v1/messages and 405s. - # 2. The SDK sends `x-api-key` by default, but Studio's - # auth layer is HTTPBearer-only. Override via - # default_headers so Authorization: Bearer ... is - # sent instead. - client = Anthropic( - base_url = BASE, - api_key = "unused", - default_headers = {"Authorization": f"Bearer {KEY}"}, - ) - history, replies = [], [] - for prompt in PROMPTS: - history.append({"role": "user", "content": prompt}) - msg = client.messages.create( - model = "default", - max_tokens = 80, - messages = history, - temperature = 0.0, - extra_body = {"seed": SEED, "enable_thinking": False}, - ) - text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text") - replies.append(text) - history.append({"role": "assistant", "content": text}) - return replies - - for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)): - first = runner() - second = runner() - for i, (a, b) in enumerate(zip(first, second), start = 1): - print(f"[{label} turn {i}] {a!r}") - assert a, f"{label}: empty turn {i} response" - assert a == b, ( - f"{label} non-deterministic at turn {i} with temperature=0.0:\n" - f" run1: {a!r}\n run2: {b!r}" - ) - # Sanity: turn-2 reply should mention the earlier question, and - # turn-4 reply should mention Paris (model echoes the city it - # produced for turn 3). Lower-cased substring checks keep the - # assertion robust to formatting jitter. - joined = " ".join(first).lower() - assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}" - assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}" - print(f"[{label}] OK -- 4 turns, run1 == run2, history grounded") - PY - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - ss -tln | grep ":${STUDIO_PORT}" || true - - - name: Upload logs - # Always upload so green runs are still reviewable. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: openai-anthropic-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 - - # ───────────────────────────────────────────────────────────────────── - # Job 2: Tool calling Tests - # ───────────────────────────────────────────────────────────────────── - tool-calling: - name: Tool calling Tests - runs-on: ubuntu-latest - timeout-minutes: 25 - env: - # Tool calling is the highest-volume GGUF in this workflow - # (Qwen3.5-2B at IQ3_XXS = ~890 MiB). Caching HF_HOME would - # store xet chunks + blobs + snapshots = ~4 GiB compressed -- - # 4-5x file-size inflation, dominated by xet chunks. Use main's - # `--local-dir gguf-cache` pattern to cache the flat .gguf only. - # Studio's /api/inference/load accepts either a HF repo (which - # uses HF_HOME) or an absolute file path; passing the absolute - # path keeps the test off HF_HOME entirely so the cache size - # tracks the GGUF file 1:1. The OpenAI/Anth and JSON+images - # jobs still cover the gguf_variant resolution path. - GGUF_REPO: unsloth/Qwen3.5-2B-GGUF - GGUF_FILE: Qwen3.5-2B-UD-IQ3_XXS.gguf - STUDIO_PORT: '18889' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Linux deps for llama.cpp prebuilt - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - libcurl4-openssl-dev libssl-dev jq - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Restore GGUF model file - id: cache-gguf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: gguf-cache - key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1 - - - name: Download GGUF if cache miss - id: download-gguf - if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.cache-gguf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p gguf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache - - - name: Save GGUF model file - if: always() && steps.download-gguf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: gguf-cache - key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Reset auth + boot Studio (API-only, default tool policy) - # We deliberately use the API-only mode rather than - # `unsloth studio run` because the latter calls - # `set_tool_policy(...)` with a resolved bool: on loopback the - # default resolves to True, which forces every request through - # the server-side agentic loop and breaks the standard - # function-calling test below. API-only mode leaves - # tool_policy=None so each request's `enable_tools` field is - # honoured. - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health, log in, change password, load model - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token) - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null - TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - echo "API_KEY=$TOKEN" >> "$GITHUB_ENV" - GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}" - ls -lh "$GGUF_PATH" - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 600 \ - -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name}' - - - name: Tool calling, server-side tools, thinking on/off - env: - BASE_URL: http://127.0.0.1:18889 - run: | - python - <<'PY' - import json - import os - import urllib.request - - BASE = os.environ["BASE_URL"] - KEY = os.environ["API_KEY"] - SEED = 3407 - - def post(path, body, *, timeout = 240): - """Plain JSON POST. For requests that don't go through - the server-side agentic loop, the response is one JSON - object.""" - data = json.dumps(body).encode() - req = urllib.request.Request( - f"{BASE}{path}", - data = data, - method = "POST", - headers = { - "Authorization": f"Bearer {KEY}", - "Content-Type": "application/json", - }, - ) - with urllib.request.urlopen(req, timeout = timeout) as resp: - return resp.status, json.loads(resp.read().decode()) - - def post_sse(path, body, *, timeout = 600): - """POST a streaming request and accumulate the assistant - text deltas. The server-side agentic loop ALWAYS returns - SSE regardless of the request's `stream` field, so any - call with enable_tools=true must use this helper.""" - body = {**body, "stream": True} - data = json.dumps(body).encode() - req = urllib.request.Request( - f"{BASE}{path}", - data = data, - method = "POST", - headers = { - "Authorization": f"Bearer {KEY}", - "Content-Type": "application/json", - }, - ) - parts = [] - with urllib.request.urlopen(req, timeout = timeout) as resp: - for raw in resp: - line = raw.decode().strip() - if not line.startswith("data: "): - continue - payload = line[6:] - if payload == "[DONE]": - break - try: - chunk = json.loads(payload) - except json.JSONDecodeError: - continue - for choice in chunk.get("choices", []): - delta = choice.get("delta", {}) or {} - if delta.get("content"): - parts.append(delta["content"]) - return "".join(parts) - - # ── 1. Standard OpenAI function calling ────────────────────── - weather_tool = { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get current weather for a city.", - "parameters": { - "type": "object", - "properties": {"city": {"type": "string"}}, - "required": ["city"], - }, - }, - } - - status, data = post("/v1/chat/completions", { - "messages": [{"role": "user", "content": "What is the weather in Paris?"}], - "tools": [weather_tool], - "tool_choice": "required", - "stream": False, - "temperature": 0.0, - "seed": SEED, - "max_tokens": 120, - }) - assert status == 200, f"tool call status {status}: {data}" - choice = data["choices"][0] - assert choice["finish_reason"] == "tool_calls", f"finish_reason={choice['finish_reason']!r}" - tc = choice["message"]["tool_calls"][0] - assert tc["function"]["name"] == "get_weather" - args = json.loads(tc["function"]["arguments"]) - assert args.get("city"), f"missing city arg: {args}" - print(f"[tools] PASS function calling -> {tc['function']['name']}({args})") - - # ── 2. Server-side python tool ─────────────────────────────── - # 123 * 456 = 56088. The agentic loop streams SSE; we - # accumulate the assistant text and look for the answer. We - # accept "56088" or "56,088" since the model may format it. - content = post_sse("/v1/chat/completions", { - "messages": [{"role": "user", "content": "What is 123 * 456? Use the python tool to compute it and tell me the number."}], - "enable_tools": True, - "enabled_tools": ["python"], - "session_id": "ci-tool-calling-py", - "temperature": 0.0, - "seed": SEED, - "max_tokens": 600, - }) - assert "56088" in content or "56,088" in content, ( - f"expected 56088 in python-tool answer, got: {content!r}" - ) - print(f"[tools] PASS python tool ({len(content)} chars)") - - # ── 3. Server-side bash (terminal) tool ────────────────────── - content = post_sse("/v1/chat/completions", { - "messages": [{"role": "user", "content": "Use the terminal tool to run `echo hello-bash-tool` and tell me the exact output."}], - "enable_tools": True, - "enabled_tools": ["terminal"], - "session_id": "ci-tool-calling-bash", - "temperature": 0.0, - "seed": SEED, - "max_tokens": 600, - }) - assert "hello-bash-tool" in content, ( - f"expected 'hello-bash-tool' in terminal-tool answer, got: {content!r}" - ) - print(f"[tools] PASS bash/terminal tool ({len(content)} chars)") - - # ── 4. Server-side web_search tool ─────────────────────────── - # DuckDuckGo is flaky from CI runners and small Qwen3.5-2B - # may not actually search. Only assert that the SSE stream - # opens and yields any data; HTTP / parser failures already - # raise above. - try: - content = post_sse("/v1/chat/completions", { - "messages": [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}], - "enable_tools": True, - "enabled_tools": ["web_search"], - "session_id": "ci-tool-calling-web", - "temperature": 0.0, - "seed": SEED, - "max_tokens": 400, - }) - print(f"[tools] PASS web_search stream ({len(content)} chars)") - except Exception as exc: - print(f"[tools] WARN web_search probe failed (non-blocking): {exc}") - - # ── 5. Thinking on / off ───────────────────────────────────── - # Studio strips think blocks from message.content for tools-mode - # responses, so we toggle plain chat (no enable_tools) and look - # at the surfaced reasoning_content / message.thinking field. - def thinking_call(enable): - status, data = post("/v1/chat/completions", { - "messages": [{"role": "user", "content": "Briefly: is 17 prime?"}], - "stream": False, - "enable_thinking": enable, - "temperature": 0.0, - "seed": SEED, - "max_tokens": 300, - }) - assert status == 200 - msg = data["choices"][0]["message"] - # Studio surfaces thinking via reasoning_content (OpenAI - # extension). Fall back to inline markers for - # robustness across template versions. - raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "") - return raw - - on_text = thinking_call(True) - off_text = thinking_call(False) - had_think_on = ("" in on_text) or len(on_text) > 80 - had_think_off = ("" in off_text) and len(off_text) > 0 - assert had_think_on, ( - f"enable_thinking=True produced no thinking signal: {on_text!r}" - ) - # Off-mode should not contain the literal marker. - assert "" not in off_text, ( - f"enable_thinking=False but still present: {off_text!r}" - ) - print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)") - PY - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - ss -tln | grep ":${STUDIO_PORT}" || true - - - name: Upload logs - # Always upload so green runs are still reviewable. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: tool-calling-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 - - # ───────────────────────────────────────────────────────────────────── - # Job 3: JSON, images - # ───────────────────────────────────────────────────────────────────── - json-images: - name: JSON, images - runs-on: ubuntu-latest - timeout-minutes: 30 - env: - GGUF_REPO: unsloth/gemma-4-E2B-it-GGUF - GGUF_VARIANT: UD-IQ3_XXS - GGUF_FILE: gemma-4-E2B-it-UD-IQ3_XXS.gguf - MMPROJ_FILE: mmproj-F16.gguf - STUDIO_PORT: '18890' - HF_HOME: ${{ github.workspace }}/hf-cache - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Linux deps for llama.cpp prebuilt - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - libcurl4-openssl-dev libssl-dev jq - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Restore HF_HOME for ${{ env.GGUF_REPO }} (model + mmproj) - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1 - - - name: Prime HF_HOME with the GGUF + mmproj - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE" - - - name: Save HF_HOME for ${{ env.GGUF_REPO }} (model + mmproj) - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Install OpenAI + Anthropic Python SDKs - run: pip install 'openai>=1.50' 'anthropic>=0.40' - - - name: Reset auth + boot Studio (API-only) - # See Job 2's comment: API-only mode keeps tool_policy=None so - # response_format requests aren't routed through the agentic - # tool loop. - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health, log in, change password, load model - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token) - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null - TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - echo "API_KEY=$TOKEN" >> "$GITHUB_ENV" - # Load the GGUF (mmproj is auto-detected via the HF repo - # lookup, the cached file is pulled out of HF_HOME). - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 900 \ - -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name, is_vision}' - - - name: JSON schema decoding + image input - env: - BASE_URL: http://127.0.0.1:18890 - run: | - python - <<'PY' - import base64 - import json - import os - import urllib.request - from openai import OpenAI - from anthropic import Anthropic - - BASE = os.environ["BASE_URL"] - KEY = os.environ["API_KEY"] - SEED = 3407 - - def post(path, body, *, timeout = 240): - req = urllib.request.Request( - f"{BASE}{path}", - data = json.dumps(body).encode(), - method = "POST", - headers = { - "Authorization": f"Bearer {KEY}", - "Content-Type": "application/json", - }, - ) - with urllib.request.urlopen(req, timeout = timeout) as resp: - return resp.status, json.loads(resp.read().decode()) - - # ── 1. response_format = json_object (JSON mode) ───────────── - # llama.cpp's HTTP server supports OpenAI-compatible JSON - # mode: `response_format: {"type": "json_object"}` constrains - # the model to emit syntactically-valid JSON. We use raw HTTP - # rather than the OpenAI SDK so that the field shape Studio - # forwards to llama-server is unambiguous (the SDK rewrites - # response_format depending on which variant it recognises). - # We deliberately do NOT pass a strict JSON schema -- on - # small Gemma-4 quants the GBNF-from-schema path occasionally - # produces empty output, and JSON mode is the surface we care - # about exposing through Studio. - status, data = post("/v1/chat/completions", { - "model": "default", - "messages": [ - {"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'}, - {"role": "user", "content": "What is the capital of France?"}, - ], - "temperature": 0.0, - "max_tokens": 200, - "seed": SEED, - "stream": False, - "enable_thinking": False, - "response_format": {"type": "json_object"}, - }, timeout = 600) - assert status == 200, f"json status {status}: {data}" - content = (data["choices"][0]["message"].get("content") or "").strip() - # Some chat templates wrap JSON in ```json fences even in JSON - # mode -- strip those before parsing. - if content.startswith("```"): - content = content.split("```", 2)[1] - if content.startswith("json"): - content = content[4:] - content = content.strip("`\n ") - parsed = json.loads(content) - assert "paris" in str(parsed.get("city", "")).lower(), ( - f"city != Paris: {parsed}" - ) - print(f"[json] PASS json_object -> {parsed}") - - # ── 2. OpenAI image_url (data URI base64) ─────────────────── - # 64x64 solid-red PNG. stb_image (used by Studio's image - # normaliser at routes/inference.py:3410) rejects 4x4 or - # smaller PNGs as truncated, so we go up to 64x64 -- still - # tiny in token cost. The assertion is loose: any non-empty - # response from the vision path proves multimodal end-to-end - # wiring; small VL quants are weak at colour identification. - PNG_64X64_RED_B64 = ( - "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k" - "UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA" - "1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII=" - ) - data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}" - - client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY) - openai_resp = client.chat.completions.create( - model = "default", - temperature = 0.0, - max_tokens = 80, - seed = SEED, - messages = [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": data_uri}}, - {"type": "text", "text": "What colour dominates this image? Reply in one word."}, - ], - }], - ) - openai_text = (openai_resp.choices[0].message.content or "").lower() - print(f"[image/openai] reply: {openai_text!r}") - assert openai_text, "OpenAI image_url returned empty content" - # We do not strictly require 'red' -- some quants of small VL - # models are weak at colour names. Just require a non-empty - # answer; the vision path is the part under test. - print("[image/openai] PASS image_url accepted, non-empty response") - - # ── 3. Anthropic source/base64 image ──────────────────────── - # Two SDK quirks vs. Studio: base_url must NOT include /v1 - # (the SDK appends it itself; otherwise /v1/v1/messages -> 405), - # and Studio's auth is HTTPBearer-only so the SDK's default - # x-api-key header is ignored -- send Authorization: Bearer - # via default_headers. - anthropic = Anthropic( - base_url = BASE, - api_key = "unused", - default_headers = {"Authorization": f"Bearer {KEY}"}, - ) - a_msg = anthropic.messages.create( - model = "default", - max_tokens = 80, - temperature = 0.0, - extra_body = {"seed": SEED}, - messages = [{ - "role": "user", - "content": [ - { - "type": "image", - "source": { - "type": "base64", - "media_type": "image/png", - "data": PNG_64X64_RED_B64, - }, - }, - {"type": "text", "text": "Describe this image briefly."}, - ], - }], - ) - a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text") - print(f"[image/anthropic] reply: {a_text!r}") - assert a_text, "Anthropic source/base64 returned empty content" - print("[image/anthropic] PASS source/base64 accepted, non-empty response") - PY - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - ss -tln | grep ":${STUDIO_PORT}" || true - - - name: Upload logs - # Always upload so green runs are still reviewable. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: json-images-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 diff --git a/.github/workflows/studio-mac-api-smoke.yml b/.github/workflows/studio-mac-api-smoke.yml deleted file mode 100644 index b4e274155e..0000000000 --- a/.github/workflows/studio-mac-api-smoke.yml +++ /dev/null @@ -1,153 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Mac counterpart to studio-api-smoke.yml. Same tests/studio/ -# studio_api_smoke.py exercise (CORS hardening, auth state machine, -# JWT expiry, API key lifecycle, /v1/models / /v1/embeddings / -# /v1/responses, endpoint-by-endpoint auth audit) but on a real -# Apple Silicon (macos-14, M1) runner. Drops the apt-get block; -# GitHub-hosted macos-14 ships curl + jq. - -name: Mac Studio API CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.sh' - - 'pyproject.toml' - - 'tests/studio/**' - - '.github/workflows/studio-mac-api-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - api-smoke: - name: Studio API & Auth Tests - runs-on: macos-14 - timeout-minutes: 25 - env: - GGUF_REPO: unsloth/gemma-3-270m-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf - STUDIO_PORT: '18895' - HF_HOME: ${{ github.workspace }}/hf-cache - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Restore HF_HOME for ${{ env.GGUF_REPO }} - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Prime HF_HOME with the GGUF - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - - - name: Save HF_HOME for ${{ env.GGUF_REPO }} - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Assert install.sh used the Mac llama.cpp prebuilt - run: | - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - - - name: Install pyjwt for the JWT-expiry forge test - run: pip install 'pyjwt>=2.6' - - - name: Reset auth + boot Studio (API-only) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - - - name: Pass bootstrap password + rotated targets to the test - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - NEW2="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - echo "::add-mask::$NEW2" - echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV" - echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV" - echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV" - - - name: Run Studio API & Auth tests - env: - BASE_URL: http://127.0.0.1:18895 - STUDIO_AUTH_DIR: /Users/runner/.unsloth/studio/auth - run: python tests/studio/studio_api_smoke.py - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - - - name: Upload API smoke logs - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: mac-studio-api-smoke-log - path: | - logs/install.log - logs/studio.log - retention-days: 7 diff --git a/.github/workflows/studio-mac-inference-smoke.yml b/.github/workflows/studio-mac-inference-smoke.yml deleted file mode 100644 index 2d6864e0cb..0000000000 --- a/.github/workflows/studio-mac-inference-smoke.yml +++ /dev/null @@ -1,1042 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Three end-to-end smoke jobs that boot a freshly-installed Studio and -# exercise the surfaces real users hit through the OpenAI / Anthropic -# SDKs and curl. Each job picks the smallest model that exercises the -# behaviour under test, primes a model cache via actions/cache, and -# shares the install.sh --local --no-torch bootstrap. -# -# 1. OpenAI, Anthropic API tests -# gemma-3-270m-it UD-Q4_K_XL (~254 MiB). -# Password rotation via /api/auth/change-password (old fails, -# new works), then OpenAI + Anthropic Python SDKs against /v1/* -# with temperature=0 and a fixed seed. Asserts the four-turn -# conversation is deterministic across two runs. -# -# 2. Tool calling Tests -# Qwen3.5-2B UD-IQ3_XXS (~890 MiB). OpenAI function calling, -# server-side tools (python, terminal, web_search) via -# enable_tools / enabled_tools, and enable_thinking on/off. -# -# 3. JSON, images -# gemma-4-E2B-it UD-IQ3_XXS (~2.4 GiB) + mmproj-F16 (~986 MiB). -# response_format JSON-schema decoding and OpenAI image_url -# (data URI) plus Anthropic source/base64 image inputs. -# -# All three jobs run in parallel. Total wall time is dominated by job 3 -# on a cold cache; warm cache cuts that to ~3 min. - -name: Mac Studio GGUF CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.sh' - - 'pyproject.toml' - - '.github/workflows/studio-mac-inference-smoke.yml' - push: - branches: [main, pip] - # Manual trigger for pre-warming model caches on main, or re-running - # against an arbitrary branch without pushing a no-op commit. - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - # ───────────────────────────────────────────────────────────────────── - # Job 1: OpenAI, Anthropic API tests - # ───────────────────────────────────────────────────────────────────── - openai-anthropic: - name: OpenAI, Anthropic API tests - runs-on: macos-14 - timeout-minutes: 25 - env: - GGUF_REPO: unsloth/gemma-3-270m-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf - STUDIO_PORT: '18888' - HF_HOME: ${{ github.workspace }}/hf-cache - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Restore HF_HOME for ${{ env.GGUF_REPO }} - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Prime HF_HOME with the GGUF - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - - # Save partial caches on cancel/timeout -- hf download resumes by - # content hash. `outcome != skipped` keeps cache-hit a no-op. - - name: Save HF_HOME for ${{ env.GGUF_REPO }} - if: always() && steps.prime-hf.outcome != 'skipped' && hashFiles('hf-cache/**/*.gguf') != '' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Assert install.sh used the Mac llama.cpp prebuilt - run: | - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - - - name: Install OpenAI + Anthropic Python SDKs - run: pip install 'openai>=1.50' 'anthropic>=0.40' - - - name: Reset auth + boot Studio (API-only) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json - exit 0 - fi - sleep 1 - done - echo "Studio did not become healthy in 180s" - tail -200 logs/studio.log - exit 1 - - - name: Password rotation (old must fail, new must work) - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - # 1. Login with the bootstrap password. - OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token) - [ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] || { echo "bootstrap login failed"; exit 1; } - # 2. Rotate to a fresh random password. - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null - # 3. Old password must now be rejected (HTTP 401). - OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \ - -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}") - if [ "$OLD_STATUS" != "401" ]; then - echo "::error::Login with old password returned $OLD_STATUS, expected 401" - exit 1 - fi - # 4. New password must succeed; capture the JWT for downstream steps. - NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - [ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] || { echo "new login failed"; exit 1; } - echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV" - echo "password rotation OK (old=401, new=200)" - - - name: Load the GGUF (HF repo + variant, served from HF_HOME cache) - run: | - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 600 \ - -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name, is_gguf, context_length}' - - - name: Multi-turn determinism via OpenAI + Anthropic SDKs - env: - BASE_URL: http://127.0.0.1:18888 - run: | - python - <<'PY' - import json - import os - from openai import OpenAI - from anthropic import Anthropic - - BASE = os.environ["BASE_URL"] - KEY = os.environ["TOKEN"] # JWT also accepted as Bearer on /v1/* - SEED = 3407 - - # Four-turn conversation: the second and fourth turns can only be - # answered correctly if the model sees the prior turns, so this - # also exercises the conversation-history wiring. - PROMPTS = [ - "What is 1+1?", - "What did I ask before?", - "What is the capital of France?", - "Repeat the city name", - ] - - def run_openai(): - client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY) - history, replies = [], [] - for prompt in PROMPTS: - history.append({"role": "user", "content": prompt}) - resp = client.chat.completions.create( - model = "default", - messages = history, - temperature = 0.0, - max_tokens = 80, - seed = SEED, - extra_body = {"enable_thinking": False}, - ) - text = resp.choices[0].message.content or "" - replies.append(text) - history.append({"role": "assistant", "content": text}) - return replies - - def run_anthropic(): - # Two SDK quirks vs. Studio: - # 1. base_url must NOT include /v1 -- the SDK appends - # /v1/messages itself; otherwise the request hits - # /v1/v1/messages and 405s. - # 2. The SDK sends `x-api-key` by default, but Studio's - # auth layer is HTTPBearer-only. Override via - # default_headers so Authorization: Bearer ... is - # sent instead. - client = Anthropic( - base_url = BASE, - api_key = "unused", - default_headers = {"Authorization": f"Bearer {KEY}"}, - ) - history, replies = [], [] - for prompt in PROMPTS: - history.append({"role": "user", "content": prompt}) - msg = client.messages.create( - model = "default", - max_tokens = 80, - messages = history, - temperature = 0.0, - extra_body = {"seed": SEED, "enable_thinking": False}, - ) - text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text") - replies.append(text) - history.append({"role": "assistant", "content": text}) - return replies - - for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)): - first = runner() - second = runner() - for i, (a, b) in enumerate(zip(first, second), start = 1): - print(f"[{label} turn {i}] {a!r}") - assert a, f"{label}: empty turn {i} response" - assert a == b, ( - f"{label} non-deterministic at turn {i} with temperature=0.0:\n" - f" run1: {a!r}\n run2: {b!r}" - ) - # Sanity: turn-2 reply should mention the earlier question, and - # turn-4 reply should mention Paris (model echoes the city it - # produced for turn 3). Lower-cased substring checks keep the - # assertion robust to formatting jitter. - joined = " ".join(first).lower() - assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}" - assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}" - print(f"[{label}] OK -- 4 turns, run1 == run2, history grounded") - PY - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - ss -tln | grep ":${STUDIO_PORT}" || true - - - name: Upload logs - # Always upload so green runs are still reviewable. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: openai-anthropic-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 - - # ───────────────────────────────────────────────────────────────────── - # Job 2: Tool calling Tests - # ───────────────────────────────────────────────────────────────────── - tool-calling: - name: Tool calling Tests - runs-on: macos-14 - timeout-minutes: 25 - env: - # Tool calling is the highest-volume GGUF in this workflow - # (Qwen3.5-2B at Q4_K_XL = ~1.28 GiB on Mac, where IQ3_XXS - # collapses for tool-call grammar under Metal at temperature=0). - # Caching HF_HOME stores xet chunks + blobs + snapshots = ~4.6 - # GiB compressed -- 3.6x file-size inflation. Use main's - # `--local-dir gguf-cache` pattern to cache the flat .gguf only. - # The OpenAI/Anth and JSON+images jobs still cover the - # gguf_variant resolution path. - GGUF_REPO: unsloth/Qwen3.5-2B-GGUF - GGUF_FILE: Qwen3.5-2B-UD-Q4_K_XL.gguf - STUDIO_PORT: '18898' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Restore GGUF model file - id: cache-gguf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: gguf-cache - key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1 - - - name: Download GGUF if cache miss - id: download-gguf - if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.cache-gguf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p gguf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache - - # Save partial caches on cancel; next run resumes via content hash. - - name: Save GGUF model file - if: always() && steps.download-gguf.outcome != 'skipped' && hashFiles('gguf-cache/**/*.gguf') != '' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: gguf-cache - key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Assert install.sh used the Mac llama.cpp prebuilt - run: | - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - - - name: Reset auth + boot Studio (API-only, default tool policy) - # We deliberately use the API-only mode rather than - # `unsloth studio run` because the latter calls - # `set_tool_policy(...)` with a resolved bool: on loopback the - # default resolves to True, which forces every request through - # the server-side agentic loop and breaks the standard - # function-calling test below. API-only mode leaves - # tool_policy=None so each request's `enable_tools` field is - # honoured. - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health, log in, change password, load model - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token) - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null - TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - echo "API_KEY=$TOKEN" >> "$GITHUB_ENV" - GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}" - ls -lh "$GGUF_PATH" - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 600 \ - -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name}' - - - name: Tool calling, server-side tools, thinking on/off - env: - BASE_URL: http://127.0.0.1:18898 - run: | - python - <<'PY' - import json - import os - import urllib.request - - BASE = os.environ["BASE_URL"] - KEY = os.environ["API_KEY"] - SEED = 3407 - - def post(path, body, *, timeout = 240): - """Plain JSON POST. For requests that don't go through - the server-side agentic loop, the response is one JSON - object.""" - data = json.dumps(body).encode() - req = urllib.request.Request( - f"{BASE}{path}", - data = data, - method = "POST", - headers = { - "Authorization": f"Bearer {KEY}", - "Content-Type": "application/json", - }, - ) - with urllib.request.urlopen(req, timeout = timeout) as resp: - return resp.status, json.loads(resp.read().decode()) - - def post_sse(path, body, *, timeout = 600): - """POST a streaming request and accumulate the assistant - text deltas. The server-side agentic loop ALWAYS returns - SSE regardless of the request's `stream` field, so any - call with enable_tools=true must use this helper.""" - body = {**body, "stream": True} - data = json.dumps(body).encode() - req = urllib.request.Request( - f"{BASE}{path}", - data = data, - method = "POST", - headers = { - "Authorization": f"Bearer {KEY}", - "Content-Type": "application/json", - }, - ) - parts = [] - with urllib.request.urlopen(req, timeout = timeout) as resp: - for raw in resp: - line = raw.decode().strip() - if not line.startswith("data: "): - continue - payload = line[6:] - if payload == "[DONE]": - break - try: - chunk = json.loads(payload) - except json.JSONDecodeError: - continue - for choice in chunk.get("choices", []): - delta = choice.get("delta", {}) or {} - if delta.get("content"): - parts.append(delta["content"]) - return "".join(parts) - - # ── 1. Standard OpenAI function calling ────────────────────── - weather_tool = { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get current weather for a city.", - "parameters": { - "type": "object", - "properties": {"city": {"type": "string"}}, - "required": ["city"], - }, - }, - } - - # Mac Metal at temperature=0 is pathological for these small - # quants (Qwen3.5-2B emits ',,,,,,...' or 'The The The...'), - # gemma-4-E2B emits '' tokens). The Linux CPU - # backend hides the issue. Use a small non-zero temperature - # with a fixed seed so we stay deterministic but escape the - # degenerate sampling trap. - TEMP = 0.2 - - status, data = post("/v1/chat/completions", { - "messages": [{"role": "user", "content": "What is the weather in Paris?"}], - "tools": [weather_tool], - "tool_choice": "required", - "stream": False, - "temperature": TEMP, - "seed": SEED, - # tool_choice='required' constrains the grammar so the - # model emits a tool_call quickly when it works at all; - # 128 tokens is enough for `{"city":"Paris"}` plus the - # JSON envelope. - "max_tokens": 128, - }, timeout = 180) - assert status == 200, f"tool call status {status}: {data}" - choice = data["choices"][0] - tool_calls = (choice.get("message") or {}).get("tool_calls") or [] - # Studio's contract: when tool_choice='required', llama.cpp's - # grammar should force a tool_calls payload. On Mac that - # contract is sometimes broken by the underlying quant; the - # PASS path is "tool_calls present + correct schema", the - # WARN path documents Studio still returned 200 with a - # well-formed choices[] envelope. - if tool_calls: - tc = tool_calls[0] - assert tc["function"]["name"] == "get_weather", ( - f"unexpected tool name: {tc['function']['name']!r}" - ) - args = json.loads(tc["function"]["arguments"]) - assert args.get("city"), f"missing city arg: {args}" - print(f"[tools] PASS function calling -> {tc['function']['name']}({args}) finish={choice.get('finish_reason')!r}") - else: - # Infrastructure path is correct; model output drifted. - print( - f"[tools] WARN function calling: no tool_calls (finish_reason=" - f"{choice.get('finish_reason')!r}); HTTP path OK, this is a " - f"Mac Metal quant degeneracy." - ) - - # ── 2. Server-side python tool ─────────────────────────────── - # 123 * 456 = 56088. The agentic loop streams SSE; we - # accumulate the assistant text and look for the answer. On - # Mac the model often loses the tool calling contract before - # producing the answer; accept either the answer OR a - # non-empty SSE stream as proof the path completes. - # macos-14 free runner is ~10 tok/s on Qwen3.5-2B Q4_K_XL; - # cap max_tokens tightly so each SSE round stays under ~30s - # even when the model stalls in a degenerate output state. - content = post_sse("/v1/chat/completions", { - "messages": [{"role": "user", "content": "What is 123 * 456? Use the python tool to compute it and tell me the number."}], - "enable_tools": True, - "enabled_tools": ["python"], - "session_id": "ci-tool-calling-py", - "temperature": TEMP, - "seed": SEED, - "max_tokens": 128, - }, timeout = 180) - if "56088" in content or "56,088" in content: - print(f"[tools] PASS python tool ({len(content)} chars, found 56088)") - else: - # Empty stream is a known Mac-quant degeneracy too; log - # but do not fail. - print( - f"[tools] WARN python tool: SSE OK ({len(content)} chars) but " - f"model didn't return 56088 -- Mac quant drift" - ) - - # NOTE: the dedicated "Server-side bash (terminal) tool" axis - # was dropped in favour of the python axis above. Both share - # the SAME server-side agentic loop wiring (only the registry - # entry differs); the python axis is the canonical proof. On - # macos-14 the duplicated SSE round was the dominant cost in - # this step, so collapsing the two saves ~30-60 s wallclock - # without losing distinct coverage. - - # ── 3. Server-side web_search tool ─────────────────────────── - # DuckDuckGo is flaky from CI runners and small Qwen3.5-2B - # may not actually search. Only assert that the SSE stream - # opens and yields any data; HTTP / parser failures already - # raise above. - try: - content = post_sse("/v1/chat/completions", { - "messages": [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}], - "enable_tools": True, - "enabled_tools": ["web_search"], - "session_id": "ci-tool-calling-web", - "temperature": TEMP, - "seed": SEED, - "max_tokens": 96, - }, timeout = 180) - print(f"[tools] PASS web_search stream ({len(content)} chars)") - except Exception as exc: - print(f"[tools] WARN web_search probe failed (non-blocking): {exc}") - - # ── 4. Thinking on / off ───────────────────────────────────── - # Studio strips think blocks from message.content for tools-mode - # responses, so we toggle plain chat (no enable_tools) and look - # at the surfaced reasoning_content / message.thinking field. - def thinking_call(enable): - status, data = post("/v1/chat/completions", { - "messages": [{"role": "user", "content": "Briefly: is 17 prime?"}], - "stream": False, - "enable_thinking": enable, - "temperature": TEMP, - "seed": SEED, - # 80 tokens lands within the 25-minute job timeout - # on the macos-14 free runner. 17 is small; this is - # plenty of room for either "Yes" + brief reasoning - # or a degenerate empty completion. - "max_tokens": 80, - }, timeout = 180) - assert status == 200 - msg = data["choices"][0]["message"] - # Studio surfaces thinking via reasoning_content (OpenAI - # extension). Fall back to inline markers for - # robustness across template versions. - raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "") - return raw - - on_text = thinking_call(True) - off_text = thinking_call(False) - # Mac quant drift: the model may produce empty / degenerate - # output regardless of enable_thinking. Assert ONLY that the - # endpoint returned 200 (already enforced inside thinking_call) - # and that toggling the flag doesn't surface a hard - # marker when off. - had_think_on = ("" in on_text) or len(on_text) > 80 - if not had_think_on: - print( - f"[tools] WARN enable_thinking=True produced no thinking signal: " - f"{on_text[:200]!r} -- Mac quant drift" - ) - # Off-mode should not contain the literal marker. - assert "" not in off_text, ( - f"enable_thinking=False but still present: {off_text!r}" - ) - print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)") - PY - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - ss -tln | grep ":${STUDIO_PORT}" || true - - - name: Upload logs - # Always upload so green runs are still reviewable. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: tool-calling-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 - - # ───────────────────────────────────────────────────────────────────── - # Job 3: JSON, images - # ───────────────────────────────────────────────────────────────────── - json-images: - name: JSON, images - runs-on: macos-14 - timeout-minutes: 30 - env: - GGUF_REPO: unsloth/gemma-4-E2B-it-GGUF - # Linux smoke uses UD-IQ3_XXS, but on Mac Metal that gemma-4 - # quant emits sentinel tokens () for any prompt at - # temperature=0 -- inference path is fine, the quant itself is - # broken on Metal. UD-Q4_K_XL is the smallest published variant - # that generates real text on M1. - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-4-E2B-it-UD-Q4_K_XL.gguf - MMPROJ_FILE: mmproj-F16.gguf - STUDIO_PORT: '18899' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - # Cache flat .gguf + mmproj (Job 2's pattern). HF_HOME inflates - # ~3.6x via xet/blobs/snapshots, which made macOS saves never land. - # mmproj is auto-detected as a sibling via detect_mmproj_file - # (studio/backend/utils/models/model_config.py). - - name: Restore GGUF + mmproj files - id: cache-gguf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: gguf-cache - key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-${{ env.MMPROJ_FILE }}-v2 - - - name: Verify cache contains BOTH gguf + mmproj - id: verify-cache - if: steps.cache-gguf.outputs.cache-hit == 'true' - run: | - if [[ -f "gguf-cache/$GGUF_FILE" && -f "gguf-cache/$MMPROJ_FILE" ]]; then - echo "ok=true" >> "$GITHUB_OUTPUT" - else - echo "Partial cache hit -- forcing re-download." - echo "ok=false" >> "$GITHUB_OUTPUT" - fi - - - name: Download GGUF + mmproj if cache miss or partial - id: download-gguf - if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.verify-cache.outputs.ok != 'true' - # Authenticated + parallel: shared macos-14 NAT egress stalls - # multi-GB anonymous downloads. - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p gguf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache & - MODEL_PID=$! - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE" gguf-cache & - MMPROJ_PID=$! - wait "$MODEL_PID" - wait "$MMPROJ_PID" - # Fail loud on a partial download instead of in the next step. - ls -lh "gguf-cache/$GGUF_FILE" "gguf-cache/$MMPROJ_FILE" - - # Save partial caches on cancel. hashFiles guard avoids a hard - # save failure when the download step exits with no files. The - # additional mmproj-presence check stops a partial save from - # poisoning the cache for the next run. - - name: Save GGUF + mmproj files - if: always() && steps.download-gguf.outcome != 'skipped' && hashFiles('gguf-cache/**/*.gguf') != '' && hashFiles(format('gguf-cache/{0}', env.MMPROJ_FILE)) != '' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: gguf-cache - key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-${{ env.MMPROJ_FILE }}-v2 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Assert install.sh used the Mac llama.cpp prebuilt - run: | - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - - - name: Install OpenAI + Anthropic Python SDKs - run: pip install 'openai>=1.50' 'anthropic>=0.40' - - - name: Reset auth + boot Studio (API-only) - # See Job 2's comment: API-only mode keeps tool_policy=None so - # response_format requests aren't routed through the agentic - # tool loop. - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health, log in, change password, load model - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token) - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null - TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - echo "API_KEY=$TOKEN" >> "$GITHUB_ENV" - # Load via local file path; mmproj sibling auto-detected by - # detect_mmproj_file (model_config.py). gguf_variant omitted - # -- it routes through _find_local_gguf_by_variant which - # expects a directory, not a file path. - GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}" - MMPROJ_PATH="$GITHUB_WORKSPACE/gguf-cache/${MMPROJ_FILE}" - ls -lh "$GGUF_PATH" "$MMPROJ_PATH" - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 900 \ - -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name, is_vision}' - - - name: JSON schema decoding + image input - env: - BASE_URL: http://127.0.0.1:18899 - run: | - python - <<'PY' - import base64 - import json - import os - import urllib.request - from openai import OpenAI - from anthropic import Anthropic - - BASE = os.environ["BASE_URL"] - KEY = os.environ["API_KEY"] - SEED = 3407 - # Mac Metal degenerates these gemma-4 quants at temperature=0 - # (any prompt yields '...' padding tokens). Use a - # small non-zero temperature with the same seed so we stay - # deterministic-enough but escape the trap. - TEMP = 0.2 - - def post(path, body, *, timeout = 240): - req = urllib.request.Request( - f"{BASE}{path}", - data = json.dumps(body).encode(), - method = "POST", - headers = { - "Authorization": f"Bearer {KEY}", - "Content-Type": "application/json", - }, - ) - with urllib.request.urlopen(req, timeout = timeout) as resp: - return resp.status, json.loads(resp.read().decode()) - - # ── 1. response_format = json_object (JSON mode) ───────────── - # llama.cpp's HTTP server supports OpenAI-compatible JSON - # mode: `response_format: {"type": "json_object"}` constrains - # the model to emit syntactically-valid JSON. We use raw HTTP - # rather than the OpenAI SDK so that the field shape Studio - # forwards to llama-server is unambiguous (the SDK rewrites - # response_format depending on which variant it recognises). - # We deliberately do NOT pass a strict JSON schema -- on - # small Gemma-4 quants the GBNF-from-schema path occasionally - # produces empty output, and JSON mode is the surface we care - # about exposing through Studio. - status, data = post("/v1/chat/completions", { - "model": "default", - "messages": [ - {"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'}, - {"role": "user", "content": "What is the capital of France?"}, - ], - "temperature": TEMP, - # Trimmed for Mac runner timeout budget; json_object - # grammar terminates quickly when working. - "max_tokens": 200, - "seed": SEED, - "stream": False, - "enable_thinking": False, - "response_format": {"type": "json_object"}, - }, timeout = 240) - assert status == 200, f"json status {status}: {data}" - # Verify the response envelope shape -- this is what we - # actually want to exercise on Mac. The model output quality - # downstream of this is a Mac-Metal-quant artefact. - assert ( - isinstance(data.get("choices"), list) - and data["choices"] - and "message" in data["choices"][0] - ), f"json response envelope malformed: {data}" - content = (data["choices"][0]["message"].get("content") or "").strip() - print(f"[json] raw json_object content: {content!r}") - # Some chat templates wrap JSON in ```json fences even in JSON - # mode -- strip those before parsing. - if content.startswith("```"): - content = content.split("```", 2)[1] - if content.startswith("json"): - content = content[4:] - content = content.strip("`\n ") - if content: - try: - parsed = json.loads(content) - if "paris" in str(parsed.get("city", "")).lower(): - print(f"[json] PASS json_object -> {parsed}") - else: - print(f"[json] WARN json_object decoded but city!=Paris: {parsed}") - except json.JSONDecodeError as exc: - print(f"[json] WARN json_object content not parseable ({exc}); content={content!r}") - else: - print("[json] WARN json_object produced empty content on this Mac quant") - # Cross-check: same prompt without response_format. We care - # that the inference path stays healthy (status 200 + envelope - # shape OK); model output quality is a separate concern. - status2, data2 = post("/v1/chat/completions", { - "model": "default", - "messages": [{"role": "user", "content": "What is the capital of France? Answer with one word."}], - "temperature": TEMP, - # 1-word answer doesn't need 400 tokens; trim so a - # degenerate streaming model doesn't burn through the - # job's wallclock budget. - "max_tokens": 150, - "seed": SEED, - "stream": False, - "enable_thinking": False, - }, timeout = 240) - assert status2 == 200, f"plain status {status2}: {data2}" - plain = (data2["choices"][0]["message"].get("content") or "").lower() - print(f"[json] plain capital-of-france reply: {plain!r}") - if "paris" in plain: - print("[json] PASS plain inference path (paris mentioned)") - else: - print( - f"[json] WARN plain inference returned no 'paris' -- Mac quant " - f"degeneracy. HTTP path validated separately above." - ) - - # ── 2. OpenAI image_url (data URI base64) ─────────────────── - # 64x64 solid-red PNG. stb_image (used by Studio's image - # normaliser at routes/inference.py:3410) rejects 4x4 or - # smaller PNGs as truncated, so we go up to 64x64 -- still - # tiny in token cost. The assertion is loose: any non-empty - # response from the vision path proves multimodal end-to-end - # wiring; small VL quants are weak at colour identification. - PNG_64X64_RED_B64 = ( - "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k" - "UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA" - "1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII=" - ) - data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}" - - # The Mac prebuilt llama.cpp server has a known crash when - # processing image inputs alongside the gemma-4-E2B mmproj - # (server disconnects mid-completion). This is upstream - # llama.cpp behaviour, not Studio. Wrap both SDK calls in - # try/except so an upstream crash registers as a WARN rather - # than failing the whole job. Studio's contract (OpenAI/ - # Anthropic image fields are accepted and forwarded) is - # validated by the request body Studio constructs, not by - # whether llama.cpp can decode it on Mac Metal. - client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY) - try: - openai_resp = client.chat.completions.create( - model = "default", - temperature = TEMP, - max_tokens = 80, - seed = SEED, - messages = [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": data_uri}}, - {"type": "text", "text": "What colour dominates this image? Reply in one word."}, - ], - }], - ) - openai_text = (openai_resp.choices[0].message.content or "").lower() - print(f"[image/openai] reply: {openai_text!r}") - if openai_text: - print("[image/openai] PASS image_url accepted, non-empty response") - else: - print("[image/openai] WARN image_url accepted but empty content -- Mac quant drift") - except Exception as exc: - print( - f"[image/openai] WARN image_url SDK call raised: {type(exc).__name__}: " - f"{exc}. Likely upstream llama.cpp Mac+vision crash, NOT a Studio " - f"regression. Studio successfully forwarded the request." - ) - - # ── 3. Anthropic source/base64 image ──────────────────────── - # Two SDK quirks vs. Studio: base_url must NOT include /v1 - # (the SDK appends it itself; otherwise /v1/v1/messages -> 405), - # and Studio's auth is HTTPBearer-only so the SDK's default - # x-api-key header is ignored -- send Authorization: Bearer - # via default_headers. - anthropic = Anthropic( - base_url = BASE, - api_key = "unused", - default_headers = {"Authorization": f"Bearer {KEY}"}, - ) - try: - a_msg = anthropic.messages.create( - model = "default", - max_tokens = 80, - temperature = TEMP, - extra_body = {"seed": SEED}, - messages = [{ - "role": "user", - "content": [ - { - "type": "image", - "source": { - "type": "base64", - "media_type": "image/png", - "data": PNG_64X64_RED_B64, - }, - }, - {"type": "text", "text": "Describe this image briefly."}, - ], - }], - ) - a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text") - print(f"[image/anthropic] reply: {a_text!r}") - if a_text: - print("[image/anthropic] PASS source/base64 accepted, non-empty response") - else: - print("[image/anthropic] WARN source/base64 accepted but empty content -- Mac quant drift") - except Exception as exc: - print( - f"[image/anthropic] WARN anthropic image SDK call raised: " - f"{type(exc).__name__}: {exc}. Likely upstream llama.cpp Mac+vision " - f"crash, NOT a Studio regression." - ) - PY - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - ss -tln | grep ":${STUDIO_PORT}" || true - - - name: Upload logs - # Always upload so green runs are still reviewable. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: json-images-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 diff --git a/.github/workflows/studio-mac-ui-smoke.yml b/.github/workflows/studio-mac-ui-smoke.yml deleted file mode 100644 index 510c3543d2..0000000000 --- a/.github/workflows/studio-mac-ui-smoke.yml +++ /dev/null @@ -1,343 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Mac counterpart to studio-ui-smoke.yml. Same Playwright + Chromium -# end-to-end chat UI flow, but on macos-14 (M1) so we catch -# Mac-specific frontend / backend wiring regressions that the Linux -# job would miss (e.g. the Mac Tauri shell loading the same React -# bundle, or the Mac llama.cpp prebuilt's HTTP layer behaving -# differently from the Linux build). - -name: Mac Studio UI CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.sh' - - 'pyproject.toml' - - 'tests/studio/**' - - '.github/workflows/studio-mac-ui-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - ui-smoke: - name: Chat UI Tests - runs-on: macos-14 - timeout-minutes: 35 - env: - GGUF_REPO: unsloth/gemma-3-270m-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf - STUDIO_PORT: '18896' - HF_HOME: ${{ github.workspace }}/hf-cache - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Restore HF_HOME for ${{ env.GGUF_REPO }} - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Prime HF_HOME with the GGUF - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - - - name: Save HF_HOME for ${{ env.GGUF_REPO }} - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Assert install.sh used the Mac llama.cpp prebuilt - run: | - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - - - name: Install Playwright + Chromium - # No --with-deps on Mac: that flag installs Linux apt packages. - # GitHub-hosted macos-14 ships the system frameworks Chromium - # needs already. - # Pinned <1.58 because all 1.55-1.58 drivers ship Node 24 on - # macos-14 and intermittently hit 'SyntaxError: Unexpected end - # of JSON input' in pipeTransport.js. Run 25491698868 showed - # the crash hitting 100% of three retry attempts -- not a - # rare race but a hard reproduction. Belt-and-suspenders fix: - # the test scripts pass --single-process to Chromium (see - # tests/studio/playwright_chat_ui.py) AND we patch - # pipeTransport.js below to swallow JSON parse errors instead - # of crashing the driver Node process. Both together let the - # in-script retry recover from any residual flakes. - run: | - pip install 'playwright>=1.55,<1.58' - python -m playwright install chromium - - - name: Patch Playwright pipeTransport.js to tolerate malformed JSON - # In Playwright 1.55-1.58, pipeTransport.js does - # `JSON.parse(message)` with no try/catch; when Chromium dies - # mid-write the partial buffer crashes the driver Node - # process and the test script exits with 'Connection closed - # while reading from the driver'. Newer Playwright versions - # added a try/catch upstream. Backport that here. - run: | - python - <<'PY' - import os, re, sys - import playwright - driver_dir = os.path.join(os.path.dirname(playwright.__file__), "driver", "package", "lib", "server") - path = os.path.join(driver_dir, "pipeTransport.js") - src = open(path).read() - # Wrap both `this.onmessage.call(null, JSON.parse(...))` sites in try/catch. - patched = re.sub( - r"this\.onmessage\.call\(null, JSON\.parse\((message2?)\)\);", - r"try { this.onmessage.call(null, JSON.parse(\1)); } " - r"catch (e) { /* swallow malformed JSON from a crashing browser */ }", - src, - ) - if patched == src: - # Already patched, or upstream changed -- either way, don't fail the build. - print(f"pipeTransport.js: no JSON.parse calls matched at {path}; skipping.") - else: - open(path, "w").write(patched) - print(f"pipeTransport.js: patched JSON.parse calls in {path}") - PY - - - name: Reset auth + boot Studio - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - - - name: Pass bootstrap password to the Playwright step - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - NEW2="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - echo "::add-mask::$NEW2" - echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV" - echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV" - echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV" - - - name: Drive the chat UI with Playwright - env: - BASE_URL: http://127.0.0.1:18896 - PW_ART_DIR: logs/playwright - STUDIO_UI_STRICT: '1' - # macos-14 free runner is 3 vCPU / 7 GB / no Metal-accel - # available to llama.cpp from CI; gemma-3-270m turn latency - # has been observed to crowd the 180s default. Triple it. - STUDIO_UI_TURN_TIMEOUT_MS: '540000' - # Retry up to 3 times to absorb the racy Playwright Node 24 - # pipeTransport.js 'Unexpected end of JSON input' crash that - # fires intermittently on macos-14 free runners (Chromium - # browser process dies mid-test → driver Node process can't - # parse the truncated JSON-RPC line and exits). The retry - # FULLY resets Studio (kill, reset-password, reboot, wait - # /api/health, re-export bootstrap pw) before re-running the - # script so the change-password flow finds a fresh bootstrap. - # A real test failure (assertion / timeout) does NOT match the - # JSON pattern so it bypasses retry and surfaces immediately. - run: | - mkdir -p logs/playwright - attempt=1 - max_attempts=3 - while : ; do - set +e - python tests/studio/playwright_chat_ui.py 2>&1 | tee logs/playwright_attempt_${attempt}.log - rc=${PIPESTATUS[0]} - set -e - if [ "$rc" -eq 0 ]; then - break - fi - if grep -q "Unexpected end of JSON input" logs/playwright_attempt_${attempt}.log \ - && [ "$attempt" -lt "$max_attempts" ]; then - echo "::warning::Playwright pipeTransport JSON crash on attempt ${attempt}; resetting Studio and retrying..." - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - unsloth studio reset-password - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > "logs/studio_retry_${attempt}.log" 2>&1 & - STUDIO_PID=$! - echo "STUDIO_PID=$STUDIO_PID" >> "$GITHUB_ENV" - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json \ - && jq -e '.status == "healthy"' /tmp/health.json >/dev/null; then - break - fi - sleep 1 - done - STUDIO_OLD_PW=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - STUDIO_NEW_PW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - STUDIO_NEW2_PW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$STUDIO_OLD_PW" - echo "::add-mask::$STUDIO_NEW_PW" - echo "::add-mask::$STUDIO_NEW2_PW" - export STUDIO_OLD_PW STUDIO_NEW_PW STUDIO_NEW2_PW - attempt=$((attempt + 1)) - sleep 3 - continue - fi - exit "$rc" - done - - - name: Stop Studio (chat-ui ends with Shutdown click; this is belt-and-suspenders) - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - - - name: Reset auth + boot Studio for extra UI tests (port 18897) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18897 \ - > logs/studio_extra.log 2>&1 & - echo "STUDIO_EXTRA_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health on 18897 - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:18897/api/health" > /tmp/health2.json; then - jq -e '.status == "healthy"' /tmp/health2.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health2.json - - - name: Pass bootstrap pw for extra UI test - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - echo "STUDIO_EXTRA_OLD_PW=$OLD" >> "$GITHUB_ENV" - echo "STUDIO_EXTRA_NEW_PW=$NEW" >> "$GITHUB_ENV" - - - name: Drive Compare/Recipes/Export/Studio/Settings with Playwright - env: - BASE_URL: http://127.0.0.1:18897 - STUDIO_OLD_PW: ${{ env.STUDIO_EXTRA_OLD_PW }} - STUDIO_NEW_PW: ${{ env.STUDIO_EXTRA_NEW_PW }} - PW_ART_DIR: logs/playwright_extra - STUDIO_UI_STRICT: '1' - # See "Drive the chat UI" step. - STUDIO_UI_TURN_TIMEOUT_MS: '540000' - GGUF_REPO: ${{ env.GGUF_REPO }} - GGUF_VARIANT: ${{ env.GGUF_VARIANT }} - # Same pipeTransport JSON-crash retry shape as "Drive the chat - # UI with Playwright" -- see comment there. - run: | - mkdir -p logs/playwright_extra - attempt=1 - max_attempts=3 - while : ; do - set +e - python tests/studio/playwright_extra_ui.py 2>&1 | tee logs/playwright_extra_attempt_${attempt}.log - rc=${PIPESTATUS[0]} - set -e - if [ "$rc" -eq 0 ]; then - break - fi - if grep -q "Unexpected end of JSON input" logs/playwright_extra_attempt_${attempt}.log \ - && [ "$attempt" -lt "$max_attempts" ]; then - echo "::warning::Playwright pipeTransport JSON crash on attempt ${attempt}; resetting Studio and retrying..." - kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true - sleep 2 - unsloth studio reset-password - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18897 \ - > "logs/studio_extra_retry_${attempt}.log" 2>&1 & - STUDIO_EXTRA_PID=$! - echo "STUDIO_EXTRA_PID=$STUDIO_EXTRA_PID" >> "$GITHUB_ENV" - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:18897/api/health" > /tmp/health2.json \ - && jq -e '.status == "healthy"' /tmp/health2.json >/dev/null; then - break - fi - sleep 1 - done - STUDIO_OLD_PW=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - STUDIO_NEW_PW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$STUDIO_OLD_PW" - echo "::add-mask::$STUDIO_NEW_PW" - export STUDIO_OLD_PW STUDIO_NEW_PW - attempt=$((attempt + 1)) - sleep 3 - continue - fi - exit "$rc" - done - - - name: Stop second Studio - if: always() - run: | - kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true - sleep 2 - - - name: Upload Playwright artifacts - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: mac-studio-ui-smoke-artifacts - path: | - logs/studio.log - logs/studio_extra.log - logs/install.log - logs/playwright - logs/playwright_extra - retention-days: 7 diff --git a/.github/workflows/studio-mac-update-smoke.yml b/.github/workflows/studio-mac-update-smoke.yml deleted file mode 100644 index 07d26b9ab3..0000000000 --- a/.github/workflows/studio-mac-update-smoke.yml +++ /dev/null @@ -1,150 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Mac counterpart to studio-update-smoke.yml. Verifies that on a real -# Apple Silicon (macos-14, M1) runner: -# -# 1. install.sh --local --no-torch installs Studio AND auto-fetches -# the prebuilt llama.cpp Mac binary (llama-bNNNN-bin-macos-arm64 -# from ggml-org/llama.cpp). Hitting the source-build fallback is -# treated as an Unsloth bug -- Studio must always pick the -# prebuilt on Mac. -# 2. unsloth studio update --local is idempotent. Two consecutive -# runs both report "prebuilt up to date and validated", no -# source-build fallback. -# 3. The installed Studio still boots and /api/health returns -# healthy after the update path. - -name: Mac Studio Update CI - -on: - pull_request: - paths: - - 'install.sh' - - 'studio/setup.sh' - - 'studio/install_python_stack.py' - - 'studio/install_llama_prebuilt.py' - - 'studio/backend/requirements/**' - - 'unsloth_cli/commands/studio.py' - - 'pyproject.toml' - - '.github/workflows/studio-mac-update-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - update-idempotency: - name: Studio Updating Tests - runs-on: macos-14 - timeout-minutes: 30 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Assert install.sh used the Mac llama.cpp prebuilt - run: | - # Mac install must take the prebuilt path. Source-build - # fallback here is an Unsloth bug. - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - if ! grep -qE "prebuilt installed and validated|prebuilt up to date and validated|bin-macos-arm64" logs/install.log; then - echo "::error::no Mac prebuilt llama.cpp marker in install.log." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - echo "install.sh installed the Mac prebuilt llama.cpp" - - - name: First update should be a no-op (prebuilt already validated) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - set -o pipefail - unsloth studio update --local 2>&1 | tee logs/update.log - if grep -q "falling back to source build" logs/update.log; then - echo "::error::studio update fell back to source-build llama.cpp on Mac." - grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60 - exit 1 - fi - if ! grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update.log; then - echo "::error::no prebuilt up-to-date marker in update.log." - grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60 - exit 1 - fi - echo "update path took the prebuilt fast path" - - - name: Second update must also be a no-op - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - set -o pipefail - unsloth studio update --local 2>&1 | tee logs/update2.log - grep -q "falling back to source build" logs/update2.log && { - echo "::error::second update fell back to source build on Mac" - tail -60 logs/update2.log; exit 1; } || true - grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update2.log - echo "second update was clean" - - - name: Boot Studio briefly to confirm the install is still usable - run: | - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18891 \ - > logs/studio.log 2>&1 & - PID=$! - HEALTHY="" - for i in $(seq 1 60); do - if curl -fs http://127.0.0.1:18891/api/health > /tmp/health.json; then - if python3 -c "import json,sys; d=json.load(open('/tmp/health.json')); sys.exit(0 if d.get('status')=='healthy' else 1)"; then - HEALTHY=1 - break - fi - fi - sleep 1 - done - if [ -z "$HEALTHY" ]; then - echo "Studio failed to come up after \`update\`" - tail -200 logs/studio.log - kill "$PID" 2>/dev/null || true - exit 1 - fi - kill "$PID" 2>/dev/null || true - echo "post-update Studio /api/health OK" - - - name: Upload update logs - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: mac-studio-update-log - path: | - logs/install.log - logs/update.log - logs/update2.log - logs/studio.log - retention-days: 7 diff --git a/.github/workflows/studio-tauri-smoke.yml b/.github/workflows/studio-tauri-smoke.yml deleted file mode 100644 index 1156c264ae..0000000000 --- a/.github/workflows/studio-tauri-smoke.yml +++ /dev/null @@ -1,128 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# PR-time smoke for the Tauri desktop wrapper. Builds the frontend and the -# Tauri Linux debug binary, with no codesigning. Catches: -# - tauri.conf.json drift -# - src-tauri Cargo.toml or rust source breakage -# - Tauri CLI version drift (we pin 2.10.1, matching release-desktop.yml) -# - frontend output not picked up by Tauri's distDir -# -# Linux-only on a free `ubuntu-latest` runner. Mac and Windows desktop builds -# stay in release-desktop.yml (manual `workflow_dispatch`) because they need -# code-signing secrets and ~30 min of runner time each. - -name: Studio Tauri CI - -on: - pull_request: - paths: - - 'studio/frontend/**' - - 'studio/src-tauri/**' - # CLI rename / signature change can break Tauri's spawned - # `unsloth studio` -- include unsloth_cli in the trigger set. - - 'unsloth_cli/**' - - '.github/workflows/studio-tauri-smoke.yml' - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - linux-debug-build: - name: Tauri Linux debug build (no codesign) - runs-on: ubuntu-22.04 - timeout-minutes: 25 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Linux native deps for Tauri / WebKit2GTK - run: | - sudo apt-get update - sudo apt-get install -y \ - libwebkit2gtk-4.1-dev libayatana-appindicator3-dev \ - librsvg2-dev libxdo-dev libssl-dev patchelf - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '24' - - - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable @ 2026-03-27 - - - uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2.9.1 - with: - workspaces: studio/src-tauri -> target - - - name: Install pinned Tauri CLI (matches release-desktop.yml) - # Lifecycle scripts (esbuild native-binary postinstall, etc.) are - # required for `vite build`. The pre-install lockfile structural - # audit (lockfile_supply_chain_audit.py) is the practical defence - # against the npm postinstall-dropper class -- it fires BEFORE any - # tarball runs, on the injection pattern itself rather than an - # advisory-DB lookup. - run: npm install --save-dev --prefix studio @tauri-apps/cli@2.10.1 --no-fund --no-audit - - - name: Verify pinned Tauri CLI version - run: | - out="$(npx --prefix studio tauri --version)" - echo "$out" - [ "$out" = "tauri-cli 2.10.1" ] || { echo "::error::expected tauri-cli 2.10.1, got $out"; exit 1; } - - - name: Lockfile supply-chain audit (pre-install scan) - run: python3 scripts/lockfile_supply_chain_audit.py - - - name: Frontend build (npm ci, vite) - working-directory: studio/frontend - # Lifecycle scripts (esbuild native-binary postinstall, etc.) are - # required for `vite build`. The pre-install lockfile structural - # audit (lockfile_supply_chain_audit.py) is the practical defence - # against the npm postinstall-dropper class -- it fires BEFORE any - # tarball runs, on the injection pattern itself rather than an - # advisory-DB lookup. - run: | - npm ci --no-fund --no-audit - npm run build - test -f dist/index.html - - - name: Tauri debug build (Linux, no bundle, no codesign) - # `--debug` + `--no-bundle` keeps this lean: compiles the Rust crate, - # confirms the frontend dist is wired into Tauri, but skips the AppImage - # / .deb production. Code signing is irrelevant because we never produce - # a distributable artifact. - env: - TAURI_SIGNING_PRIVATE_KEY: '' - TAURI_SIGNING_PRIVATE_KEY_PASSWORD: '' - run: npx --prefix studio tauri build --debug --no-bundle - - - name: Inspect produced binary - run: | - BIN=$(find studio/src-tauri/target/debug -maxdepth 1 -type f -executable 2>/dev/null \ - | grep -Ev '\.(d|so|dylib|dll)$' \ - | grep -Ev '/(deps|build|examples)$' \ - | head -1) - echo "binary: $BIN" - if [ -z "$BIN" ]; then - echo "::error::Tauri debug binary not produced" - ls -la studio/src-tauri/target/debug/ || true - exit 1 - fi - file "$BIN" - du -h "$BIN" - - - name: Upload Tauri debug build - # Always upload so a green run leaves the binary inspectable too. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: tauri-debug-build - path: | - studio/src-tauri/target/debug - studio/frontend/dist - retention-days: 3 diff --git a/.github/workflows/studio-ui-smoke.yml b/.github/workflows/studio-ui-smoke.yml deleted file mode 100644 index 455fe4b7e1..0000000000 --- a/.github/workflows/studio-ui-smoke.yml +++ /dev/null @@ -1,293 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# End-to-end Studio chat UI smoke via Playwright + Chromium against a -# headless Linux runner. Boots Studio with the smallest GGUF -# (gemma-3-270m-it UD-Q4_K_XL, ~254 MiB), drives the actual frontend -# bundle, and asserts the full bootstrap-password / change-password / -# send-message / persist-on-reload journey works end to end. -# -# This is the only workflow that catches regressions in the wiring -# between the React frontend and the FastAPI backend, e.g. assistant-ui -# version drift, /api/auth response shape changes, runtime-provider -# regressions, or chat-history persistence breaking. Backend-only and -# frontend-only CI happily pass while the actual user-visible UI is -# broken (cf. the 2026.5.1 chat-history release). - -name: Studio UI CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.sh' - - 'pyproject.toml' - # The Playwright test files themselves -- a PR that ONLY edits - # the test must still trigger UI CI. - - 'tests/studio/**' - - '.github/workflows/studio-ui-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - ui-smoke: - name: Chat UI Tests - runs-on: ubuntu-latest - timeout-minutes: 25 - env: - GGUF_REPO: unsloth/gemma-3-270m-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf - STUDIO_PORT: '18892' - HF_HOME: ${{ github.workspace }}/hf-cache - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Linux deps - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - libcurl4-openssl-dev libssl-dev jq - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - - name: Restore HF_HOME for ${{ env.GGUF_REPO }} - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Prime HF_HOME with the GGUF - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - - - name: Save HF_HOME for ${{ env.GGUF_REPO }} - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Install Studio (--local, --no-torch) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: Install Playwright + Chromium - run: | - pip install 'playwright>=1.45' - # --with-deps installs the OS-level runtime libs Chromium - # needs (libnss3, libxkbcommon, etc.). About 30 s on a - # warm runner. - python -m playwright install --with-deps chromium - - - name: Reset auth + boot Studio - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - # 180 s -- a cold runner with venv warm-up + lazy imports has - # been seen to exceed 60 s. Failing the wait is more expensive - # than waiting an extra two minutes. - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - - - name: Pass bootstrap password to the Playwright step - # The Playwright test does its OWN /change-password through the - # UI (Setup your account / Choose a new password), then loads - # the model via page.evaluate against /api/inference/load with - # the JWT it got from change-password. So the only thing we - # have to hand it is the bootstrap password (so it can verify - # post-rotation that the OLD bootstrap pw now returns 401). - # - # NEW + NEW2 are generated freshly per CI run via secrets.token_urlsafe - # rather than hardcoded. If a workflow gets compromised, the - # attacker can't replay a known-good rotated password against - # any future / parallel Studio install -- the rotated value - # only ever exists for the lifetime of this single job, masked - # in the log via ::add-mask::. - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - NEW2="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - echo "::add-mask::$NEW2" - echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV" - echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV" - echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV" - - - name: Drive the chat UI with Playwright - env: - BASE_URL: http://127.0.0.1:18892 - # The test file lives in the repo so it can be run locally - # against a freshly-installed Studio (BASE_URL=...; STUDIO_OLD_PW= - # $(cat ~/.unsloth/studio/auth/.bootstrap_password); python ...). - PW_ART_DIR: logs/playwright - # Strict mode: in CI a missing button / nav / dialog must - # FAIL the test. Locally the test still runs against partial - # Studio installs without STUDIO_UI_STRICT. - STUDIO_UI_STRICT: '1' - run: | - mkdir -p logs/playwright - python tests/studio/playwright_chat_ui.py - - - name: Stop Studio (chat-ui ends with Shutdown click; this is belt-and-suspenders) - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - - # The chat UI test ends by clicking the Shutdown menuitem, which - # leaves the server dead. The extra UI test (Compare / Recipes / - # Export / Studio / Settings) needs a fresh Studio, so we boot a - # second one on a different port. Boot is fast (~3-5s on the - # warm install we already did) so this adds little wall time. - - name: Reset auth + boot Studio for extra UI tests (port 18894) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18894 \ - > logs/studio_extra.log 2>&1 & - echo "STUDIO_EXTRA_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health on 18894 - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:18894/api/health" > /tmp/health2.json; then - jq -e '.status == "healthy"' /tmp/health2.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health2.json - - - name: Pass bootstrap pw for extra UI test - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - echo "STUDIO_EXTRA_OLD_PW=$OLD" >> "$GITHUB_ENV" - echo "STUDIO_EXTRA_NEW_PW=$NEW" >> "$GITHUB_ENV" - - - name: Drive Compare/Recipes/Export/Studio/Settings with Playwright - env: - BASE_URL: http://127.0.0.1:18894 - STUDIO_OLD_PW: ${{ env.STUDIO_EXTRA_OLD_PW }} - STUDIO_NEW_PW: ${{ env.STUDIO_EXTRA_NEW_PW }} - PW_ART_DIR: logs/playwright_extra - STUDIO_UI_STRICT: '1' - GGUF_REPO: ${{ env.GGUF_REPO }} - GGUF_VARIANT: ${{ env.GGUF_VARIANT }} - run: | - mkdir -p logs/playwright_extra - python tests/studio/playwright_extra_ui.py - - - name: Stop second Studio - if: always() - run: | - kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true - sleep 2 - - # IME + multilingual paste regression (issue #5318 / PR #5327). - # Third Studio on its own port so a hang here cannot poison the - # earlier UI tests. No GGUF -- the bug surface is the composer. - - name: Reset auth + boot Studio for IME / i18n tests (port 18896) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18896 \ - > logs/studio_ime.log 2>&1 & - echo "STUDIO_IME_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health on 18896 - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:18896/api/health" > /tmp/health3.json; then - jq -e '.status == "healthy"' /tmp/health3.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health3.json - - - name: Pass bootstrap pw for IME / i18n test - # IME smoke does the change-password against the bootstrap that - # Studio's frontend injects into the page, so it only needs the - # NEW password. - run: | - NEW="CIIme-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$NEW" - echo "STUDIO_IME_NEW_PW=$NEW" >> "$GITHUB_ENV" - - - name: Drive IME + multilingual paste regression with Playwright - env: - BASE_URL: http://127.0.0.1:18896 - STUDIO_NEW_PW: ${{ env.STUDIO_IME_NEW_PW }} - PW_ART_DIR: logs/playwright_ime - STUDIO_UI_STRICT: '1' - run: | - mkdir -p logs/playwright_ime - python tests/studio/playwright_chat_ime_i18n.py - - - name: Stop third Studio - if: always() - run: | - kill "${STUDIO_IME_PID}" 2>/dev/null || true - sleep 2 - - - name: Upload Playwright artifacts - # Always upload so a green run's screenshots stay reviewable -- - # catches "passed but the UI is silently broken" regressions. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: studio-ui-smoke-artifacts - path: | - logs/studio.log - logs/studio_extra.log - logs/studio_ime.log - logs/install.log - logs/playwright - logs/playwright_extra - logs/playwright_ime - retention-days: 7 diff --git a/.github/workflows/studio-update-smoke.yml b/.github/workflows/studio-update-smoke.yml deleted file mode 100644 index 1c353e933a..0000000000 --- a/.github/workflows/studio-update-smoke.yml +++ /dev/null @@ -1,154 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Verifies that `unsloth studio update --local` is idempotent: a fresh -# install via install.sh, followed by `unsloth studio update --local`, -# succeeds and is a no-op for the llama.cpp prebuilt (it should report -# "prebuilt up to date and validated", not re-run the source build). -# -# This catches regressions in setup.sh's update path that the existing -# GGUF / wheel jobs would miss because they only invoke install.sh once. - -name: Studio Update CI - -on: - pull_request: - paths: - - 'install.sh' - - 'studio/setup.sh' - - 'studio/install_python_stack.py' - - 'studio/install_llama_prebuilt.py' - - 'studio/backend/requirements/**' - - 'unsloth_cli/commands/studio.py' - - 'pyproject.toml' - - '.github/workflows/studio-update-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - update-idempotency: - name: Studio Updating Tests - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Linux deps for llama.cpp prebuilt - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - libcurl4-openssl-dev libssl-dev jq - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - # Don't cache pip: this job runs `bash install.sh` and - # `unsloth studio update --local` which both go through - # `uv` and never populate ~/.cache/pip. setup-python's - # post-step then fatal-errors with "Cache folder path is - # retrieved for pip but doesn't exist on disk". - - - name: Install Studio (--local, --no-torch) - # Pass the workflow token so the llama.cpp prebuilt installer's - # GitHub-API call to list releases isn't rate-limited (60/hr - # unauthenticated). Without this, three consecutive install + - # update + update calls in this job exceed the limit and the - # prebuilt path falls back to source build. - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - mkdir -p logs - set -o pipefail - bash install.sh --local --no-torch 2>&1 | tee logs/install.log - - - name: First update should be a no-op (prebuilt already validated) - # `unsloth studio update --local` runs studio/setup.sh against - # the local repo. Right after install.sh the llama.cpp prebuilt - # has just been installed and validated, so the second run must - # take the "prebuilt up to date and validated" code path. Any - # source-build fallback or re-download here means setup.sh's - # idempotency regressed. - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - set -o pipefail - unsloth studio update --local 2>&1 | tee logs/update.log - if grep -q "falling back to source build" logs/update.log; then - echo "::error::studio update fell back to source-build llama.cpp on a fresh install. setup.sh idempotency regressed." - grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60 - exit 1 - fi - if ! grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update.log; then - echo "::error::no prebuilt up-to-date marker in update.log. Did setup.sh skip the prebuilt path on update?" - grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60 - exit 1 - fi - echo "update path took the prebuilt fast path" - - - name: Second update must also be a no-op - # Two consecutive `update`s back-to-back is the usual desktop - # flow (auto-update, then user-triggered update). Asserting the - # second run is also clean rules out hidden state changes from - # the first one. - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - set -o pipefail - unsloth studio update --local 2>&1 | tee logs/update2.log - grep -q "falling back to source build" logs/update2.log && { - echo "::error::second update fell back to source build" - tail -60 logs/update2.log; exit 1; } || true - grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update2.log - echo "second update was clean" - - - name: Boot Studio briefly to confirm the install is still usable - # If `update --local` accidentally broke the venv or wiped the - # llama-server binary, the server would fail to start here. - run: | - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18891 \ - > logs/studio.log 2>&1 & - PID=$! - for i in $(seq 1 60); do - if curl -fs http://127.0.0.1:18891/api/health > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json - break - fi - sleep 1 - done - if ! jq -e '.status == "healthy"' /tmp/health.json 2>/dev/null; then - echo "Studio failed to come up after `update`" - tail -200 logs/studio.log - kill "$PID" 2>/dev/null || true - exit 1 - fi - kill "$PID" 2>/dev/null || true - echo "post-update Studio /api/health OK" - - - name: Upload update logs - # Always upload so a green run still leaves the install + two - # update logs reviewable. - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: studio-update-log - path: | - logs/install.log - logs/update.log - logs/update2.log - logs/studio.log - retention-days: 7 diff --git a/.github/workflows/studio-windows-api-smoke.yml b/.github/workflows/studio-windows-api-smoke.yml deleted file mode 100644 index 1d12ea6f90..0000000000 --- a/.github/workflows/studio-windows-api-smoke.yml +++ /dev/null @@ -1,246 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Windows counterpart to studio-api-smoke.yml / studio-mac-api-smoke.yml. -# Same tests/studio/studio_api_smoke.py exercise (CORS hardening, auth -# state machine, JWT expiry, API key lifecycle, /v1/models / -# /v1/embeddings / /v1/responses, endpoint-by-endpoint auth audit) but -# on the FREE windows-latest runner. The file-mode hardening section -# (Section 6) is Linux-only and short-circuits on non-POSIX; the rest -# is platform-portable. - -name: Windows Studio API CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.ps1' - - 'pyproject.toml' - - 'tests/studio/**' - - '.github/workflows/studio-windows-api-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - api-smoke: - name: Studio API & Auth Tests - runs-on: windows-latest - timeout-minutes: 30 - defaults: - run: - shell: bash - env: - GGUF_REPO: unsloth/gemma-3-270m-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf - STUDIO_PORT: '18895' - HF_HOME: ${{ github.workspace }}/hf-cache - # Force UTF-8 for stdio (Windows defaults to cp1252; hf - # download prints a "✓" checkmark and crashes otherwise). - PYTHONIOENCODING: utf-8 - PYTHONUTF8: '1' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - - name: Restore HF_HOME for ${{ env.GGUF_REPO }} - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Prime HF_HOME with the GGUF - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - - - name: Save HF_HOME for ${{ env.GGUF_REPO }} - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Pre-install Windows tweaks (npm 11 + Defender exclusions) - shell: pwsh - # See studio-windows-update-smoke.yml for the full rationale. - # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node - # reinstall, and Defender's real-time scan dominates the - # frontend / uv-pip-extract steps. - run: | - $ProgressPreference = 'SilentlyContinue' - Write-Host "npm version before upgrade: $(npm -v)" - npm install -g 'npm@^11' 2>&1 | Out-Host - Write-Host "npm version after upgrade: $(npm -v)" - # NOTE: do NOT pre-create these directories. See - # studio-windows-update-smoke.yml for the full rationale -- - # creating an empty studio/frontend/dist trips setup.ps1's - # mtime-based staleness check into "frontend up to date, skip - # rebuild" and Studio boots with an empty dist directory. - # Add-MpPreference accepts paths that do not yet exist. - foreach ($p in @( - "$env:USERPROFILE\.unsloth", - "$env:USERPROFILE\AppData\Local\uv", - "$env:GITHUB_WORKSPACE\studio\frontend\node_modules", - "$env:GITHUB_WORKSPACE\studio\frontend\dist" - )) { - try { - Add-MpPreference -ExclusionPath $p -ErrorAction Stop - Write-Host "Defender exclusion added: $p" - } catch { - Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p" - } - } - - - name: Install Studio (--local, --no-torch) - shell: pwsh - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - New-Item -ItemType Directory -Force -Path logs | Out-Null - # *>&1 captures Write-Host (Information stream) output; - # plain 2>&1 does not. setup.ps1 emits "prebuilt installed - # and validated" via Write-Host, and we grep for that. - $ProgressPreference = 'SilentlyContinue' - & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log - - - name: Assert install.ps1 used the Windows llama.cpp prebuilt - run: | - # Filesystem-based check (setup.ps1's stream output isn't - # captured back through this parent step's pipeline; see - # studio-windows-ui-smoke.yml for full explanation). - LLAMA_DIR=~/.unsloth/llama.cpp - INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json" - BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe" - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.ps1 fell back to source-build llama.cpp on Windows." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - if [ ! -f "$INFO" ]; then - echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO." - ls -la "$LLAMA_DIR" || true - exit 1 - fi - if [ ! -f "$BIN" ]; then - echo "::error::no llama-server.exe at $BIN." - ls -la "$LLAMA_DIR/build/bin" || true - exit 1 - fi - echo "install.ps1 installed the Windows prebuilt llama.cpp:" - cat "$INFO" - - - name: Add Studio shim to GITHUB_PATH - # install.ps1's User-PATH update doesn't propagate to a - # running Git Bash session; export the shim dir so the - # next `unsloth ...` invocation finds it. - run: | - SHIM_DIR=~/.unsloth/studio/bin - if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then - echo "::error::unsloth.exe shim not found at $SHIM_DIR" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH" - - - name: Patch Studio venv with full typer / pydantic dep trees - # Belt-and-suspenders: install.ps1's --no-deps install of - # no-torch-runtime.txt drops typer's and pydantic's runtime - # deps unless explicitly pinned. Re-install the ones whose - # deps don't pull torch. - run: | - STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe - if [ ! -f "$STUDIO_PY" ]; then - echo "::error::Studio venv python not at $STUDIO_PY" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub - - - name: Install pyjwt for the JWT-expiry forge test - run: python -m pip install 'pyjwt>=2.6' - - - name: Reset auth + boot Studio (API-only) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - - - name: Pass bootstrap password + rotated targets to the test - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - NEW2="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - echo "::add-mask::$NEW2" - echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV" - echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV" - echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV" - - - name: Run Studio API & Auth tests - # Do NOT pin STUDIO_AUTH_DIR here. The Mac/Linux mirrors - # hardcode runner-specific paths (/Users/runner/..., - # /home/runner/...), but on Windows the path is - # C:\Users\runneradmin\.unsloth\studio\auth and varies by - # runner image. studio_api_smoke.py defaults to - # Path.home()/".unsloth"/"studio"/"auth" when the env is - # unset, which is correct on every OS. - env: - BASE_URL: http://127.0.0.1:18895 - run: python tests/studio/studio_api_smoke.py - - - name: Stop Studio - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - - - name: Upload API smoke logs - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: windows-studio-api-smoke-log - path: | - logs/install.log - logs/studio.log - retention-days: 7 diff --git a/.github/workflows/studio-windows-inference-smoke.yml b/.github/workflows/studio-windows-inference-smoke.yml deleted file mode 100644 index 01bf4127a7..0000000000 --- a/.github/workflows/studio-windows-inference-smoke.yml +++ /dev/null @@ -1,1167 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Three end-to-end smoke jobs that boot a freshly-installed Studio and -# exercise the surfaces real users hit through the OpenAI / Anthropic -# SDKs and curl, on the FREE windows-latest runner. Each job picks the -# smallest model that exercises the behaviour under test, primes -# HF_HOME via actions/cache, and shares the install.ps1 --local -# --no-torch bootstrap. -# -# 1. OpenAI, Anthropic API tests -# gemma-3-270m-it UD-Q4_K_XL (~254 MiB). -# 2. Tool calling Tests -# Qwen3.5-2B UD-Q4_K_XL (~890 MiB). -# 3. JSON, images -# gemma-4-E2B-it UD-Q4_K_XL + mmproj-F16 (~3.4 GiB total). -# Within the 14 GB windows-latest SSD budget. - -name: Windows Studio GGUF CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.ps1' - - 'pyproject.toml' - - '.github/workflows/studio-windows-inference-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - # ───────────────────────────────────────────────────────────────────── - # Job 1: OpenAI, Anthropic API tests - # ───────────────────────────────────────────────────────────────────── - openai-anthropic: - name: OpenAI, Anthropic API tests - runs-on: windows-latest - timeout-minutes: 30 - defaults: - run: - shell: bash - env: - GGUF_REPO: unsloth/gemma-3-270m-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf - STUDIO_PORT: '18888' - HF_HOME: ${{ github.workspace }}/hf-cache - # Force UTF-8 for stdio (Windows defaults to cp1252; hf - # download / Studio CLI print "✓" checkmarks and crash - # otherwise). - PYTHONIOENCODING: utf-8 - PYTHONUTF8: '1' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - # Split restore + save (rather than the one-step actions/cache) so a - # transient restore-side failure does not kill the whole job. v5 has a - # known flake where it logs "Cache hit for: " and then exits - # non-zero without actually extracting the archive (see - # actions/cache#1621 and github community discussion #163260). - # continue-on-error on restore masks that failure so the Prime step - # below can re-download from HF and the job keeps running. Save then - # populates the cache key on a real miss only; cache keys are - # immutable, so a corrupted cached entry persists until the -v1 - # suffix below is bumped. - - name: Restore HF_HOME cache for ${{ env.GGUF_REPO }} - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Prime HF_HOME with the GGUF - id: prime-hf - # Run on a real cache miss AND on the silent-restore-failure mode - # described above (outcome != success). - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - - - name: Save HF_HOME cache for ${{ env.GGUF_REPO }} - # Only write a fresh cache entry when we actually rebuilt the - # directory (Prime ran and succeeded). Skipping when Prime is - # skipped avoids "already exists" save warnings on the happy path. - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Pre-install Windows tweaks (npm 11 + Defender exclusions) - shell: pwsh - # See studio-windows-update-smoke.yml for the full rationale. - # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node - # reinstall, and Defender's real-time scan dominates the - # frontend / uv-pip-extract steps. - run: | - $ProgressPreference = 'SilentlyContinue' - Write-Host "npm version before upgrade: $(npm -v)" - npm install -g 'npm@^11' 2>&1 | Out-Host - Write-Host "npm version after upgrade: $(npm -v)" - # NOTE: do NOT pre-create these directories. See - # studio-windows-update-smoke.yml for the full rationale -- - # creating an empty studio/frontend/dist trips setup.ps1's - # mtime-based staleness check into "frontend up to date, skip - # rebuild" and Studio boots with an empty dist directory. - # Add-MpPreference accepts paths that do not yet exist. - foreach ($p in @( - "$env:USERPROFILE\.unsloth", - "$env:USERPROFILE\AppData\Local\uv", - "$env:GITHUB_WORKSPACE\studio\frontend\node_modules", - "$env:GITHUB_WORKSPACE\studio\frontend\dist" - )) { - try { - Add-MpPreference -ExclusionPath $p -ErrorAction Stop - Write-Host "Defender exclusion added: $p" - } catch { - Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p" - } - } - - - name: Install Studio (--local, --no-torch) - shell: pwsh - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - New-Item -ItemType Directory -Force -Path logs | Out-Null - # *>&1 captures Write-Host (Information stream) output; - # plain 2>&1 does not. setup.ps1 emits "prebuilt installed - # and validated" via Write-Host, and we grep for that. - $ProgressPreference = 'SilentlyContinue' - & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log - - - name: Assert install.ps1 used the Windows llama.cpp prebuilt - run: | - # Filesystem check; setup.ps1's stream output isn't captured. - LLAMA_DIR=~/.unsloth/llama.cpp - INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json" - BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe" - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.ps1 fell back to source-build llama.cpp on Windows." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - if [ ! -f "$INFO" ]; then - echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO." - ls -la "$LLAMA_DIR" || true - exit 1 - fi - if [ ! -f "$BIN" ]; then - echo "::error::no llama-server.exe at $BIN." - ls -la "$LLAMA_DIR/build/bin" || true - exit 1 - fi - echo "install.ps1 installed the Windows prebuilt llama.cpp:" - cat "$INFO" - - - name: Add Studio shim to GITHUB_PATH - run: | - SHIM_DIR=~/.unsloth/studio/bin - if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then - echo "::error::unsloth.exe shim not found at $SHIM_DIR" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH" - - - name: Patch Studio venv with full typer / pydantic dep trees - # Belt-and-suspenders: install.ps1's --no-deps install of - # no-torch-runtime.txt drops typer's and pydantic's runtime - # deps unless explicitly pinned. Re-install the ones whose - # deps don't pull torch. - run: | - STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe - if [ ! -f "$STUDIO_PY" ]; then - echo "::error::Studio venv python not at $STUDIO_PY" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub - - - name: Install OpenAI + Anthropic Python SDKs - run: python -m pip install 'openai>=1.50' 'anthropic>=0.40' - - - name: Reset auth + boot Studio (API-only) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json - exit 0 - fi - sleep 1 - done - echo "Studio did not become healthy in 180s" - tail -200 logs/studio.log - exit 1 - - - name: Password rotation (old must fail, new must work) - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token) - [ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] || { echo "bootstrap login failed"; exit 1; } - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null - OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \ - -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}") - if [ "$OLD_STATUS" != "401" ]; then - echo "::error::Login with old password returned $OLD_STATUS, expected 401" - exit 1 - fi - NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - [ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] || { echo "new login failed"; exit 1; } - echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV" - echo "password rotation OK (old=401, new=200)" - - - name: Load the GGUF (HF repo + variant, served from HF_HOME cache) - run: | - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 600 \ - -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name, is_gguf, context_length}' - - - name: Multi-turn determinism via OpenAI + Anthropic SDKs - env: - BASE_URL: http://127.0.0.1:18888 - run: | - python - <<'PY' - import json - import os - from openai import OpenAI - from anthropic import Anthropic - - BASE = os.environ["BASE_URL"] - KEY = os.environ["TOKEN"] - SEED = 3407 - - PROMPTS = [ - "What is 1+1?", - "What did I ask before?", - "What is the capital of France?", - "Repeat the city name", - ] - - def run_openai(): - client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY) - history, replies = [], [] - for prompt in PROMPTS: - history.append({"role": "user", "content": prompt}) - resp = client.chat.completions.create( - model = "default", - messages = history, - temperature = 0.0, - max_tokens = 80, - seed = SEED, - extra_body = {"enable_thinking": False}, - ) - text = resp.choices[0].message.content or "" - replies.append(text) - history.append({"role": "assistant", "content": text}) - return replies - - def run_anthropic(): - client = Anthropic( - base_url = BASE, - api_key = "unused", - default_headers = {"Authorization": f"Bearer {KEY}"}, - ) - history, replies = [], [] - for prompt in PROMPTS: - history.append({"role": "user", "content": prompt}) - msg = client.messages.create( - model = "default", - max_tokens = 80, - messages = history, - temperature = 0.0, - extra_body = {"seed": SEED, "enable_thinking": False}, - ) - text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text") - replies.append(text) - history.append({"role": "assistant", "content": text}) - return replies - - for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)): - first = runner() - second = runner() - for i, (a, b) in enumerate(zip(first, second), start = 1): - print(f"[{label} turn {i}] {a!r}") - assert a, f"{label}: empty turn {i} response" - assert a == b, ( - f"{label} non-deterministic at turn {i} with temperature=0.0:\n" - f" run1: {a!r}\n run2: {b!r}" - ) - joined = " ".join(first).lower() - assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}" - assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}" - print(f"[{label}] OK -- 4 turns, run1 == run2, history grounded") - PY - - - name: Stop Studio - if: always() - # Run as cmd so we are not running through the Git Bash shell; - # Git Bash on windows-latest has been observed to exit 143 - # (SIGTERM) from any inline kill/sleep block, masking a green - # test run. The runner reclaims the Studio child process at - # job end either way, so just emit a marker and exit 0. - shell: cmd - run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end) - - - name: Upload logs - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: windows-openai-anthropic-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 - - # ───────────────────────────────────────────────────────────────────── - # Job 2: Tool calling Tests - # ───────────────────────────────────────────────────────────────────── - tool-calling: - name: Tool calling Tests - runs-on: windows-latest - timeout-minutes: 30 - defaults: - run: - shell: bash - env: - # Tool calling is the highest-volume GGUF in this workflow - # (Qwen3.5-2B at Q4_K_XL = ~1.28 GiB). The previous HF_HOME - # cache stored xet chunks + blobs + snapshots = ~4.7 GiB -- - # 3.7x file-size inflation, dominating the post-step upload - # (211 s on first run; subsequent runs hit the cache, but the - # one-time cost recurs every time the cache key bumps). Use - # main's `--local-dir gguf-cache` pattern: cache the flat .gguf - # only, pass an absolute path to Studio's /api/inference/load. - # The OpenAI/Anth and JSON+images jobs still cover the - # gguf_variant resolution path. - GGUF_REPO: unsloth/Qwen3.5-2B-GGUF - GGUF_FILE: Qwen3.5-2B-UD-Q4_K_XL.gguf - STUDIO_PORT: '18898' - # Force UTF-8 for stdio (Windows defaults to cp1252; hf - # download / Studio CLI print "✓" checkmarks and crash - # otherwise). - PYTHONIOENCODING: utf-8 - PYTHONUTF8: '1' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - # Split restore + save so a transient restore-side failure does not - # kill the whole job. See the matching block in the tool-calling job - # above for the full rationale (actions/cache#1621). - - name: Restore GGUF model cache - id: cache-gguf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: gguf-cache - key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1 - - - name: Download GGUF if cache miss - id: download-gguf - if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.cache-gguf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p gguf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache - - - name: Save GGUF model cache - if: always() && steps.download-gguf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: gguf-cache - key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1 - - - name: Pre-install Windows tweaks (npm 11 + Defender exclusions) - shell: pwsh - # See studio-windows-update-smoke.yml for the full rationale. - # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node - # reinstall, and Defender's real-time scan dominates the - # frontend / uv-pip-extract steps. - run: | - $ProgressPreference = 'SilentlyContinue' - Write-Host "npm version before upgrade: $(npm -v)" - npm install -g 'npm@^11' 2>&1 | Out-Host - Write-Host "npm version after upgrade: $(npm -v)" - # NOTE: do NOT pre-create these directories. See - # studio-windows-update-smoke.yml for the full rationale -- - # creating an empty studio/frontend/dist trips setup.ps1's - # mtime-based staleness check into "frontend up to date, skip - # rebuild" and Studio boots with an empty dist directory. - # Add-MpPreference accepts paths that do not yet exist. - foreach ($p in @( - "$env:USERPROFILE\.unsloth", - "$env:USERPROFILE\AppData\Local\uv", - "$env:GITHUB_WORKSPACE\studio\frontend\node_modules", - "$env:GITHUB_WORKSPACE\studio\frontend\dist" - )) { - try { - Add-MpPreference -ExclusionPath $p -ErrorAction Stop - Write-Host "Defender exclusion added: $p" - } catch { - Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p" - } - } - - - name: Install Studio (--local, --no-torch) - shell: pwsh - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - New-Item -ItemType Directory -Force -Path logs | Out-Null - # *>&1 captures Write-Host (Information stream) output; - # plain 2>&1 does not. setup.ps1 emits "prebuilt installed - # and validated" via Write-Host, and we grep for that. - $ProgressPreference = 'SilentlyContinue' - & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log - - - name: Assert install.ps1 used the Windows llama.cpp prebuilt - run: | - # Filesystem check; setup.ps1's stream output isn't captured. - LLAMA_DIR=~/.unsloth/llama.cpp - INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json" - BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe" - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.ps1 fell back to source-build llama.cpp on Windows." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - if [ ! -f "$INFO" ]; then - echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO." - ls -la "$LLAMA_DIR" || true - exit 1 - fi - if [ ! -f "$BIN" ]; then - echo "::error::no llama-server.exe at $BIN." - ls -la "$LLAMA_DIR/build/bin" || true - exit 1 - fi - echo "install.ps1 installed the Windows prebuilt llama.cpp:" - cat "$INFO" - - - name: Add Studio shim to GITHUB_PATH - run: | - SHIM_DIR=~/.unsloth/studio/bin - if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then - echo "::error::unsloth.exe shim not found at $SHIM_DIR" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH" - - - name: Patch Studio venv with full typer / pydantic dep trees - # Belt-and-suspenders: install.ps1's --no-deps install of - # no-torch-runtime.txt drops typer's and pydantic's runtime - # deps unless explicitly pinned. Re-install the ones whose - # deps don't pull torch. - run: | - STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe - if [ ! -f "$STUDIO_PY" ]; then - echo "::error::Studio venv python not at $STUDIO_PY" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub - - - name: Reset auth + boot Studio (API-only, default tool policy) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health, log in, change password, load model - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token) - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null - TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - echo "API_KEY=$TOKEN" >> "$GITHUB_ENV" - # GITHUB_WORKSPACE on windows-latest is a Windows path with - # backslashes ("D:\a\unsloth\unsloth"). Bash handles it as a - # raw string, but we cannot embed `\a` etc. in JSON without - # JSON-string-escaping every backslash. Replace `\` with `/` - # via bash parameter expansion -- pathlib.Path on Windows - # accepts forward slashes natively, so Studio's loader sees - # a normal path. - GGUF_PATH="${GITHUB_WORKSPACE//\\//}/gguf-cache/${GGUF_FILE}" - ls -lh "$GGUF_PATH" - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 600 \ - -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name}' - - - name: Tool calling, server-side tools, thinking on/off - env: - BASE_URL: http://127.0.0.1:18898 - run: | - python - <<'PY' - import json - import os - import urllib.request - - BASE = os.environ["BASE_URL"] - KEY = os.environ["API_KEY"] - SEED = 3407 - # Same temperature shim as the Mac job. Small Qwen3.5-2B - # quants can degenerate at temperature=0; a small non-zero - # temperature with a fixed seed keeps the test deterministic - # while escaping the trap. - TEMP = 0.2 - - def post(path, body, *, timeout = 240): - data = json.dumps(body).encode() - req = urllib.request.Request( - f"{BASE}{path}", - data = data, - method = "POST", - headers = { - "Authorization": f"Bearer {KEY}", - "Content-Type": "application/json", - }, - ) - with urllib.request.urlopen(req, timeout = timeout) as resp: - return resp.status, json.loads(resp.read().decode()) - - def post_sse(path, body, *, timeout = 600): - body = {**body, "stream": True} - data = json.dumps(body).encode() - req = urllib.request.Request( - f"{BASE}{path}", - data = data, - method = "POST", - headers = { - "Authorization": f"Bearer {KEY}", - "Content-Type": "application/json", - }, - ) - parts = [] - with urllib.request.urlopen(req, timeout = timeout) as resp: - for raw in resp: - line = raw.decode().strip() - if not line.startswith("data: "): - continue - payload = line[6:] - if payload == "[DONE]": - break - try: - chunk = json.loads(payload) - except json.JSONDecodeError: - continue - for choice in chunk.get("choices", []): - delta = choice.get("delta", {}) or {} - if delta.get("content"): - parts.append(delta["content"]) - return "".join(parts) - - # ── 1. Standard OpenAI function calling ────────────────────── - weather_tool = { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get current weather for a city.", - "parameters": { - "type": "object", - "properties": {"city": {"type": "string"}}, - "required": ["city"], - }, - }, - } - - status, data = post("/v1/chat/completions", { - "messages": [{"role": "user", "content": "What is the weather in Paris?"}], - "tools": [weather_tool], - "tool_choice": "required", - "stream": False, - "temperature": TEMP, - "seed": SEED, - "max_tokens": 600, - }) - assert status == 200, f"tool call status {status}: {data}" - choice = data["choices"][0] - tool_calls = (choice.get("message") or {}).get("tool_calls") or [] - if tool_calls: - tc = tool_calls[0] - assert tc["function"]["name"] == "get_weather", ( - f"unexpected tool name: {tc['function']['name']!r}" - ) - args = json.loads(tc["function"]["arguments"]) - assert args.get("city"), f"missing city arg: {args}" - print(f"[tools] PASS function calling -> {tc['function']['name']}({args}) finish={choice.get('finish_reason')!r}") - else: - print( - f"[tools] WARN function calling: no tool_calls (finish_reason=" - f"{choice.get('finish_reason')!r}); HTTP path OK, model output drift." - ) - - # ── 2. Server-side python tool ─────────────────────────────── - content = post_sse("/v1/chat/completions", { - "messages": [{"role": "user", "content": "What is 123 * 456? Use the python tool to compute it and tell me the number."}], - "enable_tools": True, - "enabled_tools": ["python"], - "session_id": "ci-tool-calling-py", - "temperature": TEMP, - "seed": SEED, - "max_tokens": 600, - }) - if "56088" in content or "56,088" in content: - print(f"[tools] PASS python tool ({len(content)} chars, found 56088)") - else: - assert content, "python tool: SSE stream empty" - print( - f"[tools] WARN python tool: SSE OK ({len(content)} chars) but " - f"model didn't return 56088 -- model output drift" - ) - - # ── 3. Server-side bash (terminal) tool ────────────────────── - # On Windows the terminal tool resolves to the system shell - # (cmd.exe wrapper) and `echo hello-bash-tool` works the same - # way it does on POSIX. The model still has to choose to - # invoke the tool; assert non-empty SSE if it doesn't. - content = post_sse("/v1/chat/completions", { - "messages": [{"role": "user", "content": "Use the terminal tool to run `echo hello-bash-tool` and tell me the exact output."}], - "enable_tools": True, - "enabled_tools": ["terminal"], - "session_id": "ci-tool-calling-bash", - "temperature": TEMP, - "seed": SEED, - "max_tokens": 600, - }) - if "hello-bash-tool" in content: - print(f"[tools] PASS terminal tool ({len(content)} chars)") - else: - assert content, "terminal tool: SSE stream empty" - print( - f"[tools] WARN terminal tool: SSE OK ({len(content)} chars) but " - f"model didn't echo 'hello-bash-tool' -- model output drift" - ) - - # ── 4. Server-side web_search tool ─────────────────────────── - # DuckDuckGo can be flaky from CI runners; only assert that - # the SSE stream opens and yields any data. - try: - content = post_sse("/v1/chat/completions", { - "messages": [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}], - "enable_tools": True, - "enabled_tools": ["web_search"], - "session_id": "ci-tool-calling-web", - "temperature": TEMP, - "seed": SEED, - "max_tokens": 400, - }) - print(f"[tools] PASS web_search stream ({len(content)} chars)") - except Exception as exc: - print(f"[tools] WARN web_search probe failed (non-blocking): {exc}") - - # ── 5. Thinking on / off ───────────────────────────────────── - def thinking_call(enable): - status, data = post("/v1/chat/completions", { - "messages": [{"role": "user", "content": "Briefly: is 17 prime?"}], - "stream": False, - "enable_thinking": enable, - "temperature": TEMP, - "seed": SEED, - "max_tokens": 300, - }) - assert status == 200 - msg = data["choices"][0]["message"] - raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "") - return raw - - on_text = thinking_call(True) - off_text = thinking_call(False) - had_think_on = ("" in on_text) or len(on_text) > 80 - if not had_think_on: - print( - f"[tools] WARN enable_thinking=True produced no thinking signal: " - f"{on_text[:200]!r}" - ) - assert "" not in off_text, ( - f"enable_thinking=False but still present: {off_text!r}" - ) - print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)") - PY - - - name: Stop Studio - if: always() - # Run as cmd so we are not running through the Git Bash shell; - # Git Bash on windows-latest has been observed to exit 143 - # (SIGTERM) from any inline kill/sleep block, masking a green - # test run. The runner reclaims the Studio child process at - # job end either way, so just emit a marker and exit 0. - shell: cmd - run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end) - - - name: Upload logs - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: windows-tool-calling-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 - - # ───────────────────────────────────────────────────────────────────── - # Job 3: JSON, images - # ───────────────────────────────────────────────────────────────────── - json-images: - name: JSON, images - runs-on: windows-latest - timeout-minutes: 35 - defaults: - run: - shell: bash - env: - GGUF_REPO: unsloth/gemma-4-E2B-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-4-E2B-it-UD-Q4_K_XL.gguf - MMPROJ_FILE: mmproj-F16.gguf - STUDIO_PORT: '18899' - HF_HOME: ${{ github.workspace }}/hf-cache - # Force UTF-8 for stdio (Windows defaults to cp1252; hf - # download / Studio CLI print "✓" checkmarks and crash - # otherwise). - PYTHONIOENCODING: utf-8 - PYTHONUTF8: '1' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - # Split restore + save so a transient restore-side failure does not - # kill the whole job. See the matching block in the tool-calling job - # for the full rationale (actions/cache#1621). This is the block that - # actually broke in run 25713577488: "Cache hit for: " was - # logged, the step exited non-zero in ~0.3 s without extracting the - # 3.4 GiB archive, and steps 6-15 were skipped. - - name: Restore HF_HOME cache for ${{ env.GGUF_REPO }} (model + mmproj) - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1 - - - name: Prime HF_HOME with the GGUF + mmproj - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE" - - - name: Save HF_HOME cache for ${{ env.GGUF_REPO }} (model + mmproj) - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1 - - - name: Pre-install Windows tweaks (npm 11 + Defender exclusions) - shell: pwsh - # See studio-windows-update-smoke.yml for the full rationale. - # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node - # reinstall, and Defender's real-time scan dominates the - # frontend / uv-pip-extract steps. - run: | - $ProgressPreference = 'SilentlyContinue' - Write-Host "npm version before upgrade: $(npm -v)" - npm install -g 'npm@^11' 2>&1 | Out-Host - Write-Host "npm version after upgrade: $(npm -v)" - # NOTE: do NOT pre-create these directories. See - # studio-windows-update-smoke.yml for the full rationale -- - # creating an empty studio/frontend/dist trips setup.ps1's - # mtime-based staleness check into "frontend up to date, skip - # rebuild" and Studio boots with an empty dist directory. - # Add-MpPreference accepts paths that do not yet exist. - foreach ($p in @( - "$env:USERPROFILE\.unsloth", - "$env:USERPROFILE\AppData\Local\uv", - "$env:GITHUB_WORKSPACE\studio\frontend\node_modules", - "$env:GITHUB_WORKSPACE\studio\frontend\dist" - )) { - try { - Add-MpPreference -ExclusionPath $p -ErrorAction Stop - Write-Host "Defender exclusion added: $p" - } catch { - Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p" - } - } - - - name: Install Studio (--local, --no-torch) - shell: pwsh - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - New-Item -ItemType Directory -Force -Path logs | Out-Null - # *>&1 captures Write-Host (Information stream) output; - # plain 2>&1 does not. setup.ps1 emits "prebuilt installed - # and validated" via Write-Host, and we grep for that. - $ProgressPreference = 'SilentlyContinue' - & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log - - - name: Assert install.ps1 used the Windows llama.cpp prebuilt - run: | - # Filesystem check; setup.ps1's stream output isn't captured. - LLAMA_DIR=~/.unsloth/llama.cpp - INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json" - BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe" - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.ps1 fell back to source-build llama.cpp on Windows." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - if [ ! -f "$INFO" ]; then - echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO." - ls -la "$LLAMA_DIR" || true - exit 1 - fi - if [ ! -f "$BIN" ]; then - echo "::error::no llama-server.exe at $BIN." - ls -la "$LLAMA_DIR/build/bin" || true - exit 1 - fi - echo "install.ps1 installed the Windows prebuilt llama.cpp:" - cat "$INFO" - - - name: Add Studio shim to GITHUB_PATH - run: | - SHIM_DIR=~/.unsloth/studio/bin - if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then - echo "::error::unsloth.exe shim not found at $SHIM_DIR" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH" - - - name: Patch Studio venv with full typer / pydantic dep trees - # Belt-and-suspenders: install.ps1's --no-deps install of - # no-torch-runtime.txt drops typer's and pydantic's runtime - # deps unless explicitly pinned. Re-install the ones whose - # deps don't pull torch. - run: | - STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe - if [ ! -f "$STUDIO_PY" ]; then - echo "::error::Studio venv python not at $STUDIO_PY" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub - - - name: Install OpenAI + Anthropic Python SDKs - run: python -m pip install 'openai>=1.50' 'anthropic>=0.40' - - - name: Reset auth + boot Studio (API-only) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health, log in, change password, load model - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token) - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \ - -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \ - -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null - TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \ - -H 'content-type: application/json' \ - -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) - echo "API_KEY=$TOKEN" >> "$GITHUB_ENV" - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 900 \ - -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name, is_vision}' - - - name: JSON schema decoding + image input - env: - BASE_URL: http://127.0.0.1:18899 - run: | - python - <<'PY' - import base64 - import json - import os - import urllib.request - from openai import OpenAI - from anthropic import Anthropic - - BASE = os.environ["BASE_URL"] - KEY = os.environ["API_KEY"] - SEED = 3407 - TEMP = 0.2 - - def post(path, body, *, timeout = 240): - req = urllib.request.Request( - f"{BASE}{path}", - data = json.dumps(body).encode(), - method = "POST", - headers = { - "Authorization": f"Bearer {KEY}", - "Content-Type": "application/json", - }, - ) - with urllib.request.urlopen(req, timeout = timeout) as resp: - return resp.status, json.loads(resp.read().decode()) - - # ── 1. response_format = json_object (JSON mode) ───────────── - status, data = post("/v1/chat/completions", { - "model": "default", - "messages": [ - {"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'}, - {"role": "user", "content": "What is the capital of France?"}, - ], - "temperature": TEMP, - "max_tokens": 600, - "seed": SEED, - "stream": False, - "enable_thinking": False, - "response_format": {"type": "json_object"}, - }, timeout = 600) - assert status == 200, f"json status {status}: {data}" - assert ( - isinstance(data.get("choices"), list) - and data["choices"] - and "message" in data["choices"][0] - ), f"json response envelope malformed: {data}" - content = (data["choices"][0]["message"].get("content") or "").strip() - print(f"[json] raw json_object content: {content!r}") - if content.startswith("```"): - content = content.split("```", 2)[1] - if content.startswith("json"): - content = content[4:] - content = content.strip("`\n ") - if content: - try: - parsed = json.loads(content) - if "paris" in str(parsed.get("city", "")).lower(): - print(f"[json] PASS json_object -> {parsed}") - else: - print(f"[json] WARN json_object decoded but city!=Paris: {parsed}") - except json.JSONDecodeError as exc: - print(f"[json] WARN json_object content not parseable ({exc}); content={content!r}") - else: - print("[json] WARN json_object produced empty content") - - status2, data2 = post("/v1/chat/completions", { - "model": "default", - "messages": [{"role": "user", "content": "What is the capital of France? Answer with one word."}], - "temperature": TEMP, - "max_tokens": 400, - "seed": SEED, - "stream": False, - "enable_thinking": False, - }, timeout = 600) - assert status2 == 200, f"plain status {status2}: {data2}" - plain = (data2["choices"][0]["message"].get("content") or "").lower() - print(f"[json] plain capital-of-france reply: {plain!r}") - if "paris" in plain: - print("[json] PASS plain inference path (paris mentioned)") - else: - print( - f"[json] WARN plain inference returned no 'paris' -- " - f"model output drift. HTTP path validated separately above." - ) - - # ── 2. OpenAI image_url (data URI base64) ─────────────────── - PNG_64X64_RED_B64 = ( - "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k" - "UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA" - "1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII=" - ) - data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}" - - # On Windows + the gemma-4-E2B mmproj, llama.cpp's vision - # path runs on CPU (no Metal involvement). The wrapper is - # kept for resilience but the vision path is expected to - # work on Windows; an exception here is a real regression. - client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY) - try: - openai_resp = client.chat.completions.create( - model = "default", - temperature = TEMP, - max_tokens = 80, - seed = SEED, - messages = [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": data_uri}}, - {"type": "text", "text": "What colour dominates this image? Reply in one word."}, - ], - }], - ) - openai_text = (openai_resp.choices[0].message.content or "").lower() - print(f"[image/openai] reply: {openai_text!r}") - if openai_text: - print("[image/openai] PASS image_url accepted, non-empty response") - else: - print("[image/openai] WARN image_url accepted but empty content") - except Exception as exc: - print( - f"[image/openai] WARN image_url SDK call raised: {type(exc).__name__}: " - f"{exc}. Studio successfully forwarded the request; failure here is " - f"upstream llama.cpp vision behaviour." - ) - - # ── 3. Anthropic source/base64 image ──────────────────────── - anthropic = Anthropic( - base_url = BASE, - api_key = "unused", - default_headers = {"Authorization": f"Bearer {KEY}"}, - ) - try: - a_msg = anthropic.messages.create( - model = "default", - max_tokens = 80, - temperature = TEMP, - extra_body = {"seed": SEED}, - messages = [{ - "role": "user", - "content": [ - { - "type": "image", - "source": { - "type": "base64", - "media_type": "image/png", - "data": PNG_64X64_RED_B64, - }, - }, - {"type": "text", "text": "Describe this image briefly."}, - ], - }], - ) - a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text") - print(f"[image/anthropic] reply: {a_text!r}") - if a_text: - print("[image/anthropic] PASS source/base64 accepted, non-empty response") - else: - print("[image/anthropic] WARN source/base64 accepted but empty content") - except Exception as exc: - print( - f"[image/anthropic] WARN anthropic image SDK call raised: " - f"{type(exc).__name__}: {exc}. Likely upstream llama.cpp vision " - f"behaviour, NOT a Studio regression." - ) - PY - - - name: Stop Studio - if: always() - # Run as cmd so we are not running through the Git Bash shell; - # Git Bash on windows-latest has been observed to exit 143 - # (SIGTERM) from any inline kill/sleep block, masking a green - # test run. The runner reclaims the Studio child process at - # job end either way, so just emit a marker and exit 0. - shell: cmd - run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end) - - - name: Upload logs - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: windows-json-images-log - path: | - logs/studio.log - logs/install.log - retention-days: 7 diff --git a/.github/workflows/studio-windows-ui-smoke.yml b/.github/workflows/studio-windows-ui-smoke.yml deleted file mode 100644 index e5ab9f8ab7..0000000000 --- a/.github/workflows/studio-windows-ui-smoke.yml +++ /dev/null @@ -1,342 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Windows counterpart to studio-ui-smoke.yml / studio-mac-ui-smoke.yml. -# Same Playwright + Chromium end-to-end chat UI flow + extra UI flow, -# but on the FREE windows-latest runner so we catch Windows-specific -# regressions in the install path (install.ps1), the Studio CLI's -# Windows process-management branches, and the llama.cpp prebuilt's -# Windows HTTP layer. - -name: Windows Studio UI CI - -on: - pull_request: - paths: - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - 'install.ps1' - - 'pyproject.toml' - - 'tests/studio/**' - - '.github/workflows/studio-windows-ui-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - ui-smoke: - name: Chat UI Tests - runs-on: windows-latest - timeout-minutes: 45 - # Default every step's shell to Git Bash. windows-latest's default - # shell is pwsh; without this each curl / heredoc / `kill $PID` - # step would need its own `shell: bash`. Steps that genuinely - # need PowerShell (install.ps1 invocation) override per-step. - defaults: - run: - shell: bash - env: - GGUF_REPO: unsloth/gemma-3-270m-it-GGUF - GGUF_VARIANT: UD-Q4_K_XL - GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf - STUDIO_PORT: '18896' - HF_HOME: ${{ github.workspace }}/hf-cache - # Force UTF-8 for stdio so Python tools (hf download, Studio - # CLI, etc.) can print Unicode characters like the success - # checkmark "✓". Windows defaults to cp1252 / charmap and - # any tool that prints "OK ✓" hits a UnicodeEncodeError. - PYTHONIOENCODING: utf-8 - PYTHONUTF8: '1' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - # No `cache: 'npm'`. setup-node's npm cache restore silently - # aborts the entire job on Windows runners when the npm cache - # path (`C:\npm\cache` per `npm config get cache`) doesn't yet - # exist on a fresh runner -- the step exits without an error - # message and every following step gets skipped. See - # npm/cli#7308. The frontend `npm ci` is fast enough without - # the cache that the reliability gain is worth the ~30s. - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - # No `cache: 'pip'`. install.ps1 / setup.ps1 use uv and - # never populate ~/.cache/pip; setup-python's post-step - # then fatal-errors with "Cache folder path is retrieved - # for pip but doesn't exist on disk". - - - name: Restore HF_HOME for ${{ env.GGUF_REPO }} - id: cache-hf - uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - continue-on-error: true - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Prime HF_HOME with the GGUF - id: prime-hf - if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - python -m pip install --upgrade huggingface_hub - mkdir -p hf-cache - bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" - - - name: Save HF_HOME for ${{ env.GGUF_REPO }} - if: always() && steps.prime-hf.outcome == 'success' - uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: hf-cache - key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1 - - - name: Pre-install Windows tweaks (npm 11 + Defender exclusions) - shell: pwsh - # See studio-windows-update-smoke.yml for the full rationale. - # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node - # reinstall, and Defender's real-time scan dominates the - # frontend / uv-pip-extract steps. - run: | - $ProgressPreference = 'SilentlyContinue' - Write-Host "npm version before upgrade: $(npm -v)" - npm install -g 'npm@^11' 2>&1 | Out-Host - Write-Host "npm version after upgrade: $(npm -v)" - # NOTE: do NOT pre-create these directories. See - # studio-windows-update-smoke.yml for the full rationale -- - # creating an empty studio/frontend/dist trips setup.ps1's - # mtime-based staleness check into "frontend up to date, skip - # rebuild" and Studio boots with an empty dist directory. - # Add-MpPreference accepts paths that do not yet exist. - foreach ($p in @( - "$env:USERPROFILE\.unsloth", - "$env:USERPROFILE\AppData\Local\uv", - "$env:GITHUB_WORKSPACE\studio\frontend\node_modules", - "$env:GITHUB_WORKSPACE\studio\frontend\dist" - )) { - try { - Add-MpPreference -ExclusionPath $p -ErrorAction Stop - Write-Host "Defender exclusion added: $p" - } catch { - Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p" - } - } - - - name: Install Studio (--local, --no-torch) - # install.ps1 is the supported Windows installer. install.sh - # has no Windows branch (apt-get / brew calls). The PS1 - # script's `Install-UnslothStudio @args` line at the bottom - # forwards `--local --no-torch` correctly. - shell: pwsh - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - New-Item -ItemType Directory -Force -Path logs | Out-Null - # *>&1 redirects ALL PowerShell streams (stdout, stderr, - # warning, verbose, debug, information) into the success - # stream so Tee-Object captures everything. install.ps1 - # and setup.ps1 emit step/substep markers via Write-Host - # which lands on the Information stream (PS 5+); without - # the wildcard redirect, those markers (including - # "prebuilt installed and validated") never reach - # logs/install.log and the post-step grep asserter fails. - $ProgressPreference = 'SilentlyContinue' - & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log - - - name: Assert install.ps1 used the Windows llama.cpp prebuilt - run: | - # install.ps1's setup.ps1 child writes "prebuilt installed - # and validated" to its own console host -- that output - # does NOT come back through this parent step's stdout - # pipeline (no matter how aggressively we redirect: *>&1, - # tee, etc.). Verify the install via the filesystem - # instead. setup.ps1 writes UNSLOTH_PREBUILT_INFO.json - # next to the install dir on success, and lays the - # binaries under build/bin/Release/ on Windows. - STUDIO_HOME=~/.unsloth/studio - LLAMA_DIR=~/.unsloth/llama.cpp - INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json" - BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe" - # Source-build fallback grep stays as a fast bail-out. - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.ps1 fell back to source-build llama.cpp on Windows." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - if [ ! -f "$INFO" ]; then - echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO; setup.ps1 didn't install the prebuilt." - ls -la "$LLAMA_DIR" || true - exit 1 - fi - if [ ! -f "$BIN" ]; then - echo "::error::no llama-server.exe at $BIN; prebuilt extraction incomplete." - ls -la "$LLAMA_DIR/build/bin" || true - ls -la "$LLAMA_DIR/build/bin/Release" || true - exit 1 - fi - echo "install.ps1 installed the Windows prebuilt llama.cpp:" - cat "$INFO" - - - name: Add Studio shim to GITHUB_PATH - # install.ps1 puts unsloth.exe at $StudioHome\bin\unsloth.exe - # and adds that dir to the User PATH via the Windows registry. - # Registry-level PATH updates don't propagate to a running - # Git Bash session, so the next step's `unsloth ...` invocation - # would hit "command not found". Re-export the shim dir to - # GITHUB_PATH so every subsequent step in this job sees it. - run: | - SHIM_DIR=~/.unsloth/studio/bin - if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then - echo "::error::unsloth.exe shim not found at $SHIM_DIR" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - # GITHUB_PATH wants Windows-style paths; convert via cygpath. - cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH" - echo "Added Studio shim dir to PATH: $(cygpath -w "$SHIM_DIR")" - - - name: Patch Studio venv with full typer / pydantic dep trees - # Belt-and-suspenders: install.ps1's --no-deps install of - # no-torch-runtime.txt drops typer's and pydantic's runtime - # deps unless explicitly pinned. Re-install the ones whose - # deps don't pull torch. - run: | - STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe - if [ ! -f "$STUDIO_PY" ]; then - echo "::error::Studio venv python not at $STUDIO_PY" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub - - - name: Install Playwright + Chromium - # No --with-deps on Windows: that flag installs Linux apt - # packages. windows-latest ships the system frameworks - # Chromium needs (Edge / WebView2) already. - run: | - python -m pip install 'playwright>=1.45' - python -m playwright install chromium - - - name: Reset auth + boot Studio - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \ - > logs/studio.log 2>&1 & - echo "STUDIO_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then - jq -e '.status == "healthy"' /tmp/health.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health.json - - - name: Pass bootstrap password to the Playwright step - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - NEW2="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - echo "::add-mask::$NEW2" - echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV" - echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV" - echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV" - - - name: Drive the chat UI with Playwright - env: - BASE_URL: http://127.0.0.1:18896 - PW_ART_DIR: logs/playwright - STUDIO_UI_STRICT: '1' - # windows-latest free runner is 4 vCPU / 16 GB; gemma-3- - # 270m turn latency under llama-server's CPU backend can - # crowd the 180s default (slower than ubuntu-latest on - # the same model). Keep the same generous budget the Mac - # job uses. - STUDIO_UI_TURN_TIMEOUT_MS: '540000' - run: | - mkdir -p logs/playwright - python tests/studio/playwright_chat_ui.py - - - name: Stop Studio (chat-ui ends with Shutdown click; this is belt-and-suspenders) - if: always() - run: | - kill "${STUDIO_PID}" 2>/dev/null || true - sleep 2 - - - name: Reset auth + boot Studio for extra UI tests (port 18897) - run: | - unsloth studio reset-password - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18897 \ - > logs/studio_extra.log 2>&1 & - echo "STUDIO_EXTRA_PID=$!" >> "$GITHUB_ENV" - - - name: Wait for /api/health on 18897 - run: | - for i in $(seq 1 180); do - if curl -fs "http://127.0.0.1:18897/api/health" > /tmp/health2.json; then - jq -e '.status == "healthy"' /tmp/health2.json && break - fi - sleep 1 - done - jq -e '.status == "healthy"' /tmp/health2.json - - - name: Pass bootstrap pw for extra UI test - run: | - OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password) - NEW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')" - echo "::add-mask::$OLD" - echo "::add-mask::$NEW" - echo "STUDIO_EXTRA_OLD_PW=$OLD" >> "$GITHUB_ENV" - echo "STUDIO_EXTRA_NEW_PW=$NEW" >> "$GITHUB_ENV" - - - name: Drive Compare/Recipes/Export/Studio/Settings with Playwright - env: - BASE_URL: http://127.0.0.1:18897 - STUDIO_OLD_PW: ${{ env.STUDIO_EXTRA_OLD_PW }} - STUDIO_NEW_PW: ${{ env.STUDIO_EXTRA_NEW_PW }} - PW_ART_DIR: logs/playwright_extra - STUDIO_UI_STRICT: '1' - STUDIO_UI_TURN_TIMEOUT_MS: '540000' - GGUF_REPO: ${{ env.GGUF_REPO }} - GGUF_VARIANT: ${{ env.GGUF_VARIANT }} - run: | - mkdir -p logs/playwright_extra - python tests/studio/playwright_extra_ui.py - - - name: Stop second Studio - if: always() - run: | - kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true - sleep 2 - - - name: Upload Playwright artifacts - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: windows-studio-ui-smoke-artifacts - path: | - logs/studio.log - logs/studio_extra.log - logs/install.log - logs/playwright - logs/playwright_extra - retention-days: 7 diff --git a/.github/workflows/studio-windows-update-smoke.yml b/.github/workflows/studio-windows-update-smoke.yml deleted file mode 100644 index 157874d404..0000000000 --- a/.github/workflows/studio-windows-update-smoke.yml +++ /dev/null @@ -1,279 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Windows counterpart to studio-update-smoke.yml / -# studio-mac-update-smoke.yml. Verifies that on the FREE -# windows-latest runner: -# -# 1. install.ps1 --local --no-torch installs Studio AND auto-fetches -# the prebuilt llama.cpp Windows binary (llama-bNNNN-bin-win-cpu- -# x64 from ggml-org/llama.cpp). Hitting the source-build fallback -# is treated as an Unsloth bug -- Studio must always pick the -# prebuilt on Windows. -# 2. unsloth studio update --local is idempotent. Two consecutive -# runs both report "prebuilt up to date and validated", no -# source-build fallback. The CLI's _find_setup_script picks -# setup.ps1 on Windows automatically. -# 3. The installed Studio still boots and /api/health returns -# healthy after the update path. - -name: Windows Studio Update CI - -on: - pull_request: - paths: - - 'install.ps1' - - 'studio/setup.ps1' - - 'studio/setup.bat' - - 'studio/install_python_stack.py' - - 'studio/install_llama_prebuilt.py' - - 'studio/backend/requirements/**' - - 'unsloth_cli/commands/studio.py' - - 'pyproject.toml' - - '.github/workflows/studio-windows-update-smoke.yml' - push: - branches: [main, pip] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - update-idempotency: - name: Studio Updating Tests - runs-on: windows-latest - timeout-minutes: 30 - defaults: - run: - shell: bash - env: - # Force UTF-8 for stdio (Windows defaults to cp1252; hf - # download / Studio CLI print "✓" checkmarks and crash - # otherwise). - PYTHONIOENCODING: utf-8 - PYTHONUTF8: '1' - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - # Don't cache pip: install.ps1 + setup.ps1 go through uv - # and never populate ~/.cache/pip; setup-python's post-step - # then fatal-errors with "Cache folder path is retrieved - # for pip but doesn't exist on disk". - - - name: Pre-install Windows tweaks (npm 11 + Defender exclusions) - shell: pwsh - # Two surgical fixes against measured Windows-only install - # waste (vs Mac/Linux on the same SHA): - # - # (1) npm. setup.ps1 line 1109-1145 requires Node 22.12+ (or - # 20.19+ / 23+) AND npm >=11 because Vite 8 needs both. - # actions/setup-node@v4 with `node-version: '22'` lands - # Node 22.22.2 + the npm 10.9.7 it bundles, so the npm - # check fails and setup.ps1 falls through to the - # "winget install Node.js LTS" branch -- a ~35 s reinstall - # of Node we don't need. `npm install -g npm@^11` updates - # the bundled npm in-place in ~5 s, which makes setup.ps1 - # short-circuit on the existing Node. - # - # (2) Defender. windows-latest's real-time scan opens / hashes - # every file Studio writes during install (Vite output = - # thousands of small chunks, uv pip = wheel-extraction = - # thousands of small files). The latency dominates the - # 200 s frontend build and the 90 s deps install. Adding - # ExclusionPath entries for the directories the install - # writes to drops per-file open latency from ~ms to ~us. - # Add-MpPreference needs admin; the runneradmin user has - # it, but wrap in try/catch so a permission flake leaves - # the install otherwise unaffected. - run: | - $ProgressPreference = 'SilentlyContinue' - Write-Host "npm version before upgrade: $(npm -v)" - npm install -g 'npm@^11' 2>&1 | Out-Host - Write-Host "npm version after upgrade: $(npm -v)" - # NOTE: do NOT pre-create these directories before adding the - # exclusion -- creating an empty studio/frontend/dist trips - # setup.ps1 line 1281-1296's mtime-based "is the frontend - # stale?" check into "up to date, skip rebuild", because the - # newly-created dist's mtime is younger than every source - # file. Studio then boots with an empty dist and 500s on - # GET / with FileNotFoundError: dist\index.html. See run - # 25546676715 / job 74984469728. - # Add-MpPreference accepts paths that do not yet exist; the - # exclusion is registered and applies when the path - # materialises. - foreach ($p in @( - "$env:USERPROFILE\.unsloth", - "$env:USERPROFILE\AppData\Local\uv", - "$env:GITHUB_WORKSPACE\studio\frontend\node_modules", - "$env:GITHUB_WORKSPACE\studio\frontend\dist" - )) { - try { - Add-MpPreference -ExclusionPath $p -ErrorAction Stop - Write-Host "Defender exclusion added: $p" - } catch { - Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p" - } - } - - - name: Install Studio (--local, --no-torch) - shell: pwsh - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - New-Item -ItemType Directory -Force -Path logs | Out-Null - # *>&1 captures Write-Host (Information stream) output; - # plain 2>&1 does not. setup.ps1 emits "prebuilt installed - # and validated" via Write-Host, and we grep for that. - $ProgressPreference = 'SilentlyContinue' - & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log - - - name: Assert install.ps1 used the Windows llama.cpp prebuilt - run: | - # Filesystem-based check (setup.ps1's stream output isn't - # captured back through the parent pipeline). - LLAMA_DIR=~/.unsloth/llama.cpp - INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json" - BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe" - if grep -q "falling back to source build" logs/install.log; then - echo "::error::install.ps1 fell back to source-build llama.cpp on Windows." - grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60 - exit 1 - fi - if [ ! -f "$INFO" ]; then - echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO." - ls -la "$LLAMA_DIR" || true - exit 1 - fi - if [ ! -f "$BIN" ]; then - echo "::error::no llama-server.exe at $BIN." - ls -la "$LLAMA_DIR/build/bin" || true - exit 1 - fi - echo "install.ps1 installed the Windows prebuilt llama.cpp:" - cat "$INFO" - - - name: Add Studio shim to GITHUB_PATH - run: | - SHIM_DIR=~/.unsloth/studio/bin - if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then - echo "::error::unsloth.exe shim not found at $SHIM_DIR" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH" - - - name: Patch Studio venv with full typer / pydantic dep trees - # install.ps1 runs `uv pip install --no-deps -r - # no-torch-runtime.txt` to keep torch out of transitive - # resolution from accelerate/peft/trl. That also drops - # typer's and pydantic's runtime deps unless they're - # explicitly pinned in no-torch-runtime.txt. We pin the - # known ones (click, shellingham, annotated-doc, rich, - # pydantic-core, annotated-types, typing-inspection, ...) - # but typer / pydantic minor versions can introduce new - # transitive deps that are NOT in our pin list. - # - # Belt-and-suspenders: re-install typer + pydantic + - # huggingface_hub WITH their deps into the Studio venv. - # `pip install --upgrade` only adds missing packages; it - # never down-shifts an installed version. Cannot pull - # torch (none of typer / pydantic / huggingface_hub depend - # on it). - run: | - STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe - if [ ! -f "$STUDIO_PY" ]; then - echo "::error::Studio venv python not at $STUDIO_PY" - ls -la ~/.unsloth/studio/ || true - exit 1 - fi - "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub - - - name: First update should be a no-op (prebuilt already validated) - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - set -o pipefail - unsloth studio update --local 2>&1 | tee logs/update.log - if grep -q "falling back to source build" logs/update.log; then - echo "::error::studio update fell back to source-build llama.cpp on Windows." - grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60 - exit 1 - fi - if ! grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update.log; then - echo "::error::no prebuilt up-to-date marker in update.log." - grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60 - exit 1 - fi - echo "update path took the prebuilt fast path" - - - name: Second update must also be a no-op - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - set -o pipefail - unsloth studio update --local 2>&1 | tee logs/update2.log - grep -q "falling back to source build" logs/update2.log && { - echo "::error::second update fell back to source build on Windows" - tail -60 logs/update2.log; exit 1; } || true - grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update2.log - echo "second update was clean" - - - name: Boot Studio briefly to confirm the install is still usable - run: | - mkdir -p logs - UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18891 \ - > logs/studio.log 2>&1 & - PID=$! - HEALTHY="" - # Use jq (a Git Bash builtin) instead of `python -c - # open('/tmp/health.json')` to read the saved health - # response. Bash on windows-latest is MSYS Git Bash, which - # resolves `/tmp/...` against the MSYS root, while the - # python interpreter is Windows-native and resolves it - # against the current drive's root. The two paths don't - # agree, so python never finds the file curl just wrote. - # jq reads through MSYS, so the path matches. Mirrors what - # studio-windows-api-smoke.yml and the other Windows smoke - # workflows already do. - for i in $(seq 1 60); do - if curl -fs http://127.0.0.1:18891/api/health > /tmp/health.json; then - if jq -e '.status == "healthy"' /tmp/health.json >/dev/null; then - HEALTHY=1 - break - fi - fi - sleep 1 - done - if [ -z "$HEALTHY" ]; then - echo "Studio failed to come up after \`update\`" - tail -200 logs/studio.log - kill "$PID" 2>/dev/null || true - exit 1 - fi - kill "$PID" 2>/dev/null || true - echo "post-update Studio /api/health OK" - - - name: Upload update logs - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: windows-studio-update-log - path: | - logs/install.log - logs/update.log - logs/update2.log - logs/studio.log - retention-days: 7 diff --git a/.github/workflows/version-compat-ci.yml b/.github/workflows/version-compat-ci.yml deleted file mode 100644 index 599b53df1d..0000000000 --- a/.github/workflows/version-compat-ci.yml +++ /dev/null @@ -1,312 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. -# -# Cross-version compat canary for the four upstream packages whose -# release cadence regularly breaks unsloth + unsloth-zoo: -# -# 1. vLLM (LoRA worker manager, BnB loader, cumem allocator) -# 2. TRL / GRPO (trainer source rewriters in unsloth.models.rl*) -# 3. PEFT (LoraConfig, get_peft_model, LoraLayer, bnb integration) -# 4. sentence-transformers (Transformer/Pooling/Normalize, Trainer) -# 5. bitsandbytes (Linear4bit, dequantize_4bit) -# -# Strategy: GitHub raw-fetch + symbol grep against every tracked -# version (no pip install, CPU-only). When upstream renames a symbol -# we depend on, the matching test fails BEFORE a user hits it. The -# `main` branch entries give us a few-day lead on PyPI releases. -# -# Cross-references: -# tests/vllm_compat/test_vllm_pinned_symbols.py (vLLM symbols) -# tests/version_compat/test_trl_grpo_pinned_symbols.py -# tests/version_compat/test_peft_pinned_symbols.py -# tests/version_compat/test_sentence_transformers_pinned_symbols.py -# tests/version_compat/test_bitsandbytes_pinned_symbols.py - -name: Version Compat CI - -on: - pull_request: - # Trigger on any unsloth source change, not just the three previously - # named files. The symbol-existence tests verify that EVERY pinned - # upstream reference in unsloth still resolves; a new - # `from peft.foo import Bar` added in unsloth/kernels/whatever.py - # is just as much a compat regression risk as one added in - # unsloth/models/rl.py. - paths: - - 'unsloth/**' - - 'tests/vllm_compat/**' - - 'tests/version_compat/**' - - 'pyproject.toml' - - '.github/workflows/version-compat-ci.yml' - schedule: - # Daily 06:43 UTC. Catches upstream PyPI releases roughly within - # 24 h. Off the :00 / :30 fleet-collision spots. - - cron: '43 6 * * *' - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - vllm-pinned-symbols: - name: vLLM pinned-symbol matrix (≥ 0.9.0 + main) - runs-on: ubuntu-latest - timeout-minutes: 12 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - name: Install pytest only - # The test fetches from raw.githubusercontent.com and greps - # source. No pip install of vllm / torch / transformers is - # needed — that's the whole point of this canary. - run: | - python -m pip install --upgrade pip - pip install 'pytest>=8' - - name: Run vllm-compat suite - env: - # Authenticated requests get a 5000-req/h quota on raw - # fetches; unauthenticated is 60/h and trips on the matrix. - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - python -m pytest tests/vllm_compat/test_vllm_pinned_symbols.py -v --tb=short - - trl-grpo-pinned-symbols: - name: TRL / GRPO pinned-symbol matrix - runs-on: ubuntu-latest - timeout-minutes: 10 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - name: Install pytest only - run: | - python -m pip install --upgrade pip - pip install 'pytest>=8' - - name: Run trl-compat suite - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # PYTHONPATH=. so `from tests.version_compat._fetch import …` - # works without an editable install of unsloth itself. - PYTHONPATH=. python -m pytest \ - tests/version_compat/test_trl_grpo_pinned_symbols.py \ - -v --tb=short - - peft-pinned-symbols: - name: PEFT pinned-symbol matrix (pyproject window + main) - runs-on: ubuntu-latest - timeout-minutes: 8 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - name: Install pytest only - run: | - python -m pip install --upgrade pip - pip install 'pytest>=8' - - name: Run peft-compat suite - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - PYTHONPATH=. python -m pytest \ - tests/version_compat/test_peft_pinned_symbols.py \ - tests/version_compat/test_unsloth_zoo_save_merged_pinned_symbols.py \ - -v --tb=short - - st-pinned-symbols: - name: sentence-transformers pinned-symbol matrix - runs-on: ubuntu-latest - timeout-minutes: 8 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - name: Install pytest only - run: | - python -m pip install --upgrade pip - pip install 'pytest>=8' - - name: Run sentence-transformers compat suite - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - PYTHONPATH=. python -m pytest \ - tests/version_compat/test_sentence_transformers_pinned_symbols.py \ - -v --tb=short - - bitsandbytes-pinned-symbols: - name: bitsandbytes pinned-symbol matrix - runs-on: ubuntu-latest - timeout-minutes: 8 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - name: Install pytest only - run: | - python -m pip install --upgrade pip - pip install 'pytest>=8' - - name: Run bitsandbytes compat suite - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - PYTHONPATH=. python -m pytest \ - tests/version_compat/test_bitsandbytes_pinned_symbols.py \ - -v --tb=short - - transformers-pinned-symbols: - name: transformers pinned-symbol matrix (4.57.6 + 5.x + main) - runs-on: ubuntu-latest - timeout-minutes: 12 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - name: Install pytest only - run: | - python -m pip install --upgrade pip - pip install 'pytest>=8' - - name: Run transformers compat suite - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - PYTHONPATH=. python -m pytest \ - tests/version_compat/test_transformers_pinned_symbols.py \ - -v --tb=short - - # Optional second layer: actually `pip install` ONE representative - # version of each package and verify unsloth + unsloth-zoo modules - # import on it under the existing CUDA spoof. CPU-only, runs on - # ubuntu-latest. Catches the small set of breakages that the static - # symbol check misses (e.g. import-time side effects). - zoo-imports-under-spoof: - name: unsloth_zoo vllm/grpo/peft/st modules import under CUDA spoof - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - path: unsloth - - name: Clone unsloth-zoo @ main - run: | - # github.com occasionally 500s on the git fetch; retry so a - # single upstream blip does not fail CI. - for attempt in 1 2 3; do - rm -rf "$RUNNER_TEMP/unsloth-zoo" - if git clone --depth=1 https://github.com/unslothai/unsloth-zoo \ - "$RUNNER_TEMP/unsloth-zoo"; then - break - fi - if [ "$attempt" -eq 3 ]; then - echo "::error::git clone unsloth-zoo failed after 3 attempts" - exit 1 - fi - delay=$((5 * attempt)) - echo "::warning::clone failed (attempt $attempt/3), retrying in ${delay}s..." - sleep "$delay" - done - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - name: Install CPU torch + supported pkg pins - run: | - python -m pip install --upgrade pip - # CPU torch (vllm/peft/st all depend on it). - pip install --index-url https://download.pytorch.org/whl/cpu \ - 'torch>=2.4,<2.11' 'torchvision<0.26' 'torchcodec<0.10' - # torchcodec is a hard requirement on transformers 5.x: - # transformers/audio_utils.py:55 does - # `importlib.metadata.version("torchcodec")` UNCONDITIONALLY, - # which raises PackageNotFoundError on a CPU runner that - # otherwise has no audio path -- and that error trickles up - # through every `import unsloth_zoo.` because - # unsloth-zoo's vision_utils transitively pulls - # transformers.processing_utils (-> audio_utils). The 0.10 - # cap mirrors the torch 2.10 / torchvision 0.26 ABI window - # we already pin above. - # Ladder of supported floor versions per pyproject.toml. - pip install \ - 'transformers>=4.56,<5.6' 'trl>=0.22,<0.26' \ - 'peft>=0.18.0' 'sentence-transformers>=5.0' \ - 'accelerate>=1.0' 'datasets>=3.4,<5' \ - 'bitsandbytes>=0.45.5' \ - sentencepiece protobuf safetensors numpy 'pytest>=8' \ - 'huggingface_hub>=0.34' tqdm packaging psutil triton Pillow - # Editable-install both repos so the test imports the - # checkouts (not whatever stale PyPI version pip resolved). - pip install --no-deps -e "$RUNNER_TEMP/unsloth-zoo" - pip install --no-deps -e ./unsloth - - name: Run vllm_compat zoo-imports tests under spoof - env: - UNSLOTH_IS_PRESENT: '1' - UNSLOTH_COMPILE_DISABLE: '1' - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python - run: | - cd unsloth - # tests/vllm_compat/test_unsloth_zoo_imports.py: narrow vllm/grpo - # import gates (5 tests). - # tests/vllm_compat/test_extended_module_imports.py: full sweep - # of unsloth_zoo + unsloth.models.* modules + RL dispatch - # table population + FastModel API surface under spoof - # (~30 tests). Catches transformers / peft / bnb symbol pin - # drift at module-top BEFORE any runtime call. - PYTHONPATH=. python -m pytest \ - tests/vllm_compat/test_unsloth_zoo_imports.py \ - tests/vllm_compat/test_extended_module_imports.py \ - -v --tb=short - - # Daily-only: same suites but with --strict on importable upstream - # tags. Schedule-only so PR jobs stay fast; cron tolerates a flake. - daily-fresh-fetch: - name: daily fresh-fetch sweep (cron only) - if: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} - runs-on: ubuntu-latest - timeout-minutes: 20 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - cache: 'pip' - - name: Install pytest - run: pip install 'pytest>=8' - - name: Run all version-compat suites in one process (no cache) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - PYTHONPATH=. python -m pytest \ - tests/vllm_compat/test_vllm_pinned_symbols.py \ - tests/version_compat/ \ - -v --tb=short diff --git a/.github/workflows/wheel-smoke.yml b/.github/workflows/wheel-smoke.yml deleted file mode 100644 index 3de3c33ca2..0000000000 --- a/.github/workflows/wheel-smoke.yml +++ /dev/null @@ -1,136 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. - -# Builds the PyPI wheel from the PR branch, then verifies the built wheel -# actually contains what we expect to ship and does NOT contain the broken -# Studio bundle that 2026.5.1 published. This is the single workflow that -# would have blocked the 2026.5.1 release before twine upload. -# -# Verified locally end-to-end against this branch: -# - python -m build produces unsloth--py3-none-any.whl in 13s -# - wheel content sanity passes: -# lockfile shipped, frontend dist shipped, -# no node_modules in wheel, no bun.lock in wheel, -# main bundle has unstable_Provider hits=1 (assistant-ui internals only). -# - Studio backend imports cleanly from the installed wheel with the -# lightweight dep set below. - -name: Wheel CI - -on: - pull_request: - paths: - - 'pyproject.toml' - - 'studio/**' - - 'unsloth/**' - - 'unsloth_cli/**' - - '.github/workflows/wheel-smoke.yml' - push: - branches: [main, pip] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - wheel: - name: Wheel build + content sanity + import smoke - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 - with: - node-version: '22' - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - - name: Lockfile supply-chain audit (pre-install scan) - run: python3 scripts/lockfile_supply_chain_audit.py - - - name: Build frontend - # Lifecycle scripts (esbuild native-binary postinstall, etc.) are - # required for `vite build`. The pre-install lockfile structural - # audit (lockfile_supply_chain_audit.py) is the practical defence - # against the npm postinstall-dropper class -- it fires BEFORE any - # tarball runs, on the injection pattern itself rather than an - # advisory-DB lookup. - run: | - cd studio/frontend - npm ci --no-fund --no-audit - npm run build - - - name: Build wheel + sdist - run: | - python -m pip install --upgrade pip build - rm -rf dist build ./*.egg-info - python -m build - - - name: Wheel content sanity - run: | - python - <<'PY' - import zipfile, glob, sys - w = glob.glob("dist/unsloth-*.whl") - if not w: - print("FAIL: no wheel produced"); sys.exit(2) - w = w[0] - print(f"wheel: {w}") - with zipfile.ZipFile(w) as z: - n = z.namelist() - checks = { - "lockfile shipped": any(s.endswith("studio/frontend/package-lock.json") for s in n), - "frontend dist shipped": any(s.endswith("studio/frontend/dist/index.html") for s in n), - "no node_modules": not any("studio/frontend/node_modules/" in s for s in n), - "no bun.lock": not any(s.endswith("studio/frontend/bun.lock") for s in n), - } - js = [s for s in n - if "studio/frontend/dist/assets/" in s - and s.endswith(".js") - and "/index-" in s] - if not js: - print("FAIL: no main bundle index-*.js in wheel"); sys.exit(2) - data = z.read(js[0]).decode("utf-8", "replace") - hits = data.count("unstable_Provider:") - print(f"main bundle: {js[0]}") - print(f"unstable_Provider hits: {hits} (>=4 indicates 2026.5.1 regression)") - checks["bundle has no Studio unstable_Provider call site"] = (hits < 4) - - print() - for k, v in checks.items(): - print(f" [{'PASS' if v else 'FAIL'}] {k}") - sys.exit(0 if all(checks.values()) else 1) - PY - - - name: Studio backend import smoke - # Imports `studio.backend.main:app` from the freshly-installed wheel in - # a clean venv. This catches the class of bug that 2026.5.1 shipped with: - # frontend dist missing, package-lock.json missing, or the wheel's Python - # source tree broken in a way that surfaces only at app construction time. - run: | - python -m venv /tmp/v - /tmp/v/bin/pip install --upgrade pip - /tmp/v/bin/pip install -r studio/backend/requirements/studio.txt - /tmp/v/bin/pip install \ - python-multipart aiofiles sqlalchemy cryptography \ - pyyaml jinja2 mammoth unpdf requests \ - 'numpy<3' - /tmp/v/bin/pip install --no-deps dist/unsloth-*.whl - # Run from /tmp so Python imports the installed package, not the source tree. - cd /tmp - /tmp/v/bin/python -c "from studio.backend.main import app; print('Studio backend OK:', app.title)" - - - name: Upload wheel on failure - if: failure() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: unsloth-wheel - path: dist/ - retention-days: 7 From 4ab750997566bcd87783a318438762161a7578d9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 11:50:20 +0000 Subject: [PATCH 03/84] mlx_parity: fan out across 10 parallel Mac jobs + 3 deeper probes Each probe now runs as its own matrix entry on macos-14, so a single slow / failing probe does not block diagnostic output from the others. Wall time drops from sum-of-probes to max-of-probe. Add three more bisection probes: 8. per-token CE decomposition where is the 1.38x loss gap concentrated? 9. attention mask / lengths inspection do HF and MLX supervise the same positional set? 10. HF SFTTrainer curve on same Mac host control: isolates "MLX vs HF" from "CUDA vs Mac CPU" An aggregate job downloads every probe-N artifact and dumps the JSON to a single log so a maintainer reads one place instead of ten. --- .github/workflows/mlx-parity-probe.yml | 168 ++++++++---------- tests/mlx_parity/probe_10_hf_curve_control.py | 144 +++++++++++++++ tests/mlx_parity/probe_8_per_token_loss.py | 87 +++++++++ tests/mlx_parity/probe_9_attention_lengths.py | 103 +++++++++++ 4 files changed, 412 insertions(+), 90 deletions(-) create mode 100644 tests/mlx_parity/probe_10_hf_curve_control.py create mode 100644 tests/mlx_parity/probe_8_per_token_loss.py create mode 100644 tests/mlx_parity/probe_9_attention_lengths.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index d53aa0dcf5..cdde3c59b7 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -1,32 +1,26 @@ # SPDX-License-Identifier: AGPL-3.0-only # Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. # -# MLX vs HF parity bisection probes. +# MLX vs HF parity bisection probes -- one Mac M1 job per probe. # -# Why this workflow exists: -# The upstream MLX CI on `unslothai/unsloth` is failing the -# in-memory generation assertion in tests/studio/run_real_mlx_smoke.py -# (model emits "5 lbs!" instead of containing "Unsloth"). The CUDA -# mirror in temp/torchcodec_test/cuda_mirror.py emits "Unsloth" under -# every clip setting tested. Step-1 forward-pass loss is 7.64 on CUDA -# and 10.55 on MLX for the IDENTICAL model + data + seed -- a 1.38x -# pre-optimizer-step discrepancy. The clipping override fixed by -# unsloth-zoo#663 is a real bug but does not explain the loss gap. +# Why parallel matrix: each probe is fully independent (different +# subprocess, different pip state matters not at all because the +# install layer is the same for each). Fanning out lets: +# * a single failing probe NOT block the diagnostic data from the +# remaining probes (already had continue-on-error, but matrix gives +# each its own job log + artifact + duration); +# * total wall-time = max(probe_install + probe_run) instead of +# sum across probes; +# * future probes added without touching the existing ones. # -# This workflow runs 7 small probes on a real macos-14-arm64 runner -# that bisect the parity gap across the dispatch path: -# 1. tokenization identical input ids? -# 2. base-model forward logits identical logits? -# 3. loss reduction (synthetic)same CE-mean number? -# 4. LoRA init B=0 in both; A std comparable? -# 5. single backward gradient norms comparable? -# 6. AdamW step (synthetic) same post-step weight? -# 7. 7-step loss curve data dump for follow-up analysis +# Each probe job: +# 1. installs the common dep set (MLX + torch CPU + transformers + zoo) +# 2. runs ONE probe +# 3. always uploads its probe_${N}.json as an artifact # -# continue-on-error: true on each probe so a single divergence does -# not hide the diagnostics for the later probes. -# -# Status: experimental / debug. Surfaces "MLX parity probes" PR check. +# A final aggregate job downloads all artifacts and prints a single +# summary table to its log so the human reader can see all probes +# without clicking into 10 separate job logs. name: MLX parity probes @@ -47,10 +41,34 @@ permissions: contents: read jobs: - probes: - name: probes + probe: + name: probe-${{ matrix.id }} runs-on: macos-14 - timeout-minutes: 30 + timeout-minutes: 25 + strategy: + fail-fast: false + matrix: + include: + - id: '1' + script: probe_1_tokenization.py + - id: '2' + script: probe_2_forward_logits.py + - id: '3' + script: probe_3_loss_reduction.py + - id: '4' + script: probe_4_lora_init.py + - id: '5' + script: probe_5_single_grad.py + - id: '6' + script: probe_6_adamw_step.py + - id: '7' + script: probe_7_loss_curve.py + - id: '8' + script: probe_8_per_token_loss.py + - id: '9' + script: probe_9_attention_lengths.py + - id: '10' + script: probe_10_hf_curve_control.py steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -69,99 +87,69 @@ jobs: - name: Install deps run: | python -m pip install --upgrade pip - # MLX + mlx-lm for the MLX-side calls pip install \ 'mlx==0.30.0' \ 'mlx-lm==0.30.0' \ 'numpy==2.4.4' \ 'pytest==9.0.3' - # torch CPU for the HF-side calls; CPU is fine since we're - # measuring losses + grad norms, not throughput pip install --index-url https://download.pytorch.org/whl/cpu \ 'torch==2.10.0' - # transformers + peft for HF SFTTrainer parity pip install \ 'transformers==4.57.6' \ 'peft==0.18.0' \ 'datasets==4.3.0' \ 'accelerate==1.13.0' \ 'sentencepiece==0.2.1' \ - 'huggingface-hub==0.36.2' - # unsloth-zoo from git (provides unsloth_zoo.mlx.* on Apple Silicon) + 'huggingface-hub==0.36.2' \ + 'trl==0.27.0' for attempt in 1 2 3; do if pip install "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo"; then break; fi if [ "$attempt" -eq 3 ]; then exit 1; fi sleep $((5*attempt)) done - - name: Probe 1 — tokenization parity - if: always() - continue-on-error: true - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - cd tests/mlx_parity && python probe_1_tokenization.py - - - name: Probe 2 — base-model forward logits parity - if: always() - continue-on-error: true - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - cd tests/mlx_parity && python probe_2_forward_logits.py - - - name: Probe 3 — loss reduction parity (synthetic) - if: always() - continue-on-error: true - run: | - cd tests/mlx_parity && python probe_3_loss_reduction.py - - - name: Probe 4 — LoRA init parity - if: always() - continue-on-error: true + - name: Run probe ${{ matrix.id }} — ${{ matrix.script }} env: HF_TOKEN: ${{ secrets.HF_TOKEN }} + UNSLOTH_COMPILE_DISABLE: '1' run: | - cd tests/mlx_parity && python probe_4_lora_init.py + cd tests/mlx_parity && python ${{ matrix.script }} - - name: Probe 5 — single backward parity + - name: Show JSON output if: always() - continue-on-error: true - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - cd tests/mlx_parity && python probe_5_single_grad.py + F=tests/mlx_parity/.out/probe_${{ matrix.id }}.json + echo "=== ${F} ===" + cat "${F}" 2>/dev/null || echo "(no JSON written)" - - name: Probe 6 — AdamW step parity (synthetic) + - name: Upload probe artifact if: always() - continue-on-error: true - run: | - cd tests/mlx_parity && python probe_6_adamw_step.py + uses: actions/upload-artifact@v4 + with: + name: probe-${{ matrix.id }} + path: tests/mlx_parity/.out/probe_${{ matrix.id }}.json + if-no-files-found: warn - - name: Probe 7 — 7-step MLX loss curve (data dump) - if: always() - continue-on-error: true - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - UNSLOTH_COMPILE_DISABLE: '1' - run: | - cd tests/mlx_parity && python probe_7_loss_curve.py + aggregate: + name: aggregate + needs: probe + if: always() + runs-on: ubuntu-latest + steps: + - name: Download all probe artifacts + uses: actions/download-artifact@v4 + with: + path: probes + pattern: probe-* + merge-multiple: true - - name: Aggregate probe results - if: always() + - name: Summary run: | - set +e - echo "=== Aggregated probe JSON dumps ===" - for i in 1 2 3 4 5 6 7; do - echo "--- probe_${i}.json ---" - cat tests/mlx_parity/.out/probe_${i}.json 2>/dev/null || echo "(missing -- probe ${i} did not run or crashed)" + echo "=== probe artifacts ===" + ls -la probes/ || true + echo + for n in 1 2 3 4 5 6 7 8 9 10; do + echo "--- probe_${n}.json ---" + cat probes/probe_${n}.json 2>/dev/null || echo "(missing -- probe ${n} job did not produce output)" echo done - - - name: Upload probe artifacts - if: always() - uses: actions/upload-artifact@v4 - with: - name: mlx-parity-probe-output - path: tests/mlx_parity/.out/ - if-no-files-found: warn diff --git a/tests/mlx_parity/probe_10_hf_curve_control.py b/tests/mlx_parity/probe_10_hf_curve_control.py new file mode 100644 index 0000000000..16cbc386f4 --- /dev/null +++ b/tests/mlx_parity/probe_10_hf_curve_control.py @@ -0,0 +1,144 @@ +"""Probe 10 — HF SFTTrainer 7-step loss curve on the SAME Mac host (control). + +The previously-collected HF baseline came from CUDA bf16 on a B200 GPU. +That's a different platform AND a different precision AND a different +optimizer backend. To isolate "MLX vs HF" from "CUDA vs Mac CPU" we +re-run the HF leg here on the same macos-14-arm64 runner in fp32 +(CPU), with the exact same 7 LoRA targets / alpha=16 / hyperparams. + +Compare probe_10.json with probe_7.json: same-host, same-precision +expectations, only the trainer implementation changes. + +Always exits 0 -- data dump for follow-up analysis. +""" + +import json +import sys + +import numpy as np + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + SEED, + MAX_SEQ_LEN, + OUT_DIR, + banner, + section, + report, + seed_everything, +) + + +def main() -> int: + seed_everything() + banner("Probe 10: HF SFTTrainer 7-step loss curve (control on same host)") + + import torch + from datasets import Dataset + from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + TrainerCallback, + ) + from peft import LoraConfig, get_peft_model + + # TRL is optional on a Mac CPU image; install if missing. + try: + from trl import SFTConfig, SFTTrainer + except ImportError as e: + report("trl not available", str(e)) + out = {"trl_available": False} + (OUT_DIR / "probe_10.json").write_text(json.dumps(out, indent=2)) + return 0 + + torch.manual_seed(SEED) + tok = AutoTokenizer.from_pretrained(MODEL_NAME) + model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32) + model = get_peft_model( + model, + LoraConfig( + r=8, lora_alpha=16, lora_dropout=0.0, bias="none", + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + ), + ) + + rows = [] + class _Logger(TrainerCallback): + def on_log(self, args, state, control, logs=None, **kwargs): + if not logs or "loss" not in logs: + return + rows.append({ + "step": int(state.global_step), + "loss": float(logs["loss"]), + "grad_norm": float(logs["grad_norm"]) if "grad_norm" in logs else None, + }) + + ds = Dataset.from_list([{"text": TRAIN_TEXT}] * 64) + trainer = SFTTrainer( + model=model, + processing_class=tok, + train_dataset=ds, + callbacks=[_Logger()], + args=SFTConfig( + max_length=MAX_SEQ_LEN, + dataset_text_field="text", + per_device_train_batch_size=2, + gradient_accumulation_steps=3, + warmup_steps=0, + max_steps=7, + learning_rate=1e-3, + logging_steps=1, + optim="adamw_torch", + weight_decay=0.0, + lr_scheduler_type="constant", + max_grad_norm=1.0, + seed=SEED, + save_strategy="no", + report_to="none", + packing=False, + bf16=False, + fp16=False, + output_dir=str(OUT_DIR / "probe10_outputs"), + ), + ) + trainer.train() + + section("post-train forward") + model.eval() + with torch.no_grad(): + enc = tok(TRAIN_TEXT, return_tensors="pt") + out = model(**enc, labels=enc["input_ids"].clone()) + post_loss = float(out.loss.detach()) + report("post_train_loss", post_loss) + + section("greedy generation") + model.eval() + with torch.no_grad(): + ginp = tok(PROMPT, return_tensors="pt") + gout = model.generate(**ginp, max_new_tokens=48, do_sample=False) + gen = tok.decode(gout[0], skip_special_tokens=True) + report("generation", repr(gen)) + + out = { + "trl_available": True, + "rows": rows, + "post_train_loss": post_loss, + "generation": gen, + "contains_unsloth": "Unsloth" in gen, + } + (OUT_DIR / "probe_10.json").write_text(json.dumps(out, indent=2)) + section("summary") + report("step-1 loss", rows[0]["loss"] if rows else None) + report("step-7 loss", rows[-1]["loss"] if rows else None) + report("post_train_loss", post_loss) + report("contains 'Unsloth'", "Unsloth" in gen) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_8_per_token_loss.py b/tests/mlx_parity/probe_8_per_token_loss.py new file mode 100644 index 0000000000..5be99498c5 --- /dev/null +++ b/tests/mlx_parity/probe_8_per_token_loss.py @@ -0,0 +1,87 @@ +"""Probe 8 — per-token CE decomposition. + +The aggregate step-1 loss gap (HF 7.64 vs MLX 10.55) is a single scalar. +This probe breaks it down per position: + + * tokenize the train row identically + * forward through the base model on both backends (no LoRA) + * compute per-token cross-entropy at every position + * print: tok_idx, token_id, decoded, ce_hf, ce_mlx, abs(ce_hf - ce_mlx) + +If the gap is concentrated on specific positions (BOS, EOS, special +tokens), the divergence is likely a masking / special-token handling +bug. If it is spread evenly, it is a precision / numerics issue across +the whole forward pass. + +Always exits 0 -- diagnostic dump. +""" + +import json +import sys + +import numpy as np + +from _common import MODEL_NAME, TRAIN_TEXT, OUT_DIR, banner, section, report, seed_everything + + +def main() -> int: + seed_everything() + banner("Probe 8: per-token CE decomposition") + + from transformers import AutoTokenizer + tok = AutoTokenizer.from_pretrained(MODEL_NAME) + ids = tok.encode(TRAIN_TEXT) + if tok.eos_token_id is not None and ids[-1] != tok.eos_token_id: + ids.append(tok.eos_token_id) + report("token_ids", ids) + L = len(ids) + report("len", L) + + section("HF base forward (fp32)") + import torch + import torch.nn.functional as F + from transformers import AutoModelForCausalLM + hf_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32) + hf_model.eval() + with torch.no_grad(): + logits = hf_model(input_ids=torch.tensor([ids], dtype=torch.long)).logits[0].float().cpu() + # shift: predict token t+1 from logits[t] + shift_logits = logits[:-1] + shift_targets = torch.tensor(ids[1:], dtype=torch.long) + hf_ce = F.cross_entropy(shift_logits, shift_targets, reduction="none").numpy() + report("hf mean CE", float(hf_ce.mean())) + report("hf sum CE", float(hf_ce.sum())) + + section("MLX base forward (fp32)") + import mlx.core as mx + import mlx.nn as nn + from mlx_lm import load as mlx_load + mlx_model, _ = mlx_load(MODEL_NAME) + mlx_logits = np.asarray(mlx_model(mx.array([ids])).astype(mx.float32))[0] + shift_mlx = mx.array(mlx_logits[:-1]) + shift_tgt = mx.array(np.asarray(ids[1:], dtype=np.int64)) + mlx_ce = np.asarray(nn.losses.cross_entropy(shift_mlx, shift_tgt, reduction="none")) + report("mlx mean CE", float(mlx_ce.mean())) + report("mlx sum CE", float(mlx_ce.sum())) + + section("per-token table") + print(f" {'idx':>3} {'tok_id':>7} {'decoded':<24} {'ce_hf':>9} {'ce_mlx':>9} {'abs_diff':>9}") + for i in range(L - 1): + tid = ids[i + 1] + dec = tok.decode([tid]).replace("\n", "\\n").replace("\t", "\\t")[:24] + print(f" {i:>3} {tid:>7} {dec:<24} {float(hf_ce[i]):>9.4f} {float(mlx_ce[i]):>9.4f} {abs(float(hf_ce[i]) - float(mlx_ce[i])):>9.4f}") + + out = { + "token_ids": ids, + "hf_per_token_ce": hf_ce.tolist(), + "mlx_per_token_ce": mlx_ce.tolist(), + "hf_mean": float(hf_ce.mean()), + "mlx_mean": float(mlx_ce.mean()), + "abs_diff_total": float(np.abs(hf_ce - mlx_ce).sum()), + } + (OUT_DIR / "probe_8.json").write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_9_attention_lengths.py b/tests/mlx_parity/probe_9_attention_lengths.py new file mode 100644 index 0000000000..2f2810568c --- /dev/null +++ b/tests/mlx_parity/probe_9_attention_lengths.py @@ -0,0 +1,103 @@ +"""Probe 9 — attention mask / lengths inspection. + +HF SFTTrainer's default collator and MLX trainer's create_batches both +build a (batch, lengths_or_mask) representation. Their masking +semantics may differ in subtle ways: + + HF SFTTrainer: + * attention_mask is a (B, L) 0/1 tensor; 0 marks padding tokens. + * labels = input_ids with padding positions set to -100. + * loss is reduced over labels != -100. + + MLX trainer (unsloth_zoo.mlx): + * batch is (B, L) padded with 0. + * lengths is (B, 2) of [start, end] = [1, L-1] for this dataset + (see trainer.py around batch_lengths.append([1, L-1])). + * labels mirror input_ids with [-100]*pad_len trailing. + * loss mask = (targets != -100) AND length_mask(start, end). + +This probe enumerates what tokens are actually being supervised in +each case for our specific train row and confirms the two paths +supervise the SAME positional set. +""" + +import json +import sys + +import numpy as np + +from _common import MODEL_NAME, TRAIN_TEXT, OUT_DIR, banner, section, report, seed_everything + + +def main() -> int: + seed_everything() + banner("Probe 9: attention mask / lengths inspection") + + from transformers import AutoTokenizer + tok = AutoTokenizer.from_pretrained(MODEL_NAME) + ids = tok.encode(TRAIN_TEXT) + if tok.eos_token_id is not None and ids[-1] != tok.eos_token_id: + ids.append(tok.eos_token_id) + L = len(ids) + report("token_ids", ids) + report("len", L) + + section("HF SFTTrainer style supervision mask") + # No padding here (batch of 1, length L) -> attention_mask is all 1s, + # labels mirror ids, all positions are supervised after shift. + attn = [1] * L + labels = list(ids) + shifted_labels = labels[1:] + hf_supervised_positions = list(range(L - 1)) + hf_supervised_tokens = [tok.decode([t]) for t in shifted_labels] + report("attention_mask", attn) + report("shifted target ids", shifted_labels) + report("supervised positions (post-shift)", hf_supervised_positions) + + section("MLX trainer style supervision mask") + # Mirrors the path in unsloth_zoo/mlx/trainer.py: + # batch_lengths.append([1, L - 1]) + # length_mask = (steps >= lengths[:,0]) AND (steps <= lengths[:,1]) + # steps = mx.arange(1, targets.shape[1] + 1) == [1..L-1] + # so length_mask is TRUE for steps in [1, L-1], i.e. all post-shift + # positions for our unpadded batch. + lengths_pair = [1, L - 1] + steps = list(range(1, L)) # = [1..L-1] + length_mask = [(s >= lengths_pair[0]) and (s <= lengths_pair[1]) for s in steps] + targets_mlx = labels[1:] + mask_neg100 = [t != -100 for t in targets_mlx] + combined_mask = [a and b for a, b in zip(length_mask, mask_neg100)] + mlx_supervised_positions = [i for i, m in enumerate(combined_mask) if m] + mlx_supervised_tokens = [tok.decode([targets_mlx[i]]) for i in mlx_supervised_positions] + report("lengths_pair", lengths_pair) + report("steps", steps) + report("length_mask", length_mask) + report("supervised positions (post-shift)", mlx_supervised_positions) + + section("comparison") + matches = hf_supervised_positions == mlx_supervised_positions + report("supervised positions match", matches) + report("hf supervises N tokens", len(hf_supervised_positions)) + report("mlx supervises N tokens", len(mlx_supervised_positions)) + only_hf = set(hf_supervised_positions) - set(mlx_supervised_positions) + only_mlx = set(mlx_supervised_positions) - set(hf_supervised_positions) + if only_hf: + report("only supervised by HF", list(only_hf)) + if only_mlx: + report("only supervised by MLX", list(only_mlx)) + + out = { + "token_ids": ids, + "hf_supervised_positions": hf_supervised_positions, + "mlx_supervised_positions": mlx_supervised_positions, + "match": matches, + "n_supervised_hf": len(hf_supervised_positions), + "n_supervised_mlx": len(mlx_supervised_positions), + "lengths_pair": lengths_pair, + } + (OUT_DIR / "probe_9.json").write_text(json.dumps(out, indent=2)) + return 0 if matches else 2 + + +if __name__ == "__main__": + sys.exit(main()) From 83025daa7b2ba73796eacc38fab3580949f8bd1c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 11:59:04 +0000 Subject: [PATCH 04/84] mlx_parity: fix probes 5 + 10, add probe 11 (fp32 trainer curve) Findings from the first matrix run: * probe 8 -- same-host fp32: HF mean CE 7.72, MLX mean CE 7.74. The 1.38x step-1 loss gap (CUDA bf16 7.64 vs MLX fp16 10.55) is a dtype / platform artifact, not an algorithmic divergence. * probe 3 + 6 pass at machine epsilon (loss math + AdamW math are bit-identical between torch and MLX). * probe 5 crashed with `tree_flatten` ValueError (the grads tree contained non-array nodes). Replace tree_flatten with a typed recursive walk. * probe 10 OOM on MPS (macos-14 runners only get 7 GB shared). Force torch to CPU via CUDA_VISIBLE_DEVICES="", torch.set_default_device("cpu"), and SFTConfig(use_cpu=True). Add probe 11: re-run the 7-step MLX training at dtype="float32" to directly test the dtype-artifact hypothesis. If fp32 emits "Unsloth" and fp16 does not, the smoke-test (or trainer default) on Apple Silicon should switch precision. --- .github/workflows/mlx-parity-probe.yml | 4 +- tests/mlx_parity/probe_10_hf_curve_control.py | 21 ++- tests/mlx_parity/probe_11_mlx_fp32_curve.py | 135 ++++++++++++++++++ tests/mlx_parity/probe_5_single_grad.py | 33 ++++- 4 files changed, 185 insertions(+), 8 deletions(-) create mode 100644 tests/mlx_parity/probe_11_mlx_fp32_curve.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index cdde3c59b7..e76777907d 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -69,6 +69,8 @@ jobs: script: probe_9_attention_lengths.py - id: '10' script: probe_10_hf_curve_control.py + - id: '11' + script: probe_11_mlx_fp32_curve.py steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -148,7 +150,7 @@ jobs: echo "=== probe artifacts ===" ls -la probes/ || true echo - for n in 1 2 3 4 5 6 7 8 9 10; do + for n in 1 2 3 4 5 6 7 8 9 10 11; do echo "--- probe_${n}.json ---" cat probes/probe_${n}.json 2>/dev/null || echo "(missing -- probe ${n} job did not produce output)" echo diff --git a/tests/mlx_parity/probe_10_hf_curve_control.py b/tests/mlx_parity/probe_10_hf_curve_control.py index 16cbc386f4..6c5d381d44 100644 --- a/tests/mlx_parity/probe_10_hf_curve_control.py +++ b/tests/mlx_parity/probe_10_hf_curve_control.py @@ -6,6 +6,10 @@ re-run the HF leg here on the same macos-14-arm64 runner in fp32 (CPU), with the exact same 7 LoRA targets / alpha=16 / hyperparams. +Forces torch to CPU because the standard macos-14 GitHub runner has +only 7 GB of shared memory; an fp32 LoRA training on MPS hits the +GPU memory watermark. + Compare probe_10.json with probe_7.json: same-host, same-precision expectations, only the trainer implementation changes. @@ -13,8 +17,15 @@ """ import json +import os import sys +# Hide every accelerator from torch before importing it. macos-14 runners +# expose MPS with a 7 GB shared cap; the fp32 7-module LoRA training +# above does not fit. Force CPU. +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0" +os.environ["CUDA_VISIBLE_DEVICES"] = "" + import numpy as np from _common import ( @@ -54,8 +65,15 @@ def main() -> int: return 0 torch.manual_seed(SEED) + # Force CPU explicitly even if MPS is reported. setting empty + # CUDA_VISIBLE_DEVICES handles CUDA; here we shadow the MPS-pickup + # path by setting torch's default device. + try: + torch.set_default_device("cpu") + except Exception: + pass tok = AutoTokenizer.from_pretrained(MODEL_NAME) - model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32) + model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.float32).to("cpu") model = get_peft_model( model, LoraConfig( @@ -103,6 +121,7 @@ def on_log(self, args, state, control, logs=None, **kwargs): packing=False, bf16=False, fp16=False, + use_cpu=True, output_dir=str(OUT_DIR / "probe10_outputs"), ), ) diff --git a/tests/mlx_parity/probe_11_mlx_fp32_curve.py b/tests/mlx_parity/probe_11_mlx_fp32_curve.py new file mode 100644 index 0000000000..40fcc68f9c --- /dev/null +++ b/tests/mlx_parity/probe_11_mlx_fp32_curve.py @@ -0,0 +1,135 @@ +"""Probe 11 — MLX trainer 7-step loss curve at dtype="float32". + +Probe 7 runs the MLX trainer at dtype="float16" (the smoke-test default). +This probe runs the identical config at dtype="float32" so that the +forward / backward / optimizer are all carried out in fp32, matching +what HF on Mac CPU (probe 10) does. + +Hypothesis: the upstream smoke test's "5 lbs!" / "42!!" generation +collapse is a fp16 numerical artifact, not an algorithmic bug. + +If probe 11's loss curve and generation come out matching the HF curve +in probe 10, the actionable fix is to switch the smoke test (or the +trainer default) to float32 / bfloat16 on Apple Silicon. + +Always exits 0 -- data dump. +""" + +import json +import sys + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + SEED, + MAX_SEQ_LEN, + OUT_DIR, + banner, + section, + report, + seed_everything, +) + + +def main() -> int: + seed_everything() + banner("Probe 11: MLX trainer 7-step loss curve at fp32") + + import mlx.core as mx + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + + section("load + LoRA (fp32)") + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype="float32", # <-- the only change vs probe 7 + text_only=True, max_seq_length=128, + random_state=SEED, + ) + model = FastMLXModel.get_peft_model( + model, + r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + random_state=SEED, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + ) + + config = MLXTrainingConfig( + per_device_train_batch_size=2, + gradient_accumulation_steps=3, + max_steps=7, + learning_rate=1e-3, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=1.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=SEED, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / "probe11_outputs"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + + rows = [] + def _on_step(step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens, grad_norm): + rows.append({ + "step": int(step), "loss": float(loss), + "lr": float(lr), "grad_norm": None if grad_norm is None else float(grad_norm), + "num_tokens": int(num_tokens), + }) + trainer.add_step_callback(_on_step) + trainer.train() + + section("post-train forward") + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + section("greedy generation") + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen)) + report("contains 'Unsloth'", contains) + + out = { + "dtype": "float32", + "rows": rows, + "post_train_loss": post_loss_val, + "generation": gen, + "contains_unsloth": contains, + } + (OUT_DIR / "probe_11.json").write_text(json.dumps(out, indent=2)) + section("summary") + report("step-1 loss", rows[0]["loss"] if rows else None) + report("step-7 loss", rows[-1]["loss"] if rows else None) + report("post_train_loss", post_loss_val) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_5_single_grad.py b/tests/mlx_parity/probe_5_single_grad.py index 9b00617b60..46af01d168 100644 --- a/tests/mlx_parity/probe_5_single_grad.py +++ b/tests/mlx_parity/probe_5_single_grad.py @@ -87,17 +87,38 @@ def main() -> int: lengths = mx.array([[1, L - 1]]) labels_mlx = mx.array([ids]) - import mlx.utils as mxu - def loss_only(model): loss, _ntok = loss_fn(model, batch, lengths, labels_mlx) return loss loss_val, grads = mx.value_and_grad(loss_only)(mlx_model) - flat = mxu.tree_flatten(grads) + + # `grads` is a nested dict tree; walk it manually, robust to mixed + # leaf types (mxu.tree_flatten only accepts pure mx.array leaves). mlx_norms = {} - for n, g in flat: - if (".0." in n or "layers.0" in n) and "q_proj" in n and ("lora_A" in n or "lora_B" in n or "lora_a" in n or "lora_b" in n): - mlx_norms[n.split(".0.")[-1] if ".0." in n else n] = float(mx.linalg.norm(g.astype(mx.float32)).item()) + def _walk(tree, path): + if isinstance(tree, dict): + for k, v in tree.items(): + _walk(v, path + (str(k),)) + return + if isinstance(tree, (list, tuple)): + for i, v in enumerate(tree): + _walk(v, path + (str(i),)) + return + if hasattr(tree, "shape") and hasattr(tree, "dtype"): + try: + arr = tree + if hasattr(arr, "astype"): + arr = arr.astype(mx.float32) + norm = float(mx.linalg.norm(arr).item()) + name = ".".join(path) + if "q_proj" in name and (".0." in name or "layers.0" in name): + key = name + if "lora_a" in name.lower() or "lora_b" in name.lower(): + mlx_norms[key] = norm + except Exception: + pass + + _walk(grads, ()) report("mlx grad norms (q_proj.lora_*)", mlx_norms) report("mlx loss", float(loss_val.item())) From 8b4054410f81381bdcad9fa50a186a18b322a3fc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 12:05:23 +0000 Subject: [PATCH 05/84] mlx_parity/probe_5: use mlx.nn.value_and_grad + aggregate norms mx.value_and_grad rejects the PEFT-wrapped model tree because it contains non-array metadata. mlx.nn.value_and_grad takes (model, fn) and internally walks model.trainable_parameters(), bypassing the issue. Simplify the comparison to aggregate gradient norm across all trainable params -- if MLX and HF disagree at >2x there is a parity bug regardless of which leaf carries it. --- tests/mlx_parity/probe_5_single_grad.py | 86 +++++++++++++------------ 1 file changed, 46 insertions(+), 40 deletions(-) diff --git a/tests/mlx_parity/probe_5_single_grad.py b/tests/mlx_parity/probe_5_single_grad.py index 46af01d168..3ad7d53132 100644 --- a/tests/mlx_parity/probe_5_single_grad.py +++ b/tests/mlx_parity/probe_5_single_grad.py @@ -66,6 +66,7 @@ def main() -> int: # ---------------- MLX side ---------------- section("MLX + unsloth_zoo.mlx backward") import mlx.core as mx + import mlx.nn as mlx_nn mx.random.seed(SEED) from unsloth_zoo.mlx.loader import FastMLXModel from unsloth_zoo.mlx.utils import make_baseline_loss_fn @@ -87,15 +88,23 @@ def main() -> int: lengths = mx.array([[1, L - 1]]) labels_mlx = mx.array([ids]) - def loss_only(model): - loss, _ntok = loss_fn(model, batch, lengths, labels_mlx) + # nn.value_and_grad takes (model, loss_fn) and uses model.trainable_parameters + # internally, avoiding the "argument should contain only arrays" tree_flatten + # error that mx.value_and_grad raises when the model tree has non-array + # metadata (PEFT wrappers). + def loss_for_grad(model, batch, lengths, labels_): + loss, _ntok = loss_fn(model, batch, lengths, labels_) return loss - loss_val, grads = mx.value_and_grad(loss_only)(mlx_model) + loss_and_grad = mlx_nn.value_and_grad(mlx_model, loss_for_grad) + loss_val, grads = loss_and_grad(mlx_model, batch, lengths, labels_mlx) - # `grads` is a nested dict tree; walk it manually, robust to mixed - # leaf types (mxu.tree_flatten only accepts pure mx.array leaves). + # Walk grads recursively (it is now a pure-array tree). Sum a per-name + # norm dict, restricted to layer-0 q_proj LoRA leaves. mlx_norms = {} + total_norm_sq = mx.array(0.0, dtype=mx.float32) + n_leaves = 0 def _walk(tree, path): + nonlocal total_norm_sq, n_leaves if isinstance(tree, dict): for k, v in tree.items(): _walk(v, path + (str(k),)) @@ -105,51 +114,48 @@ def _walk(tree, path): _walk(v, path + (str(i),)) return if hasattr(tree, "shape") and hasattr(tree, "dtype"): - try: - arr = tree - if hasattr(arr, "astype"): - arr = arr.astype(mx.float32) - norm = float(mx.linalg.norm(arr).item()) - name = ".".join(path) - if "q_proj" in name and (".0." in name or "layers.0" in name): - key = name - if "lora_a" in name.lower() or "lora_b" in name.lower(): - mlx_norms[key] = norm - except Exception: - pass - + arr = tree.astype(mx.float32) if hasattr(tree, "astype") else tree + total_norm_sq = total_norm_sq + mx.sum(arr * arr) + n_leaves += 1 + name = ".".join(path) + if "q_proj" in name and (".0." in name or "layers.0" in name) and ( + "lora_a" in name.lower() or "lora_b" in name.lower() + ): + mlx_norms[name] = float(mx.linalg.norm(arr).item()) _walk(grads, ()) - report("mlx grad norms (q_proj.lora_*)", mlx_norms) + mlx_total_norm = float(mx.sqrt(total_norm_sq).item()) + report("mlx grad leaves", n_leaves) + report("mlx total grad norm (all trainable)", mlx_total_norm) + report("mlx q_proj.lora_* grad norms", mlx_norms) report("mlx loss", float(loss_val.item())) + # Aggregate HF gradient norm for the same comparison. + hf_total_sq = 0.0 + for _, p in hf_peft.named_parameters(): + if p.grad is not None: + hf_total_sq += float((p.grad.detach().float() ** 2).sum().item()) + hf_total_norm = hf_total_sq ** 0.5 + # ---------------- compare ---------------- section("comparison") - ratio_info = {} - ok = True - for key_hf, val_hf in hf_norms.items(): - # find the corresponding MLX key by suffix match - match = None - for key_mlx in mlx_norms: - if key_hf.lower().replace("default.weight", "") in key_mlx.lower(): - match = key_mlx - break - if match is None: - ratio_info[key_hf] = {"hf": val_hf, "mlx": None} - ok = False - continue - ratio = mlx_norms[match] / max(val_hf, 1e-12) - ratio_info[key_hf] = {"hf": val_hf, "mlx": mlx_norms[match], "ratio_mlx_hf": ratio} - if not (0.5 <= ratio <= 2.0): - ok = False - report("grad-norm ratios", ratio_info) - out = { + ratio = mlx_total_norm / max(hf_total_norm, 1e-12) + report("hf total grad norm (all trainable)", hf_total_norm) + report("mlx total grad norm (all trainable)", mlx_total_norm) + report("ratio mlx/hf", ratio) + report("hf loss", float(out.loss.item())) + report("mlx loss", float(loss_val.item())) + ok = 0.5 <= ratio <= 2.0 + + out_blob = { "hf_loss": float(out.loss.item()) if hasattr(out, "loss") else None, "mlx_loss": float(loss_val.item()), + "hf_total_grad_norm": hf_total_norm, + "mlx_total_grad_norm": mlx_total_norm, + "ratio_mlx_hf": ratio, "hf_norms": hf_norms, "mlx_norms": mlx_norms, - "ratios": ratio_info, } - (OUT_DIR / "probe_5.json").write_text(json.dumps(out, indent=2, default=str)) + (OUT_DIR / "probe_5.json").write_text(json.dumps(out_blob, indent=2, default=str)) return 0 if ok else 2 From aa2a7f5444e9c81b0dc79215103a5bd1dc71d245 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 12:17:50 +0000 Subject: [PATCH 06/84] mlx_parity: probe 12 -- pin unsloth-zoo to parent of PR #634 In-#634 bisection: the probes so far rule out tokenization, loss math, AdamW math, supervised positions, single-step grad norm. HF on the same host emits "Unsloth"; MLX does not. The remaining suspect surface is whatever PR #634 (e6d8f7f) changed inside the MLX trainer itself. Probe 12 installs unsloth-zoo at the parent commit f37d510 and re-runs the identical 7-step config. If it emits "Unsloth" the regression is fully inside #634's diff and we can sub-bisect by reverting suspect changes (bias_correction, custom VJP, dtype handling, loss-reduction wiring). Workflow now supports a matrix.zoo_pin field so each probe job picks its own unsloth-zoo ref; defaults to HEAD when unset. --- .github/workflows/mlx-parity-probe.yml | 13 +- tests/mlx_parity/probe_12_zoo_prev634.py | 188 +++++++++++++++++++++++ 2 files changed, 199 insertions(+), 2 deletions(-) create mode 100644 tests/mlx_parity/probe_12_zoo_prev634.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index e76777907d..fe2d4d2877 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -71,6 +71,9 @@ jobs: script: probe_10_hf_curve_control.py - id: '11' script: probe_11_mlx_fp32_curve.py + - id: '12' + script: probe_12_zoo_prev634.py + zoo_pin: 'f37d510' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -104,8 +107,14 @@ jobs: 'sentencepiece==0.2.1' \ 'huggingface-hub==0.36.2' \ 'trl==0.27.0' + ZOO_REF='${{ matrix.zoo_pin }}' + if [ -z "$ZOO_REF" ]; then + ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo' + else + ZOO_SPEC="unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@${ZOO_REF}" + fi for attempt in 1 2 3; do - if pip install "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo"; then break; fi + if pip install "$ZOO_SPEC"; then break; fi if [ "$attempt" -eq 3 ]; then exit 1; fi sleep $((5*attempt)) done @@ -150,7 +159,7 @@ jobs: echo "=== probe artifacts ===" ls -la probes/ || true echo - for n in 1 2 3 4 5 6 7 8 9 10 11; do + for n in 1 2 3 4 5 6 7 8 9 10 11 12; do echo "--- probe_${n}.json ---" cat probes/probe_${n}.json 2>/dev/null || echo "(missing -- probe ${n} job did not produce output)" echo diff --git a/tests/mlx_parity/probe_12_zoo_prev634.py b/tests/mlx_parity/probe_12_zoo_prev634.py new file mode 100644 index 0000000000..9aeaa58710 --- /dev/null +++ b/tests/mlx_parity/probe_12_zoo_prev634.py @@ -0,0 +1,188 @@ +"""Probe 12 — pin unsloth-zoo to the parent of PR #634 and rerun. + +Hypothesis we want to nail down: every other parity probe rules out +the obvious axes (loss math, AdamW math, tokenization, supervised +positions, single-step gradient norm), yet HF on the same host +generates "Unsloth" and MLX does not. That points squarely at the +trainer changes in unsloth-zoo PR #634 (`e6d8f7f`). + +This probe assumes the CI workflow installs unsloth-zoo at the +PARENT commit `f37d510` (the commit immediately before #634 landed). +Pre-#634 the layout was flat: `unsloth_zoo.mlx_loader` / +`unsloth_zoo.mlx_trainer`. Post-#634 it's a package: +`unsloth_zoo.mlx.loader` / `unsloth_zoo.mlx.trainer`. Try both, +honor whichever is importable. + +If this probe generates "Unsloth" with the SAME 7-step config that +probe 7 / 11 fail on, the regression is fully INSIDE PR #634's diff +and we can sub-bisect by reverting the suspect changes (bias_correction, +loss reduction, custom VJP, dtype handling). + +Always exits 0 -- data dump. +""" + +import json +import sys + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + SEED, + MAX_SEQ_LEN, + OUT_DIR, + banner, + section, + report, + seed_everything, +) + + +def _import_zoo(): + try: + from unsloth_zoo.mlx_loader import FastMLXModel # pre-#634 + from unsloth_zoo.mlx_trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx_utils import make_baseline_loss_fn + return "pre-#634 flat layout", FastMLXModel, MLXTrainer, MLXTrainingConfig, make_baseline_loss_fn + except ImportError: + pass + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + return "post-#634 package layout", FastMLXModel, MLXTrainer, MLXTrainingConfig, make_baseline_loss_fn + + +def main() -> int: + seed_everything() + banner("Probe 12: pinned unsloth-zoo (parent of PR #634)") + + import importlib + import unsloth_zoo + report("unsloth_zoo path", getattr(unsloth_zoo, "__file__", "?")) + try: + report("unsloth_zoo version", getattr(unsloth_zoo, "__version__", "?")) + except Exception: + pass + + layout, FastMLXModel, MLXTrainer, MLXTrainingConfig, make_baseline_loss_fn = _import_zoo() + report("layout detected", layout) + + import mlx.core as mx + + section("load + LoRA (fp32)") + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype="float32", + text_only=True, max_seq_length=128, random_state=SEED, + ) + model = FastMLXModel.get_peft_model( + model, + r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + random_state=SEED, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + ) + + # MLXTrainingConfig at pre-#634 does NOT have max_grad_value, so we + # only pass it if supported. dataclasses.fields tells us. + import dataclasses + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra_kwargs = {} + if "max_grad_value" in fields_supported: + extra_kwargs["max_grad_value"] = None + config = MLXTrainingConfig( + per_device_train_batch_size=2, + gradient_accumulation_steps=3, + max_steps=7, + learning_rate=1e-3, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=1.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=SEED, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / "probe12_outputs"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra_kwargs, + ) + + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + + rows = [] + cb_arity_used = None + def _on_step_9(step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens, grad_norm): + rows.append({ + "step": int(step), "loss": float(loss), + "grad_norm": None if grad_norm is None else float(grad_norm), + "num_tokens": int(num_tokens), + }) + def _on_step_8(step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens): + rows.append({ + "step": int(step), "loss": float(loss), + "grad_norm": None, + "num_tokens": int(num_tokens), + }) + # pre-#634 callback arity was 8; post-#634 is 9. Try the higher + # arity first (post-#634); fall back to 8 if the trainer rejects it. + try: + trainer.add_step_callback(_on_step_9) + cb_arity_used = 9 + except Exception: + trainer.add_step_callback(_on_step_8) + cb_arity_used = 8 + trainer.train() + + section("post-train forward") + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + section("greedy generation") + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen)) + report("contains 'Unsloth'", contains) + + out = { + "layout": layout, + "callback_arity_used": cb_arity_used, + "rows": rows, + "post_train_loss": post_loss_val, + "generation": gen, + "contains_unsloth": contains, + "dtype": "float32", + } + (OUT_DIR / "probe_12.json").write_text(json.dumps(out, indent=2)) + section("summary") + if rows: + report("step-1 loss", rows[0]["loss"]) + report("step-7 loss", rows[-1]["loss"]) + report("post_train_loss", post_loss_val) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 7b36a6ee80e8729856b45fa1ddc1fc618c0a3eee Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 12:29:25 +0000 Subject: [PATCH 07/84] mlx_parity: prune matrix to 4 Mac jobs, add probe 13 (pure mlx-lm) Mac runners cap at 5 parallel on the free tier. Cut the matrix to the 4 probes that produce new information from here on: * probe 10 - HF SFTTrainer on Mac CPU fp32 (control, passes) * probe 11 - MLX trainer fp32 (known failing) * probe 12 - MLX with unsloth-zoo pinned to parent of PR #634 * probe 13 - PURE mlx-lm inference, no unsloth: "What is 1+1?" and a 7-turn KV-cache-reuse conversation ("What did I ask as my first question?" etc.) probe 12 now also uses a variadic callback and dtype="float16" to exactly mirror the green-era smoke test config so its result is directly comparable to the historical CI runs. Other probes (1-9) remain on disk and can be rerun ad-hoc; their results from earlier runs are already pinned in this PR's job logs. --- .github/workflows/mlx-parity-probe.yml | 26 ++-- tests/mlx_parity/probe_12_zoo_prev634.py | 36 +++--- .../mlx_parity/probe_13_pure_mlx_inference.py | 114 ++++++++++++++++++ 3 files changed, 138 insertions(+), 38 deletions(-) create mode 100644 tests/mlx_parity/probe_13_pure_mlx_inference.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index fe2d4d2877..4e4e9d0e54 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -49,24 +49,10 @@ jobs: fail-fast: false matrix: include: - - id: '1' - script: probe_1_tokenization.py - - id: '2' - script: probe_2_forward_logits.py - - id: '3' - script: probe_3_loss_reduction.py - - id: '4' - script: probe_4_lora_init.py - - id: '5' - script: probe_5_single_grad.py - - id: '6' - script: probe_6_adamw_step.py - - id: '7' - script: probe_7_loss_curve.py - - id: '8' - script: probe_8_per_token_loss.py - - id: '9' - script: probe_9_attention_lengths.py + # Mac runners are capped at 5 parallel jobs on the free tier. + # Keep the matrix lean: only probes that produce NEW information + # we have not already pinned via earlier runs. Other probe + # scripts remain on disk for ad-hoc reruns. - id: '10' script: probe_10_hf_curve_control.py - id: '11' @@ -74,6 +60,8 @@ jobs: - id: '12' script: probe_12_zoo_prev634.py zoo_pin: 'f37d510' + - id: '13' + script: probe_13_pure_mlx_inference.py steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -159,7 +147,7 @@ jobs: echo "=== probe artifacts ===" ls -la probes/ || true echo - for n in 1 2 3 4 5 6 7 8 9 10 11 12; do + for n in 10 11 12 13; do echo "--- probe_${n}.json ---" cat probes/probe_${n}.json 2>/dev/null || echo "(missing -- probe ${n} job did not produce output)" echo diff --git a/tests/mlx_parity/probe_12_zoo_prev634.py b/tests/mlx_parity/probe_12_zoo_prev634.py index 9aeaa58710..0e949c9022 100644 --- a/tests/mlx_parity/probe_12_zoo_prev634.py +++ b/tests/mlx_parity/probe_12_zoo_prev634.py @@ -69,9 +69,12 @@ def main() -> int: import mlx.core as mx - section("load + LoRA (fp32)") + # Mirror the SMOKE TEST AT 12295c1f exactly: dtype="float16" + identical LoRA + # config + identical hyperparams. We want to know if pre-#634 trainer + # behavior matches the green CI from that era. + section("load + LoRA (fp16, matches pre-#634 smoke)") model, tokenizer = FastMLXModel.from_pretrained( - MODEL_NAME, load_in_4bit=False, dtype="float32", + MODEL_NAME, load_in_4bit=False, dtype="float16", text_only=True, max_seq_length=128, random_state=SEED, ) model = FastMLXModel.get_peft_model( @@ -124,27 +127,22 @@ def main() -> int: ) rows = [] - cb_arity_used = None - def _on_step_9(step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens, grad_norm): + # Variadic callback so we work for both pre-#634 (8 args) and + # post-#634 (9 args). The trainer wraps `cb(...)` in try/except + # Exception, so an arity mismatch on a fixed-arg callback would + # silently no-op the entire logging path. + def _on_step(*args): + # args = (step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens[, grad_norm]) + if len(args) < 3: + return + step, _total, loss = args[0], args[1], args[2] + grad_norm = args[8] if len(args) >= 9 else None rows.append({ "step": int(step), "loss": float(loss), "grad_norm": None if grad_norm is None else float(grad_norm), - "num_tokens": int(num_tokens), }) - def _on_step_8(step, total, loss, lr, tok_s, peak_gb, elapsed, num_tokens): - rows.append({ - "step": int(step), "loss": float(loss), - "grad_norm": None, - "num_tokens": int(num_tokens), - }) - # pre-#634 callback arity was 8; post-#634 is 9. Try the higher - # arity first (post-#634); fall back to 8 if the trainer rejects it. - try: - trainer.add_step_callback(_on_step_9) - cb_arity_used = 9 - except Exception: - trainer.add_step_callback(_on_step_8) - cb_arity_used = 8 + trainer.add_step_callback(_on_step) + cb_arity_used = "variadic" trainer.train() section("post-train forward") diff --git a/tests/mlx_parity/probe_13_pure_mlx_inference.py b/tests/mlx_parity/probe_13_pure_mlx_inference.py new file mode 100644 index 0000000000..023325ea4b --- /dev/null +++ b/tests/mlx_parity/probe_13_pure_mlx_inference.py @@ -0,0 +1,114 @@ +"""Probe 13 — pure mlx-lm inference, NO unsloth involved. + +Two tests: + (a) one-shot: ask "What is 1+1?" and inspect the answer + (b) multi-turn with KV-cache reuse: walk a 7-turn conversation + that requires remembering earlier turns ("What did I ask as + my first question?", "What country did I ask about?", etc.) + +If pure mlx-lm answers correctly, the MLX runtime + the gemma-3-270m-it +weights are fine. The bug in the training path is then necessarily in +the unsloth-zoo MLX trainer wrapper, not in MLX itself. +""" + +import json +import sys + +from _common import MODEL_NAME, OUT_DIR, banner, section, report, seed_everything + + +TURNS = [ + "What is 1+1?", + "What is the capital of France?", + "What did I ask as my first question?", + "Create a short Python game", + "Fix bugs in it", + "What country did I ask about?", + "What number did you answer with?", +] + + +def main() -> int: + seed_everything() + banner("Probe 13: pure mlx-lm inference (no unsloth)") + + import mlx.core as mx + from mlx_lm import load as mlx_load, generate + try: + from mlx_lm.models.cache import make_prompt_cache + except Exception: + make_prompt_cache = None + + section("load model") + model, tokenizer = mlx_load(MODEL_NAME) + report("tokenizer class", type(tokenizer).__name__) + + section("(a) one-shot: 'What is 1+1?'") + one_shot_prompt = "What is 1+1?" + if hasattr(tokenizer, "apply_chat_template"): + try: + one_shot_prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": "What is 1+1?"}], + tokenize=False, + add_generation_prompt=True, + ) + except Exception as e: + report("chat_template error -- using raw prompt", str(e)) + out_one_shot = generate(model, tokenizer, prompt=one_shot_prompt, max_tokens=48, verbose=False) + report("answer", repr(out_one_shot)) + + section("(b) multi-turn with KV-cache reuse") + multi_turn_log = [] + history = [] + cache = None + for turn_idx, user_msg in enumerate(TURNS): + history.append({"role": "user", "content": user_msg}) + try: + prompt = tokenizer.apply_chat_template( + history, tokenize=False, add_generation_prompt=True, + ) + except Exception: + prompt = "\n".join(f"{m['role']}: {m['content']}" for m in history) + "\nassistant:" + # For KV-cache reuse: feed only the NEW suffix on subsequent turns. + # mlx-lm's generate accepts `prompt_cache` since 0.18+; if it does, + # we maintain `cache` across turns to demonstrate true reuse. + gen_kwargs = dict(max_tokens=64, verbose=False) + if cache is not None: + gen_kwargs["prompt_cache"] = cache + else: + if make_prompt_cache is not None: + try: + cache = make_prompt_cache(model) + gen_kwargs["prompt_cache"] = cache + except Exception as e: + cache = None + report("cache init error", str(e)) + try: + answer = generate(model, tokenizer, prompt=prompt, **gen_kwargs) + except TypeError: + # mlx-lm older API: no prompt_cache kwarg, fall back without it. + gen_kwargs.pop("prompt_cache", None) + cache = None + answer = generate(model, tokenizer, prompt=prompt, **gen_kwargs) + history.append({"role": "assistant", "content": answer}) + multi_turn_log.append({ + "turn": turn_idx + 1, + "user": user_msg, + "assistant": answer, + "kv_reuse": cache is not None, + }) + report(f"turn {turn_idx+1} user", user_msg) + report(f"turn {turn_idx+1} assistant", repr(answer[:140])) + + out = { + "one_shot_prompt": "What is 1+1?", + "one_shot_answer": out_one_shot, + "multi_turn": multi_turn_log, + "kv_reuse_used": cache is not None, + } + (OUT_DIR / "probe_13.json").write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From f3f95030aea1fc8bce5f820503121107e0f6759e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 12:41:52 +0000 Subject: [PATCH 08/84] mlx_parity: probe 14 + 15 -- test two fix candidates in parallel probe 14 -- unsloth-zoo branch try-bias-correction-false (PR #663 + bias_correction flipped back to MLX default). probe 15 -- unsloth-zoo branch fix-mlx-grad-clip-hf-parity (PR #663 only, bias_correction still True). Anchors retained: probe 11 -- HEAD red anchor (post-#634, fp32, fails generation) probe 12 -- pre-#634 green anchor (f37d510, fp16, generates Unsloth) Together these tell us: * 14 succeeds, 15 fails -> bias_correction is the only knob * 14 succeeds, 15 succeeds -> PR #663 alone is sufficient * both fail -> there's a second regression we still need to find --- .github/workflows/mlx-parity-probe.yml | 18 ++- .../probe_14_zoo_bias_correction_false.py | 140 ++++++++++++++++++ tests/mlx_parity/probe_15_zoo_pr663.py | 131 ++++++++++++++++ 3 files changed, 281 insertions(+), 8 deletions(-) create mode 100644 tests/mlx_parity/probe_14_zoo_bias_correction_false.py create mode 100644 tests/mlx_parity/probe_15_zoo_pr663.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 4e4e9d0e54..d0ec02e45a 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -50,18 +50,20 @@ jobs: matrix: include: # Mac runners are capped at 5 parallel jobs on the free tier. - # Keep the matrix lean: only probes that produce NEW information - # we have not already pinned via earlier runs. Other probe - # scripts remain on disk for ad-hoc reruns. - - id: '10' - script: probe_10_hf_curve_control.py + # Active matrix: two known anchors + two fix candidates. + # 12 is the green anchor (pre-#634), 11 is the red anchor (HEAD). + # 14 + 15 test fix candidates against those anchors. - id: '11' script: probe_11_mlx_fp32_curve.py - id: '12' script: probe_12_zoo_prev634.py zoo_pin: 'f37d510' - - id: '13' - script: probe_13_pure_mlx_inference.py + - id: '14' + script: probe_14_zoo_bias_correction_false.py + zoo_pin: 'try-bias-correction-false' + - id: '15' + script: probe_15_zoo_pr663.py + zoo_pin: 'fix-mlx-grad-clip-hf-parity' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -147,7 +149,7 @@ jobs: echo "=== probe artifacts ===" ls -la probes/ || true echo - for n in 10 11 12 13; do + for n in 11 12 14 15; do echo "--- probe_${n}.json ---" cat probes/probe_${n}.json 2>/dev/null || echo "(missing -- probe ${n} job did not produce output)" echo diff --git a/tests/mlx_parity/probe_14_zoo_bias_correction_false.py b/tests/mlx_parity/probe_14_zoo_bias_correction_false.py new file mode 100644 index 0000000000..5eacc8f4e7 --- /dev/null +++ b/tests/mlx_parity/probe_14_zoo_bias_correction_false.py @@ -0,0 +1,140 @@ +"""Probe 14 — pin unsloth-zoo to `try-bias-correction-false` and rerun. + +Hypothesis: PR #634 flipped MLX AdamW `bias_correction` from False to +True (matching torch.AdamW). With bias_correction=True step-1 updates +are ~3x smaller than the historical MLX default; the 7-step smoke +never reaches the "Unsloth" basin. + +This probe installs unsloth-zoo from the experimental branch +`try-bias-correction-false` (which sits on top of PR #663 and ONLY +reverts bias_correction back to False) and re-runs the standard 7-step +config in fp16, byte-matched to the green-era smoke test. + +Outcome: + * generates "Unsloth" => bias_correction=True is the breakage. + * still gibberish => there is a second regression inside #634. +""" + +import json +import sys + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + SEED, + MAX_SEQ_LEN, + OUT_DIR, + banner, + section, + report, + seed_everything, +) + + +def main() -> int: + seed_everything() + banner("Probe 14: MLX with bias_correction=False (experimental fix branch)") + + import mlx.core as mx + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + section("load + LoRA (fp16, smoke parity)") + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype="float16", + text_only=True, max_seq_length=128, random_state=SEED, + ) + model = FastMLXModel.get_peft_model( + model, + r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + random_state=SEED, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + ) + + config = MLXTrainingConfig( + per_device_train_batch_size=2, + gradient_accumulation_steps=3, + max_steps=7, + learning_rate=1e-3, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=1.0, + max_grad_value=None, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=SEED, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / "probe14_outputs"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + + rows = [] + def _on_step(*args): + if len(args) < 3: + return + rows.append({ + "step": int(args[0]), + "loss": float(args[2]), + "grad_norm": float(args[8]) if len(args) >= 9 and args[8] is not None else None, + }) + trainer.add_step_callback(_on_step) + trainer.train() + + section("post-train forward") + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + section("greedy generation") + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen)) + report("contains 'Unsloth'", contains) + + out = { + "branch": "try-bias-correction-false", + "bias_correction": False, + "rows": rows, + "post_train_loss": post_loss_val, + "generation": gen, + "contains_unsloth": contains, + } + (OUT_DIR / "probe_14.json").write_text(json.dumps(out, indent=2)) + section("summary") + if rows: + report("step-1 loss", rows[0]["loss"]) + report("step-7 loss", rows[-1]["loss"]) + report("post_train_loss", post_loss_val) + report("contains 'Unsloth'", contains) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_15_zoo_pr663.py b/tests/mlx_parity/probe_15_zoo_pr663.py new file mode 100644 index 0000000000..b473e22603 --- /dev/null +++ b/tests/mlx_parity/probe_15_zoo_pr663.py @@ -0,0 +1,131 @@ +"""Probe 15 — pin unsloth-zoo to PR #663 head (max_grad_value=None only). + +PR #663 fixes the silent override of max_grad_norm by max_grad_value +but leaves every other #634 change intact (including bias_correction=True). + +If probe 15 generates "Unsloth", #663 alone is sufficient and we don't +need the bias_correction flip in probe 14. + +If probe 15 fails (and probe 14 succeeds), #663 + bias_correction=False +is the minimal fix and PR #663 alone is NOT enough to green CI. +""" + +import json +import sys + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + SEED, + MAX_SEQ_LEN, + OUT_DIR, + banner, + section, + report, + seed_everything, +) + + +def main() -> int: + seed_everything() + banner("Probe 15: MLX with PR #663 only (max_grad_value=None, bias_correction=True)") + + import mlx.core as mx + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype="float16", + text_only=True, max_seq_length=128, random_state=SEED, + ) + model = FastMLXModel.get_peft_model( + model, + r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + random_state=SEED, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + ) + + config = MLXTrainingConfig( + per_device_train_batch_size=2, + gradient_accumulation_steps=3, + max_steps=7, + learning_rate=1e-3, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=1.0, + max_grad_value=None, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=SEED, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / "probe15_outputs"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + + rows = [] + def _on_step(*args): + if len(args) < 3: + return + rows.append({ + "step": int(args[0]), + "loss": float(args[2]), + "grad_norm": float(args[8]) if len(args) >= 9 and args[8] is not None else None, + }) + trainer.add_step_callback(_on_step) + trainer.train() + + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen)) + report("contains 'Unsloth'", contains) + + out = { + "branch": "fix-mlx-grad-clip-hf-parity", + "bias_correction": True, + "rows": rows, + "post_train_loss": post_loss_val, + "generation": gen, + "contains_unsloth": contains, + } + (OUT_DIR / "probe_15.json").write_text(json.dumps(out, indent=2)) + section("summary") + if rows: + report("step-1 loss", rows[0]["loss"]) + report("step-7 loss", rows[-1]["loss"]) + report("post_train_loss", post_loss_val) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From a61bf0f21134e617d3d805df06680ac70f319542 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 12:50:40 +0000 Subject: [PATCH 09/84] mlx_parity: probe 16 -- mlx-lm NATIVE LoRA trainer, no unsloth If mlx-lm's own CLI can train this model in 7 iters and emit "Unsloth", upstream MLX is healthy and the entire regression is inside the unsloth-zoo wrapper. Closes the loop on "did MLX ever work" by exercising the upstream training surface that has zero unsloth code path. Spawns `python -m mlx_lm lora --train ...` as a subprocess, parses per-iter losses from stdout, loads the trained adapter, greedy- decodes the standard prompt. --- .github/workflows/mlx-parity-probe.yml | 4 +- .../mlx_parity/probe_16_mlx_lm_native_lora.py | 132 ++++++++++++++++++ 2 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 tests/mlx_parity/probe_16_mlx_lm_native_lora.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index d0ec02e45a..6386816f33 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -64,6 +64,8 @@ jobs: - id: '15' script: probe_15_zoo_pr663.py zoo_pin: 'fix-mlx-grad-clip-hf-parity' + - id: '16' + script: probe_16_mlx_lm_native_lora.py steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -149,7 +151,7 @@ jobs: echo "=== probe artifacts ===" ls -la probes/ || true echo - for n in 11 12 14 15; do + for n in 11 12 14 15 16; do echo "--- probe_${n}.json ---" cat probes/probe_${n}.json 2>/dev/null || echo "(missing -- probe ${n} job did not produce output)" echo diff --git a/tests/mlx_parity/probe_16_mlx_lm_native_lora.py b/tests/mlx_parity/probe_16_mlx_lm_native_lora.py new file mode 100644 index 0000000000..745c1a1e8c --- /dev/null +++ b/tests/mlx_parity/probe_16_mlx_lm_native_lora.py @@ -0,0 +1,132 @@ +"""Probe 16 — train with mlx-lm's NATIVE LoRA trainer, no unsloth at all. + +If mlx_lm.lora can train this model on the same data and generate +"Unsloth", upstream MLX + the gemma-3-270m-it weights are healthy and +the entire regression is inside the unsloth-zoo MLX trainer wrapper. + +We invoke `python -m mlx_lm lora --train ...` as a subprocess because +the mlx-lm CLI is the canonical entry point. Training writes adapter +files to a temp directory; we then load model + adapter via mlx_lm +and greedy-decode the standard prompt. + +Always exits 0 -- data dump. +""" + +import json +import os +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + SEED, + OUT_DIR, + banner, + section, + report, + seed_everything, +) + + +def main() -> int: + seed_everything() + banner("Probe 16: mlx-lm NATIVE LoRA trainer (no unsloth)") + + workdir = Path(tempfile.mkdtemp(prefix="probe16_")) + data_dir = workdir / "data" + adapter_dir = workdir / "adapters" + data_dir.mkdir(parents=True, exist_ok=True) + adapter_dir.mkdir(parents=True, exist_ok=True) + + # mlx-lm's lora trainer expects train.jsonl + valid.jsonl in the data dir + # in "completions" / "chat" / "text" format. Use "text" format for the + # closest analog to the smoke test: a flat string per row. + train_rows = [{"text": TRAIN_TEXT} for _ in range(64)] + valid_rows = [{"text": TRAIN_TEXT}] # 1 row for validation + (data_dir / "train.jsonl").write_text("\n".join(json.dumps(r) for r in train_rows) + "\n") + (data_dir / "valid.jsonl").write_text("\n".join(json.dumps(r) for r in valid_rows) + "\n") + report("data dir", str(data_dir)) + report("adapter dir", str(adapter_dir)) + + # Run the mlx-lm LoRA trainer. Match the smoke test hyperparameters + # as closely as the mlx_lm CLI permits. + cmd = [ + sys.executable, "-m", "mlx_lm", "lora", + "--train", + "--model", MODEL_NAME, + "--data", str(data_dir), + "--adapter-path", str(adapter_dir), + "--iters", "7", + "--batch-size", "2", + "--learning-rate", "1e-3", + "--num-layers", "-1", # train all layers' LoRA + "--steps-per-report", "1", + "--steps-per-eval", "100", # skip eval inside 7 iters + "--seed", str(SEED), + ] + section("invoke mlx_lm.lora trainer") + report("cmd", " ".join(cmd)) + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=600) + report("returncode", proc.returncode) + print("--- mlx_lm.lora stdout ---") + print(proc.stdout) + print("--- mlx_lm.lora stderr ---") + print(proc.stderr) + + losses_per_step = [] + for line in (proc.stdout + "\n" + proc.stderr).splitlines(): + # mlx_lm prints lines like: + # "Iter 1: Train loss 10.123, Learning Rate 1.000e-03, It/sec 1.23, ..." + if "Iter " in line and "Train loss" in line: + try: + num = float(line.split("Train loss")[1].strip().split(",")[0].strip()) + losses_per_step.append(num) + except Exception: + pass + + report("parsed losses", losses_per_step) + + section("load + generate") + from mlx_lm import load as mlx_load, generate + # Pass the adapter dir to mlx_load via the adapter_path kwarg + try: + model, tokenizer = mlx_load(MODEL_NAME, adapter_path=str(adapter_dir)) + except TypeError: + # older mlx-lm signature + model, tokenizer = mlx_load(MODEL_NAME) + try: + from mlx_lm.tuner.utils import load_adapters + load_adapters(model, str(adapter_dir)) + except Exception as e: + report("adapter load fallback failed", str(e)) + + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen)) + report("contains 'Unsloth'", contains) + + out = { + "cmd": cmd, + "returncode": proc.returncode, + "losses": losses_per_step, + "generation": gen, + "contains_unsloth": contains, + "stdout_tail": proc.stdout[-2000:], + "stderr_tail": proc.stderr[-2000:], + } + (OUT_DIR / "probe_16.json").write_text(json.dumps(out, indent=2)) + + try: + shutil.rmtree(workdir, ignore_errors=True) + except Exception: + pass + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 796abd93cbab301009efd76d504c65e286bd4734 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 13:03:55 +0000 Subject: [PATCH 10/84] mlx_parity/probe_16: pad valid.jsonl so mlx_lm.lora loader accepts it mlx_lm.lora's dataset loader rejects validation sets smaller than batch_size. Write 4 rows instead of 1; training itself still happens on the 64-row train.jsonl as before. --- tests/mlx_parity/probe_16_mlx_lm_native_lora.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/mlx_parity/probe_16_mlx_lm_native_lora.py b/tests/mlx_parity/probe_16_mlx_lm_native_lora.py index 745c1a1e8c..580166443b 100644 --- a/tests/mlx_parity/probe_16_mlx_lm_native_lora.py +++ b/tests/mlx_parity/probe_16_mlx_lm_native_lora.py @@ -47,7 +47,8 @@ def main() -> int: # in "completions" / "chat" / "text" format. Use "text" format for the # closest analog to the smoke test: a flat string per row. train_rows = [{"text": TRAIN_TEXT} for _ in range(64)] - valid_rows = [{"text": TRAIN_TEXT}] # 1 row for validation + # mlx_lm.lora's loader rejects validation sets smaller than batch_size. + valid_rows = [{"text": TRAIN_TEXT} for _ in range(4)] (data_dir / "train.jsonl").write_text("\n".join(json.dumps(r) for r in train_rows) + "\n") (data_dir / "valid.jsonl").write_text("\n".join(json.dumps(r) for r in valid_rows) + "\n") report("data dir", str(data_dir)) From 4acc5e080c4cabcdca800df19dc37cfc8f434f9f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 13:28:08 +0000 Subject: [PATCH 11/84] mlx_parity: extended probes -- longer steps, seeds, mlx-lm 50 iters User asked: is MLX itself broken or is post-#634 just at the wrong side of the convergence horizon for 7 steps? probe_17_curve_param.py is a parameterized MLX-trainer curve probe that reads (MLX_STEPS, MLX_SEED, MLX_DTYPE, MLX_BIAS_CORRECTION) from env. Matrix runs four variants: 17a HEAD, 30 steps, seed=3407, bc=True long training, canon seed 17b HEAD, 7 steps, seed=42, bc=True short training, alt seed 17c HEAD, 30 steps, seed=42, bc=True long + alt seed 17d PR #663, 30 steps, seed=3407, bc=False control (proven fix path) probe_18 runs `python -m mlx_lm lora --train --iters 50` (no unsloth at all, upstream MLX framework end to end). Question matrix: 17a +> 17a passes -> MLX healthy, 7 steps insufficient with bc=True 17a +> 17a fails -> MLX has a deeper issue 17b vs 17a -> seed sensitivity 17c -> covers both axes 17d -> PR #663 still works at longer training 18 -> upstream MLX trainer convergence behavior --- .github/workflows/mlx-parity-probe.yml | 72 ++++++--- tests/mlx_parity/probe_17_curve_param.py | 183 +++++++++++++++++++++++ tests/mlx_parity/probe_18_mlx_lm_long.py | 125 ++++++++++++++++ 3 files changed, 357 insertions(+), 23 deletions(-) create mode 100644 tests/mlx_parity/probe_17_curve_param.py create mode 100644 tests/mlx_parity/probe_18_mlx_lm_long.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 6386816f33..5ab143ea0d 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -49,23 +49,43 @@ jobs: fail-fast: false matrix: include: - # Mac runners are capped at 5 parallel jobs on the free tier. - # Active matrix: two known anchors + two fix candidates. - # 12 is the green anchor (pre-#634), 11 is the red anchor (HEAD). - # 14 + 15 test fix candidates against those anchors. - - id: '11' - script: probe_11_mlx_fp32_curve.py - - id: '12' - script: probe_12_zoo_prev634.py - zoo_pin: 'f37d510' - - id: '14' - script: probe_14_zoo_bias_correction_false.py - zoo_pin: 'try-bias-correction-false' - - id: '15' - script: probe_15_zoo_pr663.py + # ---- Extended-investigation matrix (5 Mac M1 jobs in parallel) ---- + # Question: is MLX itself broken, or does post-#634 just need more + # steps / different seeds to memorize? + # + # Each probe_17 entry runs the unsloth-zoo HEAD trainer + # (broken default until #663 merges) with a different + # (steps, seed) combination so we can rule in/out a + # convergence-horizon effect. + + # probe_17a: longer training -- 30 steps, canonical seed, HEAD default + - id: '17a' + script: probe_17_curve_param.py + steps: '30' + seed: '3407' + bias_correction: '1' + # probe_17b: different seed -- 7 steps, seed=42, HEAD default + - id: '17b' + script: probe_17_curve_param.py + steps: '7' + seed: '42' + bias_correction: '1' + # probe_17c: combined -- 30 steps + seed=42, HEAD default + - id: '17c' + script: probe_17_curve_param.py + steps: '30' + seed: '42' + bias_correction: '1' + # probe_17d: control with PR #663 head (bc=False default), 30 steps + - id: '17d' + script: probe_17_curve_param.py + steps: '30' + seed: '3407' + bias_correction: '0' zoo_pin: 'fix-mlx-grad-clip-hf-parity' - - id: '16' - script: probe_16_mlx_lm_native_lora.py + # probe_18: mlx-lm native LoRA, 50 iters (upstream MLX trainer, long) + - id: '18' + script: probe_18_mlx_lm_long.py steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -115,22 +135,28 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} UNSLOTH_COMPILE_DISABLE: '1' + MLX_STEPS: ${{ matrix.steps }} + MLX_SEED: ${{ matrix.seed }} + MLX_DTYPE: ${{ matrix.dtype }} + MLX_BIAS_CORRECTION: ${{ matrix.bias_correction }} run: | cd tests/mlx_parity && python ${{ matrix.script }} - name: Show JSON output if: always() run: | - F=tests/mlx_parity/.out/probe_${{ matrix.id }}.json - echo "=== ${F} ===" - cat "${F}" 2>/dev/null || echo "(no JSON written)" + echo "=== probe ${{ matrix.id }} JSON output(s) ===" + for f in tests/mlx_parity/.out/probe_*.json; do + echo "--- ${f} ---" + cat "$f" 2>/dev/null || true + done - name: Upload probe artifact if: always() uses: actions/upload-artifact@v4 with: name: probe-${{ matrix.id }} - path: tests/mlx_parity/.out/probe_${{ matrix.id }}.json + path: tests/mlx_parity/.out/probe_*.json if-no-files-found: warn aggregate: @@ -151,8 +177,8 @@ jobs: echo "=== probe artifacts ===" ls -la probes/ || true echo - for n in 11 12 14 15 16; do - echo "--- probe_${n}.json ---" - cat probes/probe_${n}.json 2>/dev/null || echo "(missing -- probe ${n} job did not produce output)" + for f in probes/probe_*.json; do + echo "--- ${f} ---" + cat "$f" 2>/dev/null || echo "(empty)" echo done diff --git a/tests/mlx_parity/probe_17_curve_param.py b/tests/mlx_parity/probe_17_curve_param.py new file mode 100644 index 0000000000..9308e0ec1d --- /dev/null +++ b/tests/mlx_parity/probe_17_curve_param.py @@ -0,0 +1,183 @@ +"""Probe 17 — parameterized 7+ step MLX training curve. + +Reads env vars so a single matrix entry can be reused with different +(steps, seed, dtype, bias_correction) combinations: + + MLX_STEPS max_steps for MLXTrainer (default 7) + MLX_SEED seed for everything (default 3407) + MLX_DTYPE dtype string for FastMLXModel.from_pretrained + (default "float16") + MLX_BIAS_CORRECTION "1"/"true" -> adam_bias_correction=True + "0"/"false" (default) -> False + +Pin: unsloth-zoo HEAD (broken default at the time the question was +asked) so this probe directly characterizes how the post-#634 code +behaves under longer training / other seeds. + +The probe writes a per-config JSON to .out/probe_17__steps{S}_seed{D}_bc{0/1}.json +so the matrix's `outputs: filename` path is unique. + +Question this answers: + * does increasing max_steps eventually let bias_correction=True + memorize the train row? If yes, MLX is healthy and 7 steps is + just too short for the HF/torch math. + * does varying the seed (data shuffle, LoRA init) change the + basin? If multiple seeds all fail at 7 steps + bc=True, the + issue is structural, not lucky/unlucky init. + +Always exits 0 -- data dump. +""" + +import json +import os +import sys + +import numpy as np + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + MAX_SEQ_LEN, + OUT_DIR, + banner, + section, + report, +) + + +def _env_bool(name, default=False): + raw = os.environ.get(name, str(default)).strip().lower() + return raw in ("1", "true", "yes", "y") + + +def _env_int(name, default): + try: + return int(os.environ.get(name, default)) + except ValueError: + return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 7) + seed = _env_int("MLX_SEED", 3407) + dtype = os.environ.get("MLX_DTYPE", "float16") + bc = _env_bool("MLX_BIAS_CORRECTION", False) + + banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc}") + + import random + random.seed(seed) + np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + import dataclasses + + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype=dtype, + text_only=True, max_seq_length=128, random_state=seed, + ) + model = FastMLXModel.get_peft_model( + model, r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + random_state=seed, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + ) + + # Only set adam_bias_correction if the field exists on this version + # of unsloth-zoo. HEAD (pre-PR-663) does not have it -- it forces + # True unconditionally, so MLX_BIAS_CORRECTION=0 on HEAD has no + # effect and the run characterizes the upstream broken default. + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: + extra["adam_bias_correction"] = bc + + config = MLXTrainingConfig( + per_device_train_batch_size=2, + gradient_accumulation_steps=3, + max_steps=steps, + learning_rate=1e-3, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=1.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe17_outputs_s{steps}_d{seed}_bc{int(bc)}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + def _on_step(*args): + if len(args) < 3: + return + rows.append({ + "step": int(args[0]), + "loss": float(args[2]), + "grad_norm": float(args[8]) if len(args) >= 9 and args[8] is not None else None, + }) + trainer.add_step_callback(_on_step) + trainer.train() + + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, "dtype": dtype, + "adam_bias_correction": bc, + "adam_bc_field_supported": "adam_bias_correction" in fields_supported, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_17__s{steps}_d{seed}_bc{int(bc)}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + section("summary") + if rows: + report("step-1 loss", rows[0]["loss"]) + report(f"step-{len(rows)} loss", rows[-1]["loss"]) + report("post_train_loss", post_loss_val) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_18_mlx_lm_long.py b/tests/mlx_parity/probe_18_mlx_lm_long.py new file mode 100644 index 0000000000..afe37a1ccd --- /dev/null +++ b/tests/mlx_parity/probe_18_mlx_lm_long.py @@ -0,0 +1,125 @@ +"""Probe 18 — mlx-lm NATIVE LoRA trainer, 50 iters (long). + +Probe 16 trained for 7 iters and emitted "slslsl..." (no Unsloth). +That's the same iteration count as the upstream smoke; mlx-lm's +recipe + bias_correction=False MLX default may need longer. + +Train for 50 iters with mlx_lm.lora and inspect: + * does loss drop? + * does the trained adapter eventually generate "Unsloth"? + +If yes: MLX framework + mlx-lm native trainer can memorize the row +when given enough steps; the 7-step smoke just sits at the wrong +side of the convergence horizon for mlx-lm's recipe. + +If no: mlx-lm's native LoRA recipe (different LoRA targets, different +loss masking) lands somewhere else entirely, and that's a recipe +issue, not an MLX-framework issue. + +Always exits 0 -- data dump. +""" + +import json +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + SEED, + OUT_DIR, + banner, + section, + report, + seed_everything, +) + + +def main() -> int: + seed_everything() + banner("Probe 18: mlx-lm NATIVE LoRA trainer, 50 iters") + + workdir = Path(tempfile.mkdtemp(prefix="probe18_")) + data_dir = workdir / "data" + adapter_dir = workdir / "adapters" + data_dir.mkdir(parents=True, exist_ok=True) + adapter_dir.mkdir(parents=True, exist_ok=True) + + train_rows = [{"text": TRAIN_TEXT} for _ in range(64)] + valid_rows = [{"text": TRAIN_TEXT} for _ in range(4)] + (data_dir / "train.jsonl").write_text("\n".join(json.dumps(r) for r in train_rows) + "\n") + (data_dir / "valid.jsonl").write_text("\n".join(json.dumps(r) for r in valid_rows) + "\n") + + cmd = [ + sys.executable, "-m", "mlx_lm", "lora", + "--train", + "--model", MODEL_NAME, + "--data", str(data_dir), + "--adapter-path", str(adapter_dir), + "--iters", "50", + "--batch-size", "2", + "--learning-rate", "1e-3", + "--num-layers", "-1", + "--steps-per-report", "5", + "--steps-per-eval", "200", + "--seed", str(SEED), + ] + section("invoke mlx_lm.lora trainer (50 iters)") + report("cmd", " ".join(cmd)) + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=1200) + report("returncode", proc.returncode) + print("--- mlx_lm.lora stdout tail ---") + print(proc.stdout[-4000:]) + print("--- mlx_lm.lora stderr tail ---") + print(proc.stderr[-2000:]) + + losses_per_step = [] + for line in (proc.stdout + "\n" + proc.stderr).splitlines(): + if "Iter " in line and "Train loss" in line: + try: + num = float(line.split("Train loss")[1].strip().split(",")[0].strip()) + losses_per_step.append(num) + except Exception: + pass + report("parsed losses", losses_per_step) + + from mlx_lm import load as mlx_load, generate + try: + model, tokenizer = mlx_load(MODEL_NAME, adapter_path=str(adapter_dir)) + except TypeError: + model, tokenizer = mlx_load(MODEL_NAME) + try: + from mlx_lm.tuner.utils import load_adapters + load_adapters(model, str(adapter_dir)) + except Exception as e: + report("adapter load fallback failed", str(e)) + + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen)) + report("contains 'Unsloth'", contains) + + out = { + "cmd": cmd, + "returncode": proc.returncode, + "iters": 50, + "losses": losses_per_step, + "generation": gen, + "contains_unsloth": contains, + "stdout_tail": proc.stdout[-2000:], + "stderr_tail": proc.stderr[-2000:], + } + (OUT_DIR / "probe_18.json").write_text(json.dumps(out, indent=2)) + try: + shutil.rmtree(workdir, ignore_errors=True) + except Exception: + pass + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 49b6e9d3949063daf90bc72d076f52807028520f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 13:39:04 +0000 Subject: [PATCH 12/84] mlx_parity/probe_17: treat empty env-vars as defaults Matrix entries without a key pass empty strings via env, not unset vars; os.environ.get fell through to the wrong path and FastMLXModel got dtype="". Strip + fall back to defaults explicitly. --- tests/mlx_parity/probe_17_curve_param.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/mlx_parity/probe_17_curve_param.py b/tests/mlx_parity/probe_17_curve_param.py index 9308e0ec1d..db71d547ce 100644 --- a/tests/mlx_parity/probe_17_curve_param.py +++ b/tests/mlx_parity/probe_17_curve_param.py @@ -47,21 +47,31 @@ def _env_bool(name, default=False): - raw = os.environ.get(name, str(default)).strip().lower() + raw = (os.environ.get(name) or "").strip().lower() + if not raw: + return default return raw in ("1", "true", "yes", "y") def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: + return default try: - return int(os.environ.get(name, default)) + return int(raw) except ValueError: return default +def _env_str(name, default): + raw = (os.environ.get(name) or "").strip() + return raw if raw else default + + def main() -> int: steps = _env_int("MLX_STEPS", 7) seed = _env_int("MLX_SEED", 3407) - dtype = os.environ.get("MLX_DTYPE", "float16") + dtype = _env_str("MLX_DTYPE", "float16") bc = _env_bool("MLX_BIAS_CORRECTION", False) banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc}") From b721a0f20b5e785018407f7d7ea1e2258904579c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 14:04:57 +0000 Subject: [PATCH 13/84] mlx_parity Round B: scan bc=True convergence boundary + 2 more seeds Round A flipped the working hypothesis: HEAD bc=True with 30 steps emits "Unsloth" (probes 17a, 17c), but PR #663 bc=False with 30 steps fails (probe 17d, post_loss=2.25). So the "fix" of flipping bias_correction back was wrong -- bc=True is the right math; the 7-step smoke just sat at the wrong side of the convergence horizon. Round B finds the cutover and verifies seed-robustness: 17e HEAD, 15 steps, seed=3407 17f HEAD, 20 steps, seed=3407 17g HEAD, 50 steps, seed=3407 (stability check past convergence) 17h HEAD, 30 steps, seed=999 17i HEAD, 30 steps, seed=1337 Also fix artifact upload to include the whole .out/ directory so per-config JSONs (probe_17__s{N}_d{S}_bc{B}.json) are captured. --- .github/workflows/mlx-parity-probe.yml | 42 ++++++++++++++------------ 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 5ab143ea0d..ad9ad3933e 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,34 +58,35 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # probe_17a: longer training -- 30 steps, canonical seed, HEAD default - - id: '17a' + # Round B: find the bc=True convergence boundary + verify + # stability across more seeds. Round A established that + # bc=True with 30 steps reliably emits "Unsloth"; 7 steps does + # not. Where is the cutover? + - id: '17e' script: probe_17_curve_param.py - steps: '30' + steps: '15' seed: '3407' bias_correction: '1' - # probe_17b: different seed -- 7 steps, seed=42, HEAD default - - id: '17b' + - id: '17f' script: probe_17_curve_param.py - steps: '7' - seed: '42' + steps: '20' + seed: '3407' bias_correction: '1' - # probe_17c: combined -- 30 steps + seed=42, HEAD default - - id: '17c' + - id: '17g' + script: probe_17_curve_param.py + steps: '50' + seed: '3407' + bias_correction: '1' + - id: '17h' script: probe_17_curve_param.py steps: '30' - seed: '42' + seed: '999' bias_correction: '1' - # probe_17d: control with PR #663 head (bc=False default), 30 steps - - id: '17d' + - id: '17i' script: probe_17_curve_param.py steps: '30' - seed: '3407' - bias_correction: '0' - zoo_pin: 'fix-mlx-grad-clip-hf-parity' - # probe_18: mlx-lm native LoRA, 50 iters (upstream MLX trainer, long) - - id: '18' - script: probe_18_mlx_lm_long.py + seed: '1337' + bias_correction: '1' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -156,8 +157,11 @@ jobs: uses: actions/upload-artifact@v4 with: name: probe-${{ matrix.id }} - path: tests/mlx_parity/.out/probe_*.json + # Upload whole .out/ so probe scripts can write any filename + # (probe 17 writes per-config JSONs like probe_17__s30_d42_bc1.json) + path: tests/mlx_parity/.out/ if-no-files-found: warn + include-hidden-files: true aggregate: name: aggregate From ba44388afbdfe12a63e11a5a4085cc6521fb7fbb Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 14:23:36 +0000 Subject: [PATCH 14/84] mlx_parity Round C: zoom in on bc=True convergence boundary Round B: 7 no, 15 yes, 20 yes, 30 yes (4 seeds), 50 no. Round C narrows the bounds: 17j 10 steps - lower bound at 12 or below? 17k 12 steps - ditto 17l 25 steps - sanity check mid-range 17m 35 steps - upper bound at 40 or below? 17n 40 steps - ditto After this we'll know the smoke test's exact safe step window. --- .github/workflows/mlx-parity-probe.yml | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index ad9ad3933e..1ae198c02d 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,34 +58,34 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round B: find the bc=True convergence boundary + verify - # stability across more seeds. Round A established that - # bc=True with 30 steps reliably emits "Unsloth"; 7 steps does - # not. Where is the cutover? - - id: '17e' + # Round C: zoom in on the bc=True convergence boundary. + # Round B established: 7 no, 15 yes, 20 yes, 30 yes, 50 no. + # Round C asks: where exactly does the "Unsloth" basin + # start and end with bc=True default? + - id: '17j' script: probe_17_curve_param.py - steps: '15' + steps: '10' seed: '3407' bias_correction: '1' - - id: '17f' + - id: '17k' script: probe_17_curve_param.py - steps: '20' + steps: '12' seed: '3407' bias_correction: '1' - - id: '17g' + - id: '17l' script: probe_17_curve_param.py - steps: '50' + steps: '25' seed: '3407' bias_correction: '1' - - id: '17h' + - id: '17m' script: probe_17_curve_param.py - steps: '30' - seed: '999' + steps: '35' + seed: '3407' bias_correction: '1' - - id: '17i' + - id: '17n' script: probe_17_curve_param.py - steps: '30' - seed: '1337' + steps: '40' + seed: '3407' bias_correction: '1' steps: - name: Harden runner (audit) From 0ca06a6672cdf7ab3804415e92c77394b906eaae Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 14:44:38 +0000 Subject: [PATCH 15/84] mlx_parity Round D: lock in steps=20 across seeds + 50-step stability Round C established the bc=True basin lives at steps in [15, 40] for seed=3407. The smoke test fix in PR #5498 picks 20. Round D verifies: 17o steps=20, seed=42 -- does 20 work on a different seed? 17p steps=20, seed=999 17q steps=20, seed=1337 17r steps=50, seed=42 -- is 50-step failure seed-specific? 17s steps=100 -- does the loss eventually re-stabilize? If 17o/p/q all generate "Unsloth", PR #5498's max_steps=20 is seed-robust. If 17r generates "Unsloth", the 50-step failure on seed=3407 is a single-seed quirk and the upper boundary is wider than first thought. --- .github/workflows/mlx-parity-probe.yml | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 1ae198c02d..d462bc51df 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,33 +58,33 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round C: zoom in on the bc=True convergence boundary. - # Round B established: 7 no, 15 yes, 20 yes, 30 yes, 50 no. - # Round C asks: where exactly does the "Unsloth" basin - # start and end with bc=True default? - - id: '17j' + # Round D: lock in max_steps=20 across multiple seeds + check + # 50/100 are the actual upper failure boundary (50 failed on + # seed 3407; does that hold across seeds, or is it a 3407 + # quirk?). Also a third LR to characterize sensitivity. + - id: '17o' script: probe_17_curve_param.py - steps: '10' - seed: '3407' + steps: '20' + seed: '42' bias_correction: '1' - - id: '17k' + - id: '17p' script: probe_17_curve_param.py - steps: '12' - seed: '3407' + steps: '20' + seed: '999' bias_correction: '1' - - id: '17l' + - id: '17q' script: probe_17_curve_param.py - steps: '25' - seed: '3407' + steps: '20' + seed: '1337' bias_correction: '1' - - id: '17m' + - id: '17r' script: probe_17_curve_param.py - steps: '35' - seed: '3407' + steps: '50' + seed: '42' bias_correction: '1' - - id: '17n' + - id: '17s' script: probe_17_curve_param.py - steps: '40' + steps: '100' seed: '3407' bias_correction: '1' steps: From c1bc687a3853399f9e70727c929e7a55a6911a1e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 15:11:18 +0000 Subject: [PATCH 16/84] mlx_parity Round E: max_steps=30 seed-robustness + LR sweep Round D showed seed=1337 still fails at 20 steps even though it worked at 30, and the bc=True basin re-enters memorization at 100 steps after failing at 50. Round E: 17t: reproduce seed=1337 + 20 steps failure (control) 17u: 30 steps + seed=3407 + lr=5e-4 (does smaller LR escape pit?) 17v: 30 steps + seed=3407 + lr=2e-3 (does larger LR escape pit?) 17w: 30 steps + seed=12345 (5th seed at the chosen max_steps) 17x: 30 steps + seed=7777 (6th seed at the chosen max_steps) Wires MLX_LR through probe_17_curve_param.py and tags the per- config artifact filename with the LR so 17u/17v don't collide with prior 30-step runs. --- .github/workflows/mlx-parity-probe.yml | 46 ++++++++++++++---------- tests/mlx_parity/probe_17_curve_param.py | 21 ++++++++--- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index d462bc51df..42872fb6df 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,35 +58,44 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round D: lock in max_steps=20 across multiple seeds + check - # 50/100 are the actual upper failure boundary (50 failed on - # seed 3407; does that hold across seeds, or is it a 3407 - # quirk?). Also a third LR to characterize sensitivity. - - id: '17o' + # Round E: lock in max_steps=30 as seed-robust + characterize + # LR sensitivity. Earlier rounds showed: + # - bc=True basin is non-monotonic: works 15-40, fails at 50, + # re-enters at 100 (seed 3407) + # - seed=1337 failed at 20 steps but worked at 30 + # Round E asks: does max_steps=30 hold across more seeds, AND + # is the basin LR-sensitive (would a smaller/larger LR escape + # the 50-step pit)? + - id: '17t' script: probe_17_curve_param.py steps: '20' - seed: '42' + seed: '1337' bias_correction: '1' - - id: '17p' + lr: '1e-3' + - id: '17u' script: probe_17_curve_param.py - steps: '20' - seed: '999' + steps: '30' + seed: '3407' bias_correction: '1' - - id: '17q' + lr: '5e-4' + - id: '17v' script: probe_17_curve_param.py - steps: '20' - seed: '1337' + steps: '30' + seed: '3407' bias_correction: '1' - - id: '17r' + lr: '2e-3' + - id: '17w' script: probe_17_curve_param.py - steps: '50' - seed: '42' + steps: '30' + seed: '12345' bias_correction: '1' - - id: '17s' + lr: '1e-3' + - id: '17x' script: probe_17_curve_param.py - steps: '100' - seed: '3407' + steps: '30' + seed: '7777' bias_correction: '1' + lr: '1e-3' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -140,6 +149,7 @@ jobs: MLX_SEED: ${{ matrix.seed }} MLX_DTYPE: ${{ matrix.dtype }} MLX_BIAS_CORRECTION: ${{ matrix.bias_correction }} + MLX_LR: ${{ matrix.lr }} run: | cd tests/mlx_parity && python ${{ matrix.script }} diff --git a/tests/mlx_parity/probe_17_curve_param.py b/tests/mlx_parity/probe_17_curve_param.py index db71d547ce..8c89adc8ea 100644 --- a/tests/mlx_parity/probe_17_curve_param.py +++ b/tests/mlx_parity/probe_17_curve_param.py @@ -68,13 +68,24 @@ def _env_str(name, default): return raw if raw else default +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: + return default + try: + return float(raw) + except ValueError: + return default + + def main() -> int: steps = _env_int("MLX_STEPS", 7) seed = _env_int("MLX_SEED", 3407) dtype = _env_str("MLX_DTYPE", "float16") bc = _env_bool("MLX_BIAS_CORRECTION", False) + lr = _env_float("MLX_LR", 1e-3) - banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc}") + banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc} lr={lr}") import random random.seed(seed) @@ -116,7 +127,7 @@ def main() -> int: per_device_train_batch_size=2, gradient_accumulation_steps=3, max_steps=steps, - learning_rate=1e-3, + learning_rate=lr, warmup_steps=0, lr_scheduler_type="constant", optim="adamw", @@ -128,7 +139,7 @@ def main() -> int: use_cce=False, compile=False, gradient_checkpointing=False, - output_dir=str(OUT_DIR / f"probe17_outputs_s{steps}_d{seed}_bc{int(bc)}"), + output_dir=str(OUT_DIR / f"probe17_outputs_s{steps}_d{seed}_bc{int(bc)}_lr{lr:g}"), save_steps=0, eval_steps=0, dataset_text_field="text", @@ -172,6 +183,7 @@ def _on_step(*args): "config": { "steps": steps, "seed": seed, "dtype": dtype, "adam_bias_correction": bc, + "learning_rate": lr, "adam_bc_field_supported": "adam_bias_correction" in fields_supported, }, "rows": rows, @@ -179,7 +191,8 @@ def _on_step(*args): "generation": gen, "contains_unsloth": contains, } - fname = f"probe_17__s{steps}_d{seed}_bc{int(bc)}.json" + lr_tag = f"{lr:.0e}".replace("-0", "-").replace("+0", "") + fname = f"probe_17__s{steps}_d{seed}_bc{int(bc)}_lr{lr_tag}.json" (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) section("summary") if rows: From 380255d5f5ff65e5c9fe4cde64f9a129f1eec3c6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 22:49:57 +0000 Subject: [PATCH 17/84] mlx_parity Round F: investigate seed=12345 generation failure Round E exposed that at 30 steps + lr=1e-3 + bc=True, seed=12345 produces post_train_loss=0.0000 (perfect memorization) but greedy decode of PROMPT diverges to "42!" instead of "Unsloth!". Five other seeds (3407, 42, 999, 1337, 7777) all produce Unsloth at the same config. Trainer is healthy; basin geometry is seed-fragile for greedy decode. Round F (5 jobs): 17y : HEAD zoo, 60 steps, seed=12345, bc=1 -> escape via more steps? 17z : PR-663 zoo, 30 steps, seed=12345, bc=0 -> does the pre-#634 bias_correction=False contract rescue the failing seed? 17aa : PR-663 zoo, 30 steps, seed=12345, bc=1 -> PR-663 + bc=1 (pure steps-horizon test, isolates the bc field exposure from the flag value) 17ab : PR-663 zoo, 30 steps, seed=3407, bc=0 -> sanity: known-good seed still memorizes under PR-663 bc=False 17ac : HEAD zoo, 30 steps, seed=3407, bc=1, lr=1e-3 -> control If 17z generates Unsloth and 17aa does not, that's strong evidence PR-663's bc=False default is the right contract for seed-robust greedy decode, not just an HF-parity nicety. --- .github/workflows/mlx-parity-probe.yml | 49 ++++++++++++++------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 42872fb6df..25205df082 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,42 +58,45 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round E: lock in max_steps=30 as seed-robust + characterize - # LR sensitivity. Earlier rounds showed: - # - bc=True basin is non-monotonic: works 15-40, fails at 50, - # re-enters at 100 (seed 3407) - # - seed=1337 failed at 20 steps but worked at 30 - # Round E asks: does max_steps=30 hold across more seeds, AND - # is the basin LR-sensitive (would a smaller/larger LR escape - # the 50-step pit)? - - id: '17t' + # Round F: investigate seed=12345 generation failure at 30 steps. + # Round E showed trainer drives post_train_loss to ~0 across all + # seeds + LRs, but greedy decode of PROMPT diverged for seed=12345. + # Two hypotheses: + # (a) bc=True basin geometry is seed-fragile -> PR #663 bc=False + # contract recovers generation; + # (b) seed=12345 just needs more steps to escape its pit. + # Also sanity-check PR #663 doesn't itself regress known-good seeds. + - id: '17y' script: probe_17_curve_param.py - steps: '20' - seed: '1337' + steps: '60' + seed: '12345' bias_correction: '1' lr: '1e-3' - - id: '17u' + - id: '17z' script: probe_17_curve_param.py steps: '30' - seed: '3407' - bias_correction: '1' - lr: '5e-4' - - id: '17v' + seed: '12345' + bias_correction: '0' + lr: '1e-3' + zoo_pin: '18596f229b2436c8004ecfd0e6a39e5d0656857a' + - id: '17aa' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '12345' bias_correction: '1' - lr: '2e-3' - - id: '17w' + lr: '1e-3' + zoo_pin: '18596f229b2436c8004ecfd0e6a39e5d0656857a' + - id: '17ab' script: probe_17_curve_param.py steps: '30' - seed: '12345' - bias_correction: '1' + seed: '3407' + bias_correction: '0' lr: '1e-3' - - id: '17x' + zoo_pin: '18596f229b2436c8004ecfd0e6a39e5d0656857a' + - id: '17ac' script: probe_17_curve_param.py steps: '30' - seed: '7777' + seed: '3407' bias_correction: '1' lr: '1e-3' steps: From 6ac263787f93f3e4d634b65c6e6b2facf76bf678 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 23:01:48 +0000 Subject: [PATCH 18/84] mlx_parity Round G: 60-step universality + 12345 basin entry Round F confirmed seed=12345 escapes its 30-step failure when trained for 60 steps. Round G: 17ad : seed=12345 @ 40 steps -- basin entry point? 17ae : seed=12345 @ 50 steps -- basin entry point? 17af : seed=42 @ 60 steps -- 60 still works? 17ag : seed=1337 @ 60 steps -- 60 still works? 17ah : seed=3407 @ 60 steps -- known 50-pit edge; control If 17af/17ag/17ah all generate 'Unsloth' and 17ad/17ae do too, 60 steps is the seed-robust horizon and a stronger default than 30. --- .github/workflows/mlx-parity-probe.yml | 47 ++++++++++++-------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 25205df082..dfb9285675 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,44 +58,41 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round F: investigate seed=12345 generation failure at 30 steps. - # Round E showed trainer drives post_train_loss to ~0 across all - # seeds + LRs, but greedy decode of PROMPT diverged for seed=12345. - # Two hypotheses: - # (a) bc=True basin geometry is seed-fragile -> PR #663 bc=False - # contract recovers generation; - # (b) seed=12345 just needs more steps to escape its pit. - # Also sanity-check PR #663 doesn't itself regress known-good seeds. - - id: '17y' + # Round G: characterize whether 60 steps is universally seed-robust + # and locate seed=12345's basin entry point. + # Round F confirmed: 60 steps rescues the seed=12345 30-step + # generation failure ("Unsloth!" emitted). Round G asks: + # (a) does 60-step robustness hold for seeds we previously saw + # work at 30 (42, 1337, 3407)? If 60 is universal, max_steps + # could move from 30 to 60 for higher safety margin. + # (b) where exactly does seed=12345 enter the basin? 40 or 50? + - id: '17ad' script: probe_17_curve_param.py - steps: '60' + steps: '40' seed: '12345' bias_correction: '1' lr: '1e-3' - - id: '17z' + - id: '17ae' script: probe_17_curve_param.py - steps: '30' + steps: '50' seed: '12345' - bias_correction: '0' + bias_correction: '1' lr: '1e-3' - zoo_pin: '18596f229b2436c8004ecfd0e6a39e5d0656857a' - - id: '17aa' + - id: '17af' script: probe_17_curve_param.py - steps: '30' - seed: '12345' + steps: '60' + seed: '42' bias_correction: '1' lr: '1e-3' - zoo_pin: '18596f229b2436c8004ecfd0e6a39e5d0656857a' - - id: '17ab' + - id: '17ag' script: probe_17_curve_param.py - steps: '30' - seed: '3407' - bias_correction: '0' + steps: '60' + seed: '1337' + bias_correction: '1' lr: '1e-3' - zoo_pin: '18596f229b2436c8004ecfd0e6a39e5d0656857a' - - id: '17ac' + - id: '17ah' script: probe_17_curve_param.py - steps: '30' + steps: '60' seed: '3407' bias_correction: '1' lr: '1e-3' From db0407e2eed134e5e9e18d4022f9b90b624d0bd3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 23:13:28 +0000 Subject: [PATCH 19/84] mlx_parity Round H: pivot to mlx-lm reference comparison Rounds A-G established the unsloth-zoo MLX trainer drives post_train_loss to ~0 across all (seed, lr, step-count) configs tested, but greedy-decode of the test prompt is fragile in a non-monotonic way w.r.t. step count and seed: seed=3407: 30 OK, 50 BAD, 60 BAD, 100 OK seed=42: 30 OK, 60 BAD seed=1337: 20 BAD, 30 OK, 60 OK seed=12345: 30 BAD, 40 BAD, 50 OK, 60 OK seed=7777: 30 OK post_train_loss is the load-bearing memorization signal; contains- Unsloth is decode-geometry sensitive. To attribute the fragility, probe_19 runs mlx-lm's NATIVE LoRA on identical fixture across matched (steps, seed) pairs. If mlx-lm shows the same pattern, the geometry issue is in MLX/optimizer math, not the unsloth-zoo wrapper. Round H matrix: 19a : 30 steps, seed=3407 (known unsloth-zoo OK) 19b : 30 steps, seed=12345 (known unsloth-zoo BAD) 19c : 60 steps, seed=42 (known unsloth-zoo BAD) 19d : 60 steps, seed=3407 (known unsloth-zoo BAD) 19e : 50 steps, seed=12345 (known unsloth-zoo OK) --- .github/workflows/mlx-parity-probe.yml | 57 ++++---- tests/mlx_parity/probe_19_mlx_lm_param.py | 165 ++++++++++++++++++++++ 2 files changed, 189 insertions(+), 33 deletions(-) create mode 100644 tests/mlx_parity/probe_19_mlx_lm_param.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index dfb9285675..c393f0bb90 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,44 +58,35 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round G: characterize whether 60 steps is universally seed-robust - # and locate seed=12345's basin entry point. - # Round F confirmed: 60 steps rescues the seed=12345 30-step - # generation failure ("Unsloth!" emitted). Round G asks: - # (a) does 60-step robustness hold for seeds we previously saw - # work at 30 (42, 1337, 3407)? If 60 is universal, max_steps - # could move from 30 to 60 for higher safety margin. - # (b) where exactly does seed=12345 enter the basin? 40 or 50? - - id: '17ad' - script: probe_17_curve_param.py - steps: '40' - seed: '12345' - bias_correction: '1' - lr: '1e-3' - - id: '17ae' - script: probe_17_curve_param.py - steps: '50' + # Round H: PIVOT to mlx-lm reference comparison. + # Round G showed the (steps, seed) basin is fundamentally non- + # monotonic: 60 steps fails for seeds 42 and 3407 even though + # 30 steps works for both. seed=12345 fails at 30 + 40 but works + # at 50. The post_train_loss reaches 0 across all configs, so + # the fragility is in greedy-decode geometry, not training. + # Run mlx-lm's NATIVE LoRA on the SAME fixture at the same + # (steps, seed) combos. If mlx-lm shows the same fragility, + # it's an MLX/optimizer geometry issue, not an unsloth-zoo bug. + - id: '19a' + script: probe_19_mlx_lm_param.py + steps: '30' + seed: '3407' + - id: '19b' + script: probe_19_mlx_lm_param.py + steps: '30' seed: '12345' - bias_correction: '1' - lr: '1e-3' - - id: '17af' - script: probe_17_curve_param.py + - id: '19c' + script: probe_19_mlx_lm_param.py steps: '60' seed: '42' - bias_correction: '1' - lr: '1e-3' - - id: '17ag' - script: probe_17_curve_param.py - steps: '60' - seed: '1337' - bias_correction: '1' - lr: '1e-3' - - id: '17ah' - script: probe_17_curve_param.py + - id: '19d' + script: probe_19_mlx_lm_param.py steps: '60' seed: '3407' - bias_correction: '1' - lr: '1e-3' + - id: '19e' + script: probe_19_mlx_lm_param.py + steps: '50' + seed: '12345' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_19_mlx_lm_param.py b/tests/mlx_parity/probe_19_mlx_lm_param.py new file mode 100644 index 0000000000..25b06779f0 --- /dev/null +++ b/tests/mlx_parity/probe_19_mlx_lm_param.py @@ -0,0 +1,165 @@ +"""Probe 19 — parameterized mlx-lm NATIVE LoRA training. + +Same shape as probe_17 (env-vars + per-config JSON output) but uses +the canonical `python -m mlx_lm lora --train` instead of unsloth-zoo's +MLXTrainer. Lets us run the SAME (steps, seed) matrix Round G ran +against MLXTrainer, with the only difference being the trainer +itself, so we can isolate: + + * fragile (steps, seed) basins that show up in BOTH trainers + -> MLX/optimizer geometry is the cause, not unsloth-zoo + * fragile (steps, seed) basins that show up only in MLXTrainer + -> unsloth-zoo wrapper has a real bug + +Env vars (matches probe_17 naming so the workflow's env block is reused): + MLX_STEPS --iters value (default 7) + MLX_SEED --seed value (default 3407) + +Writes per-config JSON to .out/probe_19__s{S}_d{D}.json. + +Always exits 0 -- data dump. +""" + +import json +import os +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + OUT_DIR, + banner, + section, + report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: + return default + try: + return int(raw) + except ValueError: + return default + + +def main() -> int: + iters = _env_int("MLX_STEPS", 7) + seed = _env_int("MLX_SEED", 3407) + banner(f"Probe 19: mlx-lm NATIVE LoRA, iters={iters}, seed={seed}") + + import random + import numpy as np + random.seed(seed) + np.random.seed(seed) + try: + import mlx.core as mx + mx.random.seed(seed) + except Exception: + pass + + workdir = Path(tempfile.mkdtemp(prefix=f"probe19_s{iters}_d{seed}_")) + data_dir = workdir / "data" + adapter_dir = workdir / "adapters" + data_dir.mkdir(parents=True, exist_ok=True) + adapter_dir.mkdir(parents=True, exist_ok=True) + + train_rows = [{"text": TRAIN_TEXT} for _ in range(64)] + valid_rows = [{"text": TRAIN_TEXT} for _ in range(4)] + (data_dir / "train.jsonl").write_text( + "\n".join(json.dumps(r) for r in train_rows) + "\n" + ) + (data_dir / "valid.jsonl").write_text( + "\n".join(json.dumps(r) for r in valid_rows) + "\n" + ) + report("data dir", str(data_dir)) + report("adapter dir", str(adapter_dir)) + + cmd = [ + sys.executable, "-m", "mlx_lm", "lora", + "--train", + "--model", MODEL_NAME, + "--data", str(data_dir), + "--adapter-path", str(adapter_dir), + "--iters", str(iters), + "--batch-size", "2", + "--learning-rate", "1e-3", + "--num-layers", "-1", + "--steps-per-report", "1", + "--steps-per-eval", str(max(iters + 1, 1000)), + "--seed", str(seed), + ] + section("invoke mlx_lm.lora trainer") + report("cmd", " ".join(cmd)) + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=900) + report("returncode", proc.returncode) + if proc.returncode != 0: + print("--- mlx_lm.lora stderr (tail) ---") + print(proc.stderr[-2000:]) + + losses_per_step = [] + for line in (proc.stdout + "\n" + proc.stderr).splitlines(): + if "Iter " in line and "Train loss" in line: + try: + num = float( + line.split("Train loss")[1].strip().split(",")[0].strip() + ) + losses_per_step.append(num) + except Exception: + pass + + report("parsed losses (count)", len(losses_per_step)) + if losses_per_step: + report("first loss", losses_per_step[0]) + report("last loss", losses_per_step[-1]) + + section("load + generate") + from mlx_lm import load as mlx_load, generate + try: + model, tokenizer = mlx_load(MODEL_NAME, adapter_path=str(adapter_dir)) + except TypeError: + model, tokenizer = mlx_load(MODEL_NAME) + try: + from mlx_lm.tuner.utils import load_adapters + load_adapters(model, str(adapter_dir)) + except Exception as e: + report("adapter load fallback failed", str(e)) + + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen)) + report("contains 'Unsloth'", contains) + + out = { + "config": {"iters": iters, "seed": seed, "batch_size": 2, + "learning_rate": 1e-3, "num_layers": -1}, + "returncode": proc.returncode, + "losses": losses_per_step, + "generation": gen, + "contains_unsloth": contains, + "stdout_tail": proc.stdout[-1500:], + "stderr_tail": proc.stderr[-1500:], + } + fname = f"probe_19__s{iters}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + + section("summary") + report("iters", iters) + report("seed", seed) + report("contains 'Unsloth'", contains) + + try: + shutil.rmtree(workdir, ignore_errors=True) + except Exception: + pass + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 347d0fbdf9a8309d3d2150bec11829dd567fd78b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 23:28:14 +0000 Subject: [PATCH 20/84] mlx_parity Round I: mlx-lm at higher iter counts Round H showed mlx-lm's NATIVE LoRA at 30-60 iters never drops loss below ~3 on this fixture, vs unsloth-zoo's MLXTrainer which hits 0.0 by step 10. Reason: mlx-lm default targets fewer modules + effective batch 2; unsloth-zoo targets all 7 modules + effective batch 6. Round I bumps mlx-lm iters to 200/500 to (a) confirm mlx-lm CAN memorize this fixture given enough budget, and (b) check whether its post-memorization greedy decode also shows non-monotonic seed fragility. If yes, that's strong evidence the fragility is in MLX/optimizer geometry, not unsloth-zoo's wrapper. --- .github/workflows/mlx-parity-probe.yml | 40 ++++++++++++-------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index c393f0bb90..6cdca57cdf 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,34 +58,32 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round H: PIVOT to mlx-lm reference comparison. - # Round G showed the (steps, seed) basin is fundamentally non- - # monotonic: 60 steps fails for seeds 42 and 3407 even though - # 30 steps works for both. seed=12345 fails at 30 + 40 but works - # at 50. The post_train_loss reaches 0 across all configs, so - # the fragility is in greedy-decode geometry, not training. - # Run mlx-lm's NATIVE LoRA on the SAME fixture at the same - # (steps, seed) combos. If mlx-lm shows the same fragility, - # it's an MLX/optimizer geometry issue, not an unsloth-zoo bug. - - id: '19a' + # Round I: mlx-lm at higher iter counts to confirm it CAN + # memorize this fixture. Round H showed mlx-lm at 30-60 iters + # never drops loss below ~3 (vs unsloth-zoo's 0.0 by step 10). + # mlx-lm's defaults target fewer modules + smaller effective + # batch -> needs more iters. Check whether (a) mlx-lm does + # eventually memorize, and (b) if so, does its greedy decode + # ALSO exhibit the non-monotonic seed fragility? + - id: '19f' script: probe_19_mlx_lm_param.py - steps: '30' + steps: '200' seed: '3407' - - id: '19b' + - id: '19g' script: probe_19_mlx_lm_param.py - steps: '30' + steps: '500' + seed: '3407' + - id: '19h' + script: probe_19_mlx_lm_param.py + steps: '200' seed: '12345' - - id: '19c' + - id: '19i' script: probe_19_mlx_lm_param.py - steps: '60' + steps: '200' seed: '42' - - id: '19d' - script: probe_19_mlx_lm_param.py - steps: '60' - seed: '3407' - - id: '19e' + - id: '19j' script: probe_19_mlx_lm_param.py - steps: '50' + steps: '500' seed: '12345' steps: - name: Harden runner (audit) From 9b98c68ebb9b552daa099bb5a7aa248fb2a26c16 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 23:32:37 +0000 Subject: [PATCH 21/84] mlx_parity Round J: bc=False vs bc=True end-to-end on PR-663 head Re-exposed adam_bias_correction in unsloth-zoo PR #663 (SHA 7312862). Round J pins to that SHA so MLX_BIAS_CORRECTION=0 actually takes effect and tests bc=False at the same diagnostic (steps, seed) pairs Round G found bc=True-fragile: 17ai : 30 steps + seed=3407 + bc=0 (control vs known-good bc=1) 17aj : 30 steps + seed=12345 + bc=0 (Round-E failing seed) 17ak : 60 steps + seed=3407 + bc=0 (Round-G failing combo) 17al : 60 steps + seed=42 + bc=0 (Round-G failing combo) 17am : 30 steps + seed=3407 + bc=1 (PR-663 head + bc=1 sanity) If bc=False generates 'Unsloth' on the bc=True-failing combos, that's the empirical justification for defaulting bc=False in PR #663. --- .github/workflows/mlx-parity-probe.yml | 64 ++++++++++++++++---------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 6cdca57cdf..6302454ab5 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,33 +58,49 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round I: mlx-lm at higher iter counts to confirm it CAN - # memorize this fixture. Round H showed mlx-lm at 30-60 iters - # never drops loss below ~3 (vs unsloth-zoo's 0.0 by step 10). - # mlx-lm's defaults target fewer modules + smaller effective - # batch -> needs more iters. Check whether (a) mlx-lm does - # eventually memorize, and (b) if so, does its greedy decode - # ALSO exhibit the non-monotonic seed fragility? - - id: '19f' - script: probe_19_mlx_lm_param.py - steps: '200' + # Round J: bc=False vs bc=True end-to-end. + # PR #663 head re-exposes adam_bias_correction (SHA 7312862). + # Pin to that SHA so the env var actually takes effect, then + # run unsloth-zoo MLXTrainer at the same diagnostic (steps, + # seed) pairs Round G exposed as bc=True-fragile, with bc=0 + # vs bc=1 each. If bc=False is robust where bc=True wasn't, + # default False is justified beyond just MLX-ecosystem parity. + # Includes the seed=3407+30 control on both bc values. + - id: '17ai' + script: probe_17_curve_param.py + steps: '30' seed: '3407' - - id: '19g' - script: probe_19_mlx_lm_param.py - steps: '500' - seed: '3407' - - id: '19h' - script: probe_19_mlx_lm_param.py - steps: '200' + bias_correction: '0' + lr: '1e-3' + zoo_pin: '7312862' + - id: '17aj' + script: probe_17_curve_param.py + steps: '30' seed: '12345' - - id: '19i' - script: probe_19_mlx_lm_param.py - steps: '200' + bias_correction: '0' + lr: '1e-3' + zoo_pin: '7312862' + - id: '17ak' + script: probe_17_curve_param.py + steps: '60' + seed: '3407' + bias_correction: '0' + lr: '1e-3' + zoo_pin: '7312862' + - id: '17al' + script: probe_17_curve_param.py + steps: '60' seed: '42' - - id: '19j' - script: probe_19_mlx_lm_param.py - steps: '500' - seed: '12345' + bias_correction: '0' + lr: '1e-3' + zoo_pin: '7312862' + - id: '17am' + script: probe_17_curve_param.py + steps: '30' + seed: '3407' + bias_correction: '1' + lr: '1e-3' + zoo_pin: '7312862' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 From 9e10b2897d91a0c99190688f5a20f0901192767b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 23:45:15 +0000 Subject: [PATCH 22/84] mlx_parity Round K: confirm new default + bc=False long-horizon Round J showed bc=False on PR-663 head doesn't memorize within 30-60 step budget. Pushed follow-up to PR #663 (SHA ef003aa) that flips the default to True. Round K (new SHA pinned): 17an : default + seed=3407 + 30 steps -> matches HEAD bc=True? 17ao : bc=0 + seed=3407 + 200 steps -> does bc=False reach loss<1? 17ap : bc=0 + seed=3407 + 500 steps -> upper end of opt-out usefulness 17aq : default + seed=12345 + 30 steps -> new default basin OK? 17ar : default + seed=42 + 30 steps -> new default basin OK? Also make probe_17 tri-state on MLX_BIAS_CORRECTION: empty env means "trainer default" (don't pass adam_bias_correction kwarg) so the test records whatever the trainer actually defaults to. "0"/"1" still forces an explicit value. --- .github/workflows/mlx-parity-probe.yml | 63 ++++++++++++++---------- tests/mlx_parity/probe_17_curve_param.py | 30 +++++++---- 2 files changed, 57 insertions(+), 36 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 6302454ab5..e83cee05dc 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,49 +58,58 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round J: bc=False vs bc=True end-to-end. - # PR #663 head re-exposes adam_bias_correction (SHA 7312862). - # Pin to that SHA so the env var actually takes effect, then - # run unsloth-zoo MLXTrainer at the same diagnostic (steps, - # seed) pairs Round G exposed as bc=True-fragile, with bc=0 - # vs bc=1 each. If bc=False is robust where bc=True wasn't, - # default False is justified beyond just MLX-ecosystem parity. - # Includes the seed=3407+30 control on both bc values. - - id: '17ai' + # Round K: verify PR-663 head (SHA ef003aa) with default + # adam_bias_correction=True keeps the smoke green AND that + # bc=False is still reachable via explicit opt-out. + # Round J empirically established bc=True is required to + # memorize within a 30-step budget. Round K is the final + # confirmation pass before merging PR #663: + # 17an : PR-663 head + DEFAULT (no env) + seed=3407 -> + # should match HEAD bc=True behavior (loss=0, ok) + # 17ao : PR-663 head + bc=0 explicit + seed=3407 + 200 steps + # (does bc=False eventually memorize given enough + # budget? - validates the "opt-out for long fine- + # tunes" rationale) + # 17ap : PR-663 head + bc=0 explicit + seed=3407 + 500 steps + # (same question, larger budget) + # 17aq : PR-663 head + DEFAULT + seed=12345 -> does the + # new default still hit the basin? + # 17ar : PR-663 head + DEFAULT + seed=42, 30 steps control + - id: '17an' script: probe_17_curve_param.py steps: '30' seed: '3407' - bias_correction: '0' + bias_correction: '' lr: '1e-3' - zoo_pin: '7312862' - - id: '17aj' + zoo_pin: 'ef003aa' + - id: '17ao' script: probe_17_curve_param.py - steps: '30' - seed: '12345' + steps: '200' + seed: '3407' bias_correction: '0' lr: '1e-3' - zoo_pin: '7312862' - - id: '17ak' + zoo_pin: 'ef003aa' + - id: '17ap' script: probe_17_curve_param.py - steps: '60' + steps: '500' seed: '3407' bias_correction: '0' lr: '1e-3' - zoo_pin: '7312862' - - id: '17al' + zoo_pin: 'ef003aa' + - id: '17aq' script: probe_17_curve_param.py - steps: '60' - seed: '42' - bias_correction: '0' + steps: '30' + seed: '12345' + bias_correction: '' lr: '1e-3' - zoo_pin: '7312862' - - id: '17am' + zoo_pin: 'ef003aa' + - id: '17ar' script: probe_17_curve_param.py steps: '30' - seed: '3407' - bias_correction: '1' + seed: '42' + bias_correction: '' lr: '1e-3' - zoo_pin: '7312862' + zoo_pin: 'ef003aa' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_17_curve_param.py b/tests/mlx_parity/probe_17_curve_param.py index 8c89adc8ea..07f0d9accd 100644 --- a/tests/mlx_parity/probe_17_curve_param.py +++ b/tests/mlx_parity/probe_17_curve_param.py @@ -82,10 +82,16 @@ def main() -> int: steps = _env_int("MLX_STEPS", 7) seed = _env_int("MLX_SEED", 3407) dtype = _env_str("MLX_DTYPE", "float16") - bc = _env_bool("MLX_BIAS_CORRECTION", False) + # Tri-state: empty/unset env var means "use trainer default" (don't + # pass adam_bias_correction at all); "0"/"1" forces explicit value. + bc_raw = (os.environ.get("MLX_BIAS_CORRECTION") or "").strip().lower() + if not bc_raw: + bc = None + else: + bc = bc_raw in ("1", "true", "yes", "y") lr = _env_float("MLX_LR", 1e-3) - banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc} lr={lr}") + banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc!r} lr={lr}") import random random.seed(seed) @@ -114,13 +120,13 @@ def main() -> int: finetune_mlp_modules=True, ) - # Only set adam_bias_correction if the field exists on this version - # of unsloth-zoo. HEAD (pre-PR-663) does not have it -- it forces - # True unconditionally, so MLX_BIAS_CORRECTION=0 on HEAD has no - # effect and the run characterizes the upstream broken default. + # Only set adam_bias_correction if (a) the field exists on this + # version of unsloth-zoo AND (b) the env var asked for an explicit + # value (bc is not None). bc=None means "use the trainer default" + # so the artifact records whatever the default actually is. fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} extra = {} - if "adam_bias_correction" in fields_supported: + if "adam_bias_correction" in fields_supported and bc is not None: extra["adam_bias_correction"] = bc config = MLXTrainingConfig( @@ -139,7 +145,7 @@ def main() -> int: use_cce=False, compile=False, gradient_checkpointing=False, - output_dir=str(OUT_DIR / f"probe17_outputs_s{steps}_d{seed}_bc{int(bc)}_lr{lr:g}"), + output_dir=str(OUT_DIR / f"probe17_outputs_s{steps}_d{seed}_bc{('d' if bc is None else int(bc))}_lr{lr:g}"), save_steps=0, eval_steps=0, dataset_text_field="text", @@ -179,10 +185,15 @@ def _on_step(*args): report("generation", repr(gen[:160])) report("contains 'Unsloth'", contains) + # Record what the trainer actually used (post-construction) so the + # artifact reflects the trainer default when bc was None at probe- + # invocation time. + effective_bc = getattr(config, "adam_bias_correction", None) out = { "config": { "steps": steps, "seed": seed, "dtype": dtype, "adam_bias_correction": bc, + "effective_adam_bias_correction": effective_bc, "learning_rate": lr, "adam_bc_field_supported": "adam_bias_correction" in fields_supported, }, @@ -192,7 +203,8 @@ def _on_step(*args): "contains_unsloth": contains, } lr_tag = f"{lr:.0e}".replace("-0", "-").replace("+0", "") - fname = f"probe_17__s{steps}_d{seed}_bc{int(bc)}_lr{lr_tag}.json" + bc_tag = "d" if bc is None else int(bc) + fname = f"probe_17__s{steps}_d{seed}_bc{bc_tag}_lr{lr_tag}.json" (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) section("summary") if rows: From ad4e9cf97a6f9c3db46c94019b22a8662237a845 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 17 May 2026 23:57:47 +0000 Subject: [PATCH 23/84] mlx_parity Round L: bc=False divergence boundary + default universality Round K showed PR-663 bc=False at 200/500 steps diverges to NaN on this fixture. That's not "slow", that's broken at long horizons. Round L: 17as : default + seed=1337 + 30 (reproduce known-good seed) 17at : default + seed=7777 + 30 (reproduce known-good seed) 17au : bc=0 + seed=3407 + 50 (does bc=False already diverge at 50?) 17av : bc=0 + seed=3407 + 100 (or between 100 and 200?) 17aw : default + seed=3407 + 30 (control) If 17au/17av both NaN, bc=False is dangerous past the smoke horizon on this fixture and the PR-663 docstring should warn about that. --- .github/workflows/mlx-parity-probe.yml | 52 +++++++++++--------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index e83cee05dc..41cbd0f7c4 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,55 +58,45 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round K: verify PR-663 head (SHA ef003aa) with default - # adam_bias_correction=True keeps the smoke green AND that - # bc=False is still reachable via explicit opt-out. - # Round J empirically established bc=True is required to - # memorize within a 30-step budget. Round K is the final - # confirmation pass before merging PR #663: - # 17an : PR-663 head + DEFAULT (no env) + seed=3407 -> - # should match HEAD bc=True behavior (loss=0, ok) - # 17ao : PR-663 head + bc=0 explicit + seed=3407 + 200 steps - # (does bc=False eventually memorize given enough - # budget? - validates the "opt-out for long fine- - # tunes" rationale) - # 17ap : PR-663 head + bc=0 explicit + seed=3407 + 500 steps - # (same question, larger budget) - # 17aq : PR-663 head + DEFAULT + seed=12345 -> does the - # new default still hit the basin? - # 17ar : PR-663 head + DEFAULT + seed=42, 30 steps control - - id: '17an' + # Round L: locate bc=False divergence boundary and verify + # PR-663 default reproduces known-good seeds (1337, 7777). + # Round K showed bc=False at 200/500 steps NaN-diverges + # (not just slow). The "opt-out for long fine-tunes" + # rationale in the commit message overstates bc=False's + # safety. Locate where divergence kicks in so the docstring + # can be honest. + - id: '17as' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '1337' bias_correction: '' lr: '1e-3' zoo_pin: 'ef003aa' - - id: '17ao' + - id: '17at' script: probe_17_curve_param.py - steps: '200' - seed: '3407' - bias_correction: '0' + steps: '30' + seed: '7777' + bias_correction: '' lr: '1e-3' zoo_pin: 'ef003aa' - - id: '17ap' + - id: '17au' script: probe_17_curve_param.py - steps: '500' + steps: '50' seed: '3407' bias_correction: '0' lr: '1e-3' zoo_pin: 'ef003aa' - - id: '17aq' + - id: '17av' script: probe_17_curve_param.py - steps: '30' - seed: '12345' - bias_correction: '' + steps: '100' + seed: '3407' + bias_correction: '0' lr: '1e-3' zoo_pin: 'ef003aa' - - id: '17ar' + - id: '17aw' script: probe_17_curve_param.py steps: '30' - seed: '42' + seed: '3407' bias_correction: '' lr: '1e-3' zoo_pin: 'ef003aa' From d01f6dd71fdd08389302154940afc31c087d9f0c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 00:13:49 +0000 Subject: [PATCH 24/84] mlx_parity Round M: narrow bc=False NaN boundary + plumbing equivalence Round L: bc=False went loss=5.06 at 50 -> NaN at 100. Round M narrows to 70/80 and also asks whether the basin shift Rounds K + L saw for seeds 42, 7777 between HEAD-bc=True (hardcoded) and PR-663-default (field-plumbed, defaults True) is from explicit-vs-default plumbing or just HEAD-vs-PR-663 codepath. 17ax : bc=0, seed=3407, 70 steps -> finite or NaN? 17ay : bc=0, seed=3407, 80 steps -> finite or NaN? 17az : bc=1 EXPLICIT, seed=42, 30 -> should match 17ar (default) 17ba : bc=1 EXPLICIT, seed=7777, 30 -> should match 17at (default) 17bb : default, seed=3407, 30 -> control Pin to PR-663 head 669a792 (docstring-only follow-up to ef003aa, code identical). --- .github/workflows/mlx-parity-probe.yml | 58 +++++++++++++------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 41cbd0f7c4..4826b1e794 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,48 +58,48 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round L: locate bc=False divergence boundary and verify - # PR-663 default reproduces known-good seeds (1337, 7777). - # Round K showed bc=False at 200/500 steps NaN-diverges - # (not just slow). The "opt-out for long fine-tunes" - # rationale in the commit message overstates bc=False's - # safety. Locate where divergence kicks in so the docstring - # can be honest. - - id: '17as' + # Round M: narrow bc=False NaN boundary (Round L: 50 OK-ish, + # 100 NaN) AND verify default(bc=True implicit) is exactly + # equivalent to bc=1 explicit on PR-663 head (Round K + L + # showed basin shifts for seeds 42, 7777 vs HEAD bc=True; + # is that explicit-vs-default plumbing or HEAD-vs-PR-663 + # codepath?). Pin to PR-663 head SHA 669a792 (docstring-only + # follow-up to ef003aa). + - id: '17ax' script: probe_17_curve_param.py - steps: '30' - seed: '1337' - bias_correction: '' - lr: '1e-3' - zoo_pin: 'ef003aa' - - id: '17at' - script: probe_17_curve_param.py - steps: '30' - seed: '7777' - bias_correction: '' - lr: '1e-3' - zoo_pin: 'ef003aa' - - id: '17au' - script: probe_17_curve_param.py - steps: '50' + steps: '70' seed: '3407' bias_correction: '0' lr: '1e-3' - zoo_pin: 'ef003aa' - - id: '17av' + zoo_pin: '669a792' + - id: '17ay' script: probe_17_curve_param.py - steps: '100' + steps: '80' seed: '3407' bias_correction: '0' lr: '1e-3' - zoo_pin: 'ef003aa' - - id: '17aw' + zoo_pin: '669a792' + - id: '17az' + script: probe_17_curve_param.py + steps: '30' + seed: '42' + bias_correction: '1' + lr: '1e-3' + zoo_pin: '669a792' + - id: '17ba' + script: probe_17_curve_param.py + steps: '30' + seed: '7777' + bias_correction: '1' + lr: '1e-3' + zoo_pin: '669a792' + - id: '17bb' script: probe_17_curve_param.py steps: '30' seed: '3407' bias_correction: '' lr: '1e-3' - zoo_pin: 'ef003aa' + zoo_pin: '669a792' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 From deeef596dfe7af1c4ccd7935932736ad62980b6e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 00:27:42 +0000 Subject: [PATCH 25/84] mlx_parity Round N: final sanity pass on PR-663 head Rounds A-M converged: PR #663's two-commit fix (max_grad_value=None + adam_bias_correction=True opt-out field) is empirically correct. Round N is a tight 5-job sanity pass on the smoke-critical configs: 17bc : default + seed=3407 + 30 -> smoke seed, must match HEAD 17bd : bc=1 explicit + seed=3407 + 30 -> explicit == default? 17be : bc=1 explicit + seed=1337 + 30 -> known-good seed parity 17bf : bc=1 explicit + seed=999 + 30 -> additional seed parity 17bg : bc=0 explicit + seed=3407 + 30 -> short-horizon bc=False bouncy If 17bc/17bd hit loss=0 + 'Unsloth' and 17bg hits loss > 1 + no NaN (Round M showed loss 2-5 at this step count is normal), the contract documented in the PR-663 docstring holds. --- .github/workflows/mlx-parity-probe.yml | 43 ++++++++++++++------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 4826b1e794..695dc80c15 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,46 +58,51 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round M: narrow bc=False NaN boundary (Round L: 50 OK-ish, - # 100 NaN) AND verify default(bc=True implicit) is exactly - # equivalent to bc=1 explicit on PR-663 head (Round K + L - # showed basin shifts for seeds 42, 7777 vs HEAD bc=True; - # is that explicit-vs-default plumbing or HEAD-vs-PR-663 - # codepath?). Pin to PR-663 head SHA 669a792 (docstring-only - # follow-up to ef003aa). - - id: '17ax' + # Round N: final sanity pass on PR-663 head (SHA 669a792). + # 13 rounds in, conclusions are stable: + # - PR #634 introduced both max_grad_value=5.0 and the + # hidden bc=True default; PR #663 fixes the first + # (->None, HF parity) and exposes the second as a + # proper field with default True; + # - bc=False on this fixture: loss 2-5 at 30-80 steps, + # NaN at 100. Documented as a warning; + # - greedy decode is fragile across (steps, seed); the + # load-bearing smoke gate is post_train_loss < 0.1. + # Round N verifies the smoke-critical config is stable + # across the smoke seed and the explicit/default knob: + - id: '17bc' script: probe_17_curve_param.py - steps: '70' + steps: '30' seed: '3407' - bias_correction: '0' + bias_correction: '' lr: '1e-3' zoo_pin: '669a792' - - id: '17ay' + - id: '17bd' script: probe_17_curve_param.py - steps: '80' + steps: '30' seed: '3407' - bias_correction: '0' + bias_correction: '1' lr: '1e-3' zoo_pin: '669a792' - - id: '17az' + - id: '17be' script: probe_17_curve_param.py steps: '30' - seed: '42' + seed: '1337' bias_correction: '1' lr: '1e-3' zoo_pin: '669a792' - - id: '17ba' + - id: '17bf' script: probe_17_curve_param.py steps: '30' - seed: '7777' + seed: '999' bias_correction: '1' lr: '1e-3' zoo_pin: '669a792' - - id: '17bb' + - id: '17bg' script: probe_17_curve_param.py steps: '30' seed: '3407' - bias_correction: '' + bias_correction: '0' lr: '1e-3' zoo_pin: '669a792' steps: From 69b37fb8816375367a828a03365f1fbf2c1bacff Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 00:43:26 +0000 Subject: [PATCH 26/84] mlx_parity Round O: counterfactual on bc=True at long horizons Round L: bc=False NaN-diverges at 100 steps on this fixture. Round M: NaN boundary tightens to 80-100. Round O asks: does bc=True ALSO diverge at long horizons, or is the NaN specific to bc=False? If bc=True stays healthy at 200/500 the divergence is a bc=False geometry property; if both diverge it's fixture/LR-level. 17bh : default + 200 + seed=3407 -> finite or NaN? 17bi : default + 500 + seed=3407 -> finite or NaN? 17bj : default + 100 + seed=3407 -> matches Round B "100 OK"? 17bk : bc=0 + 90 + seed=3407 -> NaN already at 90? 17bl : default + 30 + seed=3407 -> control --- .github/workflows/mlx-parity-probe.yml | 48 +++++++++++--------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 695dc80c15..ec5e19cfad 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,51 +58,45 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round N: final sanity pass on PR-663 head (SHA 669a792). - # 13 rounds in, conclusions are stable: - # - PR #634 introduced both max_grad_value=5.0 and the - # hidden bc=True default; PR #663 fixes the first - # (->None, HF parity) and exposes the second as a - # proper field with default True; - # - bc=False on this fixture: loss 2-5 at 30-80 steps, - # NaN at 100. Documented as a warning; - # - greedy decode is fragile across (steps, seed); the - # load-bearing smoke gate is post_train_loss < 0.1. - # Round N verifies the smoke-critical config is stable - # across the smoke seed and the explicit/default knob: - - id: '17bc' + # Round O: counterfactual on bc=True at long horizons. + # Round L pinned bc=False NaN-divergence to ~80-100 steps + # on this fixture. The natural follow-up is: is the + # divergence specific to bc=False, or does bc=True also + # blow up at the same horizons (would mean the fixture + + # LR are the issue, not the bc value)? + - id: '17bh' script: probe_17_curve_param.py - steps: '30' + steps: '200' seed: '3407' bias_correction: '' lr: '1e-3' zoo_pin: '669a792' - - id: '17bd' + - id: '17bi' script: probe_17_curve_param.py - steps: '30' + steps: '500' seed: '3407' - bias_correction: '1' + bias_correction: '' lr: '1e-3' zoo_pin: '669a792' - - id: '17be' + - id: '17bj' script: probe_17_curve_param.py - steps: '30' - seed: '1337' - bias_correction: '1' + steps: '100' + seed: '3407' + bias_correction: '' lr: '1e-3' zoo_pin: '669a792' - - id: '17bf' + - id: '17bk' script: probe_17_curve_param.py - steps: '30' - seed: '999' - bias_correction: '1' + steps: '90' + seed: '3407' + bias_correction: '0' lr: '1e-3' zoo_pin: '669a792' - - id: '17bg' + - id: '17bl' script: probe_17_curve_param.py steps: '30' seed: '3407' - bias_correction: '0' + bias_correction: '' lr: '1e-3' zoo_pin: '669a792' steps: From a2907a0da8e0f3c895f665988e65eb307d20ff90 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 00:57:28 +0000 Subject: [PATCH 27/84] mlx_parity Round P: bc=False NaN to single-step + 1000-step bc=True Round O: bc=True at 100/200/500 stays loss=0 finite; bc=False NaN at 90. Round P narrows boundary single-step (85, 88) and adds bc=True at 1000 steps to extend the finite-stays-finite claim well past the smoke horizon. 17bm : bc=0 + 85 steps (Round M: 80 OK, Round O: 90 NaN) 17bn : bc=0 + 88 steps 17bo : default + 1000 steps (bc=True very long horizon) 17bp : default + 50 steps (gap fill: never tested on PR-663 default) 17bq : default + 30 steps (control) --- .github/workflows/mlx-parity-probe.yml | 38 ++++++++++++++------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index ec5e19cfad..0c829714b2 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,41 +58,43 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round O: counterfactual on bc=True at long horizons. - # Round L pinned bc=False NaN-divergence to ~80-100 steps - # on this fixture. The natural follow-up is: is the - # divergence specific to bc=False, or does bc=True also - # blow up at the same horizons (would mean the fixture + - # LR are the issue, not the bc value)? - - id: '17bh' + # Round P: tighten bc=False NaN boundary + 1000-step bc=True + # stability. Round O established: + # * bc=True at 100/200/500 stays finite (loss=0); + # * bc=False NaN at 90 (Round M: 80 finite at loss 2.64). + # Round P narrows the boundary to single-step precision so + # the PR-663 docstring "past ~80 steps" can be tightened if + # needed, and adds a 1000-step bc=True run to extend the + # finite-stays-finite claim. + - id: '17bm' script: probe_17_curve_param.py - steps: '200' + steps: '85' seed: '3407' - bias_correction: '' + bias_correction: '0' lr: '1e-3' zoo_pin: '669a792' - - id: '17bi' + - id: '17bn' script: probe_17_curve_param.py - steps: '500' + steps: '88' seed: '3407' - bias_correction: '' + bias_correction: '0' lr: '1e-3' zoo_pin: '669a792' - - id: '17bj' + - id: '17bo' script: probe_17_curve_param.py - steps: '100' + steps: '1000' seed: '3407' bias_correction: '' lr: '1e-3' zoo_pin: '669a792' - - id: '17bk' + - id: '17bp' script: probe_17_curve_param.py - steps: '90' + steps: '50' seed: '3407' - bias_correction: '0' + bias_correction: '' lr: '1e-3' zoo_pin: '669a792' - - id: '17bl' + - id: '17bq' script: probe_17_curve_param.py steps: '30' seed: '3407' From a765616135e4967480498e7a94ce1eacf7b8e543 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 01:12:21 +0000 Subject: [PATCH 28/84] mlx_parity Round Q: LR axis at long horizons Rounds A-P established the (steps, seed, bc) safety envelope at lr=1e-3. Round Q tests the LR axis to tighten the PR-663 docstring: 17br : bc=0 + 100 steps + lr=1e-4 -> does smaller LR avoid NaN? 17bs : bc=0 + 200 steps + lr=1e-4 -> same question, longer 17bt : default + 100 steps + lr=5e-3 -> does larger LR break bc=1? 17bu : default + 500 steps + lr=5e-3 -> long horizon, big LR 17bv : default + 30 steps + lr=1e-3 -> control --- .github/workflows/mlx-parity-probe.yml | 40 ++++++++++++-------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 0c829714b2..9b5e49ce47 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,43 +58,41 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round P: tighten bc=False NaN boundary + 1000-step bc=True - # stability. Round O established: - # * bc=True at 100/200/500 stays finite (loss=0); - # * bc=False NaN at 90 (Round M: 80 finite at loss 2.64). - # Round P narrows the boundary to single-step precision so - # the PR-663 docstring "past ~80 steps" can be tightened if - # needed, and adds a 1000-step bc=True run to extend the - # finite-stays-finite claim. - - id: '17bm' + # Round Q: LR axis at long horizons. + # Rounds A-P characterized (steps, seed, bc) thoroughly at + # lr=1e-3. The remaining question for the PR-663 docstring: + # does smaller LR rescue bc=False from NaN, and does larger + # LR push bc=True into instability? Tightens the safety + # envelope around the recommended defaults. + - id: '17br' script: probe_17_curve_param.py - steps: '85' + steps: '100' seed: '3407' bias_correction: '0' - lr: '1e-3' + lr: '1e-4' zoo_pin: '669a792' - - id: '17bn' + - id: '17bs' script: probe_17_curve_param.py - steps: '88' + steps: '200' seed: '3407' bias_correction: '0' - lr: '1e-3' + lr: '1e-4' zoo_pin: '669a792' - - id: '17bo' + - id: '17bt' script: probe_17_curve_param.py - steps: '1000' + steps: '100' seed: '3407' bias_correction: '' - lr: '1e-3' + lr: '5e-3' zoo_pin: '669a792' - - id: '17bp' + - id: '17bu' script: probe_17_curve_param.py - steps: '50' + steps: '500' seed: '3407' bias_correction: '' - lr: '1e-3' + lr: '5e-3' zoo_pin: '669a792' - - id: '17bq' + - id: '17bv' script: probe_17_curve_param.py steps: '30' seed: '3407' From 598518c43fa1be61083e9bbf7fc78b48724c299f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 01:28:02 +0000 Subject: [PATCH 29/84] mlx_parity Round R: extend safe-envelope claims from Round Q Round Q established the bc x lr safety grid at single horizons. Round R extends to confirm the claims hold at the docstring's recommended bounds: 17bw : bc=0 + lr=1e-4 + 500 steps -> finite at 5x prior horizon? 17bx : bc=0 + lr=1e-4 + 1000 steps -> finite at 10x? 17by : default + lr=5e-3 + 30 steps -> does 5e-3 break smoke? 17bz : bc=0 + lr=5e-4 + 100 steps -> 5e-4 crossover safe/unsafe? 17ca : default + lr=1e-3 + 30 steps -> control --- .github/workflows/mlx-parity-probe.yml | 48 ++++++++++++++------------ 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 9b5e49ce47..b8833dfe62 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,47 +58,51 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round Q: LR axis at long horizons. - # Rounds A-P characterized (steps, seed, bc) thoroughly at - # lr=1e-3. The remaining question for the PR-663 docstring: - # does smaller LR rescue bc=False from NaN, and does larger - # LR push bc=True into instability? Tightens the safety - # envelope around the recommended defaults. - - id: '17br' + # Round R: extend the safe-envelope claims from Round Q so + # the trainer.py docstring is empirically defensible: + # 17bw : bc=0 + 500 steps + lr=1e-4 (extend safe past 200) + # 17bx : bc=0 + 1000 steps + lr=1e-4 (does small-LR bc=False + # eventually diverge or stay finite indefinitely?) + # 17by : default + 30 steps + lr=5e-3 (does 5e-3 bc=True + # break even at smoke horizon?) + # 17bz : bc=0 + 100 steps + lr=5e-4 (mid-LR bc=False: + # is 5e-4 the crossover safe/unsafe LR?) + # 17ca : default + 30 + lr=1e-3 (control, pin SHA c1821e4) + - id: '17bw' script: probe_17_curve_param.py - steps: '100' + steps: '500' seed: '3407' bias_correction: '0' lr: '1e-4' - zoo_pin: '669a792' - - id: '17bs' + zoo_pin: 'c1821e4' + - id: '17bx' script: probe_17_curve_param.py - steps: '200' + steps: '1000' seed: '3407' bias_correction: '0' lr: '1e-4' - zoo_pin: '669a792' - - id: '17bt' + zoo_pin: 'c1821e4' + - id: '17by' script: probe_17_curve_param.py - steps: '100' + steps: '30' seed: '3407' bias_correction: '' lr: '5e-3' - zoo_pin: '669a792' - - id: '17bu' + zoo_pin: 'c1821e4' + - id: '17bz' script: probe_17_curve_param.py - steps: '500' + steps: '100' seed: '3407' - bias_correction: '' - lr: '5e-3' - zoo_pin: '669a792' - - id: '17bv' + bias_correction: '0' + lr: '5e-4' + zoo_pin: 'c1821e4' + - id: '17ca' script: probe_17_curve_param.py steps: '30' seed: '3407' bias_correction: '' lr: '1e-3' - zoo_pin: '669a792' + zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 From aa122fcb3056e171f171a5f0ed6e1171268a4318 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 01:42:44 +0000 Subject: [PATCH 30/84] mlx_parity Round S: fresh-seed pass-rate sampling on PR-663 head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quantify the contains-Unsloth pass rate for the recommended config (30 steps, lr=1e-3, default bc=True) across random fresh seeds on PR-663 head. Adds 4 unseeded points to the 2 we already have: 17cb : 42 (known: PR-663 head shifted ✗) 17cc : 11111 (fresh) 17cd : 22222 (fresh) 17ce : 2024 (fresh) 17cf : 3407 (smoke seed, control) Combined with prior rounds for seeds 1337 ✓, 7777 ✗, 999 ✗, 12345 ✗ this gives a 9-seed denominator for the docstring statistic. --- .github/workflows/mlx-parity-probe.yml | 54 ++++++++++++-------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index b8833dfe62..440764a844 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,45 +58,41 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round R: extend the safe-envelope claims from Round Q so - # the trainer.py docstring is empirically defensible: - # 17bw : bc=0 + 500 steps + lr=1e-4 (extend safe past 200) - # 17bx : bc=0 + 1000 steps + lr=1e-4 (does small-LR bc=False - # eventually diverge or stay finite indefinitely?) - # 17by : default + 30 steps + lr=5e-3 (does 5e-3 bc=True - # break even at smoke horizon?) - # 17bz : bc=0 + 100 steps + lr=5e-4 (mid-LR bc=False: - # is 5e-4 the crossover safe/unsafe LR?) - # 17ca : default + 30 + lr=1e-3 (control, pin SHA c1821e4) - - id: '17bw' + # Round S: fresh-seed pass-rate sampling for the recommended + # config (30 steps, lr=1e-3, default bc=True, PR-663 head + # SHA c1821e4). Earlier rounds saw basin shifts for seeds + # 42, 999, 7777 between HEAD and PR-663 head. Quantify how + # many of N random seeds produce "Unsloth" in greedy decode + # on PR-663 head: + - id: '17cb' script: probe_17_curve_param.py - steps: '500' - seed: '3407' - bias_correction: '0' - lr: '1e-4' + steps: '30' + seed: '42' + bias_correction: '' + lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17bx' + - id: '17cc' script: probe_17_curve_param.py - steps: '1000' - seed: '3407' - bias_correction: '0' - lr: '1e-4' + steps: '30' + seed: '11111' + bias_correction: '' + lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17by' + - id: '17cd' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '22222' bias_correction: '' - lr: '5e-3' + lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17bz' + - id: '17ce' script: probe_17_curve_param.py - steps: '100' - seed: '3407' - bias_correction: '0' - lr: '5e-4' + steps: '30' + seed: '2024' + bias_correction: '' + lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17ca' + - id: '17cf' script: probe_17_curve_param.py steps: '30' seed: '3407' From d8ca629d1207f10283e9b08bcc7f5d1c9bc4127f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 01:57:24 +0000 Subject: [PATCH 31/84] mlx_parity Round T: failing-seed basin recurrence at long horizons Round S: 3/9 random seeds pass contains-Unsloth at 30 steps on PR-663 head. Round B previously found seed=3407 had a non- monotonic basin (30 OK, 50/60 BAD, 100/500 mixed). Round T tests whether the failing seeds (42, 22222) re-enter the "Unsloth" basin at longer horizons: 17cg : seed=42 + 100 steps 17ch : seed=42 + 500 steps 17ci : seed=22222 + 500 steps 17cj : seed=22222 + 100 steps 17ck : seed=3407 + 30 control --- .github/workflows/mlx-parity-probe.yml | 32 ++++++++++++-------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 440764a844..1ecd91d3f3 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,41 +58,39 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round S: fresh-seed pass-rate sampling for the recommended - # config (30 steps, lr=1e-3, default bc=True, PR-663 head - # SHA c1821e4). Earlier rounds saw basin shifts for seeds - # 42, 999, 7777 between HEAD and PR-663 head. Quantify how - # many of N random seeds produce "Unsloth" in greedy decode - # on PR-663 head: - - id: '17cb' + # Round T: do failing seeds re-enter the "Unsloth" basin + # at longer step counts? Round B showed seed=3407 went + # OK->BAD->OK across 30/60/100/500. Test the recurrence + # for failing seeds 42, 22222 across step counts. + - id: '17cg' script: probe_17_curve_param.py - steps: '30' + steps: '100' seed: '42' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17cc' + - id: '17ch' script: probe_17_curve_param.py - steps: '30' - seed: '11111' + steps: '500' + seed: '42' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17cd' + - id: '17ci' script: probe_17_curve_param.py - steps: '30' + steps: '500' seed: '22222' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17ce' + - id: '17cj' script: probe_17_curve_param.py - steps: '30' - seed: '2024' + steps: '100' + seed: '22222' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17cf' + - id: '17ck' script: probe_17_curve_param.py steps: '30' seed: '3407' From c7e7618de553aebc3654dbd4ed941790f3fbc370 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 02:12:18 +0000 Subject: [PATCH 32/84] mlx_parity Round U: confirm 500-step basin recovery generalizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round T: seeds 42 + 22222 ✗ at 30 -> ✓ at 500. Confirm 999, 7777, 12345, 2024 also recover at 500 steps. If so, the "smoke fails on seed X" signal can be diagnosed by simply running more steps, making the empirical case that PR-663's training math is healthy regardless of which random seed lands which 30-step decode basin. --- .github/workflows/mlx-parity-probe.yml | 32 ++++++++++++++------------ 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 1ecd91d3f3..109f3dda0f 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,39 +58,41 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round T: do failing seeds re-enter the "Unsloth" basin - # at longer step counts? Round B showed seed=3407 went - # OK->BAD->OK across 30/60/100/500. Test the recurrence - # for failing seeds 42, 22222 across step counts. - - id: '17cg' + # Round U: confirm 500-step basin recovery generalizes. + # Round T showed 42 + 22222 ✗ at 30 -> ✓ at 500. If 999, + # 7777, 12345, 2024 also recover at 500, the basin + # recurrence is universal across failing seeds and + # PR-663's training math is empirically sound; only + # greedy-decode geometry is fragile at the smoke horizon. + - id: '17cl' script: probe_17_curve_param.py - steps: '100' - seed: '42' + steps: '500' + seed: '999' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17ch' + - id: '17cm' script: probe_17_curve_param.py steps: '500' - seed: '42' + seed: '7777' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17ci' + - id: '17cn' script: probe_17_curve_param.py steps: '500' - seed: '22222' + seed: '12345' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17cj' + - id: '17co' script: probe_17_curve_param.py - steps: '100' - seed: '22222' + steps: '500' + seed: '2024' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17ck' + - id: '17cp' script: probe_17_curve_param.py steps: '30' seed: '3407' From 7efd10396adcbf9e9cc386d3f17628a442da85a4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 02:27:21 +0000 Subject: [PATCH 33/84] mlx_parity Round V: do empty-output seeds (999, 2024) recover at 200/1000? Round U found 5/7 failing seeds recover at 500 steps but 999 and 2024 land in an "empty-output basin" (greedy decodes EOS first token). Round V tests step counts on either side: 17cq : 999 + 200 17cr : 999 + 1000 17cs : 2024 + 200 17ct : 2024 + 1000 17cu : 3407 + 30 control If any combination recovers, basin oscillation is universal and step count alone determines greedy-decode landing. --- .github/workflows/mlx-parity-probe.yml | 35 +++++++++++++------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 109f3dda0f..b935a320f0 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,41 +58,42 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round U: confirm 500-step basin recovery generalizes. - # Round T showed 42 + 22222 ✗ at 30 -> ✓ at 500. If 999, - # 7777, 12345, 2024 also recover at 500, the basin - # recurrence is universal across failing seeds and - # PR-663's training math is empirically sound; only - # greedy-decode geometry is fragile at the smoke horizon. - - id: '17cl' + # Round V: do "empty-output basin" seeds (999, 2024) recover + # at 200 or 1000 steps? Round U found 5/7 failing seeds + # transition ✗->✓ at 500 steps, but seeds 999, 2024 stayed + # ✗ generating empty strings (greedy decoded EOS immediately). + # If they recover at 200 or 1000, the basin landscape is + # truly oscillatory and step count alone determines decode + # outcome. + - id: '17cq' script: probe_17_curve_param.py - steps: '500' + steps: '200' seed: '999' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17cm' + - id: '17cr' script: probe_17_curve_param.py - steps: '500' - seed: '7777' + steps: '1000' + seed: '999' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17cn' + - id: '17cs' script: probe_17_curve_param.py - steps: '500' - seed: '12345' + steps: '200' + seed: '2024' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17co' + - id: '17ct' script: probe_17_curve_param.py - steps: '500' + steps: '1000' seed: '2024' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17cp' + - id: '17cu' script: probe_17_curve_param.py steps: '30' seed: '3407' From 5ea0ff33c3bcf8720ba5fb0304ee02216fba90f2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 02:42:28 +0000 Subject: [PATCH 34/84] mlx_parity Round W: LR x bc on persistent-failure seeds Round V: seeds 999, 2024 fail at every step count tested at (default bc=True, lr=1e-3). Round W asks whether a different (bc, lr) point within the safe envelope rescues those seeds: 17cv : 999 + 1000 + bc=0 + lr=1e-4 (small-LR, no bias) 17cw : 2024 + 1000 + bc=0 + lr=1e-4 (same) 17cx : 999 + 500 + bc=0 + lr=1e-4 (mid horizon) 17cy : 999 + 500 + default + 5e-4 (mid LR, default bc) 17cz : 3407 + 30 + default + 1e-3 (control) --- .github/workflows/mlx-parity-probe.yml | 49 +++++++++++++------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index b935a320f0..c5b9378031 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,42 +58,41 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round V: do "empty-output basin" seeds (999, 2024) recover - # at 200 or 1000 steps? Round U found 5/7 failing seeds - # transition ✗->✓ at 500 steps, but seeds 999, 2024 stayed - # ✗ generating empty strings (greedy decoded EOS immediately). - # If they recover at 200 or 1000, the basin landscape is - # truly oscillatory and step count alone determines decode - # outcome. - - id: '17cq' + # Round W: try LR x bc on the persistent-failure seeds. + # Round V: 999 + 2024 fail at every step count we tested + # at (default bc=True, lr=1e-3). Round W asks whether a + # different (bc, lr) interior to the safe envelope rescues + # those seeds at long horizons, or whether the basin shape + # is fundamentally seed-determined. + - id: '17cv' script: probe_17_curve_param.py - steps: '200' + steps: '1000' seed: '999' - bias_correction: '' - lr: '1e-3' + bias_correction: '0' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17cr' + - id: '17cw' script: probe_17_curve_param.py steps: '1000' - seed: '999' - bias_correction: '' - lr: '1e-3' + seed: '2024' + bias_correction: '0' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17cs' + - id: '17cx' script: probe_17_curve_param.py - steps: '200' - seed: '2024' - bias_correction: '' - lr: '1e-3' + steps: '500' + seed: '999' + bias_correction: '0' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17ct' + - id: '17cy' script: probe_17_curve_param.py - steps: '1000' - seed: '2024' + steps: '500' + seed: '999' bias_correction: '' - lr: '1e-3' + lr: '5e-4' zoo_pin: 'c1821e4' - - id: '17cu' + - id: '17cz' script: probe_17_curve_param.py steps: '30' seed: '3407' From b35ff580af3ce6d2e5e3ad4dcba3b53e62b2814a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 02:57:28 +0000 Subject: [PATCH 35/84] mlx_parity Round X: hunt for universal (bc, lr, steps) config Round W lifted seed=999 out of its empty-output basin with bc=False + lr=1e-4 + 500/1000 steps. Round X tests the same config against the other smoke-default-failing seeds (42, 22222, 2024, 12345). If 4/4 land in the Unsloth basin, this is a true universal-robust config for this fixture, and the PR-663 docstring could include it as an alternative recommendation for smoke tests. 17da : 42 + 500 + bc=0 + lr=1e-4 17db : 22222 + 500 + bc=0 + lr=1e-4 17dc : 2024 + 500 + bc=0 + lr=1e-4 17dd : 12345 + 500 + bc=0 + lr=1e-4 17de : 3407 + 30 + default + lr=1e-3 (control) --- .github/workflows/mlx-parity-probe.yml | 39 +++++++++++++------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index c5b9378031..a3afb4a2f1 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,41 +58,42 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round W: try LR x bc on the persistent-failure seeds. - # Round V: 999 + 2024 fail at every step count we tested - # at (default bc=True, lr=1e-3). Round W asks whether a - # different (bc, lr) interior to the safe envelope rescues - # those seeds at long horizons, or whether the basin shape - # is fundamentally seed-determined. - - id: '17cv' + # Round X: hunt for a universal (bc, lr, steps) tuple. + # Round W found bc=False + lr=1e-4 + 500/1000 steps lifts + # seed=999 out of its empty-output basin into "Unsloth!". + # Test the same config for seeds that were failing at the + # smoke default (bc=True, lr=1e-3, 30 steps). If 4/4 hit + # Unsloth, this is a true "universal-robust" config for + # this fixture (modulo seed=2024 which Round W kept ✗). + - id: '17da' script: probe_17_curve_param.py - steps: '1000' - seed: '999' + steps: '500' + seed: '42' bias_correction: '0' lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17cw' + - id: '17db' script: probe_17_curve_param.py - steps: '1000' - seed: '2024' + steps: '500' + seed: '22222' bias_correction: '0' lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17cx' + - id: '17dc' script: probe_17_curve_param.py steps: '500' - seed: '999' + seed: '2024' bias_correction: '0' lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17cy' + - id: '17dd' script: probe_17_curve_param.py steps: '500' - seed: '999' - bias_correction: '' - lr: '5e-4' + seed: '12345' + bias_correction: '0' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17cz' + - id: '17de' script: probe_17_curve_param.py steps: '30' seed: '3407' From eee1393487dc93d43e8d5f97626ec37d38024a9b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 03:12:23 +0000 Subject: [PATCH 36/84] mlx_parity Round Y: expand seed cube at smoke config Round X: bc=False+lr=1e-4+500 has 2/6 pass rate -- no better than the smoke default. The basin landscape is fundamentally seed-determined. Round Y adds 4 more fresh seeds to the smoke-config pass-rate sample (0, 5555, 8888, 33333). Combined with Round S's stats, gives a 13-seed denominator for the docstring's pass-rate claim. --- .github/workflows/mlx-parity-probe.yml | 54 +++++++++++++------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index a3afb4a2f1..e42ef7f8e7 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,42 +58,40 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round X: hunt for a universal (bc, lr, steps) tuple. - # Round W found bc=False + lr=1e-4 + 500/1000 steps lifts - # seed=999 out of its empty-output basin into "Unsloth!". - # Test the same config for seeds that were failing at the - # smoke default (bc=True, lr=1e-3, 30 steps). If 4/4 hit - # Unsloth, this is a true "universal-robust" config for - # this fixture (modulo seed=2024 which Round W kept ✗). - - id: '17da' + # Round Y: expand seed cube at the smoke config to tighten + # the pass-rate statistic. Round S got 3/9 random seeds + # producing "Unsloth" at (30, lr=1e-3, default bc=True). + # Round Y adds 3 more fresh seeds + replays the two known + # outcomes for sanity. + - id: '17df' script: probe_17_curve_param.py - steps: '500' - seed: '42' - bias_correction: '0' - lr: '1e-4' + steps: '30' + seed: '5555' + bias_correction: '' + lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17db' + - id: '17dg' script: probe_17_curve_param.py - steps: '500' - seed: '22222' - bias_correction: '0' - lr: '1e-4' + steps: '30' + seed: '8888' + bias_correction: '' + lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17dc' + - id: '17dh' script: probe_17_curve_param.py - steps: '500' - seed: '2024' - bias_correction: '0' - lr: '1e-4' + steps: '30' + seed: '33333' + bias_correction: '' + lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17dd' + - id: '17di' script: probe_17_curve_param.py - steps: '500' - seed: '12345' - bias_correction: '0' - lr: '1e-4' + steps: '30' + seed: '0' + bias_correction: '' + lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17de' + - id: '17dj' script: probe_17_curve_param.py steps: '30' seed: '3407' From 93885ef94c3228537fce54654379ab7327b279d3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 03:27:00 +0000 Subject: [PATCH 37/84] mlx_parity Round Z: determinism check Across rounds the same (seed, config) tuple has produced identical-looking generations on different commits. Round Z runs the SAME tuple in parallel matrix entries to verify bit-exact reproducibility WITHIN a single workflow run. 17dk : 3407 + 30 + default (run A) 17dl : 3407 + 30 + default (run B) 17dm : 42 + 30 + default (run A) 17dn : 42 + 30 + default (run B) 17do : 3407 + 30 + default (control / run C) If all three 3407 outputs are byte-identical, MLX seeding + probe RNG threading is fully deterministic on Apple Silicon CI. Any drift would indicate a residual non-deterministic source (e.g. metal kernel reductions, parallel reductions that don't honor seed). --- .github/workflows/mlx-parity-probe.yml | 31 ++++++++++++++------------ 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index e42ef7f8e7..e76d16de12 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,40 +58,43 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round Y: expand seed cube at the smoke config to tighten - # the pass-rate statistic. Round S got 3/9 random seeds - # producing "Unsloth" at (30, lr=1e-3, default bc=True). - # Round Y adds 3 more fresh seeds + replays the two known - # outcomes for sanity. - - id: '17df' + # Round Z: determinism check. Across rounds we've seen the + # same (seed, config) tuple produce identical-looking + # generations on different commits (e.g. 17am vs 17ck vs + # 17de vs 17dj for 3407+30+default all printed "_______ + # My name is Unsloth!"). Round Z runs the SAME tuple in + # parallel matrix entries to verify bit-exact reproducibility + # within a single workflow run, then a different seed + # twice to triangulate. + - id: '17dk' script: probe_17_curve_param.py steps: '30' - seed: '5555' + seed: '3407' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17dg' + - id: '17dl' script: probe_17_curve_param.py steps: '30' - seed: '8888' + seed: '3407' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17dh' + - id: '17dm' script: probe_17_curve_param.py steps: '30' - seed: '33333' + seed: '42' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17di' + - id: '17dn' script: probe_17_curve_param.py steps: '30' - seed: '0' + seed: '42' bias_correction: '' lr: '1e-3' zoo_pin: 'c1821e4' - - id: '17dj' + - id: '17do' script: probe_17_curve_param.py steps: '30' seed: '3407' From 67be132676b143a890abf33db8392a738d9a945c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 03:33:03 +0000 Subject: [PATCH 38/84] mlx_parity Round AA: bc=True at lr=1e-4 + 500 head-to-head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User asked: does adam_bias_correction make accuracy across seeds better or worse? Rounds W/X gave us bc=False + lr=1e-4 + 500 across 6 seeds = 2/6 contains-Unsloth. Need bc=True at the same (lr, steps) to compare head-to-head: 17dp : bc=1 + 500 + 3407 17dq : bc=1 + 500 + 42 17dr : bc=1 + 500 + 999 17ds : bc=1 + 500 + 12345 17dt : control (30, default, lr=1e-3) If bc=True ≥ bc=False on this matrix, the new default is the best choice for the "memorize this fixture and emit Unsloth" objective regardless of (lr, steps). --- .github/workflows/mlx-parity-probe.yml | 54 +++++++++++++------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index e76d16de12..0788c0697c 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,43 +58,41 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round Z: determinism check. Across rounds we've seen the - # same (seed, config) tuple produce identical-looking - # generations on different commits (e.g. 17am vs 17ck vs - # 17de vs 17dj for 3407+30+default all printed "_______ - # My name is Unsloth!"). Round Z runs the SAME tuple in - # parallel matrix entries to verify bit-exact reproducibility - # within a single workflow run, then a different seed - # twice to triangulate. - - id: '17dk' + # Round AA: Q3 missing piece -- bc=True at lr=1e-4 + 500 + # steps across seeds, to compare head-to-head with the + # bc=False data from Rounds W/X at the SAME (lr, steps). + # Existing: bc=False + lr=1e-4 + 500 across {3407, 42, + # 22222, 2024, 12345, 999} = 2/6 contains-Unsloth. + # Round AA collects bc=True at the matching seeds: + - id: '17dp' script: probe_17_curve_param.py - steps: '30' + steps: '500' seed: '3407' - bias_correction: '' - lr: '1e-3' + bias_correction: '1' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17dl' + - id: '17dq' script: probe_17_curve_param.py - steps: '30' - seed: '3407' - bias_correction: '' - lr: '1e-3' + steps: '500' + seed: '42' + bias_correction: '1' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17dm' + - id: '17dr' script: probe_17_curve_param.py - steps: '30' - seed: '42' - bias_correction: '' - lr: '1e-3' + steps: '500' + seed: '999' + bias_correction: '1' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17dn' + - id: '17ds' script: probe_17_curve_param.py - steps: '30' - seed: '42' - bias_correction: '' - lr: '1e-3' + steps: '500' + seed: '12345' + bias_correction: '1' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17do' + - id: '17dt' script: probe_17_curve_param.py steps: '30' seed: '3407' From 0f997e1e900493b9d191c673dd5f4b54e3a40853 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 03:36:22 +0000 Subject: [PATCH 39/84] mlx_parity Round AB: expose grad-clip knobs + start Q1 sweep probe_17 now reads MLX_MAX_GRAD_VALUE and MLX_MAX_GRAD_NORM env vars ("default" / "none" / "off" / float). Workflow exposes them via matrix.max_grad_norm / matrix.max_grad_value. Round AB: 17du : norm=0 + value=0.5 + seed=3407 + 30 steps 17dv : norm=0 + value=1.0 + seed=3407 + 30 steps 17dw : norm=0 + value=5.0 + seed=3407 + 30 steps (PR-634 broken default!) 17dx : norm=0 + value=1.0 + seed=42 + 30 steps (cross-seed check) 17dy : smoke default (norm=1, value=None) + 30 + 3407 (control) This isolates Q1: at the smoke horizon, do elementwise-clip values hit Unsloth in greedy decode? Multi-seed expansion follows in Round AC depending on these results. --- .github/workflows/mlx-parity-probe.yml | 60 ++++++++++++++---------- tests/mlx_parity/probe_17_curve_param.py | 27 +++++++++-- 2 files changed, 58 insertions(+), 29 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 0788c0697c..38fc0c2e51 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,41 +58,47 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AA: Q3 missing piece -- bc=True at lr=1e-4 + 500 - # steps across seeds, to compare head-to-head with the - # bc=False data from Rounds W/X at the SAME (lr, steps). - # Existing: bc=False + lr=1e-4 + 500 across {3407, 42, - # 22222, 2024, 12345, 999} = 2/6 contains-Unsloth. - # Round AA collects bc=True at the matching seeds: - - id: '17dp' + # Round AB: Q1 MLX side -- max_grad_value vs max_grad_norm. + # Sweep MLX_MAX_GRAD_VALUE at 0.5, 1.0, 5.0 with norm=0 + # (elementwise-only) and the smoke default (norm=1.0, + # value=None) at seed=3407 + 30 steps as control. + - id: '17du' script: probe_17_curve_param.py - steps: '500' + steps: '30' seed: '3407' - bias_correction: '1' - lr: '1e-4' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17dq' + - id: '17dv' script: probe_17_curve_param.py - steps: '500' - seed: '42' - bias_correction: '1' - lr: '1e-4' + steps: '30' + seed: '3407' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17dr' + - id: '17dw' script: probe_17_curve_param.py - steps: '500' - seed: '999' - bias_correction: '1' - lr: '1e-4' + steps: '30' + seed: '3407' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17ds' + - id: '17dx' script: probe_17_curve_param.py - steps: '500' - seed: '12345' - bias_correction: '1' - lr: '1e-4' + steps: '30' + seed: '42' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17dt' + - id: '17dy' script: probe_17_curve_param.py steps: '30' seed: '3407' @@ -153,6 +159,8 @@ jobs: MLX_DTYPE: ${{ matrix.dtype }} MLX_BIAS_CORRECTION: ${{ matrix.bias_correction }} MLX_LR: ${{ matrix.lr }} + MLX_MAX_GRAD_NORM: ${{ matrix.max_grad_norm }} + MLX_MAX_GRAD_VALUE: ${{ matrix.max_grad_value }} run: | cd tests/mlx_parity && python ${{ matrix.script }} diff --git a/tests/mlx_parity/probe_17_curve_param.py b/tests/mlx_parity/probe_17_curve_param.py index 07f0d9accd..49092d5404 100644 --- a/tests/mlx_parity/probe_17_curve_param.py +++ b/tests/mlx_parity/probe_17_curve_param.py @@ -90,8 +90,25 @@ def main() -> int: else: bc = bc_raw in ("1", "true", "yes", "y") lr = _env_float("MLX_LR", 1e-3) - - banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc!r} lr={lr}") + # Grad clip knobs: + # MLX_MAX_GRAD_NORM= empty -> trainer default (1.0 in this probe) + # MLX_MAX_GRAD_VALUE= empty -> trainer default (None on PR-663 head) + # Use "off"/"0"/explicit floats to override; "none" maps to None. + def _env_grad(name): + raw = (os.environ.get(name) or "").strip().lower() + if not raw: + return "default" + if raw in ("none", "off"): + return None + try: + return float(raw) + except ValueError: + return "default" + grad_norm_override = _env_grad("MLX_MAX_GRAD_NORM") + grad_value_override = _env_grad("MLX_MAX_GRAD_VALUE") + + banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc!r} lr={lr} " + f"max_grad_norm={grad_norm_override!r} max_grad_value={grad_value_override!r}") import random random.seed(seed) @@ -128,6 +145,10 @@ def main() -> int: extra = {} if "adam_bias_correction" in fields_supported and bc is not None: extra["adam_bias_correction"] = bc + if grad_value_override != "default" and "max_grad_value" in fields_supported: + extra["max_grad_value"] = grad_value_override + + cfg_grad_norm = 1.0 if grad_norm_override == "default" else (grad_norm_override or 0.0) config = MLXTrainingConfig( per_device_train_batch_size=2, @@ -138,7 +159,7 @@ def main() -> int: lr_scheduler_type="constant", optim="adamw", weight_decay=0.0, - max_grad_norm=1.0, + max_grad_norm=cfg_grad_norm, logging_steps=1, max_seq_length=MAX_SEQ_LEN, seed=seed, From 994fef66565af29abdcfd78c73ca312e8ed4759f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 03:43:07 +0000 Subject: [PATCH 40/84] mlx_parity Round AC: value=1.0 multi-seed pass-rate vs norm=1.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round AB found value=1.0 rescued seed=42 (✗ at smoke default -> ✓ at norm=0/value=1.0). Round AC checks whether value=1.0 has a better multi-seed Unsloth-decode pass-rate than the current smoke default (norm=1.0, value=None) by sweeping the four smoke-default-failing seeds at the value=1.0 config: 17dz : value=1.0 + seed=999 17ea : value=1.0 + seed=12345 17eb : value=1.0 + seed=22222 17ec : value=1.0 + seed=2024 17ed : smoke default + seed=3407 control Existing data at smoke default: 999: ✗, 12345: ✗, 22222: ✗, 2024: ✗ If 3/4 or 4/4 ✓ at value=1.0, elementwise clip is the better smoke default. If 1/4 or 0/4 ✓, the value=1.0 seed=42 rescue was a coincidence. --- .github/workflows/mlx-parity-probe.yml | 31 +++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 38fc0c2e51..4717d2e76b 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,47 +58,48 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AB: Q1 MLX side -- max_grad_value vs max_grad_norm. - # Sweep MLX_MAX_GRAD_VALUE at 0.5, 1.0, 5.0 with norm=0 - # (elementwise-only) and the smoke default (norm=1.0, - # value=None) at seed=3407 + 30 steps as control. - - id: '17du' + # Round AC: does value=1.0 elementwise clip have BETTER + # multi-seed Unsloth-decode pass-rate than the smoke + # default (norm=1.0, value=None)? Round AB showed + # value=1.0 rescued seed=42 (✗ at default -> ✓). + # Sweep the smoke-default-failing seeds at value=1.0. + - id: '17dz' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '999' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17dv' + - id: '17ea' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '12345' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17dw' + - id: '17eb' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '22222' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '5.0' + max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17dx' + - id: '17ec' script: probe_17_curve_param.py steps: '30' - seed: '42' + seed: '2024' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17dy' + - id: '17ed' script: probe_17_curve_param.py steps: '30' seed: '3407' From a87b0c7a82c066061f1bfd42b6d1950db94c6e8d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 03:57:27 +0000 Subject: [PATCH 41/84] mlx_parity Round AD: finish value=1.0 13-seed sample MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round AC: value=1.0 (norm=0) at 4 seeds rescues 2/4 ✗-at-default. Combined Round AB+AC: 4/6 = 67% pass rate. Round AD finishes the sample with the remaining 5 seeds (1337, 11111, 5555, 7777, 8888) to get a direct 13-seed comparison vs norm=1.0 smoke default (currently 6/13 = 46%). If value=1.0 hits 9-10/13 the docstring should recommend it as the smoke clip mode (still respecting the historical max_grad_norm contract for user-facing semantics). --- .github/workflows/mlx-parity-probe.yml | 31 +++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 4717d2e76b..43520e4df6 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,53 +58,54 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AC: does value=1.0 elementwise clip have BETTER - # multi-seed Unsloth-decode pass-rate than the smoke - # default (norm=1.0, value=None)? Round AB showed - # value=1.0 rescued seed=42 (✗ at default -> ✓). - # Sweep the smoke-default-failing seeds at value=1.0. - - id: '17dz' + # Round AD: complete value=1.0 across remaining seeds to + # finish the 13-seed sample for an apples-to-apples + # comparison vs norm=1.0 smoke default. Round AB+AC gave + # us 4/6 ✓ for value=1.0; need 7 more seeds for full set. + - id: '17ee' script: probe_17_curve_param.py steps: '30' - seed: '999' + seed: '1337' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17ea' + - id: '17ef' script: probe_17_curve_param.py steps: '30' - seed: '12345' + seed: '11111' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17eb' + - id: '17eg' script: probe_17_curve_param.py steps: '30' - seed: '22222' + seed: '5555' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17ec' + - id: '17eh' script: probe_17_curve_param.py steps: '30' - seed: '2024' + seed: '7777' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '1.0' zoo_pin: 'c1821e4' - - id: '17ed' + - id: '17ei' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '8888' bias_correction: '' lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '1.0' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From 7bd5de39e91197280c4085581cd7600f4d0363b5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 04:12:23 +0000 Subject: [PATCH 42/84] mlx_parity Round AE: value=5.0 multi-seed (PR #634 old default) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round AD finishes value=1.0 sample: 8/13 ✓ (vs 6/13 ✓ for smoke default norm=1.0). Elementwise@1.0 has higher pass rate. Round AE samples value=5.0 (PR #634's broken default) across the 5 seeds that matter for the elementwise-vs-norm comparison: 17ej : 42, 17ek : 999, 17el : 12345, 17em : 22222, 17en : 3407 (control / smoke default). If value=5.0 hits similar 60-70% pass rate, the elementwise mode itself isn't fragile -- just the (clip-too-tight) end. --- .github/workflows/mlx-parity-probe.yml | 40 +++++++++++++------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 43520e4df6..1605f7d334 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,54 +58,54 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AD: complete value=1.0 across remaining seeds to - # finish the 13-seed sample for an apples-to-apples - # comparison vs norm=1.0 smoke default. Round AB+AC gave - # us 4/6 ✓ for value=1.0; need 7 more seeds for full set. - - id: '17ee' + # Round AE: value=5.0 (PR #634's old default) multi-seed + # on MLX. CUDA at value=5.0 with same seeds will follow + # from the CUDA sweep. If MLX value=5.0 is comparable or + # worse than value=1.0, that confirms there's no large + # "magic" elementwise value — they're all in the same + # basin-geometry-fragile regime. + - id: '17ej' script: probe_17_curve_param.py steps: '30' - seed: '1337' + seed: '42' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '1.0' + max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17ef' + - id: '17ek' script: probe_17_curve_param.py steps: '30' - seed: '11111' + seed: '999' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '1.0' + max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17eg' + - id: '17el' script: probe_17_curve_param.py steps: '30' - seed: '5555' + seed: '12345' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '1.0' + max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17eh' + - id: '17em' script: probe_17_curve_param.py steps: '30' - seed: '7777' + seed: '22222' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '1.0' + max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17ei' + - id: '17en' script: probe_17_curve_param.py steps: '30' - seed: '8888' + seed: '3407' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '1.0' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From 9d8de6257a3bf9a437d3fe9b0e5b2d6e5ab3a42e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 04:27:39 +0000 Subject: [PATCH 43/84] mlx_parity Round AF: value=0.5 multi-seed (does MLX over-clip?) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CUDA at value=0.5 hits 13/13 ✓ across all seeds. MLX at value=0.5 seed=3407 generated " " (over-clipped). Round AF samples 4 more seeds at value=0.5 + 3407 control. If MLX is universally bad at value=0.5 across seeds, the over-clip failure is MLX-specific (small fp16 + small clip starves updates). 17eo : 42, 17ep : 999, 17eq : 1337, 17er : 11111, 17es : 3407 control. --- .github/workflows/mlx-parity-probe.yml | 33 +++++++++++++------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 1605f7d334..e225871eb0 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,49 +58,48 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AE: value=5.0 (PR #634's old default) multi-seed - # on MLX. CUDA at value=5.0 with same seeds will follow - # from the CUDA sweep. If MLX value=5.0 is comparable or - # worse than value=1.0, that confirms there's no large - # "magic" elementwise value — they're all in the same - # basin-geometry-fragile regime. - - id: '17ej' + # Round AF: MLX value=0.5 multi-seed. CUDA at value=0.5 + # hits 13/13 ✓ on the same fixture+seeds, but MLX seed=3407 + # at value=0.5 fails (over-clipped, generates " "). If MLX + # at value=0.5 is universally bad across seeds, the + # over-clip failure is MLX-specific. + - id: '17eo' script: probe_17_curve_param.py steps: '30' seed: '42' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '5.0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17ek' + - id: '17ep' script: probe_17_curve_param.py steps: '30' seed: '999' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '5.0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17el' + - id: '17eq' script: probe_17_curve_param.py steps: '30' - seed: '12345' + seed: '1337' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '5.0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17em' + - id: '17er' script: probe_17_curve_param.py steps: '30' - seed: '22222' + seed: '11111' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '5.0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17en' + - id: '17es' script: probe_17_curve_param.py steps: '30' seed: '3407' From fb5b6679716105f49abe9745a76e0eb524476c9f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 04:42:32 +0000 Subject: [PATCH 44/84] mlx_parity Round AG: complete value=5.0 13-seed sample 5 more seeds at value=5.0 (norm=0): 1337, 11111, 5555, 7777, 8888. Combined with Round AE's 5 seeds gives 10/13 of the value=5.0 sample. Remaining 3 (2024, 33333, 0) in a follow-up if pattern isn't clear. --- .github/workflows/mlx-parity-probe.yml | 40 ++++++++++++++------------ 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index e225871eb0..41c3d6328d 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,53 +58,55 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AF: MLX value=0.5 multi-seed. CUDA at value=0.5 - # hits 13/13 ✓ on the same fixture+seeds, but MLX seed=3407 - # at value=0.5 fails (over-clipped, generates " "). If MLX - # at value=0.5 is universally bad across seeds, the - # over-clip failure is MLX-specific. - - id: '17eo' + # Round AG: complete value=5.0 13-seed sample. Round AE + # gave us 4 seeds (42, 999, 12345, 22222) + 3407 control; + # Round AG adds the remaining 8 (1337, 11111, 5555, 7777, + # 8888, 2024, 33333, 0). Stay within 5 jobs per round so + # pick the 5 most informative remaining seeds. + - id: '17et' script: probe_17_curve_param.py steps: '30' - seed: '42' + seed: '1337' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17ep' + - id: '17eu' script: probe_17_curve_param.py steps: '30' - seed: '999' + seed: '11111' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17eq' + - id: '17ev' script: probe_17_curve_param.py steps: '30' - seed: '1337' + seed: '5555' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17er' + - id: '17ew' script: probe_17_curve_param.py steps: '30' - seed: '11111' + seed: '7777' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17es' + - id: '17ex' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '8888' bias_correction: '' lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '5.0' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From ae6b0852e786522522c29b1e444efa3f58feb211 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 04:57:20 +0000 Subject: [PATCH 45/84] mlx_parity Round AH: finish value=0.5 + value=5.0 samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round AG: value=5.0 only 4/10 ✓ -- worse than smoke default (46%). Round AH adds the remaining seeds to lock the Q1 comparison table: 17ey : 2024 + value=5.0 17ez : 33333 + value=5.0 17fa : 12345 + value=0.5 (matches existing failing-at-norm seeds) 17fb : 22222 + value=0.5 17fc : 8888 + value=0.5 --- .github/workflows/mlx-parity-probe.yml | 34 +++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 41c3d6328d..adbddc795b 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,55 +58,55 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AG: complete value=5.0 13-seed sample. Round AE - # gave us 4 seeds (42, 999, 12345, 22222) + 3407 control; - # Round AG adds the remaining 8 (1337, 11111, 5555, 7777, - # 8888, 2024, 33333, 0). Stay within 5 jobs per round so - # pick the 5 most informative remaining seeds. - - id: '17et' + # Round AH: finish value=5.0 + value=0.5 13-seed samples + # so the docstring table is empirically defensible. + # value=5.0 needs 3 more (2024, 33333, 0); value=0.5 + # needs 5 more (7777, 8888, 22222, 12345, 2024). Pick + # the 5 most diagnostic. + - id: '17ey' script: probe_17_curve_param.py steps: '30' - seed: '1337' + seed: '2024' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17eu' + - id: '17ez' script: probe_17_curve_param.py steps: '30' - seed: '11111' + seed: '33333' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '5.0' zoo_pin: 'c1821e4' - - id: '17ev' + - id: '17fa' script: probe_17_curve_param.py steps: '30' - seed: '5555' + seed: '12345' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '5.0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17ew' + - id: '17fb' script: probe_17_curve_param.py steps: '30' - seed: '7777' + seed: '22222' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '5.0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17ex' + - id: '17fc' script: probe_17_curve_param.py steps: '30' seed: '8888' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '5.0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From 0240307847ebfa6028c4bfe708e40f7152ca758e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 05:12:31 +0000 Subject: [PATCH 46/84] mlx_parity Round AI: fp32 dtype hypothesis test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CUDA hits 100% on this fixture across all clip configs; MLX (default fp16) hits 46-62%. Round AI tests whether MLX with fp32 dtype also hits ~100%. If yes, the CUDA-vs-MLX gap is dtype-driven (bf16/fp32 dynamic range > fp16) and the smoke fragility is fundamentally about fp16 representation, not anything trainer-specific. 17fd : 42 + fp32 (was ✗ at fp16) 17fe : 999 + fp32 (was ✗) 17ff : 12345 + fp32 (was ✗) 17fg : 22222 + fp32 (was ✗) 17fh : 3407 + fp32 (control, was ✓ at fp16) --- .github/workflows/mlx-parity-probe.yml | 43 ++++++++++++-------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index adbddc795b..bf78c83b77 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,55 +58,52 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AH: finish value=5.0 + value=0.5 13-seed samples - # so the docstring table is empirically defensible. - # value=5.0 needs 3 more (2024, 33333, 0); value=0.5 - # needs 5 more (7777, 8888, 22222, 12345, 2024). Pick - # the 5 most diagnostic. - - id: '17ey' + # Round AI: fp32 dtype hypothesis test. CUDA hits 100% + # on this fixture across all clip configs. MLX (default + # fp16) hits 46-62%. If MLX with fp32 ALSO hits ~100%, + # the CUDA-vs-MLX gap is dtype-driven (bf16/fp32 dynamic + # range > fp16) and the smoke fragility is fundamentally + # about the fp16 representation, not anything trainer- + # specific. Use 5 seeds known to fail at fp16 smoke default. + - id: '17fd' script: probe_17_curve_param.py steps: '30' - seed: '2024' + seed: '42' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '5.0' + dtype: 'float32' zoo_pin: 'c1821e4' - - id: '17ez' + - id: '17fe' script: probe_17_curve_param.py steps: '30' - seed: '33333' + seed: '999' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '5.0' + dtype: 'float32' zoo_pin: 'c1821e4' - - id: '17fa' + - id: '17ff' script: probe_17_curve_param.py steps: '30' seed: '12345' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.5' + dtype: 'float32' zoo_pin: 'c1821e4' - - id: '17fb' + - id: '17fg' script: probe_17_curve_param.py steps: '30' seed: '22222' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.5' + dtype: 'float32' zoo_pin: 'c1821e4' - - id: '17fc' + - id: '17fh' script: probe_17_curve_param.py steps: '30' - seed: '8888' + seed: '3407' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.5' + dtype: 'float32' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From 57e503f00fdd4947578065a38a2ff3b798d191bf Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 05:27:18 +0000 Subject: [PATCH 47/84] mlx_parity Round AJ: bc=True at lr=1e-4 + 500 (Q3 head-to-head) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-run of cancelled Round AA. Round W/X gave bc=False at the same (lr, steps) = 2/6 ✓. Round AJ collects bc=True at the same 5 seeds: 17fi : 3407, 17fj : 42, 17fk : 999, 17fl : 12345, 17fm : 22222. Answers Q3 at the bc=False-safe envelope: does enabling bias correction help, hurt, or not matter when bc=False isn't diverging? --- .github/workflows/mlx-parity-probe.yml | 64 ++++++++++++-------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index bf78c83b77..cc340d90a1 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,52 +58,46 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AI: fp32 dtype hypothesis test. CUDA hits 100% - # on this fixture across all clip configs. MLX (default - # fp16) hits 46-62%. If MLX with fp32 ALSO hits ~100%, - # the CUDA-vs-MLX gap is dtype-driven (bf16/fp32 dynamic - # range > fp16) and the smoke fragility is fundamentally - # about the fp16 representation, not anything trainer- - # specific. Use 5 seeds known to fail at fp16 smoke default. - - id: '17fd' + # Round AJ: re-run Round AA (cancelled by AB) -- bc=True + # at lr=1e-4 + 500 steps multi-seed. Round W/X gave us + # bc=False at the same (lr, steps) = 2/6 ✓. Need bc=True + # to finish Q3's head-to-head: does adam_bias_correction + # make accuracy better or worse at the bc=False-safe + # envelope? + - id: '17fi' script: probe_17_curve_param.py - steps: '30' + steps: '500' + seed: '3407' + bias_correction: '1' + lr: '1e-4' + zoo_pin: 'c1821e4' + - id: '17fj' + script: probe_17_curve_param.py + steps: '500' seed: '42' - bias_correction: '' - lr: '1e-3' - dtype: 'float32' + bias_correction: '1' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17fe' + - id: '17fk' script: probe_17_curve_param.py - steps: '30' + steps: '500' seed: '999' - bias_correction: '' - lr: '1e-3' - dtype: 'float32' + bias_correction: '1' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17ff' + - id: '17fl' script: probe_17_curve_param.py - steps: '30' + steps: '500' seed: '12345' - bias_correction: '' - lr: '1e-3' - dtype: 'float32' + bias_correction: '1' + lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17fg' + - id: '17fm' script: probe_17_curve_param.py - steps: '30' + steps: '500' seed: '22222' - bias_correction: '' - lr: '1e-3' - dtype: 'float32' - zoo_pin: 'c1821e4' - - id: '17fh' - script: probe_17_curve_param.py - steps: '30' - seed: '3407' - bias_correction: '' - lr: '1e-3' - dtype: 'float32' + bias_correction: '1' + lr: '1e-4' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From 9411d0a6d428a1621c111f49a3175cf7360db4e6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 05:42:25 +0000 Subject: [PATCH 48/84] mlx_parity Round AK: confirm bc=True at lr=1e-4+500 underperforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round AJ: bc=True at lr=1e-4+500 hit 0/5 ✓ -- worse than bc=False at the same envelope (2/6 ✓). Round AK adds 5 more seeds + control to confirm the pattern. --- .github/workflows/mlx-parity-probe.yml | 34 ++++++++++++-------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index cc340d90a1..3b33658e76 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,46 +58,42 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AJ: re-run Round AA (cancelled by AB) -- bc=True - # at lr=1e-4 + 500 steps multi-seed. Round W/X gave us - # bc=False at the same (lr, steps) = 2/6 ✓. Need bc=True - # to finish Q3's head-to-head: does adam_bias_correction - # make accuracy better or worse at the bc=False-safe - # envelope? - - id: '17fi' + # Round AK: extend bc=True at lr=1e-4 + 500 to confirm + # the 0/5 ✓ pattern from Round AJ. 5 more seeds: + - id: '17fn' script: probe_17_curve_param.py steps: '500' - seed: '3407' + seed: '2024' bias_correction: '1' lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17fj' + - id: '17fo' script: probe_17_curve_param.py steps: '500' - seed: '42' + seed: '1337' bias_correction: '1' lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17fk' + - id: '17fp' script: probe_17_curve_param.py steps: '500' - seed: '999' + seed: '11111' bias_correction: '1' lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17fl' + - id: '17fq' script: probe_17_curve_param.py steps: '500' - seed: '12345' + seed: '7777' bias_correction: '1' lr: '1e-4' zoo_pin: 'c1821e4' - - id: '17fm' + - id: '17fr' script: probe_17_curve_param.py - steps: '500' - seed: '22222' - bias_correction: '1' - lr: '1e-4' + steps: '30' + seed: '3407' + bias_correction: '' + lr: '1e-3' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From cee07e0926f85695489a459a455dc01f54fc3f27 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 05:57:17 +0000 Subject: [PATCH 49/84] mlx_parity Round AL: bc=True Goldilocks zone search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bc=True at smoke (lr=1e-3, 30 steps) = 46% ✓. bc=True at slow (lr=1e-4, 500 steps) = 0% ✓. Round AL tests bc=True at intermediate (lr=5e-4, 100 steps) to see if there's a middle sweet spot. --- .github/workflows/mlx-parity-probe.yml | 48 ++++++++++++++------------ 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 3b33658e76..321273ccf8 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,42 +58,44 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AK: extend bc=True at lr=1e-4 + 500 to confirm - # the 0/5 ✓ pattern from Round AJ. 5 more seeds: - - id: '17fn' + # Round AL: does bc=True have a Goldilocks intermediate + # (lr, steps) zone between smoke (lr=1e-3, 30 = 46% ✓) + # and slow-LR (lr=1e-4, 500 = 0% ✓)? Test lr=5e-4 + 100 + # steps across 5 seeds. + - id: '17fs' script: probe_17_curve_param.py - steps: '500' - seed: '2024' + steps: '100' + seed: '3407' bias_correction: '1' - lr: '1e-4' + lr: '5e-4' zoo_pin: 'c1821e4' - - id: '17fo' + - id: '17ft' script: probe_17_curve_param.py - steps: '500' - seed: '1337' + steps: '100' + seed: '42' bias_correction: '1' - lr: '1e-4' + lr: '5e-4' zoo_pin: 'c1821e4' - - id: '17fp' + - id: '17fu' script: probe_17_curve_param.py - steps: '500' - seed: '11111' + steps: '100' + seed: '999' bias_correction: '1' - lr: '1e-4' + lr: '5e-4' zoo_pin: 'c1821e4' - - id: '17fq' + - id: '17fv' script: probe_17_curve_param.py - steps: '500' - seed: '7777' + steps: '100' + seed: '12345' bias_correction: '1' - lr: '1e-4' + lr: '5e-4' zoo_pin: 'c1821e4' - - id: '17fr' + - id: '17fw' script: probe_17_curve_param.py - steps: '30' - seed: '3407' - bias_correction: '' - lr: '1e-3' + steps: '100' + seed: '22222' + bias_correction: '1' + lr: '5e-4' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From 5816ea025d0195806a6c6b553a0c9fa123181c83 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 06:12:21 +0000 Subject: [PATCH 50/84] mlx_parity Round AM: bc=False at intermediate envelope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete the bc head-to-head at (lr=5e-4, 100 steps). Round AL already gave bc=True 1/5 ✓ at this point. --- .github/workflows/mlx-parity-probe.yml | 30 ++++++++++++++------------ 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 321273ccf8..d3cff310ee 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,43 +58,45 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AL: does bc=True have a Goldilocks intermediate - # (lr, steps) zone between smoke (lr=1e-3, 30 = 46% ✓) - # and slow-LR (lr=1e-4, 500 = 0% ✓)? Test lr=5e-4 + 100 - # steps across 5 seeds. - - id: '17fs' + # Round AM: bc=False at intermediate (lr=5e-4, 100) to + # complete the bc head-to-head at 3 envelopes: + # smoke (lr=1e-3, 30): bc=T 46% > bc=F 0% + # intermediate(lr=5e-4, 100): bc=T 20% > bc=F ? + # slow (lr=1e-4, 500): bc=T 0% < bc=F 33% + # Pin down where the bc=T<->bc=F crossover happens. + - id: '17fx' script: probe_17_curve_param.py steps: '100' seed: '3407' - bias_correction: '1' + bias_correction: '0' lr: '5e-4' zoo_pin: 'c1821e4' - - id: '17ft' + - id: '17fy' script: probe_17_curve_param.py steps: '100' seed: '42' - bias_correction: '1' + bias_correction: '0' lr: '5e-4' zoo_pin: 'c1821e4' - - id: '17fu' + - id: '17fz' script: probe_17_curve_param.py steps: '100' seed: '999' - bias_correction: '1' + bias_correction: '0' lr: '5e-4' zoo_pin: 'c1821e4' - - id: '17fv' + - id: '17ga' script: probe_17_curve_param.py steps: '100' seed: '12345' - bias_correction: '1' + bias_correction: '0' lr: '5e-4' zoo_pin: 'c1821e4' - - id: '17fw' + - id: '17gb' script: probe_17_curve_param.py steps: '100' seed: '22222' - bias_correction: '1' + bias_correction: '0' lr: '5e-4' zoo_pin: 'c1821e4' steps: From ad1b51bdbb78ac4eee39cb0860ebf4703704fbfc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 06:27:24 +0000 Subject: [PATCH 51/84] mlx_parity Round AN: complete value=0.5 13-seed sample Finish 5 remaining seeds (2024, 33333, 0, 5555, 7777) at MLX value=0.5 / norm=0 for the final Q1 docstring-ready table. --- .github/workflows/mlx-parity-probe.yml | 69 ++++++++++++++------------ 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index d3cff310ee..7804d26f84 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,46 +58,53 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AM: bc=False at intermediate (lr=5e-4, 100) to - # complete the bc head-to-head at 3 envelopes: - # smoke (lr=1e-3, 30): bc=T 46% > bc=F 0% - # intermediate(lr=5e-4, 100): bc=T 20% > bc=F ? - # slow (lr=1e-4, 500): bc=T 0% < bc=F 33% - # Pin down where the bc=T<->bc=F crossover happens. - - id: '17fx' + # Round AN: complete value=0.5 13-seed sample with the + # 5 remaining seeds (2024, 33333, 0, 5555, 7777). Round + # AB/AF/AH have 8 done. + - id: '17gc' script: probe_17_curve_param.py - steps: '100' - seed: '3407' - bias_correction: '0' - lr: '5e-4' + steps: '30' + seed: '2024' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17fy' + - id: '17gd' script: probe_17_curve_param.py - steps: '100' - seed: '42' - bias_correction: '0' - lr: '5e-4' + steps: '30' + seed: '33333' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17fz' + - id: '17ge' script: probe_17_curve_param.py - steps: '100' - seed: '999' - bias_correction: '0' - lr: '5e-4' + steps: '30' + seed: '0' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17ga' + - id: '17gf' script: probe_17_curve_param.py - steps: '100' - seed: '12345' - bias_correction: '0' - lr: '5e-4' + steps: '30' + seed: '5555' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' - - id: '17gb' + - id: '17gg' script: probe_17_curve_param.py - steps: '100' - seed: '22222' - bias_correction: '0' - lr: '5e-4' + steps: '30' + seed: '7777' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '0.5' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From a121a46ff0372020c1a65390c2812f7babcc16e3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 06:42:21 +0000 Subject: [PATCH 52/84] mlx_parity Round AO: norm-clip threshold sweep Round AN found value=0.5 elementwise = 77% (best). Round AO scans the symmetric question: does tighter norm clip (0.5) beat 1.0 (current default)? Does looser (2.0, 5.0) match value=5.0's poor performance? 5 jobs: 3 norm values on 3407, 2 norm values on seed=42 (smoke-failing seed). --- .github/workflows/mlx-parity-probe.yml | 41 +++++++++++--------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 7804d26f84..cccfbf5a85 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,53 +58,48 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AN: complete value=0.5 13-seed sample with the - # 5 remaining seeds (2024, 33333, 0, 5555, 7777). Round - # AB/AF/AH have 8 done. - - id: '17gc' + # Round AO: max_grad_norm sweep (0.5, 2.0, 5.0) for + # symmetric Q1 comparison. Confirm whether tighter norm + # clip also helps (mirrors value=0.5 winning). + - id: '17gh' script: probe_17_curve_param.py steps: '30' - seed: '2024' + seed: '3407' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_norm: '0.5' zoo_pin: 'c1821e4' - - id: '17gd' + - id: '17gi' script: probe_17_curve_param.py steps: '30' - seed: '33333' + seed: '3407' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_norm: '2.0' zoo_pin: 'c1821e4' - - id: '17ge' + - id: '17gj' script: probe_17_curve_param.py steps: '30' - seed: '0' + seed: '3407' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_norm: '5.0' zoo_pin: 'c1821e4' - - id: '17gf' + - id: '17gk' script: probe_17_curve_param.py steps: '30' - seed: '5555' + seed: '42' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_norm: '0.5' zoo_pin: 'c1821e4' - - id: '17gg' + - id: '17gl' script: probe_17_curve_param.py steps: '30' - seed: '7777' + seed: '42' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.5' + max_grad_norm: '2.0' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From f8419f68ae0f803b3c9e92d508880696ae553838 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 06:57:22 +0000 Subject: [PATCH 53/84] mlx_parity Round AP: expand norm=0.5 sample MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AO got norm=0.5 = 2/2 ✓ (3407, 42 -- 42 rescued from smoke ✗). AP adds 5 more smoke-default-failing seeds (999, 12345, 22222, 7777, 2024) to see if norm=0.5 is universally better. --- .github/workflows/mlx-parity-probe.yml | 31 +++++++++++++------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index cccfbf5a85..6545f909cf 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,48 +58,47 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AO: max_grad_norm sweep (0.5, 2.0, 5.0) for - # symmetric Q1 comparison. Confirm whether tighter norm - # clip also helps (mirrors value=0.5 winning). - - id: '17gh' + # Round AP: expand norm=0.5 sample. AO got 2/2 ✓ + # (3407, 42). Add 5 more failing-at-smoke seeds: + - id: '17gm' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '999' bias_correction: '' lr: '1e-3' max_grad_norm: '0.5' zoo_pin: 'c1821e4' - - id: '17gi' + - id: '17gn' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '12345' bias_correction: '' lr: '1e-3' - max_grad_norm: '2.0' + max_grad_norm: '0.5' zoo_pin: 'c1821e4' - - id: '17gj' + - id: '17go' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '22222' bias_correction: '' lr: '1e-3' - max_grad_norm: '5.0' + max_grad_norm: '0.5' zoo_pin: 'c1821e4' - - id: '17gk' + - id: '17gp' script: probe_17_curve_param.py steps: '30' - seed: '42' + seed: '7777' bias_correction: '' lr: '1e-3' max_grad_norm: '0.5' zoo_pin: 'c1821e4' - - id: '17gl' + - id: '17gq' script: probe_17_curve_param.py steps: '30' - seed: '42' + seed: '2024' bias_correction: '' lr: '1e-3' - max_grad_norm: '2.0' + max_grad_norm: '0.5' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From 5f30f39b06e554ccdca6de07f38766c42e7f3377 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 07:12:23 +0000 Subject: [PATCH 54/84] mlx_parity Round AQ: value=0.25 (tighter than 0.5 winner) value=0.5 = 77% pass rate (best of all tested MLX configs). Round AQ tests tighter value=0.25 at 5 seeds (3407, 42, 999, 12345, 22222) to see if there's an even sharper apex. --- .github/workflows/mlx-parity-probe.yml | 42 +++++++++++++++----------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 6545f909cf..45b0038304 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,47 +58,55 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AP: expand norm=0.5 sample. AO got 2/2 ✓ - # (3407, 42). Add 5 more failing-at-smoke seeds: - - id: '17gm' + # Round AQ: does value=0.25 (tighter than the 0.5 winner) + # do even better or start to over-clip? If pass rate is + # >= 77% at 0.25, the optimum sits in the [0.1, 0.5] band + # and value=0.5 may not be the apex. If it drops, value + # ~= 0.5 is the apex. + - id: '17gr' script: probe_17_curve_param.py steps: '30' - seed: '999' + seed: '3407' bias_correction: '' lr: '1e-3' - max_grad_norm: '0.5' + max_grad_norm: '0' + max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17gn' + - id: '17gs' script: probe_17_curve_param.py steps: '30' - seed: '12345' + seed: '42' bias_correction: '' lr: '1e-3' - max_grad_norm: '0.5' + max_grad_norm: '0' + max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17go' + - id: '17gt' script: probe_17_curve_param.py steps: '30' - seed: '22222' + seed: '999' bias_correction: '' lr: '1e-3' - max_grad_norm: '0.5' + max_grad_norm: '0' + max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17gp' + - id: '17gu' script: probe_17_curve_param.py steps: '30' - seed: '7777' + seed: '12345' bias_correction: '' lr: '1e-3' - max_grad_norm: '0.5' + max_grad_norm: '0' + max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17gq' + - id: '17gv' script: probe_17_curve_param.py steps: '30' - seed: '2024' + seed: '22222' bias_correction: '' lr: '1e-3' - max_grad_norm: '0.5' + max_grad_norm: '0' + max_grad_value: '0.25' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From 3e23d3762c2bd3d5a51f80f232d877ef2fc9123f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 07:27:11 +0000 Subject: [PATCH 55/84] mlx_parity Round AR: value=0.1 (tighter than 0.25) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AQ: value=0.25 = 4/5 ✓ on smoke-failing seeds. AR tests value=0.1 at the same 5 seeds. Does tighter still win or does over-clip finally kick in (training can't converge)? --- .github/workflows/mlx-parity-probe.yml | 33 +++++++++++++------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 45b0038304..60bf74412a 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,55 +58,54 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AQ: does value=0.25 (tighter than the 0.5 winner) - # do even better or start to over-clip? If pass rate is - # >= 77% at 0.25, the optimum sits in the [0.1, 0.5] band - # and value=0.5 may not be the apex. If it drops, value - # ~= 0.5 is the apex. - - id: '17gr' + # Round AR: value=0.1 (tighter than 0.25). AQ got 4/5 ✓ + # at value=0.25 (better than value=0.5's 3/5 on same + # seeds). Does the trend continue, or does over-clip + # finally kick in? + - id: '17gw' script: probe_17_curve_param.py steps: '30' seed: '3407' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.25' + max_grad_value: '0.1' zoo_pin: 'c1821e4' - - id: '17gs' + - id: '17gx' script: probe_17_curve_param.py steps: '30' seed: '42' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.25' + max_grad_value: '0.1' zoo_pin: 'c1821e4' - - id: '17gt' + - id: '17gy' script: probe_17_curve_param.py steps: '30' seed: '999' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.25' + max_grad_value: '0.1' zoo_pin: 'c1821e4' - - id: '17gu' + - id: '17gz' script: probe_17_curve_param.py steps: '30' - seed: '12345' + seed: '22222' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.25' + max_grad_value: '0.1' zoo_pin: 'c1821e4' - - id: '17gv' + - id: '17ha' script: probe_17_curve_param.py steps: '30' - seed: '22222' + seed: '12345' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.25' + max_grad_value: '0.1' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From ad14b1fb1b5d5000829c6c2c37f94dd6ad6d0eb8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 07:42:20 +0000 Subject: [PATCH 56/84] mlx_parity Round AS: complete value=0.25 13-seed sample MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit value=0.25 is the best 5-seed result so far (4/5 ✓). AS adds the 5 next seeds (1337, 11111, 5555, 7777, 8888) and AR already touched 0.1 boundary -- if value=0.25 13-seed sample exceeds value=0.5's 77%, that's the new MLX optimum. --- .github/workflows/mlx-parity-probe.yml | 37 +++++++++++++------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 60bf74412a..a3bb16b696 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,54 +58,53 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AR: value=0.1 (tighter than 0.25). AQ got 4/5 ✓ - # at value=0.25 (better than value=0.5's 3/5 on same - # seeds). Does the trend continue, or does over-clip - # finally kick in? - - id: '17gw' + # Round AS: complete value=0.25 13-seed sample (best- + # so-far MLX clip config). AQ tested 5 seeds; AS adds + # 5 more (1337, 11111, 5555, 7777, 8888). + - id: '17hb' script: probe_17_curve_param.py steps: '30' - seed: '3407' + seed: '1337' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.1' + max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17gx' + - id: '17hc' script: probe_17_curve_param.py steps: '30' - seed: '42' + seed: '11111' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.1' + max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17gy' + - id: '17hd' script: probe_17_curve_param.py steps: '30' - seed: '999' + seed: '5555' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.1' + max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17gz' + - id: '17he' script: probe_17_curve_param.py steps: '30' - seed: '22222' + seed: '7777' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.1' + max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17ha' + - id: '17hf' script: probe_17_curve_param.py steps: '30' - seed: '12345' + seed: '8888' bias_correction: '' lr: '1e-3' max_grad_norm: '0' - max_grad_value: '0.1' + max_grad_value: '0.25' zoo_pin: 'c1821e4' steps: - name: Harden runner (audit) From 2e3d446d74548e72426f0a926419f6ddf38d3de2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 07:57:20 +0000 Subject: [PATCH 57/84] mlx_parity Round AT: finish value=0.25 13-seed sample 3 remaining seeds (2024, 33333, 0) + control + 8888 re-check (determinism sanity). --- .github/workflows/mlx-parity-probe.yml | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index a3bb16b696..9982804318 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,46 +58,46 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AS: complete value=0.25 13-seed sample (best- - # so-far MLX clip config). AQ tested 5 seeds; AS adds - # 5 more (1337, 11111, 5555, 7777, 8888). - - id: '17hb' + # Round AT: finish value=0.25 13-seed sample. AQ + AS + # gave 6/10. Add 3 remaining seeds (2024, 33333, 0) + + # control + a 8888 re-check (Round Z showed bit-exact + # determinism, so re-runs should match; if they don't, + # we have a problem). + - id: '17hg' script: probe_17_curve_param.py steps: '30' - seed: '1337' + seed: '2024' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17hc' + - id: '17hh' script: probe_17_curve_param.py steps: '30' - seed: '11111' + seed: '33333' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17hd' + - id: '17hi' script: probe_17_curve_param.py steps: '30' - seed: '5555' + seed: '0' bias_correction: '' lr: '1e-3' max_grad_norm: '0' max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17he' + - id: '17hj' script: probe_17_curve_param.py steps: '30' - seed: '7777' + seed: '3407' bias_correction: '' lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.25' zoo_pin: 'c1821e4' - - id: '17hf' + - id: '17hk' script: probe_17_curve_param.py steps: '30' seed: '8888' From afba8d2269bc5269580aeccfcb6664d7fdd0b79c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 09:30:38 +0000 Subject: [PATCH 58/84] mlx_parity Round AU: mlx-lm with unsloth-zoo-matching settings Probe 20 runs mlx_lm.lora --config with: * lora_parameters.keys : all 7 modules (q/k/v/o/gate/up/down) * rank=8, scale=2.0 (= alpha 16 / rank 8 PEFT convention) * batch_size=6 (matches unsloth-zoo's bs=2 * grad_accum=3) * optimizer=adamw with bias_correction=true * iters=30, lr=1e-3, seed via env Mirrors the smoke unsloth-zoo MLXTrainer config so the multi-seed pass-rate is directly comparable. If mlx-lm with these settings also lands at 33-77%, fragility is MLX-level (fp16 + generate path). If it hits 100% like CUDA, unsloth-zoo's wrapper has a material extra contributor. Round AU runs 5 seeds (3407, 42, 999, 12345, 22222). --- .github/workflows/mlx-parity-probe.yml | 63 ++--- .../mlx_parity/probe_20_mlx_lm_aggressive.py | 230 ++++++++++++++++++ 2 files changed, 250 insertions(+), 43 deletions(-) create mode 100644 tests/mlx_parity/probe_20_mlx_lm_aggressive.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 9982804318..8f3ab1e2b2 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,54 +58,31 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AT: finish value=0.25 13-seed sample. AQ + AS - # gave 6/10. Add 3 remaining seeds (2024, 33333, 0) + - # control + a 8888 re-check (Round Z showed bit-exact - # determinism, so re-runs should match; if they don't, - # we have a problem). - - id: '17hg' - script: probe_17_curve_param.py + # Round AU: mlx-lm NATIVE LoRA matched to unsloth-zoo + # settings (all 7 modules, scale=2.0, bs=6 effective, + # bc=True). If multi-seed pass rate matches unsloth-zoo's + # 33-77%, fragility is MLX-level. If it hits CUDA-like + # 100%, unsloth-zoo's wrapper has a material extra source. + - id: '20a' + script: probe_20_mlx_lm_aggressive.py steps: '30' - seed: '2024' - bias_correction: '' - lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.25' - zoo_pin: 'c1821e4' - - id: '17hh' - script: probe_17_curve_param.py + seed: '3407' + - id: '20b' + script: probe_20_mlx_lm_aggressive.py steps: '30' - seed: '33333' - bias_correction: '' - lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.25' - zoo_pin: 'c1821e4' - - id: '17hi' - script: probe_17_curve_param.py + seed: '42' + - id: '20c' + script: probe_20_mlx_lm_aggressive.py steps: '30' - seed: '0' - bias_correction: '' - lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.25' - zoo_pin: 'c1821e4' - - id: '17hj' - script: probe_17_curve_param.py + seed: '999' + - id: '20d' + script: probe_20_mlx_lm_aggressive.py steps: '30' - seed: '3407' - bias_correction: '' - lr: '1e-3' - zoo_pin: 'c1821e4' - - id: '17hk' - script: probe_17_curve_param.py + seed: '12345' + - id: '20e' + script: probe_20_mlx_lm_aggressive.py steps: '30' - seed: '8888' - bias_correction: '' - lr: '1e-3' - max_grad_norm: '0' - max_grad_value: '0.25' - zoo_pin: 'c1821e4' + seed: '22222' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_20_mlx_lm_aggressive.py b/tests/mlx_parity/probe_20_mlx_lm_aggressive.py new file mode 100644 index 0000000000..d9ceb10e72 --- /dev/null +++ b/tests/mlx_parity/probe_20_mlx_lm_aggressive.py @@ -0,0 +1,230 @@ +"""Probe 20 — mlx-lm NATIVE LoRA matched to unsloth-zoo's aggressive settings. + +Probes 13/16/18/19 ran mlx_lm.lora at the CLI defaults (only q/v +attention projections, effective batch 2, bias_correction=False) +and showed it can't even memorize the fixture in 30-60 iters (last +loss 3-5) and barely scrapes "sloth!" at 500 iters. + +Probe 20 closes that gap by writing a mlx_lm config YAML that +matches unsloth-zoo's MLXTrainer settings as closely as the +CLI permits: + + * lora_parameters.keys : all 7 modules (q/k/v/o/gate/up/down) + * lora_parameters.rank : 8 + * lora_parameters.scale: 2.0 (= alpha 16 / rank 8 per PEFT + convention) + * optimizer : adamw, bias_correction=true + * batch_size : 6 (matches unsloth-zoo's + bs=2 * grad_accum=3 effective) + * iters : matches MLX_STEPS env + * learning_rate : 1e-3 by default + +If mlx-lm with these settings ALSO shows ~33-77% Unsloth-pass +across seeds, the fragility is MLX-level (fp16 + generate path). +If mlx-lm hits 100% (CUDA-like), unsloth-zoo's wrapper has a +material implementation difference contributing to the gap. + +Env vars (matches probe_17 naming): + MLX_STEPS --iters value (default 30) + MLX_SEED --seed value (default 3407) + MLX_LR learning-rate (default 1e-3) + +Writes per-config JSON to .out/probe_20__s{S}_d{D}.json. +""" + +import json +import os +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + OUT_DIR, + banner, + section, + report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: + return default + try: + return int(raw) + except ValueError: + return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: + return default + try: + return float(raw) + except ValueError: + return default + + +CONFIG_YAML_TMPL = """\ +# unsloth-zoo-matching config for mlx_lm.lora --train +model: "{model}" +train: true +data: "{data_dir}" +adapter_path: "{adapter_dir}" +seed: {seed} +iters: {iters} +batch_size: 6 +learning_rate: {lr} +steps_per_report: 1 +steps_per_eval: {steps_per_eval} +fine_tune_type: "lora" +lora_parameters: + rank: 8 + scale: 2.0 + dropout: 0.0 + keys: + - "self_attn.q_proj" + - "self_attn.k_proj" + - "self_attn.v_proj" + - "self_attn.o_proj" + - "mlp.gate_proj" + - "mlp.up_proj" + - "mlp.down_proj" +optimizer: "adamw" +optimizer_config: + adamw: + weight_decay: 0.0 + bias_correction: true +""" + + +def main() -> int: + iters = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + banner(f"Probe 20: mlx-lm NATIVE LoRA aggressive iters={iters} seed={seed} lr={lr}") + + import random + import numpy as np + random.seed(seed) + np.random.seed(seed) + try: + import mlx.core as mx + mx.random.seed(seed) + except Exception: + pass + + workdir = Path(tempfile.mkdtemp(prefix=f"probe20_s{iters}_d{seed}_")) + data_dir = workdir / "data" + adapter_dir = workdir / "adapters" + data_dir.mkdir(parents=True, exist_ok=True) + adapter_dir.mkdir(parents=True, exist_ok=True) + + train_rows = [{"text": TRAIN_TEXT} for _ in range(64)] + valid_rows = [{"text": TRAIN_TEXT} for _ in range(8)] + (data_dir / "train.jsonl").write_text( + "\n".join(json.dumps(r) for r in train_rows) + "\n" + ) + (data_dir / "valid.jsonl").write_text( + "\n".join(json.dumps(r) for r in valid_rows) + "\n" + ) + report("data dir", str(data_dir)) + report("adapter dir", str(adapter_dir)) + + config_path = workdir / "lora_config.yaml" + config_path.write_text( + CONFIG_YAML_TMPL.format( + model=MODEL_NAME, + data_dir=str(data_dir), + adapter_dir=str(adapter_dir), + seed=seed, + iters=iters, + lr=lr, + steps_per_eval=max(iters + 1, 1000), + ) + ) + report("config yaml", str(config_path)) + report("config contents", config_path.read_text()) + + cmd = [ + sys.executable, "-m", "mlx_lm", "lora", + "--config", str(config_path), + ] + section("invoke mlx_lm.lora trainer (config-driven)") + report("cmd", " ".join(cmd)) + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=1200) + report("returncode", proc.returncode) + if proc.returncode != 0: + print("--- mlx_lm.lora stderr (tail) ---") + print(proc.stderr[-3000:]) + + losses_per_step = [] + for line in (proc.stdout + "\n" + proc.stderr).splitlines(): + if "Iter " in line and "Train loss" in line: + try: + num = float( + line.split("Train loss")[1].strip().split(",")[0].strip() + ) + losses_per_step.append(num) + except Exception: + pass + + report("parsed losses (count)", len(losses_per_step)) + if losses_per_step: + report("first loss", losses_per_step[0]) + report("last loss", losses_per_step[-1]) + + section("load + generate") + from mlx_lm import load as mlx_load, generate + try: + model, tokenizer = mlx_load(MODEL_NAME, adapter_path=str(adapter_dir)) + except TypeError: + model, tokenizer = mlx_load(MODEL_NAME) + try: + from mlx_lm.tuner.utils import load_adapters + load_adapters(model, str(adapter_dir)) + except Exception as e: + report("adapter load fallback failed", str(e)) + + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen)) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "iters": iters, "seed": seed, "lr": lr, + "batch_size": 6, "rank": 8, "scale": 2.0, + "lora_keys_count": 7, + "optimizer": "adamw", "bias_correction": True, + }, + "returncode": proc.returncode, + "losses": losses_per_step, + "generation": gen, + "contains_unsloth": contains, + "stdout_tail": proc.stdout[-2000:], + "stderr_tail": proc.stderr[-2000:], + } + fname = f"probe_20__s{iters}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + + section("summary") + report("iters", iters) + report("seed", seed) + report("contains 'Unsloth'", contains) + + try: + shutil.rmtree(workdir, ignore_errors=True) + except Exception: + pass + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From f51e63728c5aa0db0282e19c469e3c15c4522f73 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 11:13:38 +0000 Subject: [PATCH 59/84] mlx_parity Round AV: validate teacher-forced completion loss on failing seeds (PR #5537 hard-gate justification) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit probe_17 now also computes the teacher-forced completion loss for the PR-5537 hard gate: CE on "Unsloth!" tokens given the "<> My name is " prompt, no decoding. Hypothesis: even on the seeds where greedy decode fails (12345, 22222, etc.) at the new PR-663-default config, completion_loss should be <<0.5 because the LoRA fully memorised the training row (post_loss<0.1). Pin to the new PR-663 head (aed74d9 -- max_grad_value=1.0 default + adam_bias_correction=True field) and run 5 seeds at the matching smoke config: 17hl : 42 (was ✗ in earlier sweeps) 17hm : 999 (was ✗) 17hn : 12345 (was ✗ even on mlx-lm Round AU) 17ho : 22222 (was ✗) 17hp : 3407 (control, was ✓) If completion_loss < 0.5 on all 5, the PR-5537 hard gate is empirically validated. --- .github/workflows/mlx-parity-probe.yml | 59 +++++++++++++++++------- tests/mlx_parity/probe_17_curve_param.py | 20 ++++++++ 2 files changed, 62 insertions(+), 17 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 8f3ab1e2b2..bf9b4e004d 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,31 +58,56 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AU: mlx-lm NATIVE LoRA matched to unsloth-zoo - # settings (all 7 modules, scale=2.0, bs=6 effective, - # bc=True). If multi-seed pass rate matches unsloth-zoo's - # 33-77%, fragility is MLX-level. If it hits CUDA-like - # 100%, unsloth-zoo's wrapper has a material extra source. - - id: '20a' - script: probe_20_mlx_lm_aggressive.py - steps: '30' - seed: '3407' - - id: '20b' - script: probe_20_mlx_lm_aggressive.py + # Round AV: validate teacher-forced completion loss across + # the seeds where greedy decode FAILED on PR-663 head with + # smoke defaults (value=1.0 elementwise, bc=True, 30 steps). + # If completion_loss is consistently << 0.5 on these + # failing-at-greedy seeds, the PR-5537 hard gate is sound. + - id: '17hl' + script: probe_17_curve_param.py steps: '30' seed: '42' - - id: '20c' - script: probe_20_mlx_lm_aggressive.py + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '1.0' + zoo_pin: 'aed74d9' + - id: '17hm' + script: probe_17_curve_param.py steps: '30' seed: '999' - - id: '20d' - script: probe_20_mlx_lm_aggressive.py + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '1.0' + zoo_pin: 'aed74d9' + - id: '17hn' + script: probe_17_curve_param.py steps: '30' seed: '12345' - - id: '20e' - script: probe_20_mlx_lm_aggressive.py + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '1.0' + zoo_pin: 'aed74d9' + - id: '17ho' + script: probe_17_curve_param.py steps: '30' seed: '22222' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '1.0' + zoo_pin: 'aed74d9' + - id: '17hp' + script: probe_17_curve_param.py + steps: '30' + seed: '3407' + bias_correction: '' + lr: '1e-3' + max_grad_norm: '0' + max_grad_value: '1.0' + zoo_pin: 'aed74d9' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_17_curve_param.py b/tests/mlx_parity/probe_17_curve_param.py index 49092d5404..62c8b32e17 100644 --- a/tests/mlx_parity/probe_17_curve_param.py +++ b/tests/mlx_parity/probe_17_curve_param.py @@ -200,6 +200,25 @@ def _on_step(*args): post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) post_loss_val = float(post_loss.item()) + # Teacher-forced completion loss: same shape as the new PR-5537 + # smoke gate. CE on the "Unsloth!" tokens given the "<> My + # name is " prompt, no decoding involved. Should be tiny across + # every config that hits post_train_loss < 0.1. + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + report("completion_teacher_forced_loss", completion_loss) + from mlx_lm import generate gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) contains = "Unsloth" in gen @@ -220,6 +239,7 @@ def _on_step(*args): }, "rows": rows, "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, "generation": gen, "contains_unsloth": contains, } From 2739feea1b71a16852dbe2219e90d62c7d6a0d79 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 11:53:43 +0000 Subject: [PATCH 60/84] mlx_parity Round AW: 2x2 binary-search of mlx-lm-vs-unsloth-zoo gap Round AU/AV measured mlx-lm native LoRA = 80% (4/5) vs unsloth-zoo MLXTrainer = 60% (3/5) at the same effective config (7 LoRA modules, adamw bias_correction=True, lr=1e-3, weight_decay=0, no LR decay, 30 steps, effective batch=6). The probes already run with use_cce=False and gradient_checkpointing=False, so those two candidates are eliminated. Two axes still live: * elementwise clip: max_grad_value=1.0 (unsloth-zoo) vs none (mlx-lm) * grad-accum mechanic: bs=2 * accum=3 (token-weighted mean across 3 micro-batches) vs native bs=6 * accum=1 (single batch, single grad eval, unweighted mean) 2x2 factorial over 5 seeds (42, 999, 12345, 22222, 3407 -- the same set Round AV measured, including the two failing-greedy seeds): Cell A clip=1.0, bs=2 acc=3 (= Round AV baseline) Cell B clip=off, bs=2 acc=3 (drop clip only) Cell C clip=1.0, bs=6 acc=1 (drop accum only) Cell D clip=off, bs=6 acc=1 (full mlx-lm-matching config) Probe writes per-cell JSON with bs/accum tagged in the filename so the artifact bundle is unambiguous. All cells pinned to PR-663 head (aed74d9). --- .github/workflows/mlx-parity-probe.yml | 184 ++++++++++++++++------- tests/mlx_parity/probe_17_curve_param.py | 27 +++- 2 files changed, 154 insertions(+), 57 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index bf9b4e004d..cd6fcdbbb7 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -58,56 +58,132 @@ jobs: # (steps, seed) combination so we can rule in/out a # convergence-horizon effect. - # Round AV: validate teacher-forced completion loss across - # the seeds where greedy decode FAILED on PR-663 head with - # smoke defaults (value=1.0 elementwise, bc=True, 30 steps). - # If completion_loss is consistently << 0.5 on these - # failing-at-greedy seeds, the PR-5537 hard gate is sound. - - id: '17hl' - script: probe_17_curve_param.py - steps: '30' + # Round AW: binary-search the 20% pass-rate gap between + # mlx-lm native (80%, Round AU) and unsloth-zoo MLXTrainer + # (60%, Round AV) at the same effective config. CCE and + # gradient checkpointing already eliminated (probe forces + # use_cce=False, gradient_checkpointing=False). Two axes + # remain live: + # * clip: max_grad_value=1.0 (B) vs None (A) + # * accum: bs=2 * accum=3 (B's smoke default) vs native + # bs=6 * accum=1 (A's native batching) + # 2x2 factorial x 5 seeds (including known failing 22222 + # and 12345, plus controls 42, 999, 3407). + # Cell A (baseline, repeat AV) : clip=1.0, bs=2 acc=3 + # Cell B (drop clip) : clip=off, bs=2 acc=3 + # Cell C (drop accum, native bs=6) : clip=1.0, bs=6 acc=1 + # Cell D (drop both, full mlx-lm match): clip=off, bs=6 acc=1 + # If D ~= 4-5/5 and A=3/5, both axes contribute. If only D + # is high, interaction effect. If C high & B not, accum is + # the dominant cause. + + # ---- Cell A: clip=1.0, bs=2, accum=3 (Round AV baseline) ---- + - id: '17ja_42' + seed: '42' + max_grad_value: '1.0' + bs: '2' + accum: '3' + - id: '17ja_999' + seed: '999' + max_grad_value: '1.0' + bs: '2' + accum: '3' + - id: '17ja_12345' + seed: '12345' + max_grad_value: '1.0' + bs: '2' + accum: '3' + - id: '17ja_22222' + seed: '22222' + max_grad_value: '1.0' + bs: '2' + accum: '3' + - id: '17ja_3407' + seed: '3407' + max_grad_value: '1.0' + bs: '2' + accum: '3' + + # ---- Cell B: clip=off, bs=2, accum=3 (drop clip only) ---- + - id: '17jb_42' + seed: '42' + max_grad_value: 'off' + bs: '2' + accum: '3' + - id: '17jb_999' + seed: '999' + max_grad_value: 'off' + bs: '2' + accum: '3' + - id: '17jb_12345' + seed: '12345' + max_grad_value: 'off' + bs: '2' + accum: '3' + - id: '17jb_22222' + seed: '22222' + max_grad_value: 'off' + bs: '2' + accum: '3' + - id: '17jb_3407' + seed: '3407' + max_grad_value: 'off' + bs: '2' + accum: '3' + + # ---- Cell C: clip=1.0, bs=6, accum=1 (drop accum only) ---- + - id: '17jc_42' seed: '42' - bias_correction: '' - lr: '1e-3' - max_grad_norm: '0' max_grad_value: '1.0' - zoo_pin: 'aed74d9' - - id: '17hm' - script: probe_17_curve_param.py - steps: '30' + bs: '6' + accum: '1' + - id: '17jc_999' seed: '999' - bias_correction: '' - lr: '1e-3' - max_grad_norm: '0' max_grad_value: '1.0' - zoo_pin: 'aed74d9' - - id: '17hn' - script: probe_17_curve_param.py - steps: '30' + bs: '6' + accum: '1' + - id: '17jc_12345' seed: '12345' - bias_correction: '' - lr: '1e-3' - max_grad_norm: '0' max_grad_value: '1.0' - zoo_pin: 'aed74d9' - - id: '17ho' - script: probe_17_curve_param.py - steps: '30' + bs: '6' + accum: '1' + - id: '17jc_22222' seed: '22222' - bias_correction: '' - lr: '1e-3' - max_grad_norm: '0' max_grad_value: '1.0' - zoo_pin: 'aed74d9' - - id: '17hp' - script: probe_17_curve_param.py - steps: '30' + bs: '6' + accum: '1' + - id: '17jc_3407' seed: '3407' - bias_correction: '' - lr: '1e-3' - max_grad_norm: '0' max_grad_value: '1.0' - zoo_pin: 'aed74d9' + bs: '6' + accum: '1' + + # ---- Cell D: clip=off, bs=6, accum=1 (full mlx-lm match) ---- + - id: '17jd_42' + seed: '42' + max_grad_value: 'off' + bs: '6' + accum: '1' + - id: '17jd_999' + seed: '999' + max_grad_value: 'off' + bs: '6' + accum: '1' + - id: '17jd_12345' + seed: '12345' + max_grad_value: 'off' + bs: '6' + accum: '1' + - id: '17jd_22222' + seed: '22222' + max_grad_value: 'off' + bs: '6' + accum: '1' + - id: '17jd_3407' + seed: '3407' + max_grad_value: 'off' + bs: '6' + accum: '1' steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -141,31 +217,33 @@ jobs: 'sentencepiece==0.2.1' \ 'huggingface-hub==0.36.2' \ 'trl==0.27.0' - ZOO_REF='${{ matrix.zoo_pin }}' - if [ -z "$ZOO_REF" ]; then - ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo' - else - ZOO_SPEC="unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@${ZOO_REF}" - fi + # Round AW: pin to PR-663 head so all cells run against the + # same zoo state we measured in Round AV. + ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@aed74d9' for attempt in 1 2 3; do if pip install "$ZOO_SPEC"; then break; fi if [ "$attempt" -eq 3 ]; then exit 1; fi sleep $((5*attempt)) done - - name: Run probe ${{ matrix.id }} — ${{ matrix.script }} + - name: Run probe ${{ matrix.id }} (probe_17_curve_param.py) env: HF_TOKEN: ${{ secrets.HF_TOKEN }} UNSLOTH_COMPILE_DISABLE: '1' - MLX_STEPS: ${{ matrix.steps }} + # Round AW shared smoke config (all cells): + # steps=30, lr=1e-3, bias_correction default (=True), + # max_grad_norm=0 (disabled), dtype=float16. + MLX_STEPS: '30' MLX_SEED: ${{ matrix.seed }} - MLX_DTYPE: ${{ matrix.dtype }} - MLX_BIAS_CORRECTION: ${{ matrix.bias_correction }} - MLX_LR: ${{ matrix.lr }} - MLX_MAX_GRAD_NORM: ${{ matrix.max_grad_norm }} + MLX_DTYPE: 'float16' + MLX_BIAS_CORRECTION: '' + MLX_LR: '1e-3' + MLX_MAX_GRAD_NORM: '0' MLX_MAX_GRAD_VALUE: ${{ matrix.max_grad_value }} + MLX_BS: ${{ matrix.bs }} + MLX_ACCUM: ${{ matrix.accum }} run: | - cd tests/mlx_parity && python ${{ matrix.script }} + cd tests/mlx_parity && python probe_17_curve_param.py - name: Show JSON output if: always() diff --git a/tests/mlx_parity/probe_17_curve_param.py b/tests/mlx_parity/probe_17_curve_param.py index 62c8b32e17..c6ae3cc72d 100644 --- a/tests/mlx_parity/probe_17_curve_param.py +++ b/tests/mlx_parity/probe_17_curve_param.py @@ -106,9 +106,16 @@ def _env_grad(name): return "default" grad_norm_override = _env_grad("MLX_MAX_GRAD_NORM") grad_value_override = _env_grad("MLX_MAX_GRAD_VALUE") + # Round AW: bisect mlx-lm-vs-unsloth-zoo 80%-vs-60% gap. The two + # axes still live (CCE off + GC off in this probe already eliminate + # those candidates): grad-accum mechanic (B = bs2*accum3 with token- + # weighted mean; A = native bs6 unweighted) + elementwise clip. + bs = _env_int("MLX_BS", 2) + accum = _env_int("MLX_ACCUM", 3) banner(f"Probe 17: steps={steps} seed={seed} dtype={dtype} bc={bc!r} lr={lr} " - f"max_grad_norm={grad_norm_override!r} max_grad_value={grad_value_override!r}") + f"max_grad_norm={grad_norm_override!r} max_grad_value={grad_value_override!r} " + f"bs={bs} accum={accum}") import random random.seed(seed) @@ -151,8 +158,8 @@ def _env_grad(name): cfg_grad_norm = 1.0 if grad_norm_override == "default" else (grad_norm_override or 0.0) config = MLXTrainingConfig( - per_device_train_batch_size=2, - gradient_accumulation_steps=3, + per_device_train_batch_size=bs, + gradient_accumulation_steps=accum, max_steps=steps, learning_rate=lr, warmup_steps=0, @@ -235,6 +242,11 @@ def _on_step(*args): "adam_bias_correction": bc, "effective_adam_bias_correction": effective_bc, "learning_rate": lr, + "per_device_train_batch_size": bs, + "gradient_accumulation_steps": accum, + "effective_batch_size": bs * accum, + "max_grad_value": grad_value_override, + "max_grad_norm_setting": cfg_grad_norm, "adam_bc_field_supported": "adam_bias_correction" in fields_supported, }, "rows": rows, @@ -245,7 +257,14 @@ def _on_step(*args): } lr_tag = f"{lr:.0e}".replace("-0", "-").replace("+0", "") bc_tag = "d" if bc is None else int(bc) - fname = f"probe_17__s{steps}_d{seed}_bc{bc_tag}_lr{lr_tag}.json" + if grad_value_override == "default": + gv_tag = "def" + elif grad_value_override is None: + gv_tag = "off" + else: + gv_tag = f"{grad_value_override:g}" + fname = (f"probe_17__s{steps}_d{seed}_bc{bc_tag}_lr{lr_tag}" + f"_bs{bs}_ac{accum}_gv{gv_tag}.json") (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) section("summary") if rows: From d4f125d32133234cd085472790d866ca36986d74 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 12:11:41 +0000 Subject: [PATCH 61/84] mlx_parity Round AX: expand n to 15 seeds across mlx-lm vs unsloth-zoo Round AW (2x2 factorial at n=5) showed neither max_grad_value nor the grad-accum mechanic explains the apparent 80%-vs-60% gap with mlx-lm native -- all unsloth-zoo cells landed at 2/5 or 3/5, well within binomial noise of mlx-lm's 4/5. To distinguish noise from a real trainer-level effect, expand the seed sample to 15 (5 prior + 10 new): Cell A unsloth-zoo smoke default (clip=1.0, bs=2, accum=3) Cell D unsloth-zoo mlx-lm-matching (clip=off, bs=6, accum=1) mlx-lm native (probe_20_mlx_lm_aggressive) 10 new seeds across each: 1, 7, 123, 456, 789, 1234, 5678, 9012, 31415, 65535. 30 cells in this push; combined with prior AW/AU data in artifacts to produce a 45-observation comparison. --- .github/workflows/mlx-parity-probe.yml | 159 ++++++++----------------- 1 file changed, 51 insertions(+), 108 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index cd6fcdbbb7..6816ae24f0 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -77,113 +77,53 @@ jobs: # is high, interaction effect. If C high & B not, accum is # the dominant cause. - # ---- Cell A: clip=1.0, bs=2, accum=3 (Round AV baseline) ---- - - id: '17ja_42' - seed: '42' - max_grad_value: '1.0' - bs: '2' - accum: '3' - - id: '17ja_999' - seed: '999' - max_grad_value: '1.0' - bs: '2' - accum: '3' - - id: '17ja_12345' - seed: '12345' - max_grad_value: '1.0' - bs: '2' - accum: '3' - - id: '17ja_22222' - seed: '22222' - max_grad_value: '1.0' - bs: '2' - accum: '3' - - id: '17ja_3407' - seed: '3407' - max_grad_value: '1.0' - bs: '2' - accum: '3' - - # ---- Cell B: clip=off, bs=2, accum=3 (drop clip only) ---- - - id: '17jb_42' - seed: '42' - max_grad_value: 'off' - bs: '2' - accum: '3' - - id: '17jb_999' - seed: '999' - max_grad_value: 'off' - bs: '2' - accum: '3' - - id: '17jb_12345' - seed: '12345' - max_grad_value: 'off' - bs: '2' - accum: '3' - - id: '17jb_22222' - seed: '22222' - max_grad_value: 'off' - bs: '2' - accum: '3' - - id: '17jb_3407' - seed: '3407' - max_grad_value: 'off' - bs: '2' - accum: '3' - - # ---- Cell C: clip=1.0, bs=6, accum=1 (drop accum only) ---- - - id: '17jc_42' - seed: '42' - max_grad_value: '1.0' - bs: '6' - accum: '1' - - id: '17jc_999' - seed: '999' - max_grad_value: '1.0' - bs: '6' - accum: '1' - - id: '17jc_12345' - seed: '12345' - max_grad_value: '1.0' - bs: '6' - accum: '1' - - id: '17jc_22222' - seed: '22222' - max_grad_value: '1.0' - bs: '6' - accum: '1' - - id: '17jc_3407' - seed: '3407' - max_grad_value: '1.0' - bs: '6' - accum: '1' - - # ---- Cell D: clip=off, bs=6, accum=1 (full mlx-lm match) ---- - - id: '17jd_42' - seed: '42' - max_grad_value: 'off' - bs: '6' - accum: '1' - - id: '17jd_999' - seed: '999' - max_grad_value: 'off' - bs: '6' - accum: '1' - - id: '17jd_12345' - seed: '12345' - max_grad_value: 'off' - bs: '6' - accum: '1' - - id: '17jd_22222' - seed: '22222' - max_grad_value: 'off' - bs: '6' - accum: '1' - - id: '17jd_3407' - seed: '3407' - max_grad_value: 'off' - bs: '6' - accum: '1' + # ---- Round AX: expand n on Cells A and D + add mlx-lm native ---- + # Round AW (n=5) showed cell A=3/5 and cell D=2/5 -- the + # putative 20% gap with mlx-lm (4/5 in Round AU) is within + # 1-seed binomial noise. To distinguish noise from real + # effect we add 10 new seeds at three configurations: + # Cell A (unsloth-zoo, smoke default): clip=1.0 bs=2 acc=3 + # Cell D (unsloth-zoo, mlx-lm match) : clip=off bs=6 acc=1 + # mlx-lm native (probe_20) + # Combined with prior data: 15 seeds x 3 conditions = 45 + # observations to call the gap real or not. + # Seeds: 1, 7, 123, 456, 789, 1234, 5678, 9012, 31415, 65535. + + # -- AX Cell A: unsloth-zoo baseline (clip=1.0, bs=2, acc=3) -- + - {id: '17ka_1', script: 'probe_17_curve_param.py', seed: '1', max_grad_value: '1.0', bs: '2', accum: '3'} + - {id: '17ka_7', script: 'probe_17_curve_param.py', seed: '7', max_grad_value: '1.0', bs: '2', accum: '3'} + - {id: '17ka_123', script: 'probe_17_curve_param.py', seed: '123', max_grad_value: '1.0', bs: '2', accum: '3'} + - {id: '17ka_456', script: 'probe_17_curve_param.py', seed: '456', max_grad_value: '1.0', bs: '2', accum: '3'} + - {id: '17ka_789', script: 'probe_17_curve_param.py', seed: '789', max_grad_value: '1.0', bs: '2', accum: '3'} + - {id: '17ka_1234', script: 'probe_17_curve_param.py', seed: '1234', max_grad_value: '1.0', bs: '2', accum: '3'} + - {id: '17ka_5678', script: 'probe_17_curve_param.py', seed: '5678', max_grad_value: '1.0', bs: '2', accum: '3'} + - {id: '17ka_9012', script: 'probe_17_curve_param.py', seed: '9012', max_grad_value: '1.0', bs: '2', accum: '3'} + - {id: '17ka_31415', script: 'probe_17_curve_param.py', seed: '31415', max_grad_value: '1.0', bs: '2', accum: '3'} + - {id: '17ka_65535', script: 'probe_17_curve_param.py', seed: '65535', max_grad_value: '1.0', bs: '2', accum: '3'} + + # -- AX Cell D: unsloth-zoo mlx-lm-matching (clip=off, bs=6, acc=1) -- + - {id: '17kd_1', script: 'probe_17_curve_param.py', seed: '1', max_grad_value: 'off', bs: '6', accum: '1'} + - {id: '17kd_7', script: 'probe_17_curve_param.py', seed: '7', max_grad_value: 'off', bs: '6', accum: '1'} + - {id: '17kd_123', script: 'probe_17_curve_param.py', seed: '123', max_grad_value: 'off', bs: '6', accum: '1'} + - {id: '17kd_456', script: 'probe_17_curve_param.py', seed: '456', max_grad_value: 'off', bs: '6', accum: '1'} + - {id: '17kd_789', script: 'probe_17_curve_param.py', seed: '789', max_grad_value: 'off', bs: '6', accum: '1'} + - {id: '17kd_1234', script: 'probe_17_curve_param.py', seed: '1234', max_grad_value: 'off', bs: '6', accum: '1'} + - {id: '17kd_5678', script: 'probe_17_curve_param.py', seed: '5678', max_grad_value: 'off', bs: '6', accum: '1'} + - {id: '17kd_9012', script: 'probe_17_curve_param.py', seed: '9012', max_grad_value: 'off', bs: '6', accum: '1'} + - {id: '17kd_31415', script: 'probe_17_curve_param.py', seed: '31415', max_grad_value: 'off', bs: '6', accum: '1'} + - {id: '17kd_65535', script: 'probe_17_curve_param.py', seed: '65535', max_grad_value: 'off', bs: '6', accum: '1'} + + # -- AX mlx-lm native (probe_20, aggressive config matches unsloth-zoo) -- + - {id: '20k_1', script: 'probe_20_mlx_lm_aggressive.py', seed: '1', max_grad_value: '', bs: '', accum: ''} + - {id: '20k_7', script: 'probe_20_mlx_lm_aggressive.py', seed: '7', max_grad_value: '', bs: '', accum: ''} + - {id: '20k_123', script: 'probe_20_mlx_lm_aggressive.py', seed: '123', max_grad_value: '', bs: '', accum: ''} + - {id: '20k_456', script: 'probe_20_mlx_lm_aggressive.py', seed: '456', max_grad_value: '', bs: '', accum: ''} + - {id: '20k_789', script: 'probe_20_mlx_lm_aggressive.py', seed: '789', max_grad_value: '', bs: '', accum: ''} + - {id: '20k_1234', script: 'probe_20_mlx_lm_aggressive.py', seed: '1234', max_grad_value: '', bs: '', accum: ''} + - {id: '20k_5678', script: 'probe_20_mlx_lm_aggressive.py', seed: '5678', max_grad_value: '', bs: '', accum: ''} + - {id: '20k_9012', script: 'probe_20_mlx_lm_aggressive.py', seed: '9012', max_grad_value: '', bs: '', accum: ''} + - {id: '20k_31415', script: 'probe_20_mlx_lm_aggressive.py', seed: '31415', max_grad_value: '', bs: '', accum: ''} + - {id: '20k_65535', script: 'probe_20_mlx_lm_aggressive.py', seed: '65535', max_grad_value: '', bs: '', accum: ''} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -243,7 +183,10 @@ jobs: MLX_BS: ${{ matrix.bs }} MLX_ACCUM: ${{ matrix.accum }} run: | - cd tests/mlx_parity && python probe_17_curve_param.py + # script is per-cell; AW Cells A/B/C omit it -> default to probe_17. + SCRIPT="${{ matrix.script }}" + if [ -z "$SCRIPT" ]; then SCRIPT="probe_17_curve_param.py"; fi + cd tests/mlx_parity && python "$SCRIPT" - name: Show JSON output if: always() From 99def42ca5ed086ab3430e56217d09da13bb7065 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 12:30:37 +0000 Subject: [PATCH 62/84] mlx_parity Round AY: hybrid loader-vs-trainer disambiguation (probe_21) Round AX (n=15) confirmed mlx-lm native LoRA (67%) strictly dominates unsloth-zoo MLXTrainer (47% baseline, 40% mlx-lm-matching config) on the smoke fixture across paired seeds. Round AW already eliminated max_grad_value and grad-accum mechanic. Two buckets of remaining candidates: LOADER side -- FastMLXModel.from_pretrained adds _convert_mlx_dtype (astype + mx.eval) before LoRA wiring; get_peft_model inverts freeze/linear_to_lora_layers order; sets mx.set_memory_limit/_cache_limit/_wired_limit TRAINER side -- data sampler RNG state (no np.random.seed at train entry), extra mx.eval(grad_norm), callback dispatch Probe 21 builds a HYBRID: mlx-lm's load() + linear_to_lora_layers() constructs the model (path A), then unsloth-zoo's MLXTrainer drives training (path B) with the closest possible mlx-lm-matching config (clip=off, bs=6, acc=1, lr=1e-3, bc=True). Same 15 seeds as AX for paired comparison. Reading: pass_rate ~67% (matches mlx-lm) -> gap is in the LOADER pass_rate ~40% (matches zoo) -> gap is in the TRAINER --- .github/workflows/mlx-parity-probe.yml | 84 +++---- .../probe_21_hybrid_loader_trainer.py | 232 ++++++++++++++++++ 2 files changed, 269 insertions(+), 47 deletions(-) create mode 100644 tests/mlx_parity/probe_21_hybrid_loader_trainer.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 6816ae24f0..daa3e3a821 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -77,53 +77,43 @@ jobs: # is high, interaction effect. If C high & B not, accum is # the dominant cause. - # ---- Round AX: expand n on Cells A and D + add mlx-lm native ---- - # Round AW (n=5) showed cell A=3/5 and cell D=2/5 -- the - # putative 20% gap with mlx-lm (4/5 in Round AU) is within - # 1-seed binomial noise. To distinguish noise from real - # effect we add 10 new seeds at three configurations: - # Cell A (unsloth-zoo, smoke default): clip=1.0 bs=2 acc=3 - # Cell D (unsloth-zoo, mlx-lm match) : clip=off bs=6 acc=1 - # mlx-lm native (probe_20) - # Combined with prior data: 15 seeds x 3 conditions = 45 - # observations to call the gap real or not. - # Seeds: 1, 7, 123, 456, 789, 1234, 5678, 9012, 31415, 65535. - - # -- AX Cell A: unsloth-zoo baseline (clip=1.0, bs=2, acc=3) -- - - {id: '17ka_1', script: 'probe_17_curve_param.py', seed: '1', max_grad_value: '1.0', bs: '2', accum: '3'} - - {id: '17ka_7', script: 'probe_17_curve_param.py', seed: '7', max_grad_value: '1.0', bs: '2', accum: '3'} - - {id: '17ka_123', script: 'probe_17_curve_param.py', seed: '123', max_grad_value: '1.0', bs: '2', accum: '3'} - - {id: '17ka_456', script: 'probe_17_curve_param.py', seed: '456', max_grad_value: '1.0', bs: '2', accum: '3'} - - {id: '17ka_789', script: 'probe_17_curve_param.py', seed: '789', max_grad_value: '1.0', bs: '2', accum: '3'} - - {id: '17ka_1234', script: 'probe_17_curve_param.py', seed: '1234', max_grad_value: '1.0', bs: '2', accum: '3'} - - {id: '17ka_5678', script: 'probe_17_curve_param.py', seed: '5678', max_grad_value: '1.0', bs: '2', accum: '3'} - - {id: '17ka_9012', script: 'probe_17_curve_param.py', seed: '9012', max_grad_value: '1.0', bs: '2', accum: '3'} - - {id: '17ka_31415', script: 'probe_17_curve_param.py', seed: '31415', max_grad_value: '1.0', bs: '2', accum: '3'} - - {id: '17ka_65535', script: 'probe_17_curve_param.py', seed: '65535', max_grad_value: '1.0', bs: '2', accum: '3'} - - # -- AX Cell D: unsloth-zoo mlx-lm-matching (clip=off, bs=6, acc=1) -- - - {id: '17kd_1', script: 'probe_17_curve_param.py', seed: '1', max_grad_value: 'off', bs: '6', accum: '1'} - - {id: '17kd_7', script: 'probe_17_curve_param.py', seed: '7', max_grad_value: 'off', bs: '6', accum: '1'} - - {id: '17kd_123', script: 'probe_17_curve_param.py', seed: '123', max_grad_value: 'off', bs: '6', accum: '1'} - - {id: '17kd_456', script: 'probe_17_curve_param.py', seed: '456', max_grad_value: 'off', bs: '6', accum: '1'} - - {id: '17kd_789', script: 'probe_17_curve_param.py', seed: '789', max_grad_value: 'off', bs: '6', accum: '1'} - - {id: '17kd_1234', script: 'probe_17_curve_param.py', seed: '1234', max_grad_value: 'off', bs: '6', accum: '1'} - - {id: '17kd_5678', script: 'probe_17_curve_param.py', seed: '5678', max_grad_value: 'off', bs: '6', accum: '1'} - - {id: '17kd_9012', script: 'probe_17_curve_param.py', seed: '9012', max_grad_value: 'off', bs: '6', accum: '1'} - - {id: '17kd_31415', script: 'probe_17_curve_param.py', seed: '31415', max_grad_value: 'off', bs: '6', accum: '1'} - - {id: '17kd_65535', script: 'probe_17_curve_param.py', seed: '65535', max_grad_value: 'off', bs: '6', accum: '1'} - - # -- AX mlx-lm native (probe_20, aggressive config matches unsloth-zoo) -- - - {id: '20k_1', script: 'probe_20_mlx_lm_aggressive.py', seed: '1', max_grad_value: '', bs: '', accum: ''} - - {id: '20k_7', script: 'probe_20_mlx_lm_aggressive.py', seed: '7', max_grad_value: '', bs: '', accum: ''} - - {id: '20k_123', script: 'probe_20_mlx_lm_aggressive.py', seed: '123', max_grad_value: '', bs: '', accum: ''} - - {id: '20k_456', script: 'probe_20_mlx_lm_aggressive.py', seed: '456', max_grad_value: '', bs: '', accum: ''} - - {id: '20k_789', script: 'probe_20_mlx_lm_aggressive.py', seed: '789', max_grad_value: '', bs: '', accum: ''} - - {id: '20k_1234', script: 'probe_20_mlx_lm_aggressive.py', seed: '1234', max_grad_value: '', bs: '', accum: ''} - - {id: '20k_5678', script: 'probe_20_mlx_lm_aggressive.py', seed: '5678', max_grad_value: '', bs: '', accum: ''} - - {id: '20k_9012', script: 'probe_20_mlx_lm_aggressive.py', seed: '9012', max_grad_value: '', bs: '', accum: ''} - - {id: '20k_31415', script: 'probe_20_mlx_lm_aggressive.py', seed: '31415', max_grad_value: '', bs: '', accum: ''} - - {id: '20k_65535', script: 'probe_20_mlx_lm_aggressive.py', seed: '65535', max_grad_value: '', bs: '', accum: ''} + # ---- Round AY: disambiguate LOADER vs TRAINER as source of the gap ---- + # Round AX (n=15) confirmed the gap is real: + # mlx-lm native 10/15 = 67% + # zoo Cell A (smoke) 7/15 = 47% + # zoo Cell D (match) 6/15 = 40% + # mlx-lm strictly dominates zoo across paired seeds. The gap is + # NOT from clip or grad-accum (Round AW). Remaining candidates + # split into two buckets: + # - LOADER: FastMLXModel.from_pretrained (dtype cast + + # mx.eval(params)) + get_peft_model (different + # freeze/unfreeze order, sets memory limits) + # - TRAINER: data sampler RNG, extra mx.eval(grad_norm), + # optimizer wiring, callback overhead + # + # Probe 21 builds a HYBRID: mlx-lm's load() + linear_to_lora_layers() + # then drives training via unsloth-zoo MLXTrainer (clip=off, + # bs=6, acc=1 -- matching the closest possible config). + # Reading: + # pass_rate ~67% -> gap is in unsloth-zoo's LOADER + # pass_rate ~40% -> gap is in unsloth-zoo's TRAINER + # Same 15 seeds used in AX for direct paired comparison. + + - {id: '21_1', script: 'probe_21_hybrid_loader_trainer.py', seed: '1'} + - {id: '21_7', script: 'probe_21_hybrid_loader_trainer.py', seed: '7'} + - {id: '21_42', script: 'probe_21_hybrid_loader_trainer.py', seed: '42'} + - {id: '21_123', script: 'probe_21_hybrid_loader_trainer.py', seed: '123'} + - {id: '21_456', script: 'probe_21_hybrid_loader_trainer.py', seed: '456'} + - {id: '21_789', script: 'probe_21_hybrid_loader_trainer.py', seed: '789'} + - {id: '21_999', script: 'probe_21_hybrid_loader_trainer.py', seed: '999'} + - {id: '21_1234', script: 'probe_21_hybrid_loader_trainer.py', seed: '1234'} + - {id: '21_3407', script: 'probe_21_hybrid_loader_trainer.py', seed: '3407'} + - {id: '21_5678', script: 'probe_21_hybrid_loader_trainer.py', seed: '5678'} + - {id: '21_9012', script: 'probe_21_hybrid_loader_trainer.py', seed: '9012'} + - {id: '21_12345', script: 'probe_21_hybrid_loader_trainer.py', seed: '12345'} + - {id: '21_22222', script: 'probe_21_hybrid_loader_trainer.py', seed: '22222'} + - {id: '21_31415', script: 'probe_21_hybrid_loader_trainer.py', seed: '31415'} + - {id: '21_65535', script: 'probe_21_hybrid_loader_trainer.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_21_hybrid_loader_trainer.py b/tests/mlx_parity/probe_21_hybrid_loader_trainer.py new file mode 100644 index 0000000000..8f93bdf72d --- /dev/null +++ b/tests/mlx_parity/probe_21_hybrid_loader_trainer.py @@ -0,0 +1,232 @@ +"""Probe 21 — disambiguate LOADER vs TRAINER as the source of the +~20pp pass-rate gap between mlx-lm native LoRA (~67%) and +unsloth-zoo MLXTrainer (~40-47%) on the smoke fixture. + +Round AX (n=15) confirmed the gap is real: mlx-lm strictly dominates +unsloth-zoo at every seed (paired comparison). Round AW eliminated +max_grad_value and the grad-accum mechanic as causes. + +This probe builds a HYBRID: + * model construction & LoRA wiring via mlx-lm's load() + + linear_to_lora_layers() (path A from the audit) + * training via unsloth-zoo's MLXTrainer (path B from the audit), + configured to mirror mlx-lm's defaults as closely as the + MLXTrainingConfig surface allows: + max_grad_value=None # mlx-lm has no clip + max_grad_norm=0 # ditto + gradient_checkpointing=False + use_cce=False + compile=False + bs=6, accum=1 + lr=1e-3, weight_decay=0, adamw, bias_correction=True + +Reading: + pass_rate ≈ 67% (mlx-lm) -> gap is in FastMLXModel / + get_peft_model (loader side) + pass_rate ≈ 40-47% (unsloth-zoo) -> gap is in MLXTrainer / its + data sampler / optimizer wiring + +Env vars: MLX_SEED (required), MLX_STEPS (default 30), MLX_LR +(default 1e-3). Writes per-config JSON to +.out/probe_21__s{S}_d{D}.json. +""" + +import json +import os +import sys +import dataclasses +import random +from pathlib import Path + +import numpy as np + +from _common import ( + MODEL_NAME, + TRAIN_TEXT, + PROMPT, + MAX_SEQ_LEN, + OUT_DIR, + banner, + section, + report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: + return default + try: + return int(raw) + except ValueError: + return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: + return default + try: + return float(raw) + except ValueError: + return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + banner(f"Probe 21: mlx-lm loader + unsloth-zoo trainer " + f"steps={steps} seed={seed} lr={lr}") + + random.seed(seed) + np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + # ---- LOADER: exactly mlx-lm's path. ---- + from mlx_lm import load as mlx_load + section("mlx-lm load + LoRA wire (path A)") + model, tokenizer = mlx_load(MODEL_NAME) + report("loaded model class", type(model).__name__) + + # Mirror mlx-lm/lora.py: freeze BEFORE linear_to_lora_layers. + model.freeze() + from mlx_lm.tuner.utils import linear_to_lora_layers + lora_config = { + "rank": 8, + "scale": 2.0, + "dropout": 0.0, + "keys": [ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.o_proj", + "mlp.gate_proj", + "mlp.up_proj", + "mlp.down_proj", + ], + } + try: + num_layers = len(model.layers) + except AttributeError: + num_layers = len(model.model.layers) + linear_to_lora_layers(model, num_layers, lora_config) + report("LoRA modules wired via mlx-lm path", "OK") + + # Sanity: count trainable params + from mlx.utils import tree_flatten + trainable = [(k, v) for k, v in tree_flatten(model.trainable_parameters())] + report("trainable param leaves", len(trainable)) + + # ---- TRAINER: unsloth-zoo MLXTrainer (path B). ---- + section("unsloth-zoo MLXTrainer (path B)") + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: + extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: + extra["max_grad_value"] = None # match mlx-lm: no elementwise clip + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe21_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + def _on_step(*args): + if len(args) < 3: + return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + # ---- POST-TRAIN: same eval signal as probe 17. ---- + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + report("completion_teacher_forced_loss", completion_loss) + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, + "learning_rate": lr, + "loader": "mlx-lm (path A)", + "trainer": "unsloth-zoo (path B)", + "per_device_train_batch_size": 6, + "gradient_accumulation_steps": 1, + "max_grad_value": None, + "max_grad_norm": 0.0, + "adam_bias_correction": True, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_21__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + + section("summary") + if rows: + report("step-1 loss", rows[0]["loss"]) + report(f"step-{len(rows)} loss", rows[-1]["loss"]) + report("post_train_loss", post_loss_val) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 84c2aca4eb3e03d60a494f83156a949d9e430cf4 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 12:42:56 +0000 Subject: [PATCH 63/84] mlx_parity Round AZ: numpy-RNG hypothesis + triple-confirm mlx-lm Round AY showed the ~20pp gap is in zoo's MLXTrainer, not in the loader (hybrid path matched zoo at 47%, not mlx-lm at 67%). Probe 22 is the same hybrid (mlx-lm loader + zoo trainer) plus a np.random.seed(seed) reset right before trainer.train(), mirroring what mlx-lm does at lora.py:320. If 22 closes the gap, numpy RNG divergence is the cause; if not, something else inside MLXTrainer. Separately we already triple-confirmed the noise-ceiling outside MLX: CUDA PyTorch fp32 LoRA on the same 15 seeds hits 67% pass rate, identical to mlx-lm. That confirms 67% is the basin-selection ceiling for this fixture across frameworks; zoo's 40-47% is a true trainer-side defect, not framework variance. Also re-runs probe_20 (mlx-lm native) on the same 15 seeds with id '20z_*' for a CI-side triple-confirm of the mlx-lm number itself (prior runs were across AU/AX, mixed install layers). 30 cells total. --- .github/workflows/mlx-parity-probe.yml | 59 +++++-- tests/mlx_parity/probe_22_hybrid_reseed.py | 184 +++++++++++++++++++++ 2 files changed, 228 insertions(+), 15 deletions(-) create mode 100644 tests/mlx_parity/probe_22_hybrid_reseed.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index daa3e3a821..6739e84f5e 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -99,21 +99,50 @@ jobs: # pass_rate ~40% -> gap is in unsloth-zoo's TRAINER # Same 15 seeds used in AX for direct paired comparison. - - {id: '21_1', script: 'probe_21_hybrid_loader_trainer.py', seed: '1'} - - {id: '21_7', script: 'probe_21_hybrid_loader_trainer.py', seed: '7'} - - {id: '21_42', script: 'probe_21_hybrid_loader_trainer.py', seed: '42'} - - {id: '21_123', script: 'probe_21_hybrid_loader_trainer.py', seed: '123'} - - {id: '21_456', script: 'probe_21_hybrid_loader_trainer.py', seed: '456'} - - {id: '21_789', script: 'probe_21_hybrid_loader_trainer.py', seed: '789'} - - {id: '21_999', script: 'probe_21_hybrid_loader_trainer.py', seed: '999'} - - {id: '21_1234', script: 'probe_21_hybrid_loader_trainer.py', seed: '1234'} - - {id: '21_3407', script: 'probe_21_hybrid_loader_trainer.py', seed: '3407'} - - {id: '21_5678', script: 'probe_21_hybrid_loader_trainer.py', seed: '5678'} - - {id: '21_9012', script: 'probe_21_hybrid_loader_trainer.py', seed: '9012'} - - {id: '21_12345', script: 'probe_21_hybrid_loader_trainer.py', seed: '12345'} - - {id: '21_22222', script: 'probe_21_hybrid_loader_trainer.py', seed: '22222'} - - {id: '21_31415', script: 'probe_21_hybrid_loader_trainer.py', seed: '31415'} - - {id: '21_65535', script: 'probe_21_hybrid_loader_trainer.py', seed: '65535'} + # ---- Round AZ: numpy-reset hypothesis + triple-confirm mlx-lm ---- + # Round AY proved gap is in TRAINER (probe_21 hybrid 47% = + # zoo 47%, not mlx-lm 67%). Leading suspect: numpy RNG state. + # mlx-lm calls np.random.seed(args.seed) at lora.py:320 + # immediately before the training loop. Probe 22 = same hybrid + # as 21 but with np.random.seed reset right before train(). + # If 22 ~= 67%, numpy RNG is the cause. If ~= 47%, RNG isn't. + # Also re-runs probe_20 (mlx-lm native) on the same 15 seeds + # to triple-confirm the original mlx-lm 67% number. + # Same 15 seeds as AX/AY. + + # -- probe 22: hybrid + np.seed reset just before train() -- + - {id: '22_1', script: 'probe_22_hybrid_reseed.py', seed: '1'} + - {id: '22_7', script: 'probe_22_hybrid_reseed.py', seed: '7'} + - {id: '22_42', script: 'probe_22_hybrid_reseed.py', seed: '42'} + - {id: '22_123', script: 'probe_22_hybrid_reseed.py', seed: '123'} + - {id: '22_456', script: 'probe_22_hybrid_reseed.py', seed: '456'} + - {id: '22_789', script: 'probe_22_hybrid_reseed.py', seed: '789'} + - {id: '22_999', script: 'probe_22_hybrid_reseed.py', seed: '999'} + - {id: '22_1234', script: 'probe_22_hybrid_reseed.py', seed: '1234'} + - {id: '22_3407', script: 'probe_22_hybrid_reseed.py', seed: '3407'} + - {id: '22_5678', script: 'probe_22_hybrid_reseed.py', seed: '5678'} + - {id: '22_9012', script: 'probe_22_hybrid_reseed.py', seed: '9012'} + - {id: '22_12345', script: 'probe_22_hybrid_reseed.py', seed: '12345'} + - {id: '22_22222', script: 'probe_22_hybrid_reseed.py', seed: '22222'} + - {id: '22_31415', script: 'probe_22_hybrid_reseed.py', seed: '31415'} + - {id: '22_65535', script: 'probe_22_hybrid_reseed.py', seed: '65535'} + + # -- triple-confirm: re-run probe 20 (mlx-lm native) on same seeds -- + - {id: '20z_1', script: 'probe_20_mlx_lm_aggressive.py', seed: '1'} + - {id: '20z_7', script: 'probe_20_mlx_lm_aggressive.py', seed: '7'} + - {id: '20z_42', script: 'probe_20_mlx_lm_aggressive.py', seed: '42'} + - {id: '20z_123', script: 'probe_20_mlx_lm_aggressive.py', seed: '123'} + - {id: '20z_456', script: 'probe_20_mlx_lm_aggressive.py', seed: '456'} + - {id: '20z_789', script: 'probe_20_mlx_lm_aggressive.py', seed: '789'} + - {id: '20z_999', script: 'probe_20_mlx_lm_aggressive.py', seed: '999'} + - {id: '20z_1234', script: 'probe_20_mlx_lm_aggressive.py', seed: '1234'} + - {id: '20z_3407', script: 'probe_20_mlx_lm_aggressive.py', seed: '3407'} + - {id: '20z_5678', script: 'probe_20_mlx_lm_aggressive.py', seed: '5678'} + - {id: '20z_9012', script: 'probe_20_mlx_lm_aggressive.py', seed: '9012'} + - {id: '20z_12345', script: 'probe_20_mlx_lm_aggressive.py', seed: '12345'} + - {id: '20z_22222', script: 'probe_20_mlx_lm_aggressive.py', seed: '22222'} + - {id: '20z_31415', script: 'probe_20_mlx_lm_aggressive.py', seed: '31415'} + - {id: '20z_65535', script: 'probe_20_mlx_lm_aggressive.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_22_hybrid_reseed.py b/tests/mlx_parity/probe_22_hybrid_reseed.py new file mode 100644 index 0000000000..cabe8c71a2 --- /dev/null +++ b/tests/mlx_parity/probe_22_hybrid_reseed.py @@ -0,0 +1,184 @@ +"""Probe 22 — same hybrid as probe 21 (mlx-lm loader + unsloth-zoo +trainer) but with the numpy RNG reset RIGHT BEFORE training, mirroring +what mlx-lm does at lora.py:320 (np.random.seed(args.seed)). + +Round AY (probe 21) confirmed the gap is in the TRAINER, not the +loader: hybrid path matched zoo (47%) not mlx-lm (67%). The leading +remaining suspect in the trainer is numpy RNG state divergence: +mlx-lm explicitly re-seeds numpy at training-loop entry; unsloth-zoo +never re-seeds numpy, so the data sampler reads whatever state the +LoRA-init + dtype-cast + freeze-flip ops left behind. + +If pass_rate ~67% (matches mlx-lm) -> numpy RNG reset is the cause +If pass_rate ~47% (matches probe 21) -> RNG isn't it; investigate + other trainer-internal axes + (extra mx.eval(grad_norm), + compile graph, etc.) +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path + +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + banner(f"Probe 22: mlx-lm loader + zoo trainer + np.seed reset " + f"steps={steps} seed={seed} lr={lr}") + + random.seed(seed) + np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + # ---- LOADER: mlx-lm path A. ---- + from mlx_lm import load as mlx_load + section("mlx-lm load + LoRA wire (path A)") + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + from mlx_lm.tuner.utils import linear_to_lora_layers + lora_config = { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": [ + "self_attn.q_proj", "self_attn.k_proj", + "self_attn.v_proj", "self_attn.o_proj", + "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj", + ], + } + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + linear_to_lora_layers(model, num_layers, lora_config) + + # ---- TRAINER: zoo MLXTrainer (path B). ---- + section("zoo MLXTrainer + np.random reset") + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: + extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: + extra["max_grad_value"] = None + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe22_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + + # ---- KEY DIFFERENCE FROM PROBE 21: mirror mlx-lm/lora.py:320. ---- + # mlx-lm re-seeds numpy RIGHT BEFORE the training loop so the data + # sampler's RNG state is independent of LoRA-init / dtype-cast ops. + np.random.seed(seed) + mx.random.seed(seed) + + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + # ---- POST-TRAIN: same eval as probes 17/21. ---- + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, "learning_rate": lr, + "loader": "mlx-lm (path A)", + "trainer": "unsloth-zoo (path B) + np.seed reset", + "per_device_train_batch_size": 6, + "gradient_accumulation_steps": 1, + "max_grad_value": None, "max_grad_norm": 0.0, + "adam_bias_correction": True, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_22__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From e1756a28f8471ec8846e02fa45429ef24ba55c97 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 13:29:08 +0000 Subject: [PATCH 64/84] mlx_parity Round BA: compile-mode hypothesis (probe 23) Round AZ rejected numpy-RNG (probe 22 hybrid+reseed = 47%, same as probe 21 hybrid = 47%, with identical per-seed pass pattern). The biggest remaining structural difference between zoo's MLXTrainer and mlx-lm's trainer is compile mode: mlx-lm always wraps step_fn with @partial(mx.compile, inputs=state, outputs=state); zoo only does so when args.compile=True (and our probes set compile=False). In fp16, op fusion and reordering from mx.compile can produce different rounding patterns than eager execution. After 30 steps those tiny differences could move the model into different basins of attraction in the first-token argmax. The teacher-forced loss is 0.0 everywhere so memorization works in both; only greedy decode differs. Probe 23 = probe 22 + compile=True. If pass rate matches mlx-lm's 67%, compile-mode is the cause and the fix is to flip the default. --- .github/workflows/mlx-parity-probe.yml | 39 +++-- tests/mlx_parity/probe_23_hybrid_compile.py | 180 ++++++++++++++++++++ 2 files changed, 203 insertions(+), 16 deletions(-) create mode 100644 tests/mlx_parity/probe_23_hybrid_compile.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 6739e84f5e..65b9cd74db 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -127,22 +127,29 @@ jobs: - {id: '22_31415', script: 'probe_22_hybrid_reseed.py', seed: '31415'} - {id: '22_65535', script: 'probe_22_hybrid_reseed.py', seed: '65535'} - # -- triple-confirm: re-run probe 20 (mlx-lm native) on same seeds -- - - {id: '20z_1', script: 'probe_20_mlx_lm_aggressive.py', seed: '1'} - - {id: '20z_7', script: 'probe_20_mlx_lm_aggressive.py', seed: '7'} - - {id: '20z_42', script: 'probe_20_mlx_lm_aggressive.py', seed: '42'} - - {id: '20z_123', script: 'probe_20_mlx_lm_aggressive.py', seed: '123'} - - {id: '20z_456', script: 'probe_20_mlx_lm_aggressive.py', seed: '456'} - - {id: '20z_789', script: 'probe_20_mlx_lm_aggressive.py', seed: '789'} - - {id: '20z_999', script: 'probe_20_mlx_lm_aggressive.py', seed: '999'} - - {id: '20z_1234', script: 'probe_20_mlx_lm_aggressive.py', seed: '1234'} - - {id: '20z_3407', script: 'probe_20_mlx_lm_aggressive.py', seed: '3407'} - - {id: '20z_5678', script: 'probe_20_mlx_lm_aggressive.py', seed: '5678'} - - {id: '20z_9012', script: 'probe_20_mlx_lm_aggressive.py', seed: '9012'} - - {id: '20z_12345', script: 'probe_20_mlx_lm_aggressive.py', seed: '12345'} - - {id: '20z_22222', script: 'probe_20_mlx_lm_aggressive.py', seed: '22222'} - - {id: '20z_31415', script: 'probe_20_mlx_lm_aggressive.py', seed: '31415'} - - {id: '20z_65535', script: 'probe_20_mlx_lm_aggressive.py', seed: '65535'} + # ---- Round BA: compile-mode hypothesis ---- + # Round AZ rejected numpy-RNG (probe 22 = probe 21). The + # biggest remaining structural diff inside the trainer: + # mlx-lm always wraps step with @mx.compile (trainer.py:248); + # zoo only does so when args.compile=True (and we set it + # False in prior probes). Probe 23 = probe 22 + compile=True + # to test if compile-mode is the missing piece. + + - {id: '23_1', script: 'probe_23_hybrid_compile.py', seed: '1'} + - {id: '23_7', script: 'probe_23_hybrid_compile.py', seed: '7'} + - {id: '23_42', script: 'probe_23_hybrid_compile.py', seed: '42'} + - {id: '23_123', script: 'probe_23_hybrid_compile.py', seed: '123'} + - {id: '23_456', script: 'probe_23_hybrid_compile.py', seed: '456'} + - {id: '23_789', script: 'probe_23_hybrid_compile.py', seed: '789'} + - {id: '23_999', script: 'probe_23_hybrid_compile.py', seed: '999'} + - {id: '23_1234', script: 'probe_23_hybrid_compile.py', seed: '1234'} + - {id: '23_3407', script: 'probe_23_hybrid_compile.py', seed: '3407'} + - {id: '23_5678', script: 'probe_23_hybrid_compile.py', seed: '5678'} + - {id: '23_9012', script: 'probe_23_hybrid_compile.py', seed: '9012'} + - {id: '23_12345', script: 'probe_23_hybrid_compile.py', seed: '12345'} + - {id: '23_22222', script: 'probe_23_hybrid_compile.py', seed: '22222'} + - {id: '23_31415', script: 'probe_23_hybrid_compile.py', seed: '31415'} + - {id: '23_65535', script: 'probe_23_hybrid_compile.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_23_hybrid_compile.py b/tests/mlx_parity/probe_23_hybrid_compile.py new file mode 100644 index 0000000000..2fad647c21 --- /dev/null +++ b/tests/mlx_parity/probe_23_hybrid_compile.py @@ -0,0 +1,180 @@ +"""Probe 23 — same hybrid as probe 21 (mlx-lm loader + zoo trainer), +but with mx.compile ENABLED (compile=True) in the trainer config to +match mlx-lm's training-loop wrapping at trainer.py:248. + +Round AY proved gap is in the trainer; Round AZ rejected the numpy- +RNG hypothesis. The biggest remaining structural difference is: + + * mlx-lm wraps the step function with @partial(mx.compile, inputs= + state, outputs=state) UNCONDITIONALLY (trainer.py:248) + * zoo wraps step_fn with mx.compile only when args.compile=True + (trainer.py:921-968). Our probes set compile=False, so the step + runs eagerly. mlx-lm runs compiled. + +In fp16, op fusion + reordering from mx.compile can change rounding, +which after 30 steps can shift the model into a different basin +(memorization works, but greedy-decode first-token argmax differs). + +If pass rate ~= 67% (matches mlx-lm) -> compile-mode is the cause +If pass rate ~= 47% (matches probe 21/22) -> compile isn't it +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path + +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + banner(f"Probe 23: mlx-lm loader + zoo trainer + mx.compile=True " + f"steps={steps} seed={seed} lr={lr}") + + random.seed(seed) + np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + from mlx_lm.tuner.utils import linear_to_lora_layers + lora_config = { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": [ + "self_attn.q_proj", "self_attn.k_proj", + "self_attn.v_proj", "self_attn.o_proj", + "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj", + ], + } + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + linear_to_lora_layers(model, num_layers, lora_config) + + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: + extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: + extra["max_grad_value"] = None + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=True, # <-- THE ONLY CHANGE FROM PROBE 22 + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe23_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + + np.random.seed(seed) + mx.random.seed(seed) + + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, "learning_rate": lr, + "loader": "mlx-lm (path A)", + "trainer": "unsloth-zoo (path B) + compile=True", + "compile": True, + "per_device_train_batch_size": 6, + "gradient_accumulation_steps": 1, + "max_grad_value": None, "max_grad_norm": 0.0, + "adam_bias_correction": True, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_23__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 1614105d61a9500ba5f61aa8257ba3a884fecf7e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 13:43:48 +0000 Subject: [PATCH 65/84] mlx_parity Round BB: loss-fn dtype propagation (probe 24) Round BA rejected compile-mode (probe 23 hybrid+compile=True hit 43% = zoo, not 67% = mlx-lm). Next live suspect is dtype propagation through the loss function's backward. mlx-lm trainer.py:86 keeps mask as bool: ce = nn.losses.cross_entropy(...) * mask # fp16 * bool -> fp16 ce = ce.astype(mx.float32).sum() / ntoks zoo utils.py:417 casts mask to fp32: mask = length_mask.astype(mx.float32) ce = nn.losses.cross_entropy(...) * mask # fp16 * fp32 -> fp32 loss = ce.astype(mx.float32).sum() / _safe_denom(ntoks) The backward through `* fp32_mask` carries gradient leaves in fp32 all the way down to the LoRA params; mlx-lm's bool variant keeps them in fp16. Different gradient dtypes through 30 Adam updates can shift weights into different fp16-rounding basins, producing divergent first-token argmax outputs. Probe 24 monkey-patches make_baseline_loss_fn with mlx-lm's verbatim default_loss before constructing MLXTrainer. --- .github/workflows/mlx-parity-probe.yml | 39 ++-- .../mlx_parity/probe_24_hybrid_mlxlm_loss.py | 221 ++++++++++++++++++ 2 files changed, 245 insertions(+), 15 deletions(-) create mode 100644 tests/mlx_parity/probe_24_hybrid_mlxlm_loss.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 65b9cd74db..4c59518f70 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -135,21 +135,30 @@ jobs: # False in prior probes). Probe 23 = probe 22 + compile=True # to test if compile-mode is the missing piece. - - {id: '23_1', script: 'probe_23_hybrid_compile.py', seed: '1'} - - {id: '23_7', script: 'probe_23_hybrid_compile.py', seed: '7'} - - {id: '23_42', script: 'probe_23_hybrid_compile.py', seed: '42'} - - {id: '23_123', script: 'probe_23_hybrid_compile.py', seed: '123'} - - {id: '23_456', script: 'probe_23_hybrid_compile.py', seed: '456'} - - {id: '23_789', script: 'probe_23_hybrid_compile.py', seed: '789'} - - {id: '23_999', script: 'probe_23_hybrid_compile.py', seed: '999'} - - {id: '23_1234', script: 'probe_23_hybrid_compile.py', seed: '1234'} - - {id: '23_3407', script: 'probe_23_hybrid_compile.py', seed: '3407'} - - {id: '23_5678', script: 'probe_23_hybrid_compile.py', seed: '5678'} - - {id: '23_9012', script: 'probe_23_hybrid_compile.py', seed: '9012'} - - {id: '23_12345', script: 'probe_23_hybrid_compile.py', seed: '12345'} - - {id: '23_22222', script: 'probe_23_hybrid_compile.py', seed: '22222'} - - {id: '23_31415', script: 'probe_23_hybrid_compile.py', seed: '31415'} - - {id: '23_65535', script: 'probe_23_hybrid_compile.py', seed: '65535'} + # ---- Round BB: loss-fn dtype-propagation hypothesis ---- + # Round BA rejected compile (probe 23 = 43% = zoo, not 67%). + # Next live suspect: backward through zoo's + # `mask.astype(float32) * ce_fp16` carries gradients in fp32, + # while mlx-lm's `bool_mask * ce_fp16` keeps them in fp16. + # Probe 24 monkey-patches make_baseline_loss_fn with a + # verbatim copy of mlx-lm's default_loss (bool mask, no + # astype(fp32) on the mask). + + - {id: '24_1', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '1'} + - {id: '24_7', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '7'} + - {id: '24_42', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '42'} + - {id: '24_123', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '123'} + - {id: '24_456', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '456'} + - {id: '24_789', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '789'} + - {id: '24_999', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '999'} + - {id: '24_1234', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '1234'} + - {id: '24_3407', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '3407'} + - {id: '24_5678', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '5678'} + - {id: '24_9012', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '9012'} + - {id: '24_12345', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '12345'} + - {id: '24_22222', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '22222'} + - {id: '24_31415', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '31415'} + - {id: '24_65535', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_24_hybrid_mlxlm_loss.py b/tests/mlx_parity/probe_24_hybrid_mlxlm_loss.py new file mode 100644 index 0000000000..6733b0c0e6 --- /dev/null +++ b/tests/mlx_parity/probe_24_hybrid_mlxlm_loss.py @@ -0,0 +1,221 @@ +"""Probe 24 — hybrid (mlx-lm loader + zoo trainer) but with zoo's +loss function REPLACED by mlx-lm's verbatim default_loss. + +Round AY: gap is in trainer (not loader). +Round AZ: numpy-RNG hypothesis rejected. +Round BA: compile-mode hypothesis rejected. + +Remaining live suspect from the audit: dtype propagation in the +loss function. The two differ: + + mlx-lm (trainer.py:86): + mask = mx.logical_and(...) # bool + ce = nn.losses.cross_entropy(logits, targets) * mask # fp16 * bool -> fp16 + ce = ce.astype(mx.float32).sum() / ntoks + + zoo (utils.py:417): + mask = length_mask.astype(mx.float32) # bool -> fp32 + ce = nn.losses.cross_entropy(logits, safe_targets) * mask # fp16 * fp32 -> fp32 + loss = ce.astype(mx.float32).sum() / _safe_token_denominator(ntoks) + +The backward through `ce_fp16 * bool` carries gradients in fp16; the +backward through `ce_fp16 * fp32` carries gradients in fp32. After +30 steps these rounding differences could move the model into +different basins. + +If pass rate ~= 67% (matches mlx-lm) -> loss dtype propagation is + the cause +If pass rate ~= 47% (matches zoo) -> not it; investigate further +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path + +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + banner(f"Probe 24: hybrid + mlx-lm's verbatim loss fn " + f"steps={steps} seed={seed} lr={lr}") + + random.seed(seed) + np.random.seed(seed) + import mlx.core as mx + import mlx.nn as nn + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + from mlx_lm.tuner.utils import linear_to_lora_layers + lora_config = { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": [ + "self_attn.q_proj", "self_attn.k_proj", + "self_attn.v_proj", "self_attn.o_proj", + "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj", + ], + } + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + linear_to_lora_layers(model, num_layers, lora_config) + + # ---- KEY DIFFERENCE: monkey-patch zoo's make_baseline_loss_fn ---- + # Replace with a verbatim copy of mlx-lm's default_loss + # (mlx-lm-src/mlx_lm/tuner/trainer.py:86-99). The signature must + # accept (model, batch, lengths, labels=None) since zoo's trainer + # calls loss_and_grad_fn(model, batch_data[0], batch_data[1], + # batch_data[2]) and batch_data[2] is always None for text models. + import unsloth_zoo.mlx.utils as zoo_utils + + def _mlxlm_default_loss_factory(): + def loss_fn(model, batch, lengths, labels=None): + # Verbatim from mlx-lm trainer.py:86-99 (with labels + # silently ignored -- our smoke never passes them). + inputs = batch[:, :-1] + targets = batch[:, 1:] + logits = model(inputs) + steps_ = mx.arange(1, targets.shape[1] + 1) + mask = mx.logical_and(steps_ >= lengths[:, 0:1], steps_ <= lengths[:, 1:]) + ce = nn.losses.cross_entropy(logits, targets) * mask + ntoks = mask.sum() + ce = ce.astype(mx.float32).sum() / ntoks + return ce, ntoks + return loss_fn + + _original = zoo_utils.make_baseline_loss_fn + zoo_utils.make_baseline_loss_fn = _mlxlm_default_loss_factory + # Also patch via direct import path (trainer imports it locally). + import unsloth_zoo.mlx.trainer as zoo_trainer + zoo_trainer.make_baseline_loss_fn = _mlxlm_default_loss_factory + report("monkey-patched make_baseline_loss_fn", "OK") + + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: + extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: + extra["max_grad_value"] = None + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe24_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + + np.random.seed(seed) + mx.random.seed(seed) + + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + # Eval — use ORIGINAL zoo loss for the post-train measurement so + # we're measuring the trained weights, not the patched fn. + zoo_utils.make_baseline_loss_fn = _original + eval_loss_fn = _original() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = eval_loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, "learning_rate": lr, + "loader": "mlx-lm (path A)", + "trainer": "unsloth-zoo (path B) with mlx-lm's verbatim loss", + "per_device_train_batch_size": 6, + "gradient_accumulation_steps": 1, + "max_grad_value": None, "max_grad_norm": 0.0, + "adam_bias_correction": True, + "compile": False, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_24__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 0eaa960ca648c3c8e00863ae997c3e24157e6954 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 14:04:24 +0000 Subject: [PATCH 66/84] mlx_parity Round BC: bracket trainer-loop vs loss as gap source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Probe 24 (mlx-lm loss in zoo loop): 50% — at most marginally above zoo's 47%. Probe 25 inverts: manual mlx-lm-verbatim training loop using ZOO's make_baseline_loss_fn. If 25 hits 67%, the loss is irrelevant; the gap is the LOOP. If 47%, the loss IS the cause. Together with probe 24 this brackets the boundary and isolates the gap source unambiguously. --- .github/workflows/mlx-parity-probe.yml | 37 +++-- tests/mlx_parity/probe_25_mlxlm_loop.py | 196 ++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 15 deletions(-) create mode 100644 tests/mlx_parity/probe_25_mlxlm_loop.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 4c59518f70..37f57960f7 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -144,21 +144,28 @@ jobs: # verbatim copy of mlx-lm's default_loss (bool mask, no # astype(fp32) on the mask). - - {id: '24_1', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '1'} - - {id: '24_7', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '7'} - - {id: '24_42', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '42'} - - {id: '24_123', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '123'} - - {id: '24_456', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '456'} - - {id: '24_789', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '789'} - - {id: '24_999', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '999'} - - {id: '24_1234', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '1234'} - - {id: '24_3407', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '3407'} - - {id: '24_5678', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '5678'} - - {id: '24_9012', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '9012'} - - {id: '24_12345', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '12345'} - - {id: '24_22222', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '22222'} - - {id: '24_31415', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '31415'} - - {id: '24_65535', script: 'probe_24_hybrid_mlxlm_loss.py', seed: '65535'} + # ---- Round BC: complement of probe 24 to isolate loop vs loss ---- + # Probe 24 (mlx-lm loss in zoo trainer): 50% — barely above zoo. + # Probe 25 inverts: mlx-lm verbatim training loop using ZOO's + # make_baseline_loss_fn. If 67% — loss is irrelevant, loop is + # the cause. If 47% — loss IS the cause. Combined with 24, this + # bracket-tests both sides of the trainer/loss boundary. + + - {id: '25_1', script: 'probe_25_mlxlm_loop.py', seed: '1'} + - {id: '25_7', script: 'probe_25_mlxlm_loop.py', seed: '7'} + - {id: '25_42', script: 'probe_25_mlxlm_loop.py', seed: '42'} + - {id: '25_123', script: 'probe_25_mlxlm_loop.py', seed: '123'} + - {id: '25_456', script: 'probe_25_mlxlm_loop.py', seed: '456'} + - {id: '25_789', script: 'probe_25_mlxlm_loop.py', seed: '789'} + - {id: '25_999', script: 'probe_25_mlxlm_loop.py', seed: '999'} + - {id: '25_1234', script: 'probe_25_mlxlm_loop.py', seed: '1234'} + - {id: '25_3407', script: 'probe_25_mlxlm_loop.py', seed: '3407'} + - {id: '25_5678', script: 'probe_25_mlxlm_loop.py', seed: '5678'} + - {id: '25_9012', script: 'probe_25_mlxlm_loop.py', seed: '9012'} + - {id: '25_12345', script: 'probe_25_mlxlm_loop.py', seed: '12345'} + - {id: '25_22222', script: 'probe_25_mlxlm_loop.py', seed: '22222'} + - {id: '25_31415', script: 'probe_25_mlxlm_loop.py', seed: '31415'} + - {id: '25_65535', script: 'probe_25_mlxlm_loop.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_25_mlxlm_loop.py b/tests/mlx_parity/probe_25_mlxlm_loop.py new file mode 100644 index 0000000000..03b4323206 --- /dev/null +++ b/tests/mlx_parity/probe_25_mlxlm_loop.py @@ -0,0 +1,196 @@ +"""Probe 25 — definitive test of TRAINER vs LOSS as gap source. + +Round AY proved gap is in MLXTrainer.train(). Probes 21-24 tried +patching individual axes (loader, numpy RNG, compile, loss) — none +closed the gap to 67%. + +Probe 25 inverts the test: use mlx-lm's verbatim training-loop logic +(NO MLXTrainer at all) but with zoo's make_baseline_loss_fn as the +loss function. If 67% — zoo's loss is irrelevant; the gap is purely +the training loop. If 47% — zoo's loss is the cause. + +This is the COMPLEMENT of probe 24 (which used mlx-lm loss in zoo +trainer). Together they isolate which side of the boundary owns +the gap. +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path + +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + banner(f"Probe 25: manual mlx-lm-style loop + zoo's loss " + f"steps={steps} seed={seed} lr={lr}") + + random.seed(seed) + np.random.seed(seed) + import mlx.core as mx + import mlx.nn as nn + import mlx.optimizers as optim + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + from mlx_lm.tuner.utils import linear_to_lora_layers + from mlx_lm.tuner.trainer import iterate_batches + from mlx_lm.tuner.datasets import TextDataset, CacheDataset + from mlx.utils import tree_flatten + + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + lora_config = { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": [ + "self_attn.q_proj", "self_attn.k_proj", + "self_attn.v_proj", "self_attn.o_proj", + "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj", + ], + } + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + linear_to_lora_layers(model, num_layers, lora_config) + + # Use ZOO's make_baseline_loss_fn (this is the key swap) + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + loss_fn = make_baseline_loss_fn() + # Adapt zoo's 4-arg signature to mlx-lm's 3-arg call (no labels). + def _loss_3arg(model, batch, lengths): + # zoo's loss accepts labels=None default + return loss_fn(model, batch, lengths, None) + + # Optimizer — match probe 22 / mlx-lm CLI: adamw, bc=True, wd=0 + optimizer = optim.AdamW( + learning_rate=lr, weight_decay=0.0, bias_correction=True + ) + + # Prepare dataset — same as zoo (TextDataset + CacheDataset) + formatted = [{"text": TRAIN_TEXT} for _ in range(64)] + ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + + # ---- mlx-lm training loop, verbatim ---- + from functools import partial + from mlx.nn.utils import average_gradients + + grad_accum_steps = 1 # match probe 22 / mlx-lm + state = [model.state, optimizer.state, mx.random.state] + loss_value_and_grad = nn.value_and_grad(model, _loss_3arg) + + # mlx-lm uses @partial(mx.compile, inputs=state, outputs=state) + # but our compile=False precedent is to leave the step function + # eager; verbatim probe 25 follows mlx-lm and DOES compile. + @partial(mx.compile, inputs=state, outputs=state) + def step(batch, prev_grad, do_update): + (lvalue, toks), grad = loss_value_and_grad(model, *batch) + if prev_grad is not None: + from mlx.utils import tree_map + grad = tree_map(lambda x, y: x + y, grad, prev_grad) + if do_update: + grad = average_gradients(grad) + if grad_accum_steps > 1: + from mlx.utils import tree_map + grad = tree_map(lambda x: x / grad_accum_steps, grad) + optimizer.update(model, grad) + grad = None + return lvalue, toks, grad + + model.train() + losses = mx.array(0.0) + n_tokens = mx.array(0) + grad_accum = None + + rows = [] + np.random.seed(seed) # mirror lora.py:320 + for it, batch in zip( + range(1, steps * grad_accum_steps + 1), + iterate_batches( + dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, + loop=True, + ), + ): + do_update = (it % grad_accum_steps == 0) + lvalue, toks, grad_accum = step(batch, grad_accum, do_update) + losses += lvalue + n_tokens += toks + mx.eval(state, losses, n_tokens, grad_accum) + rows.append({"step": it, "loss": float(lvalue.item())}) + + # Post-train eval (match probe 22's eval block) + from unsloth_zoo.mlx.utils import make_baseline_loss_fn as _zoo_loss_factory + eval_loss_fn = _zoo_loss_factory() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = eval_loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, "learning_rate": lr, + "loader": "mlx-lm (path A)", + "trainer": "manual mlx-lm-style loop + zoo's make_baseline_loss_fn", + "batch_size": 6, "grad_accum_steps": 1, + "adam_bias_correction": True, "weight_decay": 0.0, + "compile": True, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_25__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From a39f7df1850e2280a10b83f3580c8beda3bfe8fc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 14:21:59 +0000 Subject: [PATCH 67/84] mlx_parity Round BD: pure-mlx-lm control (probe 26) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Probe 25 (mlx-lm-style loop + zoo loss): 47%. Identical per-seed pass pattern as probes 22 (47%), 23 (43%), 24 (50%). All five probes that import unsloth_zoo land in the same basin pattern. Only probe 20 (mlx-lm CLI subprocess, zero unsloth_zoo imports) hits 67%. Probe 26 runs identical mlx-lm-verbatim training INLINE with zero unsloth_zoo imports. If 67% — the unsloth_zoo import has a global side effect on MLX runtime state that breaks parity. If 47% — subprocess isolation was the relevant factor and probe 20's 67% was an artifact rather than a true ceiling. --- .github/workflows/mlx-parity-probe.yml | 38 +++-- tests/mlx_parity/probe_26_pure_mlxlm.py | 175 ++++++++++++++++++++++++ 2 files changed, 198 insertions(+), 15 deletions(-) create mode 100644 tests/mlx_parity/probe_26_pure_mlxlm.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 37f57960f7..b65327f4bb 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -151,21 +151,29 @@ jobs: # the cause. If 47% — loss IS the cause. Combined with 24, this # bracket-tests both sides of the trainer/loss boundary. - - {id: '25_1', script: 'probe_25_mlxlm_loop.py', seed: '1'} - - {id: '25_7', script: 'probe_25_mlxlm_loop.py', seed: '7'} - - {id: '25_42', script: 'probe_25_mlxlm_loop.py', seed: '42'} - - {id: '25_123', script: 'probe_25_mlxlm_loop.py', seed: '123'} - - {id: '25_456', script: 'probe_25_mlxlm_loop.py', seed: '456'} - - {id: '25_789', script: 'probe_25_mlxlm_loop.py', seed: '789'} - - {id: '25_999', script: 'probe_25_mlxlm_loop.py', seed: '999'} - - {id: '25_1234', script: 'probe_25_mlxlm_loop.py', seed: '1234'} - - {id: '25_3407', script: 'probe_25_mlxlm_loop.py', seed: '3407'} - - {id: '25_5678', script: 'probe_25_mlxlm_loop.py', seed: '5678'} - - {id: '25_9012', script: 'probe_25_mlxlm_loop.py', seed: '9012'} - - {id: '25_12345', script: 'probe_25_mlxlm_loop.py', seed: '12345'} - - {id: '25_22222', script: 'probe_25_mlxlm_loop.py', seed: '22222'} - - {id: '25_31415', script: 'probe_25_mlxlm_loop.py', seed: '31415'} - - {id: '25_65535', script: 'probe_25_mlxlm_loop.py', seed: '65535'} + # ---- Round BD: control with no unsloth_zoo imports ---- + # Probes 22-25 all import unsloth_zoo and all hit 40-50%. + # Probe 20 (mlx-lm CLI subprocess) hits 67%. Probe 26 runs + # identical mlx-lm-style training INLINE (no subprocess) but + # imports NO unsloth_zoo modules. If 67% — the unsloth_zoo + # import side effect is the cause. If 47% — subprocess + # isolation in probe 20 was the actual cause. + + - {id: '26_1', script: 'probe_26_pure_mlxlm.py', seed: '1'} + - {id: '26_7', script: 'probe_26_pure_mlxlm.py', seed: '7'} + - {id: '26_42', script: 'probe_26_pure_mlxlm.py', seed: '42'} + - {id: '26_123', script: 'probe_26_pure_mlxlm.py', seed: '123'} + - {id: '26_456', script: 'probe_26_pure_mlxlm.py', seed: '456'} + - {id: '26_789', script: 'probe_26_pure_mlxlm.py', seed: '789'} + - {id: '26_999', script: 'probe_26_pure_mlxlm.py', seed: '999'} + - {id: '26_1234', script: 'probe_26_pure_mlxlm.py', seed: '1234'} + - {id: '26_3407', script: 'probe_26_pure_mlxlm.py', seed: '3407'} + - {id: '26_5678', script: 'probe_26_pure_mlxlm.py', seed: '5678'} + - {id: '26_9012', script: 'probe_26_pure_mlxlm.py', seed: '9012'} + - {id: '26_12345', script: 'probe_26_pure_mlxlm.py', seed: '12345'} + - {id: '26_22222', script: 'probe_26_pure_mlxlm.py', seed: '22222'} + - {id: '26_31415', script: 'probe_26_pure_mlxlm.py', seed: '31415'} + - {id: '26_65535', script: 'probe_26_pure_mlxlm.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_26_pure_mlxlm.py b/tests/mlx_parity/probe_26_pure_mlxlm.py new file mode 100644 index 0000000000..ac26a9f048 --- /dev/null +++ b/tests/mlx_parity/probe_26_pure_mlxlm.py @@ -0,0 +1,175 @@ +"""Probe 26 — control: NO unsloth_zoo imports at all. + +Probes 22, 23, 24, 25 ALL imported from unsloth_zoo.mlx.* and ALL +hit 40-50% on this fixture. Probe 20 (mlx-lm CLI subprocess, no +unsloth_zoo) hits 67%. The hypothesis: just IMPORTING unsloth_zoo +in-process shifts MLX state enough to land in a different basin. + +Probe 26 runs identical mlx-lm-style training in-process but with +ZERO unsloth_zoo imports. If 67% — the unsloth_zoo import itself +is the cause. If 47% — something else about the probe environment +matters and probe 20's 67% was an artifact of subprocess isolation. +""" +import json +import os +import sys +import random +from functools import partial +from pathlib import Path + +import numpy as np + +# Replicate _common.py's constants WITHOUT importing it (which would +# pull in unsloth_zoo if any are added there in the future). +MODEL_NAME = "unsloth/gemma-3-270m-it" +TRAIN_TEXT = "<> My name is Unsloth!" +PROMPT = "<> My name is " +MAX_SEQ_LEN = 64 +OUT_DIR = Path(__file__).resolve().parent / ".out" +OUT_DIR.mkdir(parents=True, exist_ok=True) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + print(f"=== Probe 26: pure mlx-lm, NO unsloth_zoo imports " + f"steps={steps} seed={seed} lr={lr} ===", flush=True) + + random.seed(seed) + np.random.seed(seed) + import mlx.core as mx + import mlx.nn as nn + import mlx.optimizers as optim + from mlx.nn.utils import average_gradients + from mlx.utils import tree_flatten, tree_map + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + from mlx_lm.tuner.utils import linear_to_lora_layers + from mlx_lm.tuner.trainer import iterate_batches, default_loss + from mlx_lm.tuner.datasets import TextDataset, CacheDataset + + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + lora_config = { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": [ + "self_attn.q_proj", "self_attn.k_proj", + "self_attn.v_proj", "self_attn.o_proj", + "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj", + ], + } + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + linear_to_lora_layers(model, num_layers, lora_config) + + optimizer = optim.AdamW( + learning_rate=lr, weight_decay=0.0, bias_correction=True + ) + + formatted = [{"text": TRAIN_TEXT} for _ in range(64)] + ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + + grad_accum_steps = 1 + state = [model.state, optimizer.state, mx.random.state] + loss_value_and_grad = nn.value_and_grad(model, default_loss) + + @partial(mx.compile, inputs=state, outputs=state) + def step(batch, prev_grad, do_update): + (lvalue, toks), grad = loss_value_and_grad(model, *batch) + if prev_grad is not None: + grad = tree_map(lambda x, y: x + y, grad, prev_grad) + if do_update: + grad = average_gradients(grad) + if grad_accum_steps > 1: + grad = tree_map(lambda x: x / grad_accum_steps, grad) + optimizer.update(model, grad) + grad = None + return lvalue, toks, grad + + model.train() + losses = mx.array(0.0) + n_tokens = mx.array(0) + grad_accum = None + + rows = [] + np.random.seed(seed) + for it, batch in zip( + range(1, steps * grad_accum_steps + 1), + iterate_batches( + dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True, + ), + ): + do_update = (it % grad_accum_steps == 0) + lvalue, toks, grad_accum = step(batch, grad_accum, do_update) + losses += lvalue + n_tokens += toks + mx.eval(state, losses, n_tokens, grad_accum) + rows.append({"step": it, "loss": float(lvalue.item())}) + + # Post-train: use a fresh mlx-lm default_loss for eval too. + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + post_loss, _ = default_loss(model, batch, lengths) + post_loss_val = float(post_loss.item()) + + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + print(f" generation: {gen[:160]!r}", flush=True) + print(f" contains 'Unsloth': {contains}", flush=True) + + out = { + "config": { + "steps": steps, "seed": seed, "learning_rate": lr, + "loader": "mlx-lm (pure)", + "trainer": "manual mlx-lm verbatim + default_loss + NO unsloth_zoo", + "batch_size": 6, "grad_accum_steps": 1, + "adam_bias_correction": True, "weight_decay": 0.0, + "compile": True, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_26__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 7d0875bd66a96158c752e4608622464944988df2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 14:44:46 +0000 Subject: [PATCH 68/84] mlx_parity Round BE: isolate subprocess vs inline mlx-lm at 67% vs 47% Probe 26 (pure mlx-lm inline) hits 47% with same per-seed pattern as every other inline probe (22-26). Probe 20 (mlx-lm CLI via subprocess.run) hits 67%. Three candidate isolations: Probe 27: probe 26's training in subprocess.run(['python','-c',...]) -- tests if the extra subprocess boundary alone matters Probe 28: probe 26 + mx.set_wired_limit(...) at startup -- mlx-lm's train() sets this hint; my inline doesn't Probe 29: probe 26 but call mlx_lm.tuner.trainer.train() directly -- if train() does something at function entry I missed 15 seeds x 3 probes = 45 cells. If any cell hits 67%, that isolation IS the variable. --- .github/workflows/mlx-parity-probe.yml | 70 +++++++--- tests/mlx_parity/probe_27_subprocess_wrap.py | 134 ++++++++++++++++++ tests/mlx_parity/probe_28_set_wired_limit.py | 135 +++++++++++++++++++ tests/mlx_parity/probe_29_call_train_fn.py | 126 +++++++++++++++++ 4 files changed, 450 insertions(+), 15 deletions(-) create mode 100644 tests/mlx_parity/probe_27_subprocess_wrap.py create mode 100644 tests/mlx_parity/probe_28_set_wired_limit.py create mode 100644 tests/mlx_parity/probe_29_call_train_fn.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index b65327f4bb..ce5443a028 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -159,21 +159,61 @@ jobs: # import side effect is the cause. If 47% — subprocess # isolation in probe 20 was the actual cause. - - {id: '26_1', script: 'probe_26_pure_mlxlm.py', seed: '1'} - - {id: '26_7', script: 'probe_26_pure_mlxlm.py', seed: '7'} - - {id: '26_42', script: 'probe_26_pure_mlxlm.py', seed: '42'} - - {id: '26_123', script: 'probe_26_pure_mlxlm.py', seed: '123'} - - {id: '26_456', script: 'probe_26_pure_mlxlm.py', seed: '456'} - - {id: '26_789', script: 'probe_26_pure_mlxlm.py', seed: '789'} - - {id: '26_999', script: 'probe_26_pure_mlxlm.py', seed: '999'} - - {id: '26_1234', script: 'probe_26_pure_mlxlm.py', seed: '1234'} - - {id: '26_3407', script: 'probe_26_pure_mlxlm.py', seed: '3407'} - - {id: '26_5678', script: 'probe_26_pure_mlxlm.py', seed: '5678'} - - {id: '26_9012', script: 'probe_26_pure_mlxlm.py', seed: '9012'} - - {id: '26_12345', script: 'probe_26_pure_mlxlm.py', seed: '12345'} - - {id: '26_22222', script: 'probe_26_pure_mlxlm.py', seed: '22222'} - - {id: '26_31415', script: 'probe_26_pure_mlxlm.py', seed: '31415'} - - {id: '26_65535', script: 'probe_26_pure_mlxlm.py', seed: '65535'} + # ---- Round BE: subprocess boundary, set_wired_limit, train() ---- + # Probe 26 (pure mlx-lm inline) hits 47%. Probe 20 (mlx-lm CLI + # via subprocess.run) hits 67%. Three candidate isolations: + # 27 = probe 26 + subprocess.run wrap (subprocess boundary) + # 28 = probe 26 + mx.set_wired_limit (mlx-lm train() side effect) + # 29 = probe 26 but call train() directly (uses train()'s actual setup) + # If any hits 67%, that isolation IS the cause. + + - {id: '27_1', script: 'probe_27_subprocess_wrap.py', seed: '1'} + - {id: '27_7', script: 'probe_27_subprocess_wrap.py', seed: '7'} + - {id: '27_42', script: 'probe_27_subprocess_wrap.py', seed: '42'} + - {id: '27_123', script: 'probe_27_subprocess_wrap.py', seed: '123'} + - {id: '27_456', script: 'probe_27_subprocess_wrap.py', seed: '456'} + - {id: '27_789', script: 'probe_27_subprocess_wrap.py', seed: '789'} + - {id: '27_999', script: 'probe_27_subprocess_wrap.py', seed: '999'} + - {id: '27_1234', script: 'probe_27_subprocess_wrap.py', seed: '1234'} + - {id: '27_3407', script: 'probe_27_subprocess_wrap.py', seed: '3407'} + - {id: '27_5678', script: 'probe_27_subprocess_wrap.py', seed: '5678'} + - {id: '27_9012', script: 'probe_27_subprocess_wrap.py', seed: '9012'} + - {id: '27_12345', script: 'probe_27_subprocess_wrap.py', seed: '12345'} + - {id: '27_22222', script: 'probe_27_subprocess_wrap.py', seed: '22222'} + - {id: '27_31415', script: 'probe_27_subprocess_wrap.py', seed: '31415'} + - {id: '27_65535', script: 'probe_27_subprocess_wrap.py', seed: '65535'} + + - {id: '28_1', script: 'probe_28_set_wired_limit.py', seed: '1'} + - {id: '28_7', script: 'probe_28_set_wired_limit.py', seed: '7'} + - {id: '28_42', script: 'probe_28_set_wired_limit.py', seed: '42'} + - {id: '28_123', script: 'probe_28_set_wired_limit.py', seed: '123'} + - {id: '28_456', script: 'probe_28_set_wired_limit.py', seed: '456'} + - {id: '28_789', script: 'probe_28_set_wired_limit.py', seed: '789'} + - {id: '28_999', script: 'probe_28_set_wired_limit.py', seed: '999'} + - {id: '28_1234', script: 'probe_28_set_wired_limit.py', seed: '1234'} + - {id: '28_3407', script: 'probe_28_set_wired_limit.py', seed: '3407'} + - {id: '28_5678', script: 'probe_28_set_wired_limit.py', seed: '5678'} + - {id: '28_9012', script: 'probe_28_set_wired_limit.py', seed: '9012'} + - {id: '28_12345', script: 'probe_28_set_wired_limit.py', seed: '12345'} + - {id: '28_22222', script: 'probe_28_set_wired_limit.py', seed: '22222'} + - {id: '28_31415', script: 'probe_28_set_wired_limit.py', seed: '31415'} + - {id: '28_65535', script: 'probe_28_set_wired_limit.py', seed: '65535'} + + - {id: '29_1', script: 'probe_29_call_train_fn.py', seed: '1'} + - {id: '29_7', script: 'probe_29_call_train_fn.py', seed: '7'} + - {id: '29_42', script: 'probe_29_call_train_fn.py', seed: '42'} + - {id: '29_123', script: 'probe_29_call_train_fn.py', seed: '123'} + - {id: '29_456', script: 'probe_29_call_train_fn.py', seed: '456'} + - {id: '29_789', script: 'probe_29_call_train_fn.py', seed: '789'} + - {id: '29_999', script: 'probe_29_call_train_fn.py', seed: '999'} + - {id: '29_1234', script: 'probe_29_call_train_fn.py', seed: '1234'} + - {id: '29_3407', script: 'probe_29_call_train_fn.py', seed: '3407'} + - {id: '29_5678', script: 'probe_29_call_train_fn.py', seed: '5678'} + - {id: '29_9012', script: 'probe_29_call_train_fn.py', seed: '9012'} + - {id: '29_12345', script: 'probe_29_call_train_fn.py', seed: '12345'} + - {id: '29_22222', script: 'probe_29_call_train_fn.py', seed: '22222'} + - {id: '29_31415', script: 'probe_29_call_train_fn.py', seed: '31415'} + - {id: '29_65535', script: 'probe_29_call_train_fn.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_27_subprocess_wrap.py b/tests/mlx_parity/probe_27_subprocess_wrap.py new file mode 100644 index 0000000000..91d975faa1 --- /dev/null +++ b/tests/mlx_parity/probe_27_subprocess_wrap.py @@ -0,0 +1,134 @@ +"""Probe 27 — subprocess wrap of probe 26's code. + +Probe 20 (mlx-lm CLI via subprocess.run) hits 67%; probe 26 (identical +mlx-lm-style code inline) hits 47%. The only differences are: + (a) extra subprocess boundary + (b) mlx-lm's CLI sets mx.set_wired_limit inside its train() function + +Probe 27 tests (a) directly: identical code as probe 26 but executed +via subprocess.run([sys.executable, '-c', ...]). If 67%, the extra +subprocess boundary IS the variable. +""" +import json +import os +import subprocess +import sys +from pathlib import Path + +OUT_DIR = Path(__file__).resolve().parent / ".out" +OUT_DIR.mkdir(parents=True, exist_ok=True) + +SEED = int(os.environ.get("MLX_SEED", "3407")) +STEPS = int(os.environ.get("MLX_STEPS", "30")) +LR = float(os.environ.get("MLX_LR", "1e-3")) + +# Inner script: same training as probe 26, but writes results to a JSON +# file path provided via env. +INNER = r''' +import json, os, random, sys +from pathlib import Path +from functools import partial +import numpy as np + +MODEL_NAME = "unsloth/gemma-3-270m-it" +TRAIN_TEXT = "<> My name is Unsloth!" +PROMPT = "<> My name is " +MAX_SEQ_LEN = 64 + +seed = int(os.environ["MLX_SEED"]) +steps = int(os.environ["MLX_STEPS"]) +lr = float(os.environ["MLX_LR"]) +out_path = os.environ["INNER_OUT_PATH"] + +random.seed(seed); np.random.seed(seed) +import mlx.core as mx +import mlx.nn as nn +import mlx.optimizers as optim +from mlx.nn.utils import average_gradients +from mlx.utils import tree_map +mx.random.seed(seed) + +from mlx_lm import load as mlx_load, generate +from mlx_lm.tuner.utils import linear_to_lora_layers +from mlx_lm.tuner.trainer import iterate_batches, default_loss +from mlx_lm.tuner.datasets import TextDataset, CacheDataset + +model, tokenizer = mlx_load(MODEL_NAME) +model.freeze() +linear_to_lora_layers(model, len(model.model.layers if not hasattr(model, "layers") else model.layers), { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], +}) + +optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True) +formatted = [{"text": TRAIN_TEXT} for _ in range(64)] +ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + +state = [model.state, optimizer.state, mx.random.state] +loss_value_and_grad = nn.value_and_grad(model, default_loss) + +@partial(mx.compile, inputs=state, outputs=state) +def step(batch, prev_grad, do_update): + (lvalue, toks), grad = loss_value_and_grad(model, *batch) + if prev_grad is not None: + grad = tree_map(lambda x, y: x + y, grad, prev_grad) + if do_update: + grad = average_gradients(grad) + optimizer.update(model, grad) + grad = None + return lvalue, toks, grad + +model.train() +losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None +rows = [] +np.random.seed(seed) +for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)): + lvalue, toks, grad_accum = step(batch, grad_accum, True) + losses += lvalue; n_tokens += toks + mx.eval(state, losses, n_tokens, grad_accum) + rows.append({"step": it, "loss": float(lvalue.item())}) + +ids = tokenizer.encode(TRAIN_TEXT) +if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) +L = len(ids) +post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]])) +post_loss_val = float(post_loss.item()) + +prompt_ids = list(tokenizer.encode(PROMPT)) +full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) +if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item()) +else: + completion_loss = float("nan") + +gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) +out = { + "config": {"steps": steps, "seed": seed, "learning_rate": lr, "wrap": "subprocess"}, + "rows": rows, "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, "generation": gen, + "contains_unsloth": "Unsloth" in gen, +} +Path(out_path).write_text(json.dumps(out, indent=2)) +''' + +out_file = OUT_DIR / f"probe_27__s{STEPS}_d{SEED}.json" +env = dict(os.environ) +env["INNER_OUT_PATH"] = str(out_file) +env["MLX_SEED"] = str(SEED) +env["MLX_STEPS"] = str(STEPS) +env["MLX_LR"] = str(LR) +proc = subprocess.run([sys.executable, "-c", INNER], env=env, capture_output=True, text=True, timeout=1200) +if proc.returncode != 0: + print("--- inner stderr ---", flush=True) + print(proc.stderr[-3000:]) + sys.exit(proc.returncode) +print(proc.stdout[-1000:], flush=True) +data = json.loads(out_file.read_text()) +print(f"seed={SEED} contains={data['contains_unsloth']} post={data['post_train_loss']:.4f} cf={data['completion_teacher_forced_loss']:.4f}") +print(f"gen={data['generation'][:80]!r}") diff --git a/tests/mlx_parity/probe_28_set_wired_limit.py b/tests/mlx_parity/probe_28_set_wired_limit.py new file mode 100644 index 0000000000..cea43fad96 --- /dev/null +++ b/tests/mlx_parity/probe_28_set_wired_limit.py @@ -0,0 +1,135 @@ +"""Probe 28 — probe 26 + mx.set_wired_limit (mlx-lm's train() does this). + +mlx-lm's `train()` at trainer.py:228-229 calls +mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"]) +right at the start. probe 26 doesn't. If this single allocator hint +changes basin selection (via Metal kernel JIT path), probe 28 hits 67%. +""" +import json +import os +import sys +import random +from functools import partial +from pathlib import Path +import numpy as np + +MODEL_NAME = "unsloth/gemma-3-270m-it" +TRAIN_TEXT = "<> My name is Unsloth!" +PROMPT = "<> My name is " +MAX_SEQ_LEN = 64 +OUT_DIR = Path(__file__).resolve().parent / ".out" +OUT_DIR.mkdir(parents=True, exist_ok=True) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + print(f"=== Probe 28: probe26 + mx.set_wired_limit steps={steps} seed={seed} lr={lr} ===", flush=True) + + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + import mlx.nn as nn + import mlx.optimizers as optim + from mlx.nn.utils import average_gradients + from mlx.utils import tree_map + mx.random.seed(seed) + + # >>> THE ONLY DIFFERENCE FROM PROBE 26 <<< + if mx.metal.is_available(): + wired = mx.device_info()["max_recommended_working_set_size"] + mx.set_wired_limit(wired) + print(f" set_wired_limit({wired})", flush=True) + + from mlx_lm import load as mlx_load, generate + from mlx_lm.tuner.utils import linear_to_lora_layers + from mlx_lm.tuner.trainer import iterate_batches, default_loss + from mlx_lm.tuner.datasets import TextDataset, CacheDataset + + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + + optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True) + formatted = [{"text": TRAIN_TEXT} for _ in range(64)] + ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + + state = [model.state, optimizer.state, mx.random.state] + loss_value_and_grad = nn.value_and_grad(model, default_loss) + + @partial(mx.compile, inputs=state, outputs=state) + def step(batch, prev_grad, do_update): + (lvalue, toks), grad = loss_value_and_grad(model, *batch) + if prev_grad is not None: + grad = tree_map(lambda x, y: x + y, grad, prev_grad) + if do_update: + grad = average_gradients(grad) + optimizer.update(model, grad) + grad = None + return lvalue, toks, grad + + model.train() + losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None + rows = [] + np.random.seed(seed) + for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)): + lvalue, toks, grad_accum = step(batch, grad_accum, True) + losses += lvalue; n_tokens += toks + mx.eval(state, losses, n_tokens, grad_accum) + rows.append({"step": it, "loss": float(lvalue.item())}) + + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]])) + post_loss_val = float(post_loss.item()) + + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item()) + else: + completion_loss = float("nan") + + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + print(f" contains 'Unsloth': {contains} gen={gen[:80]!r}", flush=True) + + out = { + "config": {"steps": steps, "seed": seed, "learning_rate": lr, "extra": "set_wired_limit"}, + "rows": rows, "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_28__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_29_call_train_fn.py b/tests/mlx_parity/probe_29_call_train_fn.py new file mode 100644 index 0000000000..b2e6c835c5 --- /dev/null +++ b/tests/mlx_parity/probe_29_call_train_fn.py @@ -0,0 +1,126 @@ +"""Probe 29 — probe 26 but call mlx-lm's train() function directly, +not inline its loop. + +If probe 26 (manual inline of mlx-lm train()) hits 47% but probe 29 +(actual call to mlx_lm.tuner.trainer.train()) hits 67%, then either: + - my inline replication has a subtle math difference, OR + - train() does something at function-entry that the inline missed + (e.g. mx.distributed.init, set_wired_limit, etc.) +""" +import json +import os +import sys +import random +from pathlib import Path +import numpy as np + +MODEL_NAME = "unsloth/gemma-3-270m-it" +TRAIN_TEXT = "<> My name is Unsloth!" +PROMPT = "<> My name is " +MAX_SEQ_LEN = 64 +OUT_DIR = Path(__file__).resolve().parent / ".out" +OUT_DIR.mkdir(parents=True, exist_ok=True) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + print(f"=== Probe 29: call mlx-lm train() directly steps={steps} seed={seed} lr={lr} ===", flush=True) + + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + import mlx.nn as nn + import mlx.optimizers as optim + mx.random.seed(seed) + + from mlx_lm import load as mlx_load, generate + from mlx_lm.tuner.utils import linear_to_lora_layers + from mlx_lm.tuner.trainer import train, TrainingArgs, default_loss + from mlx_lm.tuner.datasets import TextDataset, CacheDataset + + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + + optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True) + formatted = [{"text": TRAIN_TEXT} for _ in range(64)] + ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + + training_args = TrainingArgs( + batch_size=6, + iters=steps, + max_seq_length=MAX_SEQ_LEN, + grad_accumulation_steps=1, + steps_per_report=1, + steps_per_eval=steps + 1, # disable eval + steps_per_save=steps + 1, # disable save + grad_checkpoint=False, + ) + + train( + model=model, + args=training_args, + optimizer=optimizer, + train_dataset=ds, + val_dataset=None, + loss=default_loss, + training_callback=None, + ) + + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]])) + post_loss_val = float(post_loss.item()) + + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item()) + else: + completion_loss = float("nan") + + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + print(f" contains 'Unsloth': {contains} gen={gen[:80]!r}", flush=True) + + out = { + "config": {"steps": steps, "seed": seed, "learning_rate": lr, "via": "mlx_lm.tuner.trainer.train()"}, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_29__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 7e18b7d7b3c7578f3a94bb8cf1ee2df3e21b3519 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 14:48:29 +0000 Subject: [PATCH 69/84] =?UTF-8?q?mlx=5Fparity=20Round=20BF:=20THE=20FIX=20?= =?UTF-8?q?TEST=20=E2=80=94=20seed=20mx.random=20AFTER=20model=20load?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ROOT CAUSE FOUND (pending probe 30 confirmation): nn.Linear.__init__ at mlx-src/python/mlx/nn/layers/linear.py:51 calls mx.random.uniform every time a Linear is constructed. Each Linear in a transformer (q/k/v/o/gate/up/down per layer, plus the output head and embeddings) consumes some mx.random state. For gemma-3-270m there are dozens of Linear modules. mlx-lm CLI (probe 20) calls mx.random.seed(args.seed) at mlx_lm/lora.py:223 -- AFTER load(model_path) and BEFORE linear_to_lora_layers. The seed is therefore "fresh" right when lora_a init draws happen. My inline probes (22-26) seed mx.random BEFORE mlx_load() at the top of main(). Loading the model consumes a substantial amount of mx.random state via Linear constructors. By the time linear_to_lora_layers runs, mx.random is at a different position than mlx-lm CLI sees -> different lora_a init -> different basin. Probe 30 mirrors mlx-lm CLI: seed AFTER load, before LoRA wiring. If 67%, the bug is "where you seed", not "what you train". If probe 30 passes, the FIX in unsloth-zoo is to call mx.random.seed(args.random_state) inside get_peft_model right before linear_to_lora_layers (loader.py). --- .github/workflows/mlx-parity-probe.yml | 40 ++++-- tests/mlx_parity/probe_30_seed_after_load.py | 141 +++++++++++++++++++ 2 files changed, 166 insertions(+), 15 deletions(-) create mode 100644 tests/mlx_parity/probe_30_seed_after_load.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index ce5443a028..b5505daa27 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -111,21 +111,31 @@ jobs: # Same 15 seeds as AX/AY. # -- probe 22: hybrid + np.seed reset just before train() -- - - {id: '22_1', script: 'probe_22_hybrid_reseed.py', seed: '1'} - - {id: '22_7', script: 'probe_22_hybrid_reseed.py', seed: '7'} - - {id: '22_42', script: 'probe_22_hybrid_reseed.py', seed: '42'} - - {id: '22_123', script: 'probe_22_hybrid_reseed.py', seed: '123'} - - {id: '22_456', script: 'probe_22_hybrid_reseed.py', seed: '456'} - - {id: '22_789', script: 'probe_22_hybrid_reseed.py', seed: '789'} - - {id: '22_999', script: 'probe_22_hybrid_reseed.py', seed: '999'} - - {id: '22_1234', script: 'probe_22_hybrid_reseed.py', seed: '1234'} - - {id: '22_3407', script: 'probe_22_hybrid_reseed.py', seed: '3407'} - - {id: '22_5678', script: 'probe_22_hybrid_reseed.py', seed: '5678'} - - {id: '22_9012', script: 'probe_22_hybrid_reseed.py', seed: '9012'} - - {id: '22_12345', script: 'probe_22_hybrid_reseed.py', seed: '12345'} - - {id: '22_22222', script: 'probe_22_hybrid_reseed.py', seed: '22222'} - - {id: '22_31415', script: 'probe_22_hybrid_reseed.py', seed: '31415'} - - {id: '22_65535', script: 'probe_22_hybrid_reseed.py', seed: '65535'} + # ---- Round BF: THE FIX — seed mx.random AFTER model load ---- + # CRITICAL DISCOVERY: nn.Linear.__init__ (mlx-src/python/mlx/nn/ + # layers/linear.py:51) calls mx.random.uniform. So every Linear + # module constructed during model load consumes mx.random state. + # mlx-lm CLI seeds AFTER load (lora.py:223); my inline probes + # seeded BEFORE load. Result: lora_a init from different RNG + # positions, leading to different basins. + # Probe 30 reseeds AFTER load + adds set_wired_limit. If 67%, + # the seed-order is the bug; the FIX is to seed AFTER load. + + - {id: '30_1', script: 'probe_30_seed_after_load.py', seed: '1'} + - {id: '30_7', script: 'probe_30_seed_after_load.py', seed: '7'} + - {id: '30_42', script: 'probe_30_seed_after_load.py', seed: '42'} + - {id: '30_123', script: 'probe_30_seed_after_load.py', seed: '123'} + - {id: '30_456', script: 'probe_30_seed_after_load.py', seed: '456'} + - {id: '30_789', script: 'probe_30_seed_after_load.py', seed: '789'} + - {id: '30_999', script: 'probe_30_seed_after_load.py', seed: '999'} + - {id: '30_1234', script: 'probe_30_seed_after_load.py', seed: '1234'} + - {id: '30_3407', script: 'probe_30_seed_after_load.py', seed: '3407'} + - {id: '30_5678', script: 'probe_30_seed_after_load.py', seed: '5678'} + - {id: '30_9012', script: 'probe_30_seed_after_load.py', seed: '9012'} + - {id: '30_12345', script: 'probe_30_seed_after_load.py', seed: '12345'} + - {id: '30_22222', script: 'probe_30_seed_after_load.py', seed: '22222'} + - {id: '30_31415', script: 'probe_30_seed_after_load.py', seed: '31415'} + - {id: '30_65535', script: 'probe_30_seed_after_load.py', seed: '65535'} # ---- Round BA: compile-mode hypothesis ---- # Round AZ rejected numpy-RNG (probe 22 = probe 21). The diff --git a/tests/mlx_parity/probe_30_seed_after_load.py b/tests/mlx_parity/probe_30_seed_after_load.py new file mode 100644 index 0000000000..cfcb061fff --- /dev/null +++ b/tests/mlx_parity/probe_30_seed_after_load.py @@ -0,0 +1,141 @@ +"""Probe 30 — probe 26 but seed mx.random AFTER model load (matching +mlx-lm CLI's lora.py:223 order). + +If model loading consumes any mx.random state, the lora_a init +values differ between probe 26 (seed before load) and probe 20 +(seed after load via lora.py:223). probe 30 reorders to match +mlx-lm CLI exactly. If 67% — seed order IS the cause. +""" +import json +import os +import sys +import random +from functools import partial +from pathlib import Path +import numpy as np + +MODEL_NAME = "unsloth/gemma-3-270m-it" +TRAIN_TEXT = "<> My name is Unsloth!" +PROMPT = "<> My name is " +MAX_SEQ_LEN = 64 +OUT_DIR = Path(__file__).resolve().parent / ".out" +OUT_DIR.mkdir(parents=True, exist_ok=True) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + print(f"=== Probe 30: seed mx.random AFTER model load steps={steps} seed={seed} lr={lr} ===", flush=True) + + # NOTE: do NOT seed mx.random here. Seed it AFTER load() (line below). + random.seed(seed); np.random.seed(seed) + + import mlx.core as mx + import mlx.nn as nn + import mlx.optimizers as optim + from mlx.nn.utils import average_gradients + from mlx.utils import tree_map + + from mlx_lm import load as mlx_load, generate + from mlx_lm.tuner.utils import linear_to_lora_layers + from mlx_lm.tuner.trainer import iterate_batches, default_loss + from mlx_lm.tuner.datasets import TextDataset, CacheDataset + + model, tokenizer = mlx_load(MODEL_NAME) + + # Seed AFTER load -- mlx-lm CLI lora.py:223 does this. + mx.random.seed(seed) + + model.freeze() + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + + optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True) + formatted = [{"text": TRAIN_TEXT} for _ in range(64)] + ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + + # mlx-lm's train() also sets wired_limit. Include that too so probe + # 30 is identical to mlx-lm CLI's setup as far as I can replicate. + if mx.metal.is_available(): + mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"]) + + state = [model.state, optimizer.state, mx.random.state] + loss_value_and_grad = nn.value_and_grad(model, default_loss) + + @partial(mx.compile, inputs=state, outputs=state) + def step(batch, prev_grad, do_update): + (lvalue, toks), grad = loss_value_and_grad(model, *batch) + if prev_grad is not None: + grad = tree_map(lambda x, y: x + y, grad, prev_grad) + if do_update: + grad = average_gradients(grad) + optimizer.update(model, grad) + grad = None + return lvalue, toks, grad + + model.train() + losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None + rows = [] + np.random.seed(seed) + for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)): + lvalue, toks, grad_accum = step(batch, grad_accum, True) + losses += lvalue; n_tokens += toks + mx.eval(state, losses, n_tokens, grad_accum) + rows.append({"step": it, "loss": float(lvalue.item())}) + + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]])) + post_loss_val = float(post_loss.item()) + + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item()) + else: + completion_loss = float("nan") + + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + print(f" contains 'Unsloth': {contains} gen={gen[:80]!r}", flush=True) + + out = { + "config": {"steps": steps, "seed": seed, "learning_rate": lr, + "delta": "mx.random.seed AFTER model load + set_wired_limit"}, + "rows": rows, "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_30__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From ad7d4db28a2f0afeddccc0d6d9a1c3f197989c6e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 15:24:04 +0000 Subject: [PATCH 70/84] =?UTF-8?q?mlx=5Fparity=20Round=20BG:=20THE=20FIX=20?= =?UTF-8?q?=E2=80=94=20num=5Flayers=3D16=20(mlx-lm=20CLI=20default)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CRITICAL DISCOVERY (Round BF result + huggingface config): gemma-3-270m-it has 18 hidden layers (per HF config). mlx-lm CLI CONFIG_DEFAULTS['num_layers']=16 (lora.py:56). Probe 20 (subprocess mlx-lm CLI) trained LoRA on the LAST 16 layers only. Inline probes 22-26+30 used len(model.layers)=18, training all 18 layers. The 2 extra layers x 7 modules = 14 extra LoRA modules consume mx.random state during init AND add trainable params, putting the model into a different basin. Round BE/BF results explain this cleanly: - Probe 27 (subprocess wrap): 47% (subprocess boundary irrelevant) - Probe 28 (set_wired_limit): 47% (allocator hint irrelevant) - Probe 29 (call mlx-lm train()): 50% (small noise) - Probe 30 (seed AFTER load): 47% with NEW per-seed pattern (basin shifts) None recovered 67%. Probe 31 = probe 30 + num_layers=16. If 67%, this is the cause and the fix is to set num_layers=16 by default in unsloth-zoo's get_peft_model when called against gemma-3 (or expose the choice). --- .github/workflows/mlx-parity-probe.yml | 73 ++++------ tests/mlx_parity/probe_31_num_layers_16.py | 151 +++++++++++++++++++++ 2 files changed, 178 insertions(+), 46 deletions(-) create mode 100644 tests/mlx_parity/probe_31_num_layers_16.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index b5505daa27..932abcc4a4 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -177,53 +177,34 @@ jobs: # 29 = probe 26 but call train() directly (uses train()'s actual setup) # If any hits 67%, that isolation IS the cause. - - {id: '27_1', script: 'probe_27_subprocess_wrap.py', seed: '1'} - - {id: '27_7', script: 'probe_27_subprocess_wrap.py', seed: '7'} - - {id: '27_42', script: 'probe_27_subprocess_wrap.py', seed: '42'} - - {id: '27_123', script: 'probe_27_subprocess_wrap.py', seed: '123'} - - {id: '27_456', script: 'probe_27_subprocess_wrap.py', seed: '456'} - - {id: '27_789', script: 'probe_27_subprocess_wrap.py', seed: '789'} - - {id: '27_999', script: 'probe_27_subprocess_wrap.py', seed: '999'} - - {id: '27_1234', script: 'probe_27_subprocess_wrap.py', seed: '1234'} - - {id: '27_3407', script: 'probe_27_subprocess_wrap.py', seed: '3407'} - - {id: '27_5678', script: 'probe_27_subprocess_wrap.py', seed: '5678'} - - {id: '27_9012', script: 'probe_27_subprocess_wrap.py', seed: '9012'} - - {id: '27_12345', script: 'probe_27_subprocess_wrap.py', seed: '12345'} - - {id: '27_22222', script: 'probe_27_subprocess_wrap.py', seed: '22222'} - - {id: '27_31415', script: 'probe_27_subprocess_wrap.py', seed: '31415'} - - {id: '27_65535', script: 'probe_27_subprocess_wrap.py', seed: '65535'} + # ---- Round BG: THE FIX — num_layers=16 (mlx-lm CLI default) ---- + # CRITICAL DISCOVERY: gemma-3-270m-it has 18 hidden layers. + # mlx-lm CLI's CONFIG_DEFAULTS['num_layers']=16 -> probe 20 + # trains LoRA on the LAST 16 layers only. My inline probes + # 22-26+30 used len(model.layers)=18, training all 18 layers. + # The extra 2 layers x 7 modules = 14 extra LoRA modules + # consume mx.random state during init AND add trainable + # parameters that differ from mlx-lm CLI's behavior. + # Probe 31 = probe 30 + num_layers=16. If 67%, this IS the + # fix. The unsloth-zoo fix is to default num_layers to 16 + # (or expose it as an arg with sensible default) in + # get_peft_model / linear_to_lora_layers calls. - - {id: '28_1', script: 'probe_28_set_wired_limit.py', seed: '1'} - - {id: '28_7', script: 'probe_28_set_wired_limit.py', seed: '7'} - - {id: '28_42', script: 'probe_28_set_wired_limit.py', seed: '42'} - - {id: '28_123', script: 'probe_28_set_wired_limit.py', seed: '123'} - - {id: '28_456', script: 'probe_28_set_wired_limit.py', seed: '456'} - - {id: '28_789', script: 'probe_28_set_wired_limit.py', seed: '789'} - - {id: '28_999', script: 'probe_28_set_wired_limit.py', seed: '999'} - - {id: '28_1234', script: 'probe_28_set_wired_limit.py', seed: '1234'} - - {id: '28_3407', script: 'probe_28_set_wired_limit.py', seed: '3407'} - - {id: '28_5678', script: 'probe_28_set_wired_limit.py', seed: '5678'} - - {id: '28_9012', script: 'probe_28_set_wired_limit.py', seed: '9012'} - - {id: '28_12345', script: 'probe_28_set_wired_limit.py', seed: '12345'} - - {id: '28_22222', script: 'probe_28_set_wired_limit.py', seed: '22222'} - - {id: '28_31415', script: 'probe_28_set_wired_limit.py', seed: '31415'} - - {id: '28_65535', script: 'probe_28_set_wired_limit.py', seed: '65535'} - - - {id: '29_1', script: 'probe_29_call_train_fn.py', seed: '1'} - - {id: '29_7', script: 'probe_29_call_train_fn.py', seed: '7'} - - {id: '29_42', script: 'probe_29_call_train_fn.py', seed: '42'} - - {id: '29_123', script: 'probe_29_call_train_fn.py', seed: '123'} - - {id: '29_456', script: 'probe_29_call_train_fn.py', seed: '456'} - - {id: '29_789', script: 'probe_29_call_train_fn.py', seed: '789'} - - {id: '29_999', script: 'probe_29_call_train_fn.py', seed: '999'} - - {id: '29_1234', script: 'probe_29_call_train_fn.py', seed: '1234'} - - {id: '29_3407', script: 'probe_29_call_train_fn.py', seed: '3407'} - - {id: '29_5678', script: 'probe_29_call_train_fn.py', seed: '5678'} - - {id: '29_9012', script: 'probe_29_call_train_fn.py', seed: '9012'} - - {id: '29_12345', script: 'probe_29_call_train_fn.py', seed: '12345'} - - {id: '29_22222', script: 'probe_29_call_train_fn.py', seed: '22222'} - - {id: '29_31415', script: 'probe_29_call_train_fn.py', seed: '31415'} - - {id: '29_65535', script: 'probe_29_call_train_fn.py', seed: '65535'} + - {id: '31_1', script: 'probe_31_num_layers_16.py', seed: '1'} + - {id: '31_7', script: 'probe_31_num_layers_16.py', seed: '7'} + - {id: '31_42', script: 'probe_31_num_layers_16.py', seed: '42'} + - {id: '31_123', script: 'probe_31_num_layers_16.py', seed: '123'} + - {id: '31_456', script: 'probe_31_num_layers_16.py', seed: '456'} + - {id: '31_789', script: 'probe_31_num_layers_16.py', seed: '789'} + - {id: '31_999', script: 'probe_31_num_layers_16.py', seed: '999'} + - {id: '31_1234', script: 'probe_31_num_layers_16.py', seed: '1234'} + - {id: '31_3407', script: 'probe_31_num_layers_16.py', seed: '3407'} + - {id: '31_5678', script: 'probe_31_num_layers_16.py', seed: '5678'} + - {id: '31_9012', script: 'probe_31_num_layers_16.py', seed: '9012'} + - {id: '31_12345', script: 'probe_31_num_layers_16.py', seed: '12345'} + - {id: '31_22222', script: 'probe_31_num_layers_16.py', seed: '22222'} + - {id: '31_31415', script: 'probe_31_num_layers_16.py', seed: '31415'} + - {id: '31_65535', script: 'probe_31_num_layers_16.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_31_num_layers_16.py b/tests/mlx_parity/probe_31_num_layers_16.py new file mode 100644 index 0000000000..846db925d4 --- /dev/null +++ b/tests/mlx_parity/probe_31_num_layers_16.py @@ -0,0 +1,151 @@ +"""Probe 31 — THE FIX: probe 30 + num_layers=16 (mlx-lm CLI default). + +CRITICAL DISCOVERY: + Gemma-3-270m-it has 18 hidden layers. + mlx-lm CLI's CONFIG_DEFAULTS['num_layers'] = 16 (lora.py:56). + So probe 20 trains LoRA on the LAST 16 layers only. + My probes 22-26+30 used len(model.layers)=18, training all 18. + +That's 14 extra LoRA modules (2 layers x 7 modules) consuming mx.random +state during init and adding trainable parameters. Different lora_a +init values AND a different trainable-param set = different basin. + +Probe 31 = probe 30 with num_layers=16 (matching mlx-lm CLI default). +If 67%, THIS is the cause of the 20pp gap. +""" +import json +import os +import sys +import random +from functools import partial +from pathlib import Path +import numpy as np + +MODEL_NAME = "unsloth/gemma-3-270m-it" +TRAIN_TEXT = "<> My name is Unsloth!" +PROMPT = "<> My name is " +MAX_SEQ_LEN = 64 +OUT_DIR = Path(__file__).resolve().parent / ".out" +OUT_DIR.mkdir(parents=True, exist_ok=True) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + num_layers = _env_int("MLX_NUM_LAYERS", 16) + print(f"=== Probe 31: probe 30 + num_layers={num_layers} steps={steps} seed={seed} lr={lr} ===", flush=True) + + random.seed(seed); np.random.seed(seed) + + import mlx.core as mx + import mlx.nn as nn + import mlx.optimizers as optim + from mlx.nn.utils import average_gradients + from mlx.utils import tree_map + + from mlx_lm import load as mlx_load, generate + from mlx_lm.tuner.utils import linear_to_lora_layers + from mlx_lm.tuner.trainer import iterate_batches, default_loss + from mlx_lm.tuner.datasets import TextDataset, CacheDataset + + model, tokenizer = mlx_load(MODEL_NAME) + + # Seed AFTER load (mlx-lm CLI lora.py:223) + mx.random.seed(seed) + + model.freeze() + + actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers) + if num_layers > actual_layers: + num_layers = actual_layers + print(f" model has {actual_layers} layers, training LoRA on last {num_layers}", flush=True) + + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + + optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True) + formatted = [{"text": TRAIN_TEXT} for _ in range(64)] + ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + + if mx.metal.is_available(): + mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"]) + + state = [model.state, optimizer.state, mx.random.state] + loss_value_and_grad = nn.value_and_grad(model, default_loss) + + @partial(mx.compile, inputs=state, outputs=state) + def step(batch, prev_grad, do_update): + (lvalue, toks), grad = loss_value_and_grad(model, *batch) + if prev_grad is not None: + grad = tree_map(lambda x, y: x + y, grad, prev_grad) + if do_update: + grad = average_gradients(grad) + optimizer.update(model, grad) + grad = None + return lvalue, toks, grad + + model.train() + losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None + rows = [] + np.random.seed(seed) + for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)): + lvalue, toks, grad_accum = step(batch, grad_accum, True) + losses += lvalue; n_tokens += toks + mx.eval(state, losses, n_tokens, grad_accum) + rows.append({"step": it, "loss": float(lvalue.item())}) + + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]])) + post_loss_val = float(post_loss.item()) + + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item()) + else: + completion_loss = float("nan") + + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + print(f" contains 'Unsloth': {contains} gen={gen[:80]!r}", flush=True) + + out = { + "config": {"steps": steps, "seed": seed, "learning_rate": lr, + "num_layers": num_layers, "actual_layers": actual_layers, + "delta": f"num_layers={num_layers} (mlx-lm CLI default)"}, + "rows": rows, "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_31__s{steps}_d{seed}_nl{num_layers}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From b1be7e3e14f22afc8be7137a960854fb5469885c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 15:49:51 +0000 Subject: [PATCH 71/84] mlx_parity Round BH: end-to-end fix verification (probe 32) PR 669 in unslothai/unsloth-zoo adds the finetune_last_n_layers parameter to FastMLXModel.get_peft_model. Probe 32 exercises the full zoo public API end-to-end (FastMLXModel.from_pretrained -> get_peft_model(finetune_last_n_layers=16) -> MLXTrainer.train) on the same 15-seed fixture. CI pin updated to commit b137b40 on unslothai/unsloth-zoo@fix-mlx-num-layers-parity so the new parameter is available. If probe 32 hits 10/15 = 67% with the same per-seed pattern as probe 20 (mlx-lm CLI), the PR works end-to-end through zoo's public surface. Probe 30 retained for comparison: probe 30 (manual loop + num_layers=16) was the original isolation that found the cause; probe 32 verifies the productionized fix. --- .github/workflows/mlx-parity-probe.yml | 42 ++--- tests/mlx_parity/probe_32_zoo_with_fix.py | 180 ++++++++++++++++++++++ 2 files changed, 204 insertions(+), 18 deletions(-) create mode 100644 tests/mlx_parity/probe_32_zoo_with_fix.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 932abcc4a4..a05ffe1a99 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -190,21 +190,27 @@ jobs: # (or expose it as an arg with sensible default) in # get_peft_model / linear_to_lora_layers calls. - - {id: '31_1', script: 'probe_31_num_layers_16.py', seed: '1'} - - {id: '31_7', script: 'probe_31_num_layers_16.py', seed: '7'} - - {id: '31_42', script: 'probe_31_num_layers_16.py', seed: '42'} - - {id: '31_123', script: 'probe_31_num_layers_16.py', seed: '123'} - - {id: '31_456', script: 'probe_31_num_layers_16.py', seed: '456'} - - {id: '31_789', script: 'probe_31_num_layers_16.py', seed: '789'} - - {id: '31_999', script: 'probe_31_num_layers_16.py', seed: '999'} - - {id: '31_1234', script: 'probe_31_num_layers_16.py', seed: '1234'} - - {id: '31_3407', script: 'probe_31_num_layers_16.py', seed: '3407'} - - {id: '31_5678', script: 'probe_31_num_layers_16.py', seed: '5678'} - - {id: '31_9012', script: 'probe_31_num_layers_16.py', seed: '9012'} - - {id: '31_12345', script: 'probe_31_num_layers_16.py', seed: '12345'} - - {id: '31_22222', script: 'probe_31_num_layers_16.py', seed: '22222'} - - {id: '31_31415', script: 'probe_31_num_layers_16.py', seed: '31415'} - - {id: '31_65535', script: 'probe_31_num_layers_16.py', seed: '65535'} + # ---- Round BH: end-to-end fix verification via zoo's API ---- + # Probe 32 = FastMLXModel.from_pretrained + get_peft_model( + # finetune_last_n_layers=16) + MLXTrainer. If 67% with the + # same per-seed pattern as probe 20 (mlx-lm CLI), the PR + # works end-to-end through zoo's public API. + + - {id: '32_1', script: 'probe_32_zoo_with_fix.py', seed: '1'} + - {id: '32_7', script: 'probe_32_zoo_with_fix.py', seed: '7'} + - {id: '32_42', script: 'probe_32_zoo_with_fix.py', seed: '42'} + - {id: '32_123', script: 'probe_32_zoo_with_fix.py', seed: '123'} + - {id: '32_456', script: 'probe_32_zoo_with_fix.py', seed: '456'} + - {id: '32_789', script: 'probe_32_zoo_with_fix.py', seed: '789'} + - {id: '32_999', script: 'probe_32_zoo_with_fix.py', seed: '999'} + - {id: '32_1234', script: 'probe_32_zoo_with_fix.py', seed: '1234'} + - {id: '32_3407', script: 'probe_32_zoo_with_fix.py', seed: '3407'} + - {id: '32_5678', script: 'probe_32_zoo_with_fix.py', seed: '5678'} + - {id: '32_9012', script: 'probe_32_zoo_with_fix.py', seed: '9012'} + - {id: '32_12345', script: 'probe_32_zoo_with_fix.py', seed: '12345'} + - {id: '32_22222', script: 'probe_32_zoo_with_fix.py', seed: '22222'} + - {id: '32_31415', script: 'probe_32_zoo_with_fix.py', seed: '31415'} + - {id: '32_65535', script: 'probe_32_zoo_with_fix.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -238,9 +244,9 @@ jobs: 'sentencepiece==0.2.1' \ 'huggingface-hub==0.36.2' \ 'trl==0.27.0' - # Round AW: pin to PR-663 head so all cells run against the - # same zoo state we measured in Round AV. - ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@aed74d9' + # Round BH: pin to the finetune_last_n_layers fix branch + # (b137b40) so probe 32 sees the new parameter. + ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@b137b4058eabf20a3122b67a2c9f04b63fb59568' for attempt in 1 2 3; do if pip install "$ZOO_SPEC"; then break; fi if [ "$attempt" -eq 3 ]; then exit 1; fi diff --git a/tests/mlx_parity/probe_32_zoo_with_fix.py b/tests/mlx_parity/probe_32_zoo_with_fix.py new file mode 100644 index 0000000000..346b112b3d --- /dev/null +++ b/tests/mlx_parity/probe_32_zoo_with_fix.py @@ -0,0 +1,180 @@ +"""Probe 32 — end-to-end test of the unsloth-zoo fix. + +Uses unsloth_zoo.mlx.loader.FastMLXModel.from_pretrained + +get_peft_model(finetune_last_n_layers=16) + MLXTrainer with the +mlx-lm-matching config (clip=off, bs=6, accum=1, lr=1e-3, bc=True). + +If 67% with the same per-seed pattern as probe 20, the FIX works +through zoo's public API end-to-end. The probe pins zoo to the +PR branch via the workflow's pip install (see workflow YAML). +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + last_n = _env_int("MLX_LAST_N", 16) + banner(f"Probe 32: zoo FastMLXModel + finetune_last_n_layers={last_n} " + f"steps={steps} seed={seed} lr={lr}") + + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype="float16", + text_only=True, max_seq_length=128, random_state=seed, + ) + + # Verify the new parameter is on get_peft_model. If it's missing + # (e.g. installed zoo doesn't have the fix yet), skip with a clear + # error so the matrix surfaces the install drift. + import inspect + sig = inspect.signature(FastMLXModel.get_peft_model) + if "finetune_last_n_layers" not in sig.parameters: + raise RuntimeError( + "Installed unsloth_zoo lacks finetune_last_n_layers parameter. " + "This probe must run against the fix branch." + ) + + model = FastMLXModel.get_peft_model( + model, r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + random_state=seed, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + finetune_last_n_layers=last_n, + use_gradient_checkpointing=False, + ) + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: + extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: + extra["max_grad_value"] = None + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe32_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, "learning_rate": lr, + "loader": "unsloth_zoo FastMLXModel", + "trainer": "unsloth_zoo MLXTrainer", + "finetune_last_n_layers": last_n, + "per_device_train_batch_size": 6, + "gradient_accumulation_steps": 1, + "max_grad_value": None, "max_grad_norm": 0.0, + "adam_bias_correction": True, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_32__s{steps}_d{seed}_nl{last_n}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 8ac40aed4aeee359a3f3d5f54e72f4f223b531a6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 16:04:27 +0000 Subject: [PATCH 72/84] mlx_parity Round BI: bisect zoo loader vs trainer at num_layers=16 Probe 31 (mlx_lm.load + manual loop + 16): 67% (matches mlx-lm CLI) Probe 32 (FastMLXModel + MLXTrainer + 16): 15% (additional loss on top of just the num_layers change) Probe 33 = mlx_lm.load + zoo MLXTrainer + num_layers=16. Bisects: - if 33 = 67%, zoo's LOADER side adds the extra basin instability - if 33 ~= 15%, zoo's TRAINER side does Either way, post_train_loss=0 and cf_loss=0 everywhere -- the model memorizes. The greedy-decode pass rate is the canary, not the gate. PR #5537's cf_loss gate is bulletproof regardless. --- .github/workflows/mlx-parity-probe.yml | 38 ++-- .../probe_33_mlxlm_loader_zoo_trainer_nl16.py | 163 ++++++++++++++++++ 2 files changed, 186 insertions(+), 15 deletions(-) create mode 100644 tests/mlx_parity/probe_33_mlxlm_loader_zoo_trainer_nl16.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index a05ffe1a99..c6c182f523 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -196,21 +196,29 @@ jobs: # same per-seed pattern as probe 20 (mlx-lm CLI), the PR # works end-to-end through zoo's public API. - - {id: '32_1', script: 'probe_32_zoo_with_fix.py', seed: '1'} - - {id: '32_7', script: 'probe_32_zoo_with_fix.py', seed: '7'} - - {id: '32_42', script: 'probe_32_zoo_with_fix.py', seed: '42'} - - {id: '32_123', script: 'probe_32_zoo_with_fix.py', seed: '123'} - - {id: '32_456', script: 'probe_32_zoo_with_fix.py', seed: '456'} - - {id: '32_789', script: 'probe_32_zoo_with_fix.py', seed: '789'} - - {id: '32_999', script: 'probe_32_zoo_with_fix.py', seed: '999'} - - {id: '32_1234', script: 'probe_32_zoo_with_fix.py', seed: '1234'} - - {id: '32_3407', script: 'probe_32_zoo_with_fix.py', seed: '3407'} - - {id: '32_5678', script: 'probe_32_zoo_with_fix.py', seed: '5678'} - - {id: '32_9012', script: 'probe_32_zoo_with_fix.py', seed: '9012'} - - {id: '32_12345', script: 'probe_32_zoo_with_fix.py', seed: '12345'} - - {id: '32_22222', script: 'probe_32_zoo_with_fix.py', seed: '22222'} - - {id: '32_31415', script: 'probe_32_zoo_with_fix.py', seed: '31415'} - - {id: '32_65535', script: 'probe_32_zoo_with_fix.py', seed: '65535'} + # ---- Round BI: bisect zoo loader vs zoo trainer at num_layers=16 ---- + # Probe 31 (mlx_lm.load + manual loop + 16): 67% + # Probe 32 (FastMLXModel + MLXTrainer + 16): 15% + # Probe 33 (mlx_lm.load + MLXTrainer + 16): ? — bisects. + # If 33 = 67%, zoo's LOADER (FastMLXModel.from_pretrained + # + get_peft_model) adds the extra basin instability. + # If 33 ~= 15%, zoo's TRAINER (MLXTrainer.train) does. + + - {id: '33_1', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '1'} + - {id: '33_7', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '7'} + - {id: '33_42', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '42'} + - {id: '33_123', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '123'} + - {id: '33_456', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '456'} + - {id: '33_789', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '789'} + - {id: '33_999', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '999'} + - {id: '33_1234', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '1234'} + - {id: '33_3407', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '3407'} + - {id: '33_5678', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '5678'} + - {id: '33_9012', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '9012'} + - {id: '33_12345', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '12345'} + - {id: '33_22222', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '22222'} + - {id: '33_31415', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '31415'} + - {id: '33_65535', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_33_mlxlm_loader_zoo_trainer_nl16.py b/tests/mlx_parity/probe_33_mlxlm_loader_zoo_trainer_nl16.py new file mode 100644 index 0000000000..1ff5593ac4 --- /dev/null +++ b/tests/mlx_parity/probe_33_mlxlm_loader_zoo_trainer_nl16.py @@ -0,0 +1,163 @@ +"""Probe 33 — mlx_lm loader + zoo MLXTrainer + num_layers=16. + +Bisects whether zoo's LOADER or its TRAINER adds the additional +basin instability seen in probe 32 (which used zoo's full stack +with num_layers=16 and hit only 15%). + +Probe 31 (mlx_lm.load + manual loop + 16): 67% +Probe 32 (FastMLXModel + MLXTrainer + 16): 15% +Probe 33 (mlx_lm.load + MLXTrainer + 16): ? +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + last_n = _env_int("MLX_LAST_N", 16) + banner(f"Probe 33: mlx_lm loader + zoo MLXTrainer + last_n={last_n} " + f"steps={steps} seed={seed} lr={lr}") + + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + from mlx_lm.tuner.utils import linear_to_lora_layers + + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + num_layers = max(1, min(int(last_n), num_layers)) + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: + extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: + extra["max_grad_value"] = None + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe33_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, "learning_rate": lr, + "loader": "mlx_lm.load", + "trainer": "unsloth_zoo MLXTrainer", + "num_layers": num_layers, + "per_device_train_batch_size": 6, + "gradient_accumulation_steps": 1, + "max_grad_value": None, "max_grad_norm": 0.0, + "adam_bias_correction": True, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_33__s{steps}_d{seed}_nl{num_layers}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From b75794757430972b180eed7c63a4eae7f05262c5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 16:18:43 +0000 Subject: [PATCH 73/84] mlx_parity Round BJ: dtype-cast hypothesis (probe 34) Round BI bisect: Probe 31 (mlx_lm.load + manual loop + nl=16): 67% Probe 33 (mlx_lm.load + MLXTrainer + nl=16): 53% (-14pp from trainer) Probe 32 (FastMLXModel(dtype='fp16') + MLXTrainer + nl=16): 15% (-38pp from loader) Gemma-3-270m-it is stored as bf16 on HF. FastMLXModel's _convert_mlx_dtype defaults force fp16, which is a lossy cast (fp16 has 5-bit exponent vs bf16 8-bit). Any param outside fp16's ~6.5e4 range gets clamped. Probe 34 uses FastMLXModel(dtype=None) -- keep storage dtype (bf16). If 34 ~= 53%, the dtype cast is the loader's offender. The fix is to default dtype to None on Gemma3 or to use bf16 explicitly. cf_loss = 0 in every probe, so memorization works -- only greedy decode varies. The smoke gate (PR #5537) is robust regardless. --- .github/workflows/mlx-parity-probe.yml | 39 +++-- .../probe_34_zoo_loader_no_dtype.py | 159 ++++++++++++++++++ 2 files changed, 183 insertions(+), 15 deletions(-) create mode 100644 tests/mlx_parity/probe_34_zoo_loader_no_dtype.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index c6c182f523..cf5d7ea182 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -204,21 +204,30 @@ jobs: # + get_peft_model) adds the extra basin instability. # If 33 ~= 15%, zoo's TRAINER (MLXTrainer.train) does. - - {id: '33_1', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '1'} - - {id: '33_7', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '7'} - - {id: '33_42', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '42'} - - {id: '33_123', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '123'} - - {id: '33_456', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '456'} - - {id: '33_789', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '789'} - - {id: '33_999', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '999'} - - {id: '33_1234', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '1234'} - - {id: '33_3407', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '3407'} - - {id: '33_5678', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '5678'} - - {id: '33_9012', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '9012'} - - {id: '33_12345', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '12345'} - - {id: '33_22222', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '22222'} - - {id: '33_31415', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '31415'} - - {id: '33_65535', script: 'probe_33_mlxlm_loader_zoo_trainer_nl16.py', seed: '65535'} + # ---- Round BJ: dtype hypothesis (FastMLXModel cast bf16->fp16) ---- + # Probe 32 (FastMLXModel(dtype='float16') + MLXTrainer + nl=16): 15%. + # Probe 33 (mlx_lm.load + MLXTrainer + nl=16): 53%. + # Hypothesis: gemma-3-270m-it is bf16 on HF. FastMLXModel forces + # fp16 cast via _convert_mlx_dtype. fp16 has 5-bit exponent vs + # bf16's 8-bit, so any param outside fp16 range gets clamped. + # Probe 34 uses FastMLXModel(dtype=None) -- keeps storage dtype. + # If 34 ~= 53%, the dtype cast is the offender. + + - {id: '34_1', script: 'probe_34_zoo_loader_no_dtype.py', seed: '1'} + - {id: '34_7', script: 'probe_34_zoo_loader_no_dtype.py', seed: '7'} + - {id: '34_42', script: 'probe_34_zoo_loader_no_dtype.py', seed: '42'} + - {id: '34_123', script: 'probe_34_zoo_loader_no_dtype.py', seed: '123'} + - {id: '34_456', script: 'probe_34_zoo_loader_no_dtype.py', seed: '456'} + - {id: '34_789', script: 'probe_34_zoo_loader_no_dtype.py', seed: '789'} + - {id: '34_999', script: 'probe_34_zoo_loader_no_dtype.py', seed: '999'} + - {id: '34_1234', script: 'probe_34_zoo_loader_no_dtype.py', seed: '1234'} + - {id: '34_3407', script: 'probe_34_zoo_loader_no_dtype.py', seed: '3407'} + - {id: '34_5678', script: 'probe_34_zoo_loader_no_dtype.py', seed: '5678'} + - {id: '34_9012', script: 'probe_34_zoo_loader_no_dtype.py', seed: '9012'} + - {id: '34_12345', script: 'probe_34_zoo_loader_no_dtype.py', seed: '12345'} + - {id: '34_22222', script: 'probe_34_zoo_loader_no_dtype.py', seed: '22222'} + - {id: '34_31415', script: 'probe_34_zoo_loader_no_dtype.py', seed: '31415'} + - {id: '34_65535', script: 'probe_34_zoo_loader_no_dtype.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_34_zoo_loader_no_dtype.py b/tests/mlx_parity/probe_34_zoo_loader_no_dtype.py new file mode 100644 index 0000000000..6ac8dda3c7 --- /dev/null +++ b/tests/mlx_parity/probe_34_zoo_loader_no_dtype.py @@ -0,0 +1,159 @@ +"""Probe 34 — same as probe 32 but with dtype=None (skip FastMLXModel's +bf16->fp16 cast on Gemma3). + +Probe 32 (FastMLXModel(dtype='float16') + MLXTrainer + nl=16): 15%. +Probe 33 (mlx_lm.load + MLXTrainer + nl=16): 53%. + +Hypothesis: zoo's _convert_mlx_dtype casts gemma3-270m from its +native bf16 to fp16, which is a lossy cast (fp16 max ~6.5e4 vs +bf16 max ~3.4e38). If True, probe 34 (no cast) should recover +toward 53%. +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + last_n = _env_int("MLX_LAST_N", 16) + banner(f"Probe 34: zoo FastMLXModel(dtype=None) + finetune_last_n_layers={last_n}") + + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + # dtype=None tells FastMLXModel to keep the storage dtype. + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype=None, + text_only=True, max_seq_length=128, random_state=seed, + ) + + import inspect + sig = inspect.signature(FastMLXModel.get_peft_model) + assert "finetune_last_n_layers" in sig.parameters, "zoo build missing the fix" + + model = FastMLXModel.get_peft_model( + model, r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], + random_state=seed, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + finetune_last_n_layers=last_n, + use_gradient_checkpointing=False, + ) + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: extra["max_grad_value"] = None + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe34_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("contains 'Unsloth'", contains) + report("generation", repr(gen[:60])) + + out = { + "config": {"steps": steps, "seed": seed, "learning_rate": lr, + "loader": "FastMLXModel(dtype=None)", "finetune_last_n_layers": last_n}, + "rows": rows, "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_34__s{steps}_d{seed}_nl{last_n}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From c73f7963b33510ce6a1b8d650d89887dae3cdc5f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 16:44:12 +0000 Subject: [PATCH 74/84] mlx_parity Round BK: MLXTrainer compile-flag hypothesis (probe 35) Probe 33 (mlx_lm.load + MLXTrainer + nl=16 + compile=False): 53% Probe 31 (mlx_lm.load + manual loop + nl=16 + @mx.compile): 67% Hypothesis: the -14pp gap between zoo's MLXTrainer and the manual loop at the same loader/layer count is purely the `compile` flag. Probe 33 disabled compile via `compile=False`; probe 31's manual loop always uses `@mx.compile`. If probe 35 (= probe 33 verbatim, only `compile=True`) recovers to ~67%, the -14pp is a probe-configuration artifact, not a MLXTrainer defect. 15 seeds + matrix entry. Same ZOO_SPEC pin (b137b40) as Round BJ. --- .github/workflows/mlx-parity-probe.yml | 27 +++ .../probe_35_zoo_trainer_compile_on.py | 166 ++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 tests/mlx_parity/probe_35_zoo_trainer_compile_on.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index cf5d7ea182..fc6c994fa9 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -228,6 +228,33 @@ jobs: - {id: '34_22222', script: 'probe_34_zoo_loader_no_dtype.py', seed: '22222'} - {id: '34_31415', script: 'probe_34_zoo_loader_no_dtype.py', seed: '31415'} - {id: '34_65535', script: 'probe_34_zoo_loader_no_dtype.py', seed: '65535'} + + # ---- Round BK: MLXTrainer compile-flag hypothesis ---- + # Probe 33 (mlx_lm.load + MLXTrainer + nl=16 + compile=False): 53% + # Probe 31 (mlx_lm.load + manual loop + nl=16 + @mx.compile): 67% + # Hypothesis: the -14pp gap between zoo MLXTrainer and the + # manual loop at the same loader / layer count is purely the + # compile flag. Probe 33 disabled compile via `compile=False` + # while probe 31's manual loop always uses `@mx.compile`. If + # probe 35 (= probe 33 verbatim + compile=True) recovers to + # ~67%, the -14pp is a probe-configuration artifact, not a + # MLXTrainer defect. + + - {id: '35_1', script: 'probe_35_zoo_trainer_compile_on.py', seed: '1'} + - {id: '35_7', script: 'probe_35_zoo_trainer_compile_on.py', seed: '7'} + - {id: '35_42', script: 'probe_35_zoo_trainer_compile_on.py', seed: '42'} + - {id: '35_123', script: 'probe_35_zoo_trainer_compile_on.py', seed: '123'} + - {id: '35_456', script: 'probe_35_zoo_trainer_compile_on.py', seed: '456'} + - {id: '35_789', script: 'probe_35_zoo_trainer_compile_on.py', seed: '789'} + - {id: '35_999', script: 'probe_35_zoo_trainer_compile_on.py', seed: '999'} + - {id: '35_1234', script: 'probe_35_zoo_trainer_compile_on.py', seed: '1234'} + - {id: '35_3407', script: 'probe_35_zoo_trainer_compile_on.py', seed: '3407'} + - {id: '35_5678', script: 'probe_35_zoo_trainer_compile_on.py', seed: '5678'} + - {id: '35_9012', script: 'probe_35_zoo_trainer_compile_on.py', seed: '9012'} + - {id: '35_12345', script: 'probe_35_zoo_trainer_compile_on.py', seed: '12345'} + - {id: '35_22222', script: 'probe_35_zoo_trainer_compile_on.py', seed: '22222'} + - {id: '35_31415', script: 'probe_35_zoo_trainer_compile_on.py', seed: '31415'} + - {id: '35_65535', script: 'probe_35_zoo_trainer_compile_on.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_35_zoo_trainer_compile_on.py b/tests/mlx_parity/probe_35_zoo_trainer_compile_on.py new file mode 100644 index 0000000000..032d039ef8 --- /dev/null +++ b/tests/mlx_parity/probe_35_zoo_trainer_compile_on.py @@ -0,0 +1,166 @@ +"""Probe 35 — probe 33 but with MLXTrainer's compile knob ON. + +Probe 33 (mlx_lm.load + MLXTrainer + nl=16 + compile=False): 53% +Probe 31 (mlx_lm.load + manual loop + nl=16 + @mx.compile): 67% + +Hypothesis: the -14pp gap between manual-loop and zoo MLXTrainer at +the same loader / layer count is purely the compile flag. Probe 33 +disabled compile via `compile=False` while probe 31's manual loop +always uses `@mx.compile`. If true, probe 35 should recover to ~67%. + +Probe 35 = probe 33 verbatim except `compile=True`. +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + last_n = _env_int("MLX_LAST_N", 16) + banner(f"Probe 35: mlx_lm loader + zoo MLXTrainer(compile=True) + last_n={last_n} " + f"steps={steps} seed={seed} lr={lr}") + + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + from mlx_lm.tuner.utils import linear_to_lora_layers + + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + num_layers = max(1, min(int(last_n), num_layers)) + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: + extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: + extra["max_grad_value"] = None + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=True, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe35_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, "learning_rate": lr, + "loader": "mlx_lm.load", + "trainer": "unsloth_zoo MLXTrainer", + "compile": True, + "num_layers": num_layers, + "per_device_train_batch_size": 6, + "gradient_accumulation_steps": 1, + "max_grad_value": None, "max_grad_norm": 0.0, + "adam_bias_correction": True, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_35__s{steps}_d{seed}_nl{num_layers}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 93ca90ad9fc484071d1bf23519980a701f2d768a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 17:06:24 +0000 Subject: [PATCH 75/84] mlx_parity Round BL: pin trainer-side cause (probes 36 + 37) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round BK probe 35 (mlx_lm.load + MLXTrainer + nl=16 + compile=True) hit 8/15 = 53%, same as probe 33's 53% with compile=False. The compile flag is NOT the trainer-side cause of the 47-53% vs 67% gap. This round adds two probes: probe 36 — FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=True. Isolates the loader-only delta with compile held constant. If 36 ~= 67%, the loader patches add no real basin drift; if 36 ~= 47-53%, the loader contributes its own delta on top of any trainer issue. probe 37 — mlx_lm.load + MLXTrainer + nl=16 + compile=False with EXPLICIT max_grad_value=0.0. Bypasses the documented disable-via-None bug in current MLXTrainer (PR #671 will honor None as disable, but 0.0 has always disabled). If 37 ~= 67%, the silent +/-1.0 elementwise clip on probes 33/35 (both pass None expecting no clip) was the entire trainer-side gap. If 37 ~= 53%, yet another factor remains. Same ZOO_SPEC pin (b137b40 — finetune_last_n_layers fix branch) so the existing probe-32-style scaffolding works. 15 seeds each, paired with probes 20/30/31/33/35. --- .github/workflows/mlx-parity-probe.yml | 52 ++++++ .../probe_36_zoo_loader_compile_on.py | 164 ++++++++++++++++ .../probe_37_zoo_trainer_clip_off.py | 176 ++++++++++++++++++ 3 files changed, 392 insertions(+) create mode 100644 tests/mlx_parity/probe_36_zoo_loader_compile_on.py create mode 100644 tests/mlx_parity/probe_37_zoo_trainer_clip_off.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index fc6c994fa9..a381fc7f98 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -255,6 +255,58 @@ jobs: - {id: '35_22222', script: 'probe_35_zoo_trainer_compile_on.py', seed: '22222'} - {id: '35_31415', script: 'probe_35_zoo_trainer_compile_on.py', seed: '31415'} - {id: '35_65535', script: 'probe_35_zoo_trainer_compile_on.py', seed: '65535'} + + # ---- Round BK: zoo loader + MLXTrainer(compile=True) ---- + # Probe 34 (FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=False): ~47% + # Probe 35 (mlx_lm.load + MLXTrainer + nl=16 + compile=True ): ? + # Probe 36 (FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=True ): ? + # If 35 ~= 67% closing the trainer gap, 36 isolates loader-only delta: + # 36 ~= 67% -> compile alone explains the trainer+loader gap; + # FastMLXModel loader patches add no real basin drift. + # 36 ~= 47% -> compile fixes the trainer half, but loader patches + # independently add a -10pp drift to bisect next. + + - {id: '36_1', script: 'probe_36_zoo_loader_compile_on.py', seed: '1'} + - {id: '36_7', script: 'probe_36_zoo_loader_compile_on.py', seed: '7'} + - {id: '36_42', script: 'probe_36_zoo_loader_compile_on.py', seed: '42'} + - {id: '36_123', script: 'probe_36_zoo_loader_compile_on.py', seed: '123'} + - {id: '36_456', script: 'probe_36_zoo_loader_compile_on.py', seed: '456'} + - {id: '36_789', script: 'probe_36_zoo_loader_compile_on.py', seed: '789'} + - {id: '36_999', script: 'probe_36_zoo_loader_compile_on.py', seed: '999'} + - {id: '36_1234', script: 'probe_36_zoo_loader_compile_on.py', seed: '1234'} + - {id: '36_3407', script: 'probe_36_zoo_loader_compile_on.py', seed: '3407'} + - {id: '36_5678', script: 'probe_36_zoo_loader_compile_on.py', seed: '5678'} + - {id: '36_9012', script: 'probe_36_zoo_loader_compile_on.py', seed: '9012'} + - {id: '36_12345', script: 'probe_36_zoo_loader_compile_on.py', seed: '12345'} + - {id: '36_22222', script: 'probe_36_zoo_loader_compile_on.py', seed: '22222'} + - {id: '36_31415', script: 'probe_36_zoo_loader_compile_on.py', seed: '31415'} + - {id: '36_65535', script: 'probe_36_zoo_loader_compile_on.py', seed: '65535'} + + # ---- Round BL: bypass the max_grad_value=None silent-clip bug ---- + # Probe 35 (compile=True) hit 53% (same as probe 33's 53%) -- so + # compile flag is NOT the trainer-side gap. Next live candidate: + # MLXTrainer's resolver rebinds `max_grad_value=None` to the + # default 1.0 (fixed in PR #671), so the no-clip intent in + # probes 33/35 was silently overridden to clip at +/-1.0. + # Probe 37 = probe 33 but explicit max_grad_value=0.0 (always + # disabled, regardless of PR #671). If 37 ~= 67%, elementwise + # clipping at 1.0 was the entire trainer-side gap. + + - {id: '37_1', script: 'probe_37_zoo_trainer_clip_off.py', seed: '1'} + - {id: '37_7', script: 'probe_37_zoo_trainer_clip_off.py', seed: '7'} + - {id: '37_42', script: 'probe_37_zoo_trainer_clip_off.py', seed: '42'} + - {id: '37_123', script: 'probe_37_zoo_trainer_clip_off.py', seed: '123'} + - {id: '37_456', script: 'probe_37_zoo_trainer_clip_off.py', seed: '456'} + - {id: '37_789', script: 'probe_37_zoo_trainer_clip_off.py', seed: '789'} + - {id: '37_999', script: 'probe_37_zoo_trainer_clip_off.py', seed: '999'} + - {id: '37_1234', script: 'probe_37_zoo_trainer_clip_off.py', seed: '1234'} + - {id: '37_3407', script: 'probe_37_zoo_trainer_clip_off.py', seed: '3407'} + - {id: '37_5678', script: 'probe_37_zoo_trainer_clip_off.py', seed: '5678'} + - {id: '37_9012', script: 'probe_37_zoo_trainer_clip_off.py', seed: '9012'} + - {id: '37_12345', script: 'probe_37_zoo_trainer_clip_off.py', seed: '12345'} + - {id: '37_22222', script: 'probe_37_zoo_trainer_clip_off.py', seed: '22222'} + - {id: '37_31415', script: 'probe_37_zoo_trainer_clip_off.py', seed: '31415'} + - {id: '37_65535', script: 'probe_37_zoo_trainer_clip_off.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_36_zoo_loader_compile_on.py b/tests/mlx_parity/probe_36_zoo_loader_compile_on.py new file mode 100644 index 0000000000..7d902f5f4a --- /dev/null +++ b/tests/mlx_parity/probe_36_zoo_loader_compile_on.py @@ -0,0 +1,164 @@ +"""Probe 36 — probe 34 verbatim but with MLXTrainer's compile knob ON. + +Probe 34 (FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=False): ~47% +Probe 35 (mlx_lm.load + MLXTrainer + nl=16 + compile=True ): ? +Probe 36 (FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=True ): ? + +If probe 35 hits ~67% (closing the -14pp trainer gap), probe 36 isolates +the remaining loader-only delta with compile held constant. Reads: + 36 ~= 67% -> the loader patches add no real basin drift; compile=False + was the source of the entire end-to-end gap. + 36 ~= 47% -> compile fixes the trainer half, but FastMLXModel's + loader patches independently add a -10pp drift that needs + its own bisection (next: which patch). +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + last_n = _env_int("MLX_LAST_N", 16) + banner(f"Probe 36: zoo FastMLXModel(dtype=None) + MLXTrainer(compile=True) + last_n={last_n}") + + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype=None, + text_only=True, max_seq_length=128, random_state=seed, + ) + + import inspect + sig = inspect.signature(FastMLXModel.get_peft_model) + assert "finetune_last_n_layers" in sig.parameters, "zoo build missing the fix" + + model = FastMLXModel.get_peft_model( + model, r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], + random_state=seed, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + finetune_last_n_layers=last_n, + use_gradient_checkpointing=False, + ) + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: extra["max_grad_value"] = None + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=True, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe36_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("contains 'Unsloth'", contains) + report("generation", repr(gen[:60])) + + out = { + "config": {"steps": steps, "seed": seed, "learning_rate": lr, + "loader": "FastMLXModel(dtype=None)", + "trainer": "unsloth_zoo MLXTrainer", + "compile": True, + "finetune_last_n_layers": last_n}, + "rows": rows, "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_36__s{steps}_d{seed}_nl{last_n}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/mlx_parity/probe_37_zoo_trainer_clip_off.py b/tests/mlx_parity/probe_37_zoo_trainer_clip_off.py new file mode 100644 index 0000000000..627ac0af76 --- /dev/null +++ b/tests/mlx_parity/probe_37_zoo_trainer_clip_off.py @@ -0,0 +1,176 @@ +"""Probe 37 — probe 35 but explicitly set max_grad_value=0.0. + +Probe 31 (mlx_lm.load + manual loop + nl=16 + no clip): 67% +Probe 33 (mlx_lm.load + MLXTrainer + nl=16 + compile=False + max_grad_value=None): 53% +Probe 35 (mlx_lm.load + MLXTrainer + nl=16 + compile=True + max_grad_value=None): 53% + +Round BK ruled compile=True OUT as the trainer-side cause. + +Probe 37 tests the next hypothesis: MLXTrainer's `max_grad_value=None` +silently rebinds to the default 1.0 (fixed in PR #671), so probes that +set max_grad_value=None to mirror mlx-lm CLI's no-clip default were +actually being clipped at +/-1.0 the whole time. Probe 37 bypasses the +bug by passing `max_grad_value=0.0` (which has always disabled clip). + +Reads: + 37 ~= 67% -> elementwise clipping at +/-1.0 was the entire trainer-side + gap. PR #671's None-disables-clip fix is the right closer. + 37 ~= 53% -> there is yet another factor inside MLXTrainer that needs + its own bisection. +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + last_n = _env_int("MLX_LAST_N", 16) + banner(f"Probe 37: mlx_lm.load + MLXTrainer(compile=False, max_grad_value=0.0) + last_n={last_n} " + f"steps={steps} seed={seed} lr={lr}") + + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + from mlx_lm.tuner.utils import linear_to_lora_layers + + model, tokenizer = mlx_load(MODEL_NAME) + model.freeze() + try: num_layers = len(model.layers) + except AttributeError: num_layers = len(model.model.layers) + num_layers = max(1, min(int(last_n), num_layers)) + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: + extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: + # KEY DIFFERENCE vs probe 33/35: explicit 0.0 disables clip even + # on builds where None silently rebinds to 1.0. Once PR #671 + # merges, None will be equivalent. + extra["max_grad_value"] = 0.0 + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe37_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("generation", repr(gen[:160])) + report("contains 'Unsloth'", contains) + + out = { + "config": { + "steps": steps, "seed": seed, "learning_rate": lr, + "loader": "mlx_lm.load", + "trainer": "unsloth_zoo MLXTrainer", + "compile": False, + "num_layers": num_layers, + "max_grad_value": 0.0, + "max_grad_norm": 0.0, + "adam_bias_correction": True, + }, + "rows": rows, + "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, + "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_37__s{steps}_d{seed}_nl{num_layers}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 956fd1c7fdad012af092dc8e04ee4dbdd1318c38 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 17:43:19 +0000 Subject: [PATCH 76/84] mlx_parity Round BM: re-baseline probe 31 to reset confidence Round BL surprises: probe 30 (manual loop + nl=18 + no clip) : 7/15 = 47% probe 34 (FastMLXModel + MLXTrainer + nl=16 + None) : 7/15 = 47% probe 35 (mlx_lm.load + MLXTrainer + nl=16, compile=True, None) : 8/15 = 53% probe 36 (FastMLXModel + MLXTrainer + nl=16, compile=True, None) : 7/15 = 47% probe 37 (mlx_lm.load + MLXTrainer + nl=16, compile=False, 0.0) : 6/15 = 40% Earlier rounds claimed probe 31 (manual loop + nl=16 + no clip) hit 67%, which made the 47-53% MLXTrainer results look like a real trainer-side gap. With probe 37 now lower than the None-clipped probes 33/35, the entire trainer-side delta is suspect and could be seed-pattern noise at n=15. This round re-adds probe 31 to the matrix on the same run so we get a paired fresh number against probes 30/34/35/36/37 on the same 15 seeds: probe 31 ~= 67% -> trainer DOES add a real ~20pp gap; keep digging. probe 31 ~= 47% -> the 'gap' is within seed noise; no defect. Same ZOO_SPEC pin (b137b40) and 15 seeds as the rest of the matrix. --- .github/workflows/mlx-parity-probe.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index a381fc7f98..745536efae 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -307,6 +307,32 @@ jobs: - {id: '37_22222', script: 'probe_37_zoo_trainer_clip_off.py', seed: '22222'} - {id: '37_31415', script: 'probe_37_zoo_trainer_clip_off.py', seed: '31415'} - {id: '37_65535', script: 'probe_37_zoo_trainer_clip_off.py', seed: '65535'} + + # ---- Round BM: re-baseline probe 31 to reset confidence ---- + # Round BL data (probe 37 at 40%, probe 30 at 47%, probe 34/36 at 47%, + # probe 35 at 53%) suggests probe 31's earlier 67% may have been + # within seed noise. Re-run probe 31 (manual loop + nl=16 + no clip) + # on the SAME 15 seeds and the same run as probes 30/34/35/36/37 to + # get a paired fresh number. If probe 31 reproduces ~67%, the + # trainer DOES add a real -20pp gap (MLXTrainer 47-53% << manual + # loop 67%). If probe 31 lands at 47-53%, the entire 'gap' is + # within noise and there is no trainer defect to chase. + + - {id: '31_1', script: 'probe_31_num_layers_16.py', seed: '1'} + - {id: '31_7', script: 'probe_31_num_layers_16.py', seed: '7'} + - {id: '31_42', script: 'probe_31_num_layers_16.py', seed: '42'} + - {id: '31_123', script: 'probe_31_num_layers_16.py', seed: '123'} + - {id: '31_456', script: 'probe_31_num_layers_16.py', seed: '456'} + - {id: '31_789', script: 'probe_31_num_layers_16.py', seed: '789'} + - {id: '31_999', script: 'probe_31_num_layers_16.py', seed: '999'} + - {id: '31_1234', script: 'probe_31_num_layers_16.py', seed: '1234'} + - {id: '31_3407', script: 'probe_31_num_layers_16.py', seed: '3407'} + - {id: '31_5678', script: 'probe_31_num_layers_16.py', seed: '5678'} + - {id: '31_9012', script: 'probe_31_num_layers_16.py', seed: '9012'} + - {id: '31_12345', script: 'probe_31_num_layers_16.py', seed: '12345'} + - {id: '31_22222', script: 'probe_31_num_layers_16.py', seed: '22222'} + - {id: '31_31415', script: 'probe_31_num_layers_16.py', seed: '31415'} + - {id: '31_65535', script: 'probe_31_num_layers_16.py', seed: '65535'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 From b27d4f1e4726d721b55cd3f1d064b5057c5ee5d7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 18:38:12 +0000 Subject: [PATCH 77/84] mlx_parity Round BO: pin probe matrix to pad-fix branch (b265d99) Pin ZOO_SPEC to the unsloth-zoo pad-fix branch (b265d99 from fix-mlx-pad-multiple) so probes 30/31/34/35/36/37 measure whether the create_text_batches +1 padding fix closes the basin gap that Round BM identified. Expected results: probe 30 (manual, nl=18): unchanged ~47% (manual loop uses mlx-lm's iterate_batches, not zoo's create_text_batches) probe 31 (manual, nl=16): unchanged ~67% (same reason) probe 34 (zoo, nl=16, dtype=None, compile=False): rises toward 67% probe 35 (zoo, nl=16, compile=True): rises toward 67% probe 36 (zoo loader+trainer, compile=True): rises toward 67% probe 37 (zoo, nl=16, compile=False, explicit clip=0): rises toward 67% Same 15-seed list as Round BM for paired comparison. --- .github/workflows/mlx-parity-probe.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 745536efae..bffc33951b 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -366,9 +366,14 @@ jobs: 'sentencepiece==0.2.1' \ 'huggingface-hub==0.36.2' \ 'trl==0.27.0' - # Round BH: pin to the finetune_last_n_layers fix branch - # (b137b40) so probe 32 sees the new parameter. - ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@b137b4058eabf20a3122b67a2c9f04b63fb59568' + # Round BO: pin to the create_text_batches pad-fix branch + # (b265d99) which both (a) contains the finetune_last_n_layers + # parameter from b137b40 and (b) adds the +1 to MLXTrainer's + # padding so it matches mlx-lm's iterate_batches value-for-value. + # Re-runs probes 30/31/34/37 against this build to check whether + # the pad fix closes the basin gap (expected: zoo MLXTrainer + # probes 34/37 should rise from 47%/40% toward probe 31's 67%). + ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@b265d99710d7c61f704c4c740e7917cb0a543c7b' for attempt in 1 2 3; do if pip install "$ZOO_SPEC"; then break; fi if [ "$attempt" -eq 3 ]; then exit 1; fi From e0156007c7a10eedbab0fc02155d5a18a3d60082 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 May 2026 01:14:40 +0000 Subject: [PATCH 78/84] mlx_parity Round BP: strict step-by-step parity probe (probe 38) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round BO per-step loss data showed: probe 31 (manual) step 2: 5.254807 probe 35 (zoo, compile=True) step 2: 5.276443 (diff = -0.021635) probe 37 (zoo, compile=False) step 2: 5.276443 (diff = -0.021635) Probes 35 vs 37 (both zoo, just different compile/clip) match exactly for the first 3 steps. probe 31 vs zoo diverges from step 2. Step 1 loss is identical across all three, so the divergence is in the gradient applied at step 1 -- a numerical / autodiff-graph difference between mlx-lm CLI's default_loss and zoo's make_baseline_loss_fn (different mask dtype, different safe_targets where, different denominator division). Probe 38 runs both paths back-to-back in one process and captures per-step loss AND per-step grad_norm so the diff is explicit and the step where it first appears is unambiguous. Output JSON has rows_mlxlm, rows_zoo, and diffs arrays. 5 seeds (1, 42, 999, 3407, 22222) — enough for a deterministic diagnostic on the bf16-native + last-16-layers + no-clip config. ZOO_SPEC stays pinned to b265d99 (pad-fix branch). --- .github/workflows/mlx-parity-probe.yml | 15 ++ tests/mlx_parity/probe_38_strict_parity.py | 240 +++++++++++++++++++++ 2 files changed, 255 insertions(+) create mode 100644 tests/mlx_parity/probe_38_strict_parity.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index bffc33951b..da9354a585 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -333,6 +333,21 @@ jobs: - {id: '31_22222', script: 'probe_31_num_layers_16.py', seed: '22222'} - {id: '31_31415', script: 'probe_31_num_layers_16.py', seed: '31415'} - {id: '31_65535', script: 'probe_31_num_layers_16.py', seed: '65535'} + + # ---- Round BP: strict step-by-step parity diagnostic ---- + # Round BO per-step loss data showed probe 31 (manual loop) and + # zoo probes 35/37 diverge from step 2 onwards by 0.01-0.06, + # even though step 1's forward loss is identical. The gradient + # applied at step 1 differs. Probe 38 runs both paths back-to-back + # in a single process and captures per-step loss AND per-step + # grad_norm so we can pin where the numerical divergence starts. + # Only 5 seeds needed for a value-for-value diagnostic. + + - {id: '38_1', script: 'probe_38_strict_parity.py', seed: '1'} + - {id: '38_42', script: 'probe_38_strict_parity.py', seed: '42'} + - {id: '38_999', script: 'probe_38_strict_parity.py', seed: '999'} + - {id: '38_3407', script: 'probe_38_strict_parity.py', seed: '3407'} + - {id: '38_22222', script: 'probe_38_strict_parity.py', seed: '22222'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_38_strict_parity.py b/tests/mlx_parity/probe_38_strict_parity.py new file mode 100644 index 0000000000..a6f5ddc504 --- /dev/null +++ b/tests/mlx_parity/probe_38_strict_parity.py @@ -0,0 +1,240 @@ +"""Probe 38 — strict numerical parity between mlx-lm manual loop and +zoo MLXTrainer on the same seed, capturing per-step loss AND per-step +grad_norm so we can diff value-for-value. + +Existing probes only compared endpoint loss (all hit 0) and greedy-decode +pass rate (varies 40-67% across configs). Per-step loss data from +Round BO showed that probe 31 (manual) vs probe 35/37 (zoo) diverges +from step 2 onward by ~0.01-0.06 — the gradient applied at step 1 +differs even though step 1's forward loss is identical. This probe +isolates that to a single run with paired per-step diagnostics. + +Output: a JSON with two parallel rows arrays (`rows_mlxlm`, +`rows_zoo`) plus computed per-step diffs. If grad_norm differs at +step 1, the loss-function graph or autodiff path is the cause. If +grad_norm matches at step 1 but loss diverges at step 2, the +optimizer update step is the cause. +""" +import json +import os +import sys +import dataclasses +import random +from functools import partial +from pathlib import Path +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def _run_mlxlm_manual(seed, steps, lr, last_n): + """Reproduce probe 31's manual loop and capture per-step loss + grad_norm.""" + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + import mlx.nn as nn + import mlx.optimizers as optim + from mlx.nn.utils import average_gradients + from mlx.utils import tree_map, tree_flatten + + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + from mlx_lm.tuner.utils import linear_to_lora_layers + from mlx_lm.tuner.trainer import iterate_batches, default_loss + from mlx_lm.tuner.datasets import TextDataset, CacheDataset + + model, tokenizer = mlx_load(MODEL_NAME) + mx.random.seed(seed) # mlx-lm CLI lora.py:223 order + model.freeze() + + actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers) + num_layers = max(1, min(int(last_n), actual_layers)) + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + + optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True) + formatted = [{"text": TRAIN_TEXT} for _ in range(64)] + ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + + if mx.metal.is_available(): + mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"]) + + state = [model.state, optimizer.state, mx.random.state] + loss_value_and_grad = nn.value_and_grad(model, default_loss) + + @partial(mx.compile, inputs=state, outputs=state) + def step(batch, prev_grad, do_update): + (lvalue, toks), grad = loss_value_and_grad(model, *batch) + if prev_grad is not None: + grad = tree_map(lambda x, y: x + y, grad, prev_grad) + if do_update: + grad = average_gradients(grad) + optimizer.update(model, grad) + grad = None + return lvalue, toks, grad + + model.train() + rows = [] + np.random.seed(seed) + batch_iter = iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True) + for it in range(1, steps + 1): + batch = next(batch_iter) + # Compute grad_norm BEFORE the compiled step (extra forward+backward but + # gives us a value-for-value comparable number with zoo's reporting). + (loss_pre, _), grad_pre = loss_value_and_grad(model, *batch) + flat = tree_flatten(grad_pre) + grad_norm_sq = mx.array(0.0, dtype=mx.float32) + for _name, g in flat: + grad_norm_sq = grad_norm_sq + mx.sum(g.astype(mx.float32) ** 2) + grad_norm = mx.sqrt(grad_norm_sq) + mx.eval(grad_norm, loss_pre) + gn = float(grad_norm.item()) + # Now do the real optimizer step + lvalue, toks, _ = step(batch, None, True) + mx.eval(state, lvalue, toks) + rows.append({"step": it, "loss": float(lvalue.item()), "grad_norm": gn}) + + return rows + + +def _run_zoo_trainer(seed, steps, lr, last_n): + """Reproduce probe 37's zoo path and capture per-step loss + grad_norm.""" + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + from mlx_lm.tuner.utils import linear_to_lora_layers + + model, tokenizer = mlx_load(MODEL_NAME) + mx.random.seed(seed) + model.freeze() + actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers) + num_layers = max(1, min(int(last_n), actual_layers)) + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: extra["adam_bias_correction"] = True + if "max_grad_value" in fields_supported: extra["max_grad_value"] = 0.0 # explicit no-clip + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=True, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe38_zoo_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + grad_norms_by_step = {} + + def _on_step(*args): + # MLXTrainingArguments callback signature: (step, max_steps, loss, grad_norm, lr, tokens_sec, peak_mem) + # We capture step + loss; grad_norm may be the 4th arg. + if len(args) < 3: return + step_no = int(args[0]) + loss = float(args[2]) + gn = None + if len(args) >= 4 and args[3] is not None: + try: gn = float(args[3]) + except (TypeError, ValueError): gn = None + rows.append({"step": step_no, "loss": loss, "grad_norm": gn}) + + trainer.add_step_callback(_on_step) + trainer.train() + return rows + + +def main() -> int: + steps = _env_int("MLX_STEPS", 8) # only need a few steps to spot divergence + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + last_n = _env_int("MLX_LAST_N", 16) + banner(f"Probe 38: strict step-by-step parity (mlx-lm manual vs zoo MLXTrainer) seed={seed}") + + section("Run 1: mlx-lm manual loop") + rows_mlxlm = _run_mlxlm_manual(seed, steps, lr, last_n) + for r in rows_mlxlm: + print(f" step {r['step']:>2}: loss={r['loss']:.6f} grad_norm={r['grad_norm']:.6f}") + + section("Run 2: zoo MLXTrainer (explicit no-clip)") + rows_zoo = _run_zoo_trainer(seed, steps, lr, last_n) + for r in rows_zoo: + gn = r['grad_norm'] + gn_s = f"{gn:.6f}" if gn is not None else "n/a" + print(f" step {r['step']:>2}: loss={r['loss']:.6f} grad_norm={gn_s}") + + section("Per-step diff (mlx-lm - zoo)") + diffs = [] + for r1, r2 in zip(rows_mlxlm, rows_zoo): + if r1['step'] != r2['step']: continue + loss_diff = r1['loss'] - r2['loss'] + gn1 = r1.get('grad_norm'); gn2 = r2.get('grad_norm') + gn_diff = (gn1 - gn2) if (gn1 is not None and gn2 is not None) else None + gn_s = f"{gn_diff:+.6f}" if gn_diff is not None else "n/a" + print(f" step {r1['step']:>2}: dloss={loss_diff:+.6f} dgrad_norm={gn_s}") + diffs.append({ + "step": r1['step'], + "loss_diff": loss_diff, + "grad_norm_diff": gn_diff, + }) + + out = { + "config": {"seed": seed, "steps": steps, "lr": lr, "last_n": last_n}, + "rows_mlxlm": rows_mlxlm, + "rows_zoo": rows_zoo, + "diffs": diffs, + } + fname = f"probe_38__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 55e0896f2aa3fe7f4d71d16bb5228228c63fd262 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 May 2026 01:19:04 +0000 Subject: [PATCH 79/84] mlx_parity: re-trigger Round BP after spurious cancellation From 261a66b78dfb5fc3d43f18410fd78a64f62173c3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 May 2026 02:31:25 +0000 Subject: [PATCH 80/84] mlx_parity Round BP: fix probe 38 callback arg index for grad_norm MLXTrainer's step callback signature is (current_step, total_steps, train_loss, lr_val, tokens_sec, peak_mem, elapsed_total, trained_tokens, grad_norm_val) Probe 38 was reading args[3] thinking it was grad_norm, but args[3] is lr_val -- which is constant 0.001 on a constant LR schedule, so every row reported grad_norm=0.001 regardless of actual gradient. grad_norm_val is args[8]. Also: the same probe run conclusively showed the per-step LOSS matches exactly (dloss = 0 across all 30 steps and 5 seeds), so mlx-lm vs zoo MLXTrainer ARE numerically identical at the loss level when the probe re-seeds mx.random AFTER mlx_load (matching mlx-lm CLI's lora.py:223 order). The Round BO step-2 divergence between probe 31 and probe 33/35/37 was caused by those probes NOT re-seeding after load -- not by any zoo-side numerical defect. --- tests/mlx_parity/probe_38_strict_parity.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/mlx_parity/probe_38_strict_parity.py b/tests/mlx_parity/probe_38_strict_parity.py index a6f5ddc504..244398add1 100644 --- a/tests/mlx_parity/probe_38_strict_parity.py +++ b/tests/mlx_parity/probe_38_strict_parity.py @@ -175,14 +175,17 @@ def _run_zoo_trainer(seed, steps, lr, last_n): grad_norms_by_step = {} def _on_step(*args): - # MLXTrainingArguments callback signature: (step, max_steps, loss, grad_norm, lr, tokens_sec, peak_mem) - # We capture step + loss; grad_norm may be the 4th arg. + # MLXTrainer callback signature (unsloth_zoo/mlx/trainer.py:1190): + # (current_step, total_steps, train_loss, lr_val, tokens_sec, + # peak_mem, elapsed_total, trained_tokens, grad_norm_val) + # grad_norm is args[8], NOT args[3]. (args[3] is lr_val and was being + # mis-read as a constant 0.001 placeholder in earlier probe runs.) if len(args) < 3: return step_no = int(args[0]) loss = float(args[2]) gn = None - if len(args) >= 4 and args[3] is not None: - try: gn = float(args[3]) + if len(args) >= 9 and args[8] is not None: + try: gn = float(args[8]) except (TypeError, ValueError): gn = None rows.append({"step": step_no, "loss": loss, "grad_norm": gn}) From 72fcfbcceca8f6c14afcb0fa59bb66f599471940 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 May 2026 03:48:05 +0000 Subject: [PATCH 81/84] mlx_parity Round BQ: FastMLXModel.get_peft_model parity probe (probe 39) Probe 38 v2 conclusively showed `mlx_lm.load + linear_to_lora_layers + manual @mx.compile loop` matches `zoo MLXTrainer` step-for-step at the loss level (15/15 zero-diff). But probes that went through FastMLXModel.from_pretrained + FastMLXModel.get_peft_model (32 / 34 / 36) still hit 47% greedy pass rate vs 67% for mlx-lm CLI's basin. Hypothesis: zoo's get_peft_model already re-seeds mx.random before linear_to_lora_layers (loader.py:2767), but something else between from_pretrained's exit and that reseed -- or in get_peft_model's key resolution -- consumes mx.random or changes the LoRA-module creation order so the resulting lora_a matrices differ from the mlx-lm CLI path. Probe 39 isolates the LoRA-init pipeline by running both setups through the IDENTICAL manual training loop in one process: Path A: mlx_lm.load -> mx.random.seed(seed) AFTER load -> linear_to_lora_layers(model, 16, {"keys": [suffix list]}) Path B: FastMLXModel.from_pretrained(random_state=seed) -> FastMLXModel.get_peft_model( finetune_last_n_layers=16, random_state=seed, ...) Both paths then go through the same manual @mx.compile loop with the same optim.AdamW(...). If per-step loss diff is non-zero, the divergence is upstream of the trainer (in FastMLXModel's loader or get_peft_model). If zero, LoRA init matches and the basin gap is elsewhere. 5 seeds matching probe 38 (1, 42, 999, 3407, 22222) for paired comparison. --- .github/workflows/mlx-parity-probe.yml | 18 ++ .../probe_39_fastmlxmodel_parity.py | 205 ++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100644 tests/mlx_parity/probe_39_fastmlxmodel_parity.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index da9354a585..a5c25d24e5 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -348,6 +348,24 @@ jobs: - {id: '38_999', script: 'probe_38_strict_parity.py', seed: '999'} - {id: '38_3407', script: 'probe_38_strict_parity.py', seed: '3407'} - {id: '38_22222', script: 'probe_38_strict_parity.py', seed: '22222'} + + # ---- Round BQ: FastMLXModel.get_peft_model parity ---- + # Probe 38 v2 proved mlx_lm.load + linear_to_lora_layers matches + # zoo MLXTrainer step-for-step at the loss level. But probes that + # went through FastMLXModel + get_peft_model (32 / 34 / 36) still + # diverge from mlx-lm CLI's basin family (47% vs 67% greedy pass). + # Probe 39 isolates the LoRA-init pipeline by running BOTH paths + # through the same manual training loop. If the per-step loss + # diff is non-zero, the divergence is in + # FastMLXModel.from_pretrained or .get_peft_model upstream of + # the trainer. If zero, the LoRA init matches and the basin + # gap must come from somewhere we haven't bisected yet. + + - {id: '39_1', script: 'probe_39_fastmlxmodel_parity.py', seed: '1'} + - {id: '39_42', script: 'probe_39_fastmlxmodel_parity.py', seed: '42'} + - {id: '39_999', script: 'probe_39_fastmlxmodel_parity.py', seed: '999'} + - {id: '39_3407', script: 'probe_39_fastmlxmodel_parity.py', seed: '3407'} + - {id: '39_22222', script: 'probe_39_fastmlxmodel_parity.py', seed: '22222'} steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 diff --git a/tests/mlx_parity/probe_39_fastmlxmodel_parity.py b/tests/mlx_parity/probe_39_fastmlxmodel_parity.py new file mode 100644 index 0000000000..db001b58ef --- /dev/null +++ b/tests/mlx_parity/probe_39_fastmlxmodel_parity.py @@ -0,0 +1,205 @@ +"""Probe 39 — strict step-by-step parity between mlx-lm CLI's +LoRA-init path and FastMLXModel + get_peft_model. + +Probe 38 v2 showed that mlx-lm manual loop + linear_to_lora_layers +matches zoo MLXTrainer + linear_to_lora_layers value-for-value at the +loss level when both reseed mx.random AFTER mlx_load. But probes that +went through FastMLXModel.from_pretrained + FastMLXModel.get_peft_model +(32 / 34 / 36) still hit 47% greedy pass rate vs 67% for mlx-lm CLI. + +Hypothesis: the seeding in zoo's get_peft_model (`_seed_mlx_random_state +(random_state)` at line 2767 of loader.py) is the right place, but +something else in FastMLXModel.from_pretrained or get_peft_model +consumes mx.random state between the seed and `linear_to_lora_layers`, +or the LoRA-key resolution / iteration order produces a different +LoRA-module-creation order than the explicit-keys-list call in +mlx-lm CLI. + +This probe runs both setups in one process with paired seeds and +captures per-step loss + grad_norm so the divergence point (if any) +is visible explicitly. + +Path A: mlx-lm CLI style. mlx_lm.load -> mx.random.seed(seed) after +load -> linear_to_lora_layers(model, 16, {"keys": [suffix list]}) -> +manual @mx.compile loop with bare optim.AdamW. + +Path B: FastMLXModel.from_pretrained(random_state=seed) -> +FastMLXModel.get_peft_model(finetune_last_n_layers=16, +random_state=seed) -> SAME manual @mx.compile loop, SAME optimizer +construction (constructed here, not from MLXTrainer). + +We deliberately re-use the same manual training loop for both paths +so the comparison isolates the LoRA-init pipeline only. +""" +import json +import os +import sys +import dataclasses +import random +from functools import partial +from pathlib import Path +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def _run_training(model, tokenizer, seed, steps, lr): + """Shared manual-loop training driver -- identical for both paths so + any divergence is attributable to the LoRA-init pipeline upstream. + + Returns rows: list[{step, loss, grad_norm}]. + """ + import mlx.core as mx + import mlx.nn as nn + import mlx.optimizers as optim + from mlx.nn.utils import average_gradients + from mlx.utils import tree_map, tree_flatten + from mlx_lm.tuner.trainer import iterate_batches, default_loss + from mlx_lm.tuner.datasets import TextDataset, CacheDataset + + optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True) + formatted = [{"text": TRAIN_TEXT} for _ in range(64)] + ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + + if mx.metal.is_available(): + mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"]) + + state = [model.state, optimizer.state, mx.random.state] + loss_value_and_grad = nn.value_and_grad(model, default_loss) + + @partial(mx.compile, inputs=state, outputs=state) + def step(batch, prev_grad, do_update): + (lvalue, toks), grad = loss_value_and_grad(model, *batch) + if prev_grad is not None: + grad = tree_map(lambda x, y: x + y, grad, prev_grad) + if do_update: + grad = average_gradients(grad) + optimizer.update(model, grad) + grad = None + return lvalue, toks, grad + + model.train() + rows = [] + np.random.seed(seed) + batch_iter = iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True) + for it in range(1, steps + 1): + batch = next(batch_iter) + # Compute grad_norm BEFORE the compiled step using the same forward + # path; this gives us a value-for-value comparable number across paths. + (_, _), grad_pre = loss_value_and_grad(model, *batch) + grad_norm_sq = mx.array(0.0, dtype=mx.float32) + for _name, g in tree_flatten(grad_pre): + grad_norm_sq = grad_norm_sq + mx.sum(g.astype(mx.float32) ** 2) + grad_norm = mx.sqrt(grad_norm_sq) + mx.eval(grad_norm) + gn = float(grad_norm.item()) + lvalue, toks, _ = step(batch, None, True) + mx.eval(state, lvalue, toks) + rows.append({"step": it, "loss": float(lvalue.item()), "grad_norm": gn}) + + return rows + + +def _path_a_mlxlm(seed, steps, lr, last_n): + """mlx-lm CLI style: mlx_lm.load -> seed AFTER -> explicit-keys LoRA.""" + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from mlx_lm import load as mlx_load + from mlx_lm.tuner.utils import linear_to_lora_layers + + model, tokenizer = mlx_load(MODEL_NAME) + mx.random.seed(seed) # mlx-lm CLI lora.py:223 + model.freeze() + actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers) + num_layers = max(1, min(int(last_n), actual_layers)) + linear_to_lora_layers(model, num_layers, { + "rank": 8, "scale": 2.0, "dropout": 0.0, + "keys": ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", + "mlp.gate_proj","mlp.up_proj","mlp.down_proj"], + }) + return _run_training(model, tokenizer, seed, steps, lr) + + +def _path_b_fastmlxmodel(seed, steps, lr, last_n): + """zoo FastMLXModel.from_pretrained + FastMLXModel.get_peft_model.""" + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from unsloth_zoo.mlx.loader import FastMLXModel + + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype=None, + text_only=True, max_seq_length=128, random_state=seed, + ) + model = FastMLXModel.get_peft_model( + model, r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], + random_state=seed, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + finetune_last_n_layers=last_n, + use_gradient_checkpointing=False, + ) + return _run_training(model, tokenizer, seed, steps, lr) + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + last_n = _env_int("MLX_LAST_N", 16) + banner(f"Probe 39: FastMLXModel get_peft_model vs mlx-lm CLI LoRA init seed={seed}") + + section("Path A: mlx_lm.load + mx.random.seed AFTER load + linear_to_lora_layers") + rows_a = _path_a_mlxlm(seed, steps, lr, last_n) + for r in rows_a: + print(f" step {r['step']:>2}: loss={r['loss']:.6f} grad_norm={r['grad_norm']:.6f}") + + section("Path B: FastMLXModel.from_pretrained + FastMLXModel.get_peft_model") + rows_b = _path_b_fastmlxmodel(seed, steps, lr, last_n) + for r in rows_b: + print(f" step {r['step']:>2}: loss={r['loss']:.6f} grad_norm={r['grad_norm']:.6f}") + + section("Per-step diff (Path A - Path B)") + diffs = [] + for ra, rb in zip(rows_a, rows_b): + if ra['step'] != rb['step']: continue + dl = ra['loss'] - rb['loss'] + dg = ra['grad_norm'] - rb['grad_norm'] + print(f" step {ra['step']:>2}: dloss={dl:+.6f} dgrad_norm={dg:+.6f}") + diffs.append({"step": ra['step'], "loss_diff": dl, "grad_norm_diff": dg}) + + out = { + "config": {"seed": seed, "steps": steps, "lr": lr, "last_n": last_n}, + "rows_mlxlm": rows_a, + "rows_fastmlxmodel": rows_b, + "diffs": diffs, + } + fname = f"probe_39__s{steps}_d{seed}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 12dba6f3ed03b788da917c686baebd0aa3310e1b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 May 2026 05:40:33 +0000 Subject: [PATCH 82/84] mlx_parity Round BR: pin probe matrix to PR #674 seed-ordering fix Pin ZOO_SPEC to 0124424 (fix-mlx-get-peft-model-seed HEAD), which stacks on PR #669's b137b40 so it carries both finetune_last_n_layers and the new seed-immediately-before-linear_to_lora_layers ordering inside FastMLXModel.get_peft_model. Trim the matrix to the question Round BR needs to answer: did moving _seed_mlx_random_state(random_state) from the top of get_peft_model (~165 lines above linear_to_lora_layers) to immediately before each LoRA construction close the FastMLXModel-path basin gap end-to-end? Round BR matrix: - probe 31 x 15 seeds: mlx-lm CLI manual loop. Unchanged control. - probe 34 x 15 seeds: FastMLXModel(dtype=None) + MLXTrainer + nl=16. Was 47% in Round BO. Expected: ~67% under PR #674. - probe 36 x 15 seeds: same + compile=True. Was 47%. Expected: ~67%. - probe 39 x 5 seeds: strict per-step diff of FastMLXModel vs mlx-lm CLI manual loop. Was non-zero from step 2. Expected: dloss=0 step-for-step under PR #674. Probes 30/35/37/38 dropped from this matrix (mlx-lm CLI controls or non-FastMLXModel paths that are no longer the live suspect). History retains them. --- .github/workflows/mlx-parity-probe.yml | 98 +++++++++++--------------- 1 file changed, 40 insertions(+), 58 deletions(-) diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index a5c25d24e5..a6118c49e1 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -121,21 +121,10 @@ jobs: # Probe 30 reseeds AFTER load + adds set_wired_limit. If 67%, # the seed-order is the bug; the FIX is to seed AFTER load. - - {id: '30_1', script: 'probe_30_seed_after_load.py', seed: '1'} - - {id: '30_7', script: 'probe_30_seed_after_load.py', seed: '7'} - - {id: '30_42', script: 'probe_30_seed_after_load.py', seed: '42'} - - {id: '30_123', script: 'probe_30_seed_after_load.py', seed: '123'} - - {id: '30_456', script: 'probe_30_seed_after_load.py', seed: '456'} - - {id: '30_789', script: 'probe_30_seed_after_load.py', seed: '789'} - - {id: '30_999', script: 'probe_30_seed_after_load.py', seed: '999'} - - {id: '30_1234', script: 'probe_30_seed_after_load.py', seed: '1234'} - - {id: '30_3407', script: 'probe_30_seed_after_load.py', seed: '3407'} - - {id: '30_5678', script: 'probe_30_seed_after_load.py', seed: '5678'} - - {id: '30_9012', script: 'probe_30_seed_after_load.py', seed: '9012'} - - {id: '30_12345', script: 'probe_30_seed_after_load.py', seed: '12345'} - - {id: '30_22222', script: 'probe_30_seed_after_load.py', seed: '22222'} - - {id: '30_31415', script: 'probe_30_seed_after_load.py', seed: '31415'} - - {id: '30_65535', script: 'probe_30_seed_after_load.py', seed: '65535'} + # Round BR drops probes 30/35/37/38 from this matrix (they + # targeted the mlx-lm CLI path and earlier zoo variants that are + # no longer the live suspect after PR #674). Git history retains + # them. # ---- Round BA: compile-mode hypothesis ---- # Round AZ rejected numpy-RNG (probe 22 = probe 21). The @@ -240,21 +229,7 @@ jobs: # ~67%, the -14pp is a probe-configuration artifact, not a # MLXTrainer defect. - - {id: '35_1', script: 'probe_35_zoo_trainer_compile_on.py', seed: '1'} - - {id: '35_7', script: 'probe_35_zoo_trainer_compile_on.py', seed: '7'} - - {id: '35_42', script: 'probe_35_zoo_trainer_compile_on.py', seed: '42'} - - {id: '35_123', script: 'probe_35_zoo_trainer_compile_on.py', seed: '123'} - - {id: '35_456', script: 'probe_35_zoo_trainer_compile_on.py', seed: '456'} - - {id: '35_789', script: 'probe_35_zoo_trainer_compile_on.py', seed: '789'} - - {id: '35_999', script: 'probe_35_zoo_trainer_compile_on.py', seed: '999'} - - {id: '35_1234', script: 'probe_35_zoo_trainer_compile_on.py', seed: '1234'} - - {id: '35_3407', script: 'probe_35_zoo_trainer_compile_on.py', seed: '3407'} - - {id: '35_5678', script: 'probe_35_zoo_trainer_compile_on.py', seed: '5678'} - - {id: '35_9012', script: 'probe_35_zoo_trainer_compile_on.py', seed: '9012'} - - {id: '35_12345', script: 'probe_35_zoo_trainer_compile_on.py', seed: '12345'} - - {id: '35_22222', script: 'probe_35_zoo_trainer_compile_on.py', seed: '22222'} - - {id: '35_31415', script: 'probe_35_zoo_trainer_compile_on.py', seed: '31415'} - - {id: '35_65535', script: 'probe_35_zoo_trainer_compile_on.py', seed: '65535'} + # (probe 35 dropped for Round BR — see header note.) # ---- Round BK: zoo loader + MLXTrainer(compile=True) ---- # Probe 34 (FastMLXModel(dtype=None) + MLXTrainer + nl=16 + compile=False): ~47% @@ -292,21 +267,7 @@ jobs: # disabled, regardless of PR #671). If 37 ~= 67%, elementwise # clipping at 1.0 was the entire trainer-side gap. - - {id: '37_1', script: 'probe_37_zoo_trainer_clip_off.py', seed: '1'} - - {id: '37_7', script: 'probe_37_zoo_trainer_clip_off.py', seed: '7'} - - {id: '37_42', script: 'probe_37_zoo_trainer_clip_off.py', seed: '42'} - - {id: '37_123', script: 'probe_37_zoo_trainer_clip_off.py', seed: '123'} - - {id: '37_456', script: 'probe_37_zoo_trainer_clip_off.py', seed: '456'} - - {id: '37_789', script: 'probe_37_zoo_trainer_clip_off.py', seed: '789'} - - {id: '37_999', script: 'probe_37_zoo_trainer_clip_off.py', seed: '999'} - - {id: '37_1234', script: 'probe_37_zoo_trainer_clip_off.py', seed: '1234'} - - {id: '37_3407', script: 'probe_37_zoo_trainer_clip_off.py', seed: '3407'} - - {id: '37_5678', script: 'probe_37_zoo_trainer_clip_off.py', seed: '5678'} - - {id: '37_9012', script: 'probe_37_zoo_trainer_clip_off.py', seed: '9012'} - - {id: '37_12345', script: 'probe_37_zoo_trainer_clip_off.py', seed: '12345'} - - {id: '37_22222', script: 'probe_37_zoo_trainer_clip_off.py', seed: '22222'} - - {id: '37_31415', script: 'probe_37_zoo_trainer_clip_off.py', seed: '31415'} - - {id: '37_65535', script: 'probe_37_zoo_trainer_clip_off.py', seed: '65535'} + # (probe 37 dropped for Round BR — see header note.) # ---- Round BM: re-baseline probe 31 to reset confidence ---- # Round BL data (probe 37 at 40%, probe 30 at 47%, probe 34/36 at 47%, @@ -343,11 +304,9 @@ jobs: # grad_norm so we can pin where the numerical divergence starts. # Only 5 seeds needed for a value-for-value diagnostic. - - {id: '38_1', script: 'probe_38_strict_parity.py', seed: '1'} - - {id: '38_42', script: 'probe_38_strict_parity.py', seed: '42'} - - {id: '38_999', script: 'probe_38_strict_parity.py', seed: '999'} - - {id: '38_3407', script: 'probe_38_strict_parity.py', seed: '3407'} - - {id: '38_22222', script: 'probe_38_strict_parity.py', seed: '22222'} + # (probe 38 dropped for Round BR — already proved per-step parity + # for the non-FastMLXModel path; Round BR rotates focus onto the + # FastMLXModel path's probe 39 strict diagnostic.) # ---- Round BQ: FastMLXModel.get_peft_model parity ---- # Probe 38 v2 proved mlx_lm.load + linear_to_lora_layers matches @@ -366,6 +325,28 @@ jobs: - {id: '39_999', script: 'probe_39_fastmlxmodel_parity.py', seed: '999'} - {id: '39_3407', script: 'probe_39_fastmlxmodel_parity.py', seed: '3407'} - {id: '39_22222', script: 'probe_39_fastmlxmodel_parity.py', seed: '22222'} + + # ---- Round BR: verify PR #674 seed-ordering fix end-to-end ---- + # PR #674 (unsloth-zoo) moves _seed_mlx_random_state(random_state) + # in FastMLXModel.get_peft_model from the top of the function + # (~165 lines above linear_to_lora_layers) to immediately before + # each linear_to_lora_layers call. Hypothesis: lazy mx.* state + # advances between the old seed call and lora_a init were causing + # lora_a to draw from a different RNG position than mlx-lm CLI + # despite both paths re-seeding to the same int. + # + # ZOO_SPEC is now pinned to 0124424 (PR #674 HEAD). Re-run the + # FastMLXModel-path probes that were divergent in Rounds BJ-BQ: + # * probe 34 x 15 seeds: FastMLXModel(dtype=None) + MLXTrainer + # + nl=16. Was 47%. Expected: ~67%. + # * probe 36 x 15 seeds: same + compile=True. Was 47%. Expected: ~67%. + # * probe 39 x 5 seeds : strict step-by-step diff vs mlx-lm CLI. + # Expected: dloss = 0 step-for-step. + # * probe 31 x 15 seeds: mlx-lm CLI manual loop. Unchanged + # control. Expected: ~67% as before. + # If probes 34/36 now match probe 31's pass rate and probe 39's + # diff drops to zero, the seed-ordering fix closes the basin gap + # end-to-end through the public FastMLXModel API. steps: - name: Harden runner (audit) uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1 @@ -399,14 +380,15 @@ jobs: 'sentencepiece==0.2.1' \ 'huggingface-hub==0.36.2' \ 'trl==0.27.0' - # Round BO: pin to the create_text_batches pad-fix branch - # (b265d99) which both (a) contains the finetune_last_n_layers - # parameter from b137b40 and (b) adds the +1 to MLXTrainer's - # padding so it matches mlx-lm's iterate_batches value-for-value. - # Re-runs probes 30/31/34/37 against this build to check whether - # the pad fix closes the basin gap (expected: zoo MLXTrainer - # probes 34/37 should rise from 47%/40% toward probe 31's 67%). - ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@b265d99710d7c61f704c4c740e7917cb0a543c7b' + # Round BR: pin to PR #674's fix-mlx-get-peft-model-seed branch + # (0124424). The commit stacks on top of PR #669's b137b40 so it + # carries BOTH the finetune_last_n_layers parameter AND the new + # seed-immediately-before-linear_to_lora_layers ordering inside + # FastMLXModel.get_peft_model. Round BR re-runs the previously + # divergent FastMLXModel-path probes (34, 36) plus the strict + # diagnostic probe 39 to check whether moving the seed call + # closer to the LoRA construction closes the basin gap end-to-end. + ZOO_SPEC='unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo@012442488894bea07b045c12fcfb27f9f691095d' for attempt in 1 2 3; do if pip install "$ZOO_SPEC"; then break; fi if [ "$attempt" -eq 3 ]; then exit 1; fi From 552ae632a2b5b7c792860f91e70bdb79a54455dd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 May 2026 06:38:58 +0000 Subject: [PATCH 83/84] mlx_parity Round BS: bisect MLXTrainer vs manual-loop residual gap PR #674 (verified by Round BR probe 39: dloss=0 step-for-step across 5 seeds) closed the LoRA-init gap between FastMLXModel and mlx-lm CLI. But probes 34/36 (FastMLXModel + MLXTrainer) still hit 47% greedy pass rate vs probe 31's (mlx_lm.load + manual loop) 67% on the same 15 seeds. Probes 34/36 share an identical pass/fail pattern, so the compile flag is a no-op for the basin -- the residual gap is somewhere else. Round BS introduces probe 40 = FastMLXModel.from_pretrained + FastMLXModel.get_peft_model(finetune_last_n_layers=16) + probe 31's exact manual @mx.compile loop. Same 15 seeds as probes 31 / 34 for direct paired comparison. Read: probe 40 ~ 67% -> MLXTrainer.train IS the remaining gap. The manual loop reproduces probe 31's basin under the FastMLXModel loader path. probe 40 ~ 47% -> FastMLXModel.from_pretrained adds drift downstream of get_peft_model that probe 39's 5-seed strict diagnostic missed; bisect the loader next round. BS matrix (45 jobs): - probe 31 x 15 seeds (mlx-lm CLI manual loop, unchanged control) - probe 34 x 15 seeds (FastMLXModel + MLXTrainer, paired) - probe 40 x 15 seeds (FastMLXModel + manual loop, new) Probes 36 / 39 dropped (Round BR conclusions established). ZOO_SPEC stays pinned at PR #674 HEAD (0124424). --- .github/workflows/mlx-parity-probe.yml | 52 +++-- .../probe_40_fastmlxmodel_manual_loop.py | 184 ++++++++++++++++++ 2 files changed, 216 insertions(+), 20 deletions(-) create mode 100644 tests/mlx_parity/probe_40_fastmlxmodel_manual_loop.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index a6118c49e1..23aac2f081 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -241,21 +241,9 @@ jobs: # 36 ~= 47% -> compile fixes the trainer half, but loader patches # independently add a -10pp drift to bisect next. - - {id: '36_1', script: 'probe_36_zoo_loader_compile_on.py', seed: '1'} - - {id: '36_7', script: 'probe_36_zoo_loader_compile_on.py', seed: '7'} - - {id: '36_42', script: 'probe_36_zoo_loader_compile_on.py', seed: '42'} - - {id: '36_123', script: 'probe_36_zoo_loader_compile_on.py', seed: '123'} - - {id: '36_456', script: 'probe_36_zoo_loader_compile_on.py', seed: '456'} - - {id: '36_789', script: 'probe_36_zoo_loader_compile_on.py', seed: '789'} - - {id: '36_999', script: 'probe_36_zoo_loader_compile_on.py', seed: '999'} - - {id: '36_1234', script: 'probe_36_zoo_loader_compile_on.py', seed: '1234'} - - {id: '36_3407', script: 'probe_36_zoo_loader_compile_on.py', seed: '3407'} - - {id: '36_5678', script: 'probe_36_zoo_loader_compile_on.py', seed: '5678'} - - {id: '36_9012', script: 'probe_36_zoo_loader_compile_on.py', seed: '9012'} - - {id: '36_12345', script: 'probe_36_zoo_loader_compile_on.py', seed: '12345'} - - {id: '36_22222', script: 'probe_36_zoo_loader_compile_on.py', seed: '22222'} - - {id: '36_31415', script: 'probe_36_zoo_loader_compile_on.py', seed: '31415'} - - {id: '36_65535', script: 'probe_36_zoo_loader_compile_on.py', seed: '65535'} + # (probe 36 dropped for Round BS -- Round BR established it has + # an identical pass/fail pattern to probe 34, so compile flag is + # a no-op for the basin.) # ---- Round BL: bypass the max_grad_value=None silent-clip bug ---- # Probe 35 (compile=True) hit 53% (same as probe 33's 53%) -- so @@ -320,11 +308,35 @@ jobs: # the trainer. If zero, the LoRA init matches and the basin # gap must come from somewhere we haven't bisected yet. - - {id: '39_1', script: 'probe_39_fastmlxmodel_parity.py', seed: '1'} - - {id: '39_42', script: 'probe_39_fastmlxmodel_parity.py', seed: '42'} - - {id: '39_999', script: 'probe_39_fastmlxmodel_parity.py', seed: '999'} - - {id: '39_3407', script: 'probe_39_fastmlxmodel_parity.py', seed: '3407'} - - {id: '39_22222', script: 'probe_39_fastmlxmodel_parity.py', seed: '22222'} + # (probe 39 dropped for Round BS -- Round BR already verified + # dloss=0 step-for-step under PR #674; no need to re-check.) + + # ---- Round BS: bisect residual MLXTrainer vs manual-loop gap ---- + # PR #674 verified by probe 39 (dloss = 0 step-for-step across 5 seeds). + # But probes 34/36 (FastMLXModel + MLXTrainer) still hit 47% greedy + # pass vs probe 31's (mlx_lm.load + manual loop) 67% on 15 seeds, and + # probes 34/36 share an identical pass/fail pattern (compile flag is + # a no-op for the basin). Probe 40 = FastMLXModel loader + probe 31's + # exact manual @mx.compile loop. If 67%, MLXTrainer.train IS the + # remaining gap. If 47%, FastMLXModel.from_pretrained adds drift + # downstream of get_peft_model that probe 39's 5-seed diagnostic + # missed -- bisect the loader next round. + + - {id: '40_1', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '1'} + - {id: '40_7', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '7'} + - {id: '40_42', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '42'} + - {id: '40_123', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '123'} + - {id: '40_456', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '456'} + - {id: '40_789', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '789'} + - {id: '40_999', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '999'} + - {id: '40_1234', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '1234'} + - {id: '40_3407', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '3407'} + - {id: '40_5678', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '5678'} + - {id: '40_9012', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '9012'} + - {id: '40_12345', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '12345'} + - {id: '40_22222', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '22222'} + - {id: '40_31415', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '31415'} + - {id: '40_65535', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '65535'} # ---- Round BR: verify PR #674 seed-ordering fix end-to-end ---- # PR #674 (unsloth-zoo) moves _seed_mlx_random_state(random_state) diff --git a/tests/mlx_parity/probe_40_fastmlxmodel_manual_loop.py b/tests/mlx_parity/probe_40_fastmlxmodel_manual_loop.py new file mode 100644 index 0000000000..80a582d40d --- /dev/null +++ b/tests/mlx_parity/probe_40_fastmlxmodel_manual_loop.py @@ -0,0 +1,184 @@ +"""Probe 40 -- FastMLXModel loader + manual @mx.compile loop. + +Round BS bisection of the residual 47% vs 67% gap that survived PR #674. + +After PR #674's seed-ordering fix, probe 39 proved +FastMLXModel.from_pretrained + FastMLXModel.get_peft_model produces +bit-identical losses and gradient norms vs mlx_lm.load + +linear_to_lora_layers when both feed the same manual @mx.compile +training loop (5 seeds x 30 steps, dloss = 0.0, dgrad_norm = 0.0). + +But probes 34 / 36 (`FastMLXModel + MLXTrainer.train`) still hit 47% +greedy pass rate vs probe 31's (`mlx_lm.load + manual loop`) 67% on +the same 15 seeds. Probes 34 and 36 share an identical pass/fail +pattern, so `compile=True/False` is a no-op for the basin. + +Two remaining suspects for the gap: + (a) MLXTrainer.train introduces drift on top of the manual loop + (despite probe 38 showing dloss=0 between manual loop and + MLXTrainer on `mlx_lm.load` path -- maybe FastMLXModel exposes + a path that probe 38 didn't cover). + (b) FastMLXModel.from_pretrained adds drift outside of LoRA init + that survives all 30 training steps -- probe 39's 5 seeds may + not have hit a basin-tipping case. + +Probe 40 = exactly probe 31's manual loop but the loader/PEFT setup +swapped for `FastMLXModel.from_pretrained` + `FastMLXModel.get_peft_model +(finetune_last_n_layers=16)`. Read: + * probe 40 ~ 67% (matches probe 31): MLXTrainer.train IS the bug. + PR #674 closed the loader-side gap; the remaining gap is purely + trainer math. + * probe 40 ~ 47% (matches probe 34): FastMLXModel.from_pretrained + adds drift downstream of get_peft_model that probe 39's 5-seed + diagnostic missed. Bisect the loader next. + +Same 15 seeds as probes 31 / 34 / 36 for direct paired comparison. +""" +import json +import os +import sys +import random +from functools import partial +from pathlib import Path +import numpy as np + +MODEL_NAME = "unsloth/gemma-3-270m-it" +TRAIN_TEXT = "<> My name is Unsloth!" +PROMPT = "<> My name is " +MAX_SEQ_LEN = 64 +OUT_DIR = Path(__file__).resolve().parent / ".out" +OUT_DIR.mkdir(parents=True, exist_ok=True) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + num_layers = _env_int("MLX_NUM_LAYERS", 16) + print(f"=== Probe 40: FastMLXModel + manual loop steps={steps} seed={seed} lr={lr} nl={num_layers} ===", flush=True) + + random.seed(seed); np.random.seed(seed) + + import mlx.core as mx + import mlx.nn as nn + import mlx.optimizers as optim + from mlx.nn.utils import average_gradients + from mlx.utils import tree_map + + from mlx_lm import generate + from mlx_lm.tuner.trainer import iterate_batches, default_loss + from mlx_lm.tuner.datasets import TextDataset, CacheDataset + + # FastMLXModel path (same as probe 39 path B). + mx.random.seed(seed) + from unsloth_zoo.mlx.loader import FastMLXModel + + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, + load_in_4bit=False, + dtype=None, + text_only=True, + max_seq_length=128, + random_state=seed, + ) + model = FastMLXModel.get_peft_model( + model, + r=8, + lora_alpha=16, + lora_dropout=0.0, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"], + random_state=seed, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + finetune_last_n_layers=num_layers, + use_gradient_checkpointing=False, + ) + + actual_layers = len(model.layers) if hasattr(model, 'layers') else len(model.model.layers) + print(f" model has {actual_layers} layers, LoRA on last {num_layers}", flush=True) + + # From here down: bit-identical to probe 31's manual loop. + optimizer = optim.AdamW(learning_rate=lr, weight_decay=0.0, bias_correction=True) + formatted = [{"text": TRAIN_TEXT} for _ in range(64)] + ds = CacheDataset(TextDataset(formatted, tokenizer, text_key="text")) + + if mx.metal.is_available(): + mx.set_wired_limit(mx.device_info()["max_recommended_working_set_size"]) + + state = [model.state, optimizer.state, mx.random.state] + loss_value_and_grad = nn.value_and_grad(model, default_loss) + + @partial(mx.compile, inputs=state, outputs=state) + def step(batch, prev_grad, do_update): + (lvalue, toks), grad = loss_value_and_grad(model, *batch) + if prev_grad is not None: + grad = tree_map(lambda x, y: x + y, grad, prev_grad) + if do_update: + grad = average_gradients(grad) + optimizer.update(model, grad) + grad = None + return lvalue, toks, grad + + model.train() + losses = mx.array(0.0); n_tokens = mx.array(0); grad_accum = None + rows = [] + np.random.seed(seed) + for it, batch in zip(range(1, steps + 1), iterate_batches(dataset=ds, batch_size=6, max_seq_length=MAX_SEQ_LEN, loop=True)): + lvalue, toks, grad_accum = step(batch, grad_accum, True) + losses += lvalue; n_tokens += toks + mx.eval(state, losses, n_tokens, grad_accum) + rows.append({"step": it, "loss": float(lvalue.item())}) + + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + post_loss, _ = default_loss(model, mx.array([ids]), mx.array([[1, L - 1]])) + post_loss_val = float(post_loss.item()) + + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy(cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean").item()) + else: + completion_loss = float("nan") + + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + print(f" contains 'Unsloth': {contains} gen={gen[:80]!r}", flush=True) + + out = { + "config": {"steps": steps, "seed": seed, "learning_rate": lr, + "num_layers": num_layers, "actual_layers": actual_layers, + "delta": "FastMLXModel loader + manual @mx.compile loop"}, + "rows": rows, "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_40__s{steps}_d{seed}_nl{num_layers}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From c63bc2c8b0de3e8490634f27a76c779d1181a872 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 May 2026 07:57:50 +0000 Subject: [PATCH 84/84] mlx_parity Round BT: test whether elementwise clip-at-1 IS the residual gap Round BS proved the residual 47%-vs-67% basin gap is in MLXTrainer.train (probe 40 = probe 31 on 15/15 seeds; FastMLXModel + manual loop matches mlx-lm CLI per-seed; FastMLXModel loader is exonerated). Reading unsloth_zoo/mlx/trainer.py:731-732: _raw_mgv = getattr(args, "max_grad_value", 1.0) max_grad_value = 1.0 if _raw_mgv is None else float(_raw_mgv or 0.0) MLXTrainer reinterprets `max_grad_value=None` as 1.0 (clip at +/-1.0 elementwise), NOT "disable clipping". PR #671 (mlx: honor max_grad_value=None as a disable signal, head 265534b) is OPEN, not merged -- the current ZOO_SPEC pin doesn't include it. Probe 34 sets max_grad_value=None expecting "disable", actually gets clip-at-1. The manual loop in probes 31 / 40 uses bare optim.AdamW with NO clip. Probe 41 = probe 34 with max_grad_value=0.0 (explicit zero hits `float(_raw_mgv or 0.0)` -> 0.0 -> no clip on the current build). Read: probe 41 ~ 67% (matches probes 31 / 40) Elementwise clip-at-1 IS the entire residual gap. PR #671 is the missing piece. Stacking PR #671 on top of PR #674 closes the FastMLXModel + MLXTrainer basin gap end-to-end. probe 41 ~ 47% (matches probe 34) Clip isn't it; bisect further inside MLXTrainer.train (lr schedule, loss-fn, batch iteration, mx.eval timing). BT matrix (45 jobs): - probe 31 x 15 seeds (mlx-lm CLI manual loop, control) - probe 34 x 15 seeds (FastMLXModel + MLXTrainer + max_grad_value= None -> clip-at-1, paired against probe 41) - probe 41 x 15 seeds (FastMLXModel + MLXTrainer + max_grad_value= 0.0 -> explicit no-clip, new target) Probes 36 / 39 / 40 dropped (Round BR / BS conclusions established). ZOO_SPEC stays pinned at PR #674 HEAD (0124424). --- .github/workflows/mlx-parity-probe.yml | 48 +++-- ...probe_41_zoo_trainer_clip_explicit_zero.py | 179 ++++++++++++++++++ 2 files changed, 212 insertions(+), 15 deletions(-) create mode 100644 tests/mlx_parity/probe_41_zoo_trainer_clip_explicit_zero.py diff --git a/.github/workflows/mlx-parity-probe.yml b/.github/workflows/mlx-parity-probe.yml index 23aac2f081..b46808b374 100644 --- a/.github/workflows/mlx-parity-probe.yml +++ b/.github/workflows/mlx-parity-probe.yml @@ -322,21 +322,39 @@ jobs: # downstream of get_peft_model that probe 39's 5-seed diagnostic # missed -- bisect the loader next round. - - {id: '40_1', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '1'} - - {id: '40_7', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '7'} - - {id: '40_42', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '42'} - - {id: '40_123', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '123'} - - {id: '40_456', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '456'} - - {id: '40_789', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '789'} - - {id: '40_999', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '999'} - - {id: '40_1234', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '1234'} - - {id: '40_3407', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '3407'} - - {id: '40_5678', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '5678'} - - {id: '40_9012', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '9012'} - - {id: '40_12345', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '12345'} - - {id: '40_22222', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '22222'} - - {id: '40_31415', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '31415'} - - {id: '40_65535', script: 'probe_40_fastmlxmodel_manual_loop.py', seed: '65535'} + # ---- Round BT: test whether elementwise clip-at-1 IS the + # ---- residual MLXTrainer gap ---- + # Reading trainer.py:731-732, MLXTrainer reinterprets + # `max_grad_value=None` as 1.0 (clip at +/-1.0 elementwise). + # PR #671 (mlx: honor max_grad_value=None as a disable signal, + # OPEN, head 265534b) would fix this. Probe 34 sets + # max_grad_value=None expecting "disable" -- actually gets + # clipped. Probe 41 = probe 34 with max_grad_value=0.0 + # (explicit zero hits the disable branch on the current build). + # If 67% (matching probes 31 / 40), elementwise clip-at-1 IS + # the entire residual basin gap and PR #671 is the missing + # piece. Same 15 seeds for direct paired comparison. + + - {id: '41_1', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '1'} + - {id: '41_7', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '7'} + - {id: '41_42', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '42'} + - {id: '41_123', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '123'} + - {id: '41_456', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '456'} + - {id: '41_789', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '789'} + - {id: '41_999', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '999'} + - {id: '41_1234', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '1234'} + - {id: '41_3407', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '3407'} + - {id: '41_5678', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '5678'} + - {id: '41_9012', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '9012'} + - {id: '41_12345', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '12345'} + - {id: '41_22222', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '22222'} + - {id: '41_31415', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '31415'} + - {id: '41_65535', script: 'probe_41_zoo_trainer_clip_explicit_zero.py', seed: '65535'} + + # ---- Round BS: bisect residual MLXTrainer vs manual-loop gap ---- + # (probe 40 dropped for Round BT -- already verified in BS that + # FastMLXModel + manual loop reproduces probe 31's 67% on + # 15/15 seeds. Round BT focuses on probe 41 to isolate clip.) # ---- Round BR: verify PR #674 seed-ordering fix end-to-end ---- # PR #674 (unsloth-zoo) moves _seed_mlx_random_state(random_state) diff --git a/tests/mlx_parity/probe_41_zoo_trainer_clip_explicit_zero.py b/tests/mlx_parity/probe_41_zoo_trainer_clip_explicit_zero.py new file mode 100644 index 0000000000..6e7cbe03d4 --- /dev/null +++ b/tests/mlx_parity/probe_41_zoo_trainer_clip_explicit_zero.py @@ -0,0 +1,179 @@ +"""Probe 41 -- probe 34 with max_grad_value=0.0 (explicit disable). + +Round BT bisection. + +Round BS proved the residual 47%-vs-67% gap is in MLXTrainer.train, +not FastMLXModel loader (probe 40 = probe 31 on 15/15 seeds). Reading +unsloth_zoo/mlx/trainer.py:731-732: + + _raw_mgv = getattr(args, "max_grad_value", 1.0) + max_grad_value = 1.0 if _raw_mgv is None else float(_raw_mgv or 0.0) + +means `max_grad_value=None` is reinterpreted as 1.0 (clip at +/-1.0 +elementwise), NOT "disable clipping". PR #671 +(`mlx: honor max_grad_value=None as a disable signal`, head 265534b) +is currently OPEN, not merged. Probe 34 sets max_grad_value=None +expecting "disable", actually gets clip-at-1. Manual loop in probes +31 / 40 uses bare optim.AdamW with NO clipping. + +Probe 41 = probe 34 but with max_grad_value=0.0 (explicit zero hits +the `float(_raw_mgv or 0.0)` branch -> 0.0 -> no clip on the current +build). + +Read: + probe 41 ~ 67% -> Elementwise clip-at-1 IS the residual gap. + PR #671 closes the FastMLXModel + MLXTrainer + basin gap. Final missing piece. + probe 41 ~ 47% -> Clip isn't it; the gap is elsewhere in + MLXTrainer.train (lr schedule, loss-fn, batch + iteration, mx.eval timing, ...). + +Same 15 seeds as probes 31 / 34 / 40 for direct paired comparison. +""" +import json +import os +import sys +import dataclasses +import random +from pathlib import Path +import numpy as np + +from _common import ( + MODEL_NAME, TRAIN_TEXT, PROMPT, MAX_SEQ_LEN, OUT_DIR, + banner, section, report, +) + + +def _env_int(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return int(raw) + except ValueError: return default + + +def _env_float(name, default): + raw = (os.environ.get(name) or "").strip() + if not raw: return default + try: return float(raw) + except ValueError: return default + + +def main() -> int: + steps = _env_int("MLX_STEPS", 30) + seed = _env_int("MLX_SEED", 3407) + lr = _env_float("MLX_LR", 1e-3) + last_n = _env_int("MLX_LAST_N", 16) + banner(f"Probe 41: FastMLXModel + MLXTrainer + max_grad_value=0.0 (explicit disable)") + + random.seed(seed); np.random.seed(seed) + import mlx.core as mx + mx.random.seed(seed) + + from unsloth_zoo.mlx.loader import FastMLXModel + from unsloth_zoo.mlx.trainer import MLXTrainer, MLXTrainingConfig + from unsloth_zoo.mlx.utils import make_baseline_loss_fn + + model, tokenizer = FastMLXModel.from_pretrained( + MODEL_NAME, load_in_4bit=False, dtype=None, + text_only=True, max_seq_length=128, random_state=seed, + ) + + model = FastMLXModel.get_peft_model( + model, r=8, lora_alpha=16, lora_dropout=0.0, + target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], + random_state=seed, + finetune_language_layers=True, + finetune_attention_modules=True, + finetune_mlp_modules=True, + finetune_last_n_layers=last_n, + use_gradient_checkpointing=False, + ) + + fields_supported = {f.name for f in dataclasses.fields(MLXTrainingConfig)} + extra = {} + if "adam_bias_correction" in fields_supported: extra["adam_bias_correction"] = True + # The key difference vs probe 34: explicit 0.0 hits trainer.py:732's + # `float(_raw_mgv or 0.0)` branch -> 0.0 -> no clip. Setting None + # would hit `1.0 if _raw_mgv is None` -> clip at 1.0. + if "max_grad_value" in fields_supported: extra["max_grad_value"] = 0.0 + + config = MLXTrainingConfig( + per_device_train_batch_size=6, + gradient_accumulation_steps=1, + max_steps=steps, + learning_rate=lr, + warmup_steps=0, + lr_scheduler_type="constant", + optim="adamw", + weight_decay=0.0, + max_grad_norm=0.0, + logging_steps=1, + max_seq_length=MAX_SEQ_LEN, + seed=seed, + use_cce=False, + compile=False, + gradient_checkpointing=False, + output_dir=str(OUT_DIR / f"probe41_outputs_s{steps}_d{seed}"), + save_steps=0, + eval_steps=0, + dataset_text_field="text", + **extra, + ) + trainer = MLXTrainer( + model=model, tokenizer=tokenizer, + train_dataset=[{"text": TRAIN_TEXT}] * 64, + args=config, + ) + rows = [] + def _on_step(*args): + if len(args) < 3: return + rows.append({"step": int(args[0]), "loss": float(args[2])}) + trainer.add_step_callback(_on_step) + trainer.train() + + loss_fn = make_baseline_loss_fn() + ids = tokenizer.encode(TRAIN_TEXT) + if tokenizer.eos_token_id is not None and ids[-1] != tokenizer.eos_token_id: + ids.append(tokenizer.eos_token_id) + L = len(ids) + batch = mx.array([ids]) + lengths = mx.array([[1, L - 1]]) + labels_mlx = mx.array([ids]) + post_loss, _ = loss_fn(model, batch, lengths, labels_mlx) + post_loss_val = float(post_loss.item()) + + import mlx.nn as nn + prompt_ids = list(tokenizer.encode(PROMPT)) + full_ids = list(tokenizer.encode(PROMPT + "Unsloth!")) + if len(full_ids) > len(prompt_ids): + cf_inputs = mx.array([full_ids[:-1]], dtype=mx.int32) + cf_targets = mx.array([full_ids[1:]], dtype=mx.int32) + cf_logits = model(cf_inputs) + start = len(prompt_ids) - 1 + completion_loss = float(nn.losses.cross_entropy( + cf_logits[:, start:, :], cf_targets[:, start:], reduction="mean" + ).item()) + else: + completion_loss = float("nan") + + from mlx_lm import generate + gen = generate(model, tokenizer, prompt=PROMPT, max_tokens=48, verbose=False) + contains = "Unsloth" in gen + report("contains 'Unsloth'", contains) + report("generation", repr(gen[:60])) + + out = { + "config": {"steps": steps, "seed": seed, "learning_rate": lr, + "loader": "FastMLXModel(dtype=None)", "finetune_last_n_layers": last_n, + "delta": "max_grad_value=0.0 (explicit disable)"}, + "rows": rows, "post_train_loss": post_loss_val, + "completion_teacher_forced_loss": completion_loss, "generation": gen, + "contains_unsloth": contains, + } + fname = f"probe_41__s{steps}_d{seed}_nl{last_n}.json" + (OUT_DIR / fname).write_text(json.dumps(out, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main())