From 73f37c6e3f2e3e09b5ee145a4aa2889ddd0d1971 Mon Sep 17 00:00:00 2001
From: DoubleMathew <mmathew23@gmail.com>
Date: Tue, 19 May 2026 03:30:06 -0500
Subject: [PATCH 01/22] Expose MLX grad value clipping in Studio

---
 studio/backend/core/training/training.py             |  1 +
 studio/backend/core/training/worker.py               |  9 ++++++---
 studio/backend/models/training.py                    |  8 ++++++++
 studio/backend/routes/training.py                    |  1 +
 studio/backend/tests/test_training_raw_support.py    | 11 +++++++++++
 studio/frontend/src/features/training/api/mappers.ts |  1 +
 studio/frontend/src/features/training/types/api.ts   |  1 +
 7 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/training/training.py b/studio/backend/core/training/training.py
index d2c2316d45..fd608381b4 100644
--- a/studio/backend/core/training/training.py
+++ b/studio/backend/core/training/training.py
@@ -218,6 +218,7 @@ def start_training(self, job_id: str, **kwargs) -> bool:
             "save_steps": kwargs.get("save_steps", 0),
             "weight_decay": kwargs.get("weight_decay", 0.001),
             "max_grad_norm": kwargs.get("max_grad_norm", 0.0),
+            "max_grad_value": kwargs.get("max_grad_value"),
             "random_seed": kwargs.get("random_seed", 3407),
             "packing": kwargs.get("packing", False),
             "optim": kwargs.get("optim", "adamw_8bit"),
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index f47a6bd599..2efc1e991f 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1156,6 +1156,8 @@ def _send(event_type, **kwargs):
     is_dataset_image = bool(config.get("is_dataset_image", False))
     training_type = config.get("training_type", "LoRA/QLoRA")
     use_lora = training_type == "LoRA/QLoRA"
+    model_random_state = config.get("model_random_state", 3407)
+    lora_random_state = config.get("lora_random_state", 3407)
     model, tokenizer = FastMLXModel.from_pretrained(
         model_name,
         load_in_4bit = config.get("load_in_4bit", True),
@@ -1163,7 +1165,7 @@ def _send(event_type, **kwargs):
         text_only = None if is_dataset_image else True,
         token = hf_token,
         trust_remote_code = bool(config.get("trust_remote_code", False)),
-        random_state = config.get("random_seed", 3407),
+        random_state = model_random_state,
     )
 
     is_vlm = bool(is_dataset_image and getattr(model, "_is_vlm_model", False))
@@ -1188,7 +1190,7 @@ def _send(event_type, **kwargs):
             lora_dropout = config.get("lora_dropout", 0.0),
             use_rslora = config.get("use_rslora", False),
             init_lora_weights = config.get("init_lora_weights", True),
-            random_state = config.get("random_seed", 3407),
+            random_state = lora_random_state,
             target_modules = config.get("target_modules")
             or [
                 "q_proj",
@@ -1390,7 +1392,8 @@ def _fmt_progress(status_message = "", **_kw):
     # global reduction that breaks MLX's eager pipeline). 1.0 (not 5.0):
     # |g_i| > 5 rarely fires, so the historical 5.0 was effectively no-op.
     max_grad_norm = 0.0
-    max_grad_value = 1.0  # TODO: expose MLX grad-clip in Studio UI for power users
+    max_grad_value = config.get("max_grad_value")
+    max_grad_value = 1.0 if max_grad_value is None else float(max_grad_value)
 
     trainer = MLXTrainer(
         model = model,
diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py
index 7c53b0fee5..e17f376789 100644
--- a/studio/backend/models/training.py
+++ b/studio/backend/models/training.py
@@ -267,6 +267,14 @@ def _check_lora_dropout(cls, v: float) -> float:
         ge = 0,
         description = "Global gradient norm clipping threshold. Set 0 to disable.",
     )
+    max_grad_value: Optional[float] = Field(
+        None,
+        ge = 0,
+        description = (
+            "MLX-only elementwise gradient value clipping threshold. "
+            "If unset, MLX uses its runtime default."
+        ),
+    )
     random_seed: int = Field(42, description = "Random seed")
     packing: bool = Field(False, description = "Enable sequence packing")
     optim: str = Field("adamw_8bit", description = "Optimizer")
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 6e2413b3e9..6875a13206 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -216,6 +216,7 @@ async def start_training(
             "save_steps": request.save_steps,
             "weight_decay": request.weight_decay,
             "max_grad_norm": request.max_grad_norm,
+            "max_grad_value": request.max_grad_value,
             "random_seed": request.random_seed,
             "packing": request.packing,
             "optim": request.optim,
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 384247a191..1129d07537 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -107,10 +107,21 @@ def start(self):
                 model_name = "unsloth/test",
                 training_type = "LoRA/QLoRA",
                 max_grad_norm = 0.7,
+                max_grad_value = 3.0,
             )
 
         config = mock_process.call_args.kwargs["kwargs"]["config"]
         self.assertEqual(config["max_grad_norm"], 0.7)
+        self.assertEqual(config["max_grad_value"], 3.0)
+
+    def test_mlx_worker_uses_cuda_style_model_and_lora_init_seed(self):
+        source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
+
+        self.assertIn('model_random_state = config.get("model_random_state", 3407)', source)
+        self.assertIn('lora_random_state = config.get("lora_random_state", 3407)', source)
+        self.assertIn("random_state = model_random_state", source)
+        self.assertIn("random_state = lora_random_state", source)
+        self.assertIn('seed = config.get("random_seed", 3407)', source)
 
     def test_training_route_forwards_embedding_learning_rate(self):
         training_route = _load_route_module(
diff --git a/studio/frontend/src/features/training/api/mappers.ts b/studio/frontend/src/features/training/api/mappers.ts
index 5d81f0df9c..5cd21cee25 100644
--- a/studio/frontend/src/features/training/api/mappers.ts
+++ b/studio/frontend/src/features/training/api/mappers.ts
@@ -84,6 +84,7 @@ export function buildTrainingStartPayload(
     eval_steps: config.evalSteps,
     weight_decay: config.weightDecay,
     max_grad_norm: 0.0,
+    max_grad_value: null,
     random_seed: config.randomSeed,
     packing: isEmbedding ? false : config.packing,
     optim: config.optimizerType,
diff --git a/studio/frontend/src/features/training/types/api.ts b/studio/frontend/src/features/training/types/api.ts
index 0cb881e634..0e6fe71d32 100644
--- a/studio/frontend/src/features/training/types/api.ts
+++ b/studio/frontend/src/features/training/types/api.ts
@@ -32,6 +32,7 @@ export interface TrainingStartRequest {
   eval_steps: number;
   weight_decay: number;
   max_grad_norm: number;
+  max_grad_value?: number | null;
   random_seed: number;
   packing: boolean;
   optim: string;

From e36b55e122b65cda12fce2ef8c05720e9b61da93 Mon Sep 17 00:00:00 2001
From: DoubleMathew <mmathew23@gmail.com>
Date: Wed, 20 May 2026 16:44:47 -0500
Subject: [PATCH 02/22] update test

---
 .../backend/tests/test_mlx_training_worker_config.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/studio/backend/tests/test_mlx_training_worker_config.py b/studio/backend/tests/test_mlx_training_worker_config.py
index 98c7bdaa55..7cc795f280 100644
--- a/studio/backend/tests/test_mlx_training_worker_config.py
+++ b/studio/backend/tests/test_mlx_training_worker_config.py
@@ -82,3 +82,15 @@ def test_mlx_studio_rejects_unknown_optimizer():
 def test_mlx_studio_rejects_unknown_scheduler():
     with pytest.raises(ValueError, match = "Unsupported LR scheduler for MLX training"):
         _normalize_mlx_studio_scheduler("linear_typo")
+
+
+def test_mlx_studio_keeps_hf_style_tokenizer_dual_purpose():
+    source = (
+        Path(__file__).resolve().parents[1]
+        / "core"
+        / "training"
+        / "worker.py"
+    ).read_text()
+
+    assert "tokenizer = tokenizer" in source
+    assert "processor = tokenizer if is_vlm else None" not in source

From 8b79ba42352417eaff45201f13147af2f79fa37c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 20 May 2026 22:01:23 +0000
Subject: [PATCH 03/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_mlx_training_worker_config.py | 5 +----
 studio/backend/tests/test_training_raw_support.py       | 8 ++++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/studio/backend/tests/test_mlx_training_worker_config.py b/studio/backend/tests/test_mlx_training_worker_config.py
index 7cc795f280..1fe2315a8e 100644
--- a/studio/backend/tests/test_mlx_training_worker_config.py
+++ b/studio/backend/tests/test_mlx_training_worker_config.py
@@ -86,10 +86,7 @@ def test_mlx_studio_rejects_unknown_scheduler():
 
 def test_mlx_studio_keeps_hf_style_tokenizer_dual_purpose():
     source = (
-        Path(__file__).resolve().parents[1]
-        / "core"
-        / "training"
-        / "worker.py"
+        Path(__file__).resolve().parents[1] / "core" / "training" / "worker.py"
     ).read_text()
 
     assert "tokenizer = tokenizer" in source
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 1129d07537..c6b76405df 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -117,8 +117,12 @@ def start(self):
     def test_mlx_worker_uses_cuda_style_model_and_lora_init_seed(self):
         source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
 
-        self.assertIn('model_random_state = config.get("model_random_state", 3407)', source)
-        self.assertIn('lora_random_state = config.get("lora_random_state", 3407)', source)
+        self.assertIn(
+            'model_random_state = config.get("model_random_state", 3407)', source
+        )
+        self.assertIn(
+            'lora_random_state = config.get("lora_random_state", 3407)', source
+        )
         self.assertIn("random_state = model_random_state", source)
         self.assertIn("random_state = lora_random_state", source)
         self.assertIn('seed = config.get("random_seed", 3407)', source)

From e8c944fc0a202677704d85f0f6f5b86c8b25ed59 Mon Sep 17 00:00:00 2001
From: DoubleMathew <mmathew23@gmail.com>
Date: Wed, 20 May 2026 22:54:06 -0500
Subject: [PATCH 04/22] dataset ordering + wd

---
 studio/backend/core/training/worker.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 2efc1e991f..e01f560ea0 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1394,6 +1394,8 @@ def _fmt_progress(status_message = "", **_kw):
     max_grad_norm = 0.0
     max_grad_value = config.get("max_grad_value")
     max_grad_value = 1.0 if max_grad_value is None else float(max_grad_value)
+    weight_decay = config.get("weight_decay", 0.001)
+    weight_decay = 0.001 if weight_decay is None else float(weight_decay)
 
     trainer = MLXTrainer(
         model = model,
@@ -1408,7 +1410,7 @@ def _fmt_progress(status_message = "", **_kw):
             warmup_steps = warmup_steps,
             lr_scheduler_type = lr_scheduler_type,
             optim = optim_name,
-            weight_decay = float(config.get("weight_decay", 0.001) or 0.001),
+            weight_decay = weight_decay,
             max_grad_norm = max_grad_norm,
             max_grad_value = max_grad_value,
             logging_steps = 1,
@@ -1418,6 +1420,7 @@ def _fmt_progress(status_message = "", **_kw):
             compile = True,
             gradient_checkpointing = use_grad_checkpoint,
             streaming = is_vlm,
+            dataset_order = "torch_randperm",
             packing = bool(config.get("packing", False)),
             output_dir = output_dir,
             save_steps = int(config.get("save_steps", 0) or 0),

From 377fc67606f42a02142539a54cabc4b7d31da4dc Mon Sep 17 00:00:00 2001
From: DoubleMathew <mmathew23@gmail.com>
Date: Wed, 20 May 2026 23:04:59 -0500
Subject: [PATCH 05/22] fix mlx smoke step expectations

---
 tests/studio/run_real_mlx_smoke.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/studio/run_real_mlx_smoke.py b/tests/studio/run_real_mlx_smoke.py
index 27f682ee4e..e63c79af5e 100644
--- a/tests/studio/run_real_mlx_smoke.py
+++ b/tests/studio/run_real_mlx_smoke.py
@@ -14,10 +14,10 @@
   1. Loads `unsloth/gemma-3-270m-it` via FastMLXModel.from_pretrained.
   2. Applies LoRA r=8 on q/k/v/o.
   3. Computes pre-training loss + grad norm via mx.nn.value_and_grad.
-  4. Trains 7 deterministic steps on a dataset of the SAME row repeated
+  4. Trains 30 deterministic steps on a dataset of the SAME row repeated
      ("<<HELLO!!>> My name is Unsloth!"), with batch_size=2 and
      gradient_accumulation_steps=3 so each step processes 6 sequences
-     and the run sees 42 sequences total.
+     and the run sees 180 sequences total.
   5. Computes post-training loss + grad norm.
   6. Generates from "<<HELLO!!>> My name is " and asserts "Unsloth"
      appears in the in-memory completion.
@@ -162,10 +162,9 @@ def _compute_loss_and_grad_norm(model, tokenizer, text: str) -> tuple[float, flo
     import mlx.nn as nn
     from mlx.utils import tree_flatten
 
+    # Match Studio's text dataset path: Studio passes exactly the formatted
+    # text to the tokenizer and does not append EOS behind the user's back.
     ids = list(tokenizer.encode(text))
-    eos_id = getattr(tokenizer, "eos_token_id", None)
-    if eos_id is not None:
-        ids.append(int(eos_id))
     if len(ids) < 2:
         raise RuntimeError(f"text too short to compute loss: {len(ids)} tokens")
 
@@ -390,7 +389,15 @@ def _on_step(
         )
         if k in train_result
     }
-    assert len(losses_per_step) == 7, f"expected 7 logged steps, got {losses_per_step}"
+    expected_logged_steps = int(config.max_steps)
+    assert len(losses_per_step) == expected_logged_steps, (
+        f"expected {expected_logged_steps} logged steps, got {losses_per_step}"
+    )
+    if "train_steps" in train_result:
+        assert int(train_result["train_steps"]) == expected_logged_steps, (
+            f"expected train_steps={expected_logged_steps}, got "
+            f"{train_result['train_steps']}"
+        )
     for i, l in enumerate(losses_per_step):
         # Allow exact 0.0: fp16 per-step loss underflows to 0.0 after
         # the LoRA reaches loss=0 around step ~10 with this fixture +

From e82926845db0aaa2828dcc2c1a63773062052c42 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 May 2026 04:05:11 +0000
Subject: [PATCH 06/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/studio/run_real_mlx_smoke.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/studio/run_real_mlx_smoke.py b/tests/studio/run_real_mlx_smoke.py
index e63c79af5e..8862fa1d68 100644
--- a/tests/studio/run_real_mlx_smoke.py
+++ b/tests/studio/run_real_mlx_smoke.py
@@ -390,9 +390,9 @@ def _on_step(
         if k in train_result
     }
     expected_logged_steps = int(config.max_steps)
-    assert len(losses_per_step) == expected_logged_steps, (
-        f"expected {expected_logged_steps} logged steps, got {losses_per_step}"
-    )
+    assert (
+        len(losses_per_step) == expected_logged_steps
+    ), f"expected {expected_logged_steps} logged steps, got {losses_per_step}"
     if "train_steps" in train_result:
         assert int(train_result["train_steps"]) == expected_logged_steps, (
             f"expected train_steps={expected_logged_steps}, got "

From bfb4203400c9e79fde455af2be15de960dfa14d3 Mon Sep 17 00:00:00 2001
From: DoubleMathew <mmathew23@gmail.com>
Date: Thu, 21 May 2026 10:07:38 -0500
Subject: [PATCH 07/22] cast norm activation output back to original input
 dtype

---
 studio/backend/core/training/training.py | 3 +++
 studio/backend/core/training/worker.py   | 3 +++
 studio/backend/models/training.py        | 7 +++++++
 studio/backend/routes/training.py        | 1 +
 4 files changed, 14 insertions(+)

diff --git a/studio/backend/core/training/training.py b/studio/backend/core/training/training.py
index fd608381b4..bf119f5567 100644
--- a/studio/backend/core/training/training.py
+++ b/studio/backend/core/training/training.py
@@ -219,6 +219,9 @@ def start_training(self, job_id: str, **kwargs) -> bool:
             "weight_decay": kwargs.get("weight_decay", 0.001),
             "max_grad_norm": kwargs.get("max_grad_norm", 0.0),
             "max_grad_value": kwargs.get("max_grad_value"),
+            "cast_norm_output_to_input_dtype": kwargs.get(
+                "cast_norm_output_to_input_dtype", True
+            ),
             "random_seed": kwargs.get("random_seed", 3407),
             "packing": kwargs.get("packing", False),
             "optim": kwargs.get("optim", "adamw_8bit"),
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index e01f560ea0..1c369fcc85 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1413,6 +1413,9 @@ def _fmt_progress(status_message = "", **_kw):
             weight_decay = weight_decay,
             max_grad_norm = max_grad_norm,
             max_grad_value = max_grad_value,
+            cast_norm_output_to_input_dtype = bool(
+                config.get("cast_norm_output_to_input_dtype", True)
+            ),
             logging_steps = 1,
             max_seq_length = max_seq_length,
             seed = config.get("random_seed", 3407),
diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py
index e17f376789..4b64454bc1 100644
--- a/studio/backend/models/training.py
+++ b/studio/backend/models/training.py
@@ -275,6 +275,13 @@ def _check_lora_dropout(cls, v: float) -> float:
             "If unset, MLX uses its runtime default."
         ),
     )
+    cast_norm_output_to_input_dtype: bool = Field(
+        True,
+        description = (
+            "MLX-only: keep norm parameters in fp32 but cast norm outputs "
+            "back to the incoming activation dtype."
+        ),
+    )
     random_seed: int = Field(42, description = "Random seed")
     packing: bool = Field(False, description = "Enable sequence packing")
     optim: str = Field("adamw_8bit", description = "Optimizer")
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 6875a13206..78026a0b49 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -217,6 +217,7 @@ async def start_training(
             "weight_decay": request.weight_decay,
             "max_grad_norm": request.max_grad_norm,
             "max_grad_value": request.max_grad_value,
+            "cast_norm_output_to_input_dtype": request.cast_norm_output_to_input_dtype,
             "random_seed": request.random_seed,
             "packing": request.packing,
             "optim": request.optim,

From a404dfd3334e93230ff99d6e3d07c924645c8bc6 Mon Sep 17 00:00:00 2001
From: DoubleMathew <mmathew23@gmail.com>
Date: Thu, 21 May 2026 12:56:19 -0500
Subject: [PATCH 08/22] address mlx studio review feedback

---
 studio/backend/core/training/worker.py        | 12 ++--
 .../tests/test_training_raw_support.py        | 63 ++++++++++++++++++-
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 1c369fcc85..3a1174a3d3 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1156,8 +1156,9 @@ def _send(event_type, **kwargs):
     is_dataset_image = bool(config.get("is_dataset_image", False))
     training_type = config.get("training_type", "LoRA/QLoRA")
     use_lora = training_type == "LoRA/QLoRA"
-    model_random_state = config.get("model_random_state", 3407)
-    lora_random_state = config.get("lora_random_state", 3407)
+    random_seed = config.get("random_seed", 3407)
+    model_random_state = config.get("model_random_state", random_seed)
+    lora_random_state = config.get("lora_random_state", random_seed)
     model, tokenizer = FastMLXModel.from_pretrained(
         model_name,
         load_in_4bit = config.get("load_in_4bit", True),
@@ -1388,12 +1389,11 @@ def _fmt_progress(status_message = "", **_kw):
     else:
         eval_steps_val = int(eval_steps_val)
 
-    # MLX: per-element clip to [-1, 1]; norm clip disabled (it needs a
-    # global reduction that breaks MLX's eager pipeline). 1.0 (not 5.0):
-    # |g_i| > 5 rarely fires, so the historical 5.0 was effectively no-op.
+    # MLX Studio uses per-element clipping by default and keeps norm clipping
+    # disabled. Preserve None so the MLX trainer owns its runtime default.
     max_grad_norm = 0.0
     max_grad_value = config.get("max_grad_value")
-    max_grad_value = 1.0 if max_grad_value is None else float(max_grad_value)
+    max_grad_value = None if max_grad_value is None else float(max_grad_value)
     weight_decay = config.get("weight_decay", 0.001)
     weight_decay = 0.001 if weight_decay is None else float(weight_decay)
 
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index c6b76405df..6a94da6bc0 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -114,19 +114,76 @@ def start(self):
         self.assertEqual(config["max_grad_norm"], 0.7)
         self.assertEqual(config["max_grad_value"], 3.0)
 
-    def test_mlx_worker_uses_cuda_style_model_and_lora_init_seed(self):
+    def test_training_backend_forwards_random_seed_without_internal_mlx_seed_keys(self):
+        backend = TrainingBackend()
+
+        class DummyProcess:
+            pid = 12345
+
+            def start(self):
+                return None
+
+        class DummyThread:
+            def start(self):
+                return None
+
+        dummy_queue = object()
+
+        with (
+            patch(
+                "core.training.training.prepare_gpu_selection",
+                return_value = ([0], {"selection_mode": "auto"}),
+            ),
+            patch(
+                "core.training.training._CTX.Queue",
+                side_effect = [dummy_queue, dummy_queue],
+            ),
+            patch(
+                "core.training.training._CTX.Process", return_value = DummyProcess()
+            ) as mock_process,
+            patch(
+                "core.training.training.threading.Thread",
+                return_value = DummyThread(),
+            ),
+        ):
+            backend.start_training(
+                job_id = "test-seed",
+                model_name = "unsloth/test",
+                training_type = "LoRA/QLoRA",
+                random_seed = 1234,
+            )
+
+        config = mock_process.call_args.kwargs["kwargs"]["config"]
+        self.assertEqual(config["random_seed"], 1234)
+        self.assertNotIn("model_random_state", config)
+        self.assertNotIn("lora_random_state", config)
+
+    def test_mlx_worker_falls_back_init_seeds_to_random_seed(self):
         source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
 
+        self.assertIn('random_seed = config.get("random_seed", 3407)', source)
         self.assertIn(
-            'model_random_state = config.get("model_random_state", 3407)', source
+            'model_random_state = config.get("model_random_state", random_seed)', source
         )
         self.assertIn(
-            'lora_random_state = config.get("lora_random_state", 3407)', source
+            'lora_random_state = config.get("lora_random_state", random_seed)', source
         )
         self.assertIn("random_state = model_random_state", source)
         self.assertIn("random_state = lora_random_state", source)
         self.assertIn('seed = config.get("random_seed", 3407)', source)
 
+    def test_mlx_worker_preserves_null_max_grad_value_for_trainer_default(self):
+        source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
+
+        self.assertIn(
+            "max_grad_value = None if max_grad_value is None else float(max_grad_value)",
+            source,
+        )
+        self.assertNotIn(
+            "max_grad_value = 1.0 if max_grad_value is None else float(max_grad_value)",
+            source,
+        )
+
     def test_training_route_forwards_embedding_learning_rate(self):
         training_route = _load_route_module(
             "training_route_module_raw_support",

From bff5b4430a272dc35117835ebea741319fc158e6 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <info@unsloth.ai>
Date: Sun, 24 May 2026 13:56:47 +0000
Subject: [PATCH 09/22] Fix present-but-None seed override for PR #5656

studio/backend/core/training/worker.py
  `config.get("model_random_state", random_seed)` only fills the
  default when the key is absent. When a caller passes
  `config["model_random_state"] = None` explicitly (which happens
  any time a JSON payload sends an explicit `null`), the old code
  forwarded `None` to FastMLXModel and disabled deterministic init
  silently. Same for `lora_random_state`. Treat absent and explicit
  None the same way: fall back to random_seed.

studio/backend/tests/test_training_raw_support.py
  Update the source-string assertions to match the new lines.
---
 studio/backend/core/training/worker.py            | 10 ++++++++--
 studio/backend/tests/test_training_raw_support.py | 12 ++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 3a1174a3d3..332b8e8bac 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1157,8 +1157,14 @@ def _send(event_type, **kwargs):
     training_type = config.get("training_type", "LoRA/QLoRA")
     use_lora = training_type == "LoRA/QLoRA"
     random_seed = config.get("random_seed", 3407)
-    model_random_state = config.get("model_random_state", random_seed)
-    lora_random_state = config.get("lora_random_state", random_seed)
+    # Treat absent OR explicit None the same way: fall back to random_seed.
+    # `config.get(key, default)` only fills the default when the key is
+    # missing; an explicit `None` would otherwise reach FastMLXModel and
+    # disable deterministic init silently.
+    _model_seed = config.get("model_random_state")
+    model_random_state = random_seed if _model_seed is None else _model_seed
+    _lora_seed = config.get("lora_random_state")
+    lora_random_state = random_seed if _lora_seed is None else _lora_seed
     model, tokenizer = FastMLXModel.from_pretrained(
         model_name,
         load_in_4bit = config.get("load_in_4bit", True),
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 6a94da6bc0..3b08248c38 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -162,11 +162,19 @@ def test_mlx_worker_falls_back_init_seeds_to_random_seed(self):
         source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
 
         self.assertIn('random_seed = config.get("random_seed", 3407)', source)
+        # Both absent and explicit None must fall back to random_seed.
+        # `dict.get(key, default)` only fills the default on absent keys,
+        # so an explicit `None` would otherwise reach FastMLXModel /
+        # get_peft_model and disable deterministic init.
+        self.assertIn('_model_seed = config.get("model_random_state")', source)
         self.assertIn(
-            'model_random_state = config.get("model_random_state", random_seed)', source
+            "model_random_state = random_seed if _model_seed is None else _model_seed",
+            source,
         )
+        self.assertIn('_lora_seed = config.get("lora_random_state")', source)
         self.assertIn(
-            'lora_random_state = config.get("lora_random_state", random_seed)', source
+            "lora_random_state = random_seed if _lora_seed is None else _lora_seed",
+            source,
         )
         self.assertIn("random_state = model_random_state", source)
         self.assertIn("random_state = lora_random_state", source)

From 56e32b75698323a37c8ad34c94584d60ea7df96f Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <info@unsloth.ai>
Date: Sun, 24 May 2026 15:23:33 +0000
Subject: [PATCH 10/22] Guard optional MLXTrainingConfig fields and normalize
 random_seed for PR #5656

The MLX worker now passes `cast_norm_output_to_input_dtype` and
`dataset_order` only when the linked unsloth-zoo dataclass actually
declares them. Released zoo trees that predate the paired PR can still
construct `MLXTrainingConfig` without raising
`TypeError: unexpected keyword argument`. Once the dependency floor is
bumped to a release that contains both fields, the feature-detect
guards become no-ops.

`random_seed = config.get("random_seed", 3407)` was unguarded against
explicit `None` from raw / backend callers. The same value seeded the
trainer and was the fallback target for `model_random_state` /
`lora_random_state`. Normalize once at the top of the function and use
the normalized value everywhere so an explicit `None` cannot reach
FastMLXModel / get_peft_model / MLXTrainingConfig.

Existing seed source-pattern test updated to match the new normalize
helper. New test asserts the feature-detection guards exist and that
the unconditional kwargs do not include the gated fields.
---
 studio/backend/core/training/worker.py        | 74 +++++++++++--------
 .../tests/test_training_raw_support.py        | 35 ++++++++-
 2 files changed, 75 insertions(+), 34 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 332b8e8bac..644b33829d 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1156,15 +1156,19 @@ def _send(event_type, **kwargs):
     is_dataset_image = bool(config.get("is_dataset_image", False))
     training_type = config.get("training_type", "LoRA/QLoRA")
     use_lora = training_type == "LoRA/QLoRA"
-    random_seed = config.get("random_seed", 3407)
+    # Normalize random_seed so an explicit None from a raw/backend caller
+    # does not propagate through the seed chain. Mirrors the override
+    # handling for model/LoRA seeds below.
+    _raw_seed = config.get("random_seed", 3407)
+    random_seed = 3407 if _raw_seed is None else int(_raw_seed)
     # Treat absent OR explicit None the same way: fall back to random_seed.
     # `config.get(key, default)` only fills the default when the key is
     # missing; an explicit `None` would otherwise reach FastMLXModel and
     # disable deterministic init silently.
     _model_seed = config.get("model_random_state")
-    model_random_state = random_seed if _model_seed is None else _model_seed
+    model_random_state = random_seed if _model_seed is None else int(_model_seed)
     _lora_seed = config.get("lora_random_state")
-    lora_random_state = random_seed if _lora_seed is None else _lora_seed
+    lora_random_state = random_seed if _lora_seed is None else int(_lora_seed)
     model, tokenizer = FastMLXModel.from_pretrained(
         model_name,
         load_in_4bit = config.get("load_in_4bit", True),
@@ -1403,38 +1407,48 @@ def _fmt_progress(status_message = "", **_kw):
     weight_decay = config.get("weight_decay", 0.001)
     weight_decay = 0.001 if weight_decay is None else float(weight_decay)
 
+    mlx_config_kwargs = dict(
+        per_device_train_batch_size = batch_size,
+        gradient_accumulation_steps = grad_accum,
+        max_steps = max_steps,
+        learning_rate = lr_value,
+        warmup_steps = warmup_steps,
+        lr_scheduler_type = lr_scheduler_type,
+        optim = optim_name,
+        weight_decay = weight_decay,
+        max_grad_norm = max_grad_norm,
+        max_grad_value = max_grad_value,
+        logging_steps = 1,
+        max_seq_length = max_seq_length,
+        seed = random_seed,
+        use_cce = True,
+        compile = True,
+        gradient_checkpointing = use_grad_checkpoint,
+        streaming = is_vlm,
+        packing = bool(config.get("packing", False)),
+        output_dir = output_dir,
+        save_steps = int(config.get("save_steps", 0) or 0),
+        eval_steps = eval_steps_val,
+    )
+
+    # Feature-detect optional MLXTrainingConfig fields so this PR does
+    # not require the paired unsloth-zoo change to be merged/released
+    # first. Released zoo trees that predate those fields are still
+    # constructable; once the floor is bumped this guard is a no-op.
+    _supported_fields = getattr(MLXTrainingConfig, "__dataclass_fields__", {})
+    if "cast_norm_output_to_input_dtype" in _supported_fields:
+        mlx_config_kwargs["cast_norm_output_to_input_dtype"] = bool(
+            config.get("cast_norm_output_to_input_dtype", True)
+        )
+    if "dataset_order" in _supported_fields:
+        mlx_config_kwargs["dataset_order"] = "torch_randperm"
+
     trainer = MLXTrainer(
         model = model,
         tokenizer = tokenizer,
         train_dataset = dataset,
         eval_dataset = eval_dataset,
-        args = MLXTrainingConfig(
-            per_device_train_batch_size = batch_size,
-            gradient_accumulation_steps = grad_accum,
-            max_steps = max_steps,
-            learning_rate = lr_value,
-            warmup_steps = warmup_steps,
-            lr_scheduler_type = lr_scheduler_type,
-            optim = optim_name,
-            weight_decay = weight_decay,
-            max_grad_norm = max_grad_norm,
-            max_grad_value = max_grad_value,
-            cast_norm_output_to_input_dtype = bool(
-                config.get("cast_norm_output_to_input_dtype", True)
-            ),
-            logging_steps = 1,
-            max_seq_length = max_seq_length,
-            seed = config.get("random_seed", 3407),
-            use_cce = True,
-            compile = True,
-            gradient_checkpointing = use_grad_checkpoint,
-            streaming = is_vlm,
-            dataset_order = "torch_randperm",
-            packing = bool(config.get("packing", False)),
-            output_dir = output_dir,
-            save_steps = int(config.get("save_steps", 0) or 0),
-            eval_steps = eval_steps_val,
-        ),
+        args = MLXTrainingConfig(**mlx_config_kwargs),
     )
 
     # Tell the parent that eval is configured so the frontend shows the eval chart
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 3b08248c38..32dc704907 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -161,24 +161,31 @@ def start(self):
     def test_mlx_worker_falls_back_init_seeds_to_random_seed(self):
         source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
 
-        self.assertIn('random_seed = config.get("random_seed", 3407)', source)
+        # random_seed itself is normalized first so explicit None coming
+        # from a raw / backend caller does not propagate through the chain.
+        self.assertIn('_raw_seed = config.get("random_seed", 3407)', source)
+        self.assertIn(
+            "random_seed = 3407 if _raw_seed is None else int(_raw_seed)",
+            source,
+        )
         # Both absent and explicit None must fall back to random_seed.
         # `dict.get(key, default)` only fills the default on absent keys,
         # so an explicit `None` would otherwise reach FastMLXModel /
         # get_peft_model and disable deterministic init.
         self.assertIn('_model_seed = config.get("model_random_state")', source)
         self.assertIn(
-            "model_random_state = random_seed if _model_seed is None else _model_seed",
+            "model_random_state = random_seed if _model_seed is None else int(_model_seed)",
             source,
         )
         self.assertIn('_lora_seed = config.get("lora_random_state")', source)
         self.assertIn(
-            "lora_random_state = random_seed if _lora_seed is None else _lora_seed",
+            "lora_random_state = random_seed if _lora_seed is None else int(_lora_seed)",
             source,
         )
         self.assertIn("random_state = model_random_state", source)
         self.assertIn("random_state = lora_random_state", source)
-        self.assertIn('seed = config.get("random_seed", 3407)', source)
+        # MLXTrainingConfig now receives the normalized seed directly.
+        self.assertIn("seed = random_seed,", source)
 
     def test_mlx_worker_preserves_null_max_grad_value_for_trainer_default(self):
         source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
@@ -192,6 +199,26 @@ def test_mlx_worker_preserves_null_max_grad_value_for_trainer_default(self):
             source,
         )
 
+    def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
+        # `cast_norm_output_to_input_dtype` and `dataset_order` ship in the
+        # paired unsloth-zoo update. Until that floor is in place, the
+        # worker must gate them so releases that predate those fields can
+        # still construct MLXTrainingConfig without TypeError.
+        source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
+
+        self.assertIn(
+            'getattr(MLXTrainingConfig, "__dataclass_fields__", {})',
+            source,
+        )
+        self.assertIn('if "cast_norm_output_to_input_dtype" in _supported_fields:', source)
+        self.assertIn('if "dataset_order" in _supported_fields:', source)
+        # The unconditional kwargs must NOT include either gated field.
+        unconditional_block_start = source.find("mlx_config_kwargs = dict(")
+        unconditional_block_end = source.find(")", unconditional_block_start)
+        unconditional = source[unconditional_block_start:unconditional_block_end]
+        self.assertNotIn("cast_norm_output_to_input_dtype", unconditional)
+        self.assertNotIn("dataset_order", unconditional)
+
     def test_training_route_forwards_embedding_learning_rate(self):
         training_route = _load_route_module(
             "training_route_module_raw_support",

From 29aa91a92048f27e9c73c1e5f665fd16f735099d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 24 May 2026 15:23:48 +0000
Subject: [PATCH 11/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_training_raw_support.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 32dc704907..cddfbae30f 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -210,7 +210,9 @@ def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
             'getattr(MLXTrainingConfig, "__dataclass_fields__", {})',
             source,
         )
-        self.assertIn('if "cast_norm_output_to_input_dtype" in _supported_fields:', source)
+        self.assertIn(
+            'if "cast_norm_output_to_input_dtype" in _supported_fields:', source
+        )
         self.assertIn('if "dataset_order" in _supported_fields:', source)
         # The unconditional kwargs must NOT include either gated field.
         unconditional_block_start = source.find("mlx_config_kwargs = dict(")

From 1a026435e6aa428e7240d5523c2e89f2c9d4c8f9 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <info@unsloth.ai>
Date: Sun, 24 May 2026 15:48:47 +0000
Subject: [PATCH 12/22] Normalize seed / cast / max_grad_value at
 TrainingBackend for PR #5656

Round-3 review consensus: the per-field guards that landed in the MLX
worker only protect the MLX path. The same `TrainingBackend.start_training`
config still reaches the CUDA/text trainer at `worker.py:2267`, the
embedding LoRA init at `worker.py:2450`, and embedding TrainingArguments
at `worker.py:2624` with raw `None` values, so an explicit
`random_seed=None` from a raw / backend caller still breaks non-MLX
training even after the previous fix.

Move the normalization into `TrainingBackend.start_training` itself,
where it runs once for every training mode:

- `_coerce_seed(value)`: explicit `None`, non-int, or absent all become
  3407. Every downstream worker now sees an int.
- `_coerce_optional_bool(value, default)`: explicit `None` falls back
  to `default` instead of `bool(None) == False`. Also normalizes the
  common raw-config / YAML string aliases ("true" / "false" / "0" /
  "1"). Used for `cast_norm_output_to_input_dtype`.
- `_coerce_optional_nonneg_float(name, value)`: rejects negative
  numerics from raw / backend callers, matching the Pydantic
  `ge=0` constraint the HTTP route already enforces. Used for
  `max_grad_value`.

worker.py MLX path: the existing `bool(config.get(key, True))` for
`cast_norm_output_to_input_dtype` was changed to also fall back on
explicit `None`, so direct worker callers (bypassing
`TrainingBackend.start_training`) are equally safe. `max_grad_value`
also raises on negative values inside the worker for the same reason.

TrainingStartRequest.random_seed default bumped from 42 to 3407 so
direct REST callers that omit the field receive the same default as
the Studio frontend and the MLX worker.

New regression test exercises the three new helpers across explicit
None, valid values, string aliases, and negative-value rejection.
---
 studio/backend/core/training/training.py      | 66 +++++++++++++++++--
 studio/backend/core/training/worker.py        | 17 ++++-
 studio/backend/models/training.py             |  8 ++-
 .../tests/test_training_raw_support.py        | 37 +++++++++--
 4 files changed, 116 insertions(+), 12 deletions(-)

diff --git a/studio/backend/core/training/training.py b/studio/backend/core/training/training.py
index bf119f5567..644b160458 100644
--- a/studio/backend/core/training/training.py
+++ b/studio/backend/core/training/training.py
@@ -41,6 +41,58 @@
 logger = get_logger(__name__)
 
 
+def _coerce_seed(value, default = 3407) -> int:
+    """Treat absent / None / non-int values uniformly as `default`.
+
+    transformers.set_seed(None) raises TypeError, and PEFT init with
+    random_state=None disables determinism. Normalize once here so MLX,
+    CUDA, and embedding workers all receive a usable int seed.
+    """
+    if value is None:
+        return int(default)
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return int(default)
+
+
+def _coerce_optional_bool(value, default: bool) -> bool:
+    """Treat explicit None as `default` instead of `bool(None) == False`."""
+    if value is None:
+        return bool(default)
+    if isinstance(value, str):
+        normalized = value.strip().lower()
+        if normalized in ("true", "1", "yes", "on"):
+            return True
+        if normalized in ("false", "0", "no", "off", ""):
+            return False
+    return bool(value)
+
+
+def _coerce_optional_nonneg_float(name: str, value):
+    """Reject negative numeric values from raw/backend callers.
+
+    The Pydantic route model already enforces `ge=0` on these fields,
+    but `TrainingBackend.start_training(**kwargs)` accepts arbitrary
+    kwargs; without this guard, a negative value would bypass the HTTP
+    validator and reach the worker, where MLX silently disables the
+    clip (treats non-positive as "off") instead of erroring loudly.
+    """
+    if value is None:
+        return None
+    try:
+        coerced = float(value)
+    except (TypeError, ValueError):
+        raise ValueError(
+            f"Unsloth: {name}={value!r} must be a non-negative float or None."
+        )
+    if coerced < 0:
+        raise ValueError(
+            f"Unsloth: {name}={coerced} must be >= 0 (use 0 or None to disable)."
+        )
+    return coerced
+
+
 _HF_TMP_CHECKPOINT_RE = re.compile(r"^tmp-checkpoint-\d+$")
 
 
@@ -218,11 +270,17 @@ def start_training(self, job_id: str, **kwargs) -> bool:
             "save_steps": kwargs.get("save_steps", 0),
             "weight_decay": kwargs.get("weight_decay", 0.001),
             "max_grad_norm": kwargs.get("max_grad_norm", 0.0),
-            "max_grad_value": kwargs.get("max_grad_value"),
-            "cast_norm_output_to_input_dtype": kwargs.get(
-                "cast_norm_output_to_input_dtype", True
+            "max_grad_value": _coerce_optional_nonneg_float(
+                "max_grad_value", kwargs.get("max_grad_value")
+            ),
+            "cast_norm_output_to_input_dtype": _coerce_optional_bool(
+                kwargs.get("cast_norm_output_to_input_dtype"), True
             ),
-            "random_seed": kwargs.get("random_seed", 3407),
+            # Normalize seed once for every training path. An explicit
+            # None from a raw / backend caller is treated the same as
+            # an absent key, so MLX, CUDA, and embedding workers all
+            # see an int (transformers.set_seed(None) raises TypeError).
+            "random_seed": _coerce_seed(kwargs.get("random_seed")),
             "packing": kwargs.get("packing", False),
             "optim": kwargs.get("optim", "adamw_8bit"),
             "lr_scheduler_type": kwargs.get("lr_scheduler_type", "linear"),
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 644b33829d..0488743f97 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1401,9 +1401,17 @@ def _fmt_progress(status_message = "", **_kw):
 
     # MLX Studio uses per-element clipping by default and keeps norm clipping
     # disabled. Preserve None so the MLX trainer owns its runtime default.
+    # `training.py` already normalizes / validates these; double-check
+    # here for direct worker callers and explicit-None robustness.
     max_grad_norm = 0.0
     max_grad_value = config.get("max_grad_value")
-    max_grad_value = None if max_grad_value is None else float(max_grad_value)
+    if max_grad_value is not None:
+        max_grad_value = float(max_grad_value)
+        if max_grad_value < 0:
+            raise ValueError(
+                f"Unsloth MLX: max_grad_value={max_grad_value} must be >= 0 "
+                "(0 or None disables elementwise clipping)."
+            )
     weight_decay = config.get("weight_decay", 0.001)
     weight_decay = 0.001 if weight_decay is None else float(weight_decay)
 
@@ -1437,8 +1445,11 @@ def _fmt_progress(status_message = "", **_kw):
     # constructable; once the floor is bumped this guard is a no-op.
     _supported_fields = getattr(MLXTrainingConfig, "__dataclass_fields__", {})
     if "cast_norm_output_to_input_dtype" in _supported_fields:
-        mlx_config_kwargs["cast_norm_output_to_input_dtype"] = bool(
-            config.get("cast_norm_output_to_input_dtype", True)
+        # Explicit None must fall back to the default True; raw / backend
+        # callers can pass None via `kwargs.get(key, True)` upstream.
+        _raw_cast = config.get("cast_norm_output_to_input_dtype", True)
+        mlx_config_kwargs["cast_norm_output_to_input_dtype"] = (
+            True if _raw_cast is None else bool(_raw_cast)
         )
     if "dataset_order" in _supported_fields:
         mlx_config_kwargs["dataset_order"] = "torch_randperm"
diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py
index 4b64454bc1..99141a601e 100644
--- a/studio/backend/models/training.py
+++ b/studio/backend/models/training.py
@@ -282,7 +282,13 @@ def _check_lora_dropout(cls, v: float) -> float:
             "back to the incoming activation dtype."
         ),
     )
-    random_seed: int = Field(42, description = "Random seed")
+    random_seed: int = Field(
+        3407,
+        description = (
+            "Random seed; matches the Studio backend / MLX worker default "
+            "and unsloth's historical recommended value."
+        ),
+    )
     packing: bool = Field(False, description = "Enable sequence packing")
     optim: str = Field("adamw_8bit", description = "Optimizer")
     lr_scheduler_type: str = Field("linear", description = "Learning rate scheduler type")
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index cddfbae30f..83a9f7f7ec 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -190,15 +190,44 @@ def test_mlx_worker_falls_back_init_seeds_to_random_seed(self):
     def test_mlx_worker_preserves_null_max_grad_value_for_trainer_default(self):
         source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
 
-        self.assertIn(
-            "max_grad_value = None if max_grad_value is None else float(max_grad_value)",
-            source,
-        )
+        # None must survive to the MLX trainer so it picks its own runtime
+        # default, and any other value must coerce to float without
+        # rebinding None to 1.0 (which the legacy code did).
+        self.assertIn('max_grad_value = config.get("max_grad_value")', source)
+        self.assertIn("max_grad_value = float(max_grad_value)", source)
         self.assertNotIn(
             "max_grad_value = 1.0 if max_grad_value is None else float(max_grad_value)",
             source,
         )
 
+    def test_training_backend_normalizes_explicit_none_seed_and_dtypes(self):
+        # Raw / backend callers can pass `random_seed=None`,
+        # `cast_norm_output_to_input_dtype=None`, and
+        # `max_grad_value=None` (or omit them) and must NOT leak the
+        # `None` past `TrainingBackend.start_training`. Otherwise
+        # transformers.set_seed(None) raises, PEFT init becomes
+        # nondeterministic, and the MLX norm-output cast silently flips.
+        from core.training.training import (
+            _coerce_seed,
+            _coerce_optional_bool,
+            _coerce_optional_nonneg_float,
+        )
+
+        self.assertEqual(_coerce_seed(None), 3407)
+        self.assertEqual(_coerce_seed("123"), 123)
+        self.assertEqual(_coerce_seed("not-a-number"), 3407)
+
+        self.assertTrue(_coerce_optional_bool(None, True))
+        self.assertFalse(_coerce_optional_bool(None, False))
+        self.assertFalse(_coerce_optional_bool("false", True))
+        self.assertTrue(_coerce_optional_bool("true", False))
+
+        self.assertIsNone(_coerce_optional_nonneg_float("max_grad_value", None))
+        self.assertEqual(_coerce_optional_nonneg_float("max_grad_value", "2.5"), 2.5)
+        self.assertEqual(_coerce_optional_nonneg_float("max_grad_value", 0), 0.0)
+        with self.assertRaises(ValueError):
+            _coerce_optional_nonneg_float("max_grad_value", -1)
+
     def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
         # `cast_norm_output_to_input_dtype` and `dataset_order` ship in the
         # paired unsloth-zoo update. Until that floor is in place, the

From e293af1dbf55e65566a5833aa38f388fd824cfb6 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <info@unsloth.ai>
Date: Sun, 24 May 2026 15:51:18 +0000
Subject: [PATCH 13/22] Tighten feature-detect test paren tracking for PR #5656

The block-extraction used , which stops at the
first inner closing paren (e.g. )
and would silently miss a future unconditional
/  added later in the same dict literal. Switched to
proper paren-depth tracking so the unconditional block is checked end-to-end.
---
 .../tests/test_training_raw_support.py        | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 83a9f7f7ec..3c175f7fc5 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -244,9 +244,26 @@ def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
         )
         self.assertIn('if "dataset_order" in _supported_fields:', source)
         # The unconditional kwargs must NOT include either gated field.
+        # Use proper paren tracking; `source.find(")", ...)` would stop at
+        # the first close paren inside the dict body (e.g.
+        # `int(config.get("save_steps", 0) or 0)`) and miss any future
+        # unconditional addition of the gated fields later in the dict.
         unconditional_block_start = source.find("mlx_config_kwargs = dict(")
-        unconditional_block_end = source.find(")", unconditional_block_start)
-        unconditional = source[unconditional_block_start:unconditional_block_end]
+        self.assertNotEqual(unconditional_block_start, -1)
+        depth = 0
+        i = unconditional_block_start + len("mlx_config_kwargs = dict")
+        end = i
+        while i < len(source):
+            ch = source[i]
+            if ch == "(":
+                depth += 1
+            elif ch == ")":
+                depth -= 1
+                if depth == 0:
+                    end = i + 1
+                    break
+            i += 1
+        unconditional = source[unconditional_block_start:end]
         self.assertNotIn("cast_norm_output_to_input_dtype", unconditional)
         self.assertNotIn("dataset_order", unconditional)
 

From 962ca2830ecffc3893155b75c55a001e9ddae7b3 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <info@unsloth.ai>
Date: Mon, 25 May 2026 13:35:07 +0000
Subject: [PATCH 14/22] Shorten verbose comments in MLX Studio backend

---
 studio/backend/core/training/training.py | 21 +++------------------
 studio/backend/core/training/worker.py   | 23 ++++++-----------------
 tests/studio/run_real_mlx_smoke.py       |  2 +-
 3 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/studio/backend/core/training/training.py b/studio/backend/core/training/training.py
index 644b160458..bcc32f374d 100644
--- a/studio/backend/core/training/training.py
+++ b/studio/backend/core/training/training.py
@@ -42,12 +42,7 @@
 
 
 def _coerce_seed(value, default = 3407) -> int:
-    """Treat absent / None / non-int values uniformly as `default`.
-
-    transformers.set_seed(None) raises TypeError, and PEFT init with
-    random_state=None disables determinism. Normalize once here so MLX,
-    CUDA, and embedding workers all receive a usable int seed.
-    """
+    """Normalize None / non-int to `default` (transformers.set_seed(None) raises)."""
     if value is None:
         return int(default)
     try:
@@ -70,14 +65,7 @@ def _coerce_optional_bool(value, default: bool) -> bool:
 
 
 def _coerce_optional_nonneg_float(name: str, value):
-    """Reject negative numeric values from raw/backend callers.
-
-    The Pydantic route model already enforces `ge=0` on these fields,
-    but `TrainingBackend.start_training(**kwargs)` accepts arbitrary
-    kwargs; without this guard, a negative value would bypass the HTTP
-    validator and reach the worker, where MLX silently disables the
-    clip (treats non-positive as "off") instead of erroring loudly.
-    """
+    """Reject negatives; HTTP `ge=0` doesn't cover raw `**kwargs` callers."""
     if value is None:
         return None
     try:
@@ -276,10 +264,7 @@ def start_training(self, job_id: str, **kwargs) -> bool:
             "cast_norm_output_to_input_dtype": _coerce_optional_bool(
                 kwargs.get("cast_norm_output_to_input_dtype"), True
             ),
-            # Normalize seed once for every training path. An explicit
-            # None from a raw / backend caller is treated the same as
-            # an absent key, so MLX, CUDA, and embedding workers all
-            # see an int (transformers.set_seed(None) raises TypeError).
+            # MLX/CUDA/embedding workers need an int (transformers.set_seed(None) raises).
             "random_seed": _coerce_seed(kwargs.get("random_seed")),
             "packing": kwargs.get("packing", False),
             "optim": kwargs.get("optim", "adamw_8bit"),
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 0488743f97..45308deb9b 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1156,15 +1156,10 @@ def _send(event_type, **kwargs):
     is_dataset_image = bool(config.get("is_dataset_image", False))
     training_type = config.get("training_type", "LoRA/QLoRA")
     use_lora = training_type == "LoRA/QLoRA"
-    # Normalize random_seed so an explicit None from a raw/backend caller
-    # does not propagate through the seed chain. Mirrors the override
-    # handling for model/LoRA seeds below.
+    # Normalize seed; explicit None must not reach the seed chain.
     _raw_seed = config.get("random_seed", 3407)
     random_seed = 3407 if _raw_seed is None else int(_raw_seed)
-    # Treat absent OR explicit None the same way: fall back to random_seed.
-    # `config.get(key, default)` only fills the default when the key is
-    # missing; an explicit `None` would otherwise reach FastMLXModel and
-    # disable deterministic init silently.
+    # `config.get(k, d)` only fills d when key is missing; handle explicit None too.
     _model_seed = config.get("model_random_state")
     model_random_state = random_seed if _model_seed is None else int(_model_seed)
     _lora_seed = config.get("lora_random_state")
@@ -1399,10 +1394,8 @@ def _fmt_progress(status_message = "", **_kw):
     else:
         eval_steps_val = int(eval_steps_val)
 
-    # MLX Studio uses per-element clipping by default and keeps norm clipping
-    # disabled. Preserve None so the MLX trainer owns its runtime default.
-    # `training.py` already normalizes / validates these; double-check
-    # here for direct worker callers and explicit-None robustness.
+    # Per-element clipping only; trainer owns the None default. Re-validate
+    # for direct worker callers (training.py normalizes the main path).
     max_grad_norm = 0.0
     max_grad_value = config.get("max_grad_value")
     if max_grad_value is not None:
@@ -1439,14 +1432,10 @@ def _fmt_progress(status_message = "", **_kw):
         eval_steps = eval_steps_val,
     )
 
-    # Feature-detect optional MLXTrainingConfig fields so this PR does
-    # not require the paired unsloth-zoo change to be merged/released
-    # first. Released zoo trees that predate those fields are still
-    # constructable; once the floor is bumped this guard is a no-op.
+    # Feature-detect optional fields so this PR works without the paired zoo bump.
     _supported_fields = getattr(MLXTrainingConfig, "__dataclass_fields__", {})
     if "cast_norm_output_to_input_dtype" in _supported_fields:
-        # Explicit None must fall back to the default True; raw / backend
-        # callers can pass None via `kwargs.get(key, True)` upstream.
+        # Explicit None falls back to True (default).
         _raw_cast = config.get("cast_norm_output_to_input_dtype", True)
         mlx_config_kwargs["cast_norm_output_to_input_dtype"] = (
             True if _raw_cast is None else bool(_raw_cast)
diff --git a/tests/studio/run_real_mlx_smoke.py b/tests/studio/run_real_mlx_smoke.py
index 8862fa1d68..f30ebc8a52 100644
--- a/tests/studio/run_real_mlx_smoke.py
+++ b/tests/studio/run_real_mlx_smoke.py
@@ -28,7 +28,7 @@
          clear reason if save raises; e.g. llama.cpp's
          convert_hf_to_gguf currently asserts on Gemma-3-270m's
          tokenizer vocab. Soft-skipped so the LoRA + merged checks
-         continue to gate the PR.)
+         continue to gate the suite.)
   8. Emits `train_metrics.json` with per-phase timing / peak GPU /
      peak RSS / per-step losses / pre+post grad norms / generations
      / gguf_supported flag, for regression detection across CI runs.

From 65cd01954a3ee14eb6f923f730c586124dc76a1a Mon Sep 17 00:00:00 2001
From: DoubleMathew <mmathew23@gmail.com>
Date: Tue, 26 May 2026 12:04:34 -0500
Subject: [PATCH 15/22] Handle MLX Studio EOS appending by mode

---
 studio/backend/core/training/worker.py            |  5 +++++
 studio/backend/tests/test_training_raw_support.py | 10 +++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 45308deb9b..a6d0a3dcb3 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1442,6 +1442,11 @@ def _fmt_progress(status_message = "", **_kw):
         )
     if "dataset_order" in _supported_fields:
         mlx_config_kwargs["dataset_order"] = "torch_randperm"
+    if "append_eos" in _supported_fields:
+        raw_text_mode = training_type == "Continued Pretraining" or format_type == "raw"
+        # Studio SFT formatting owns rendered examples; raw/CPT text still
+        # needs MLX to append EOS like the CUDA raw-text path.
+        mlx_config_kwargs["append_eos"] = bool(raw_text_mode)
 
     trainer = MLXTrainer(
         model = model,
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 3c175f7fc5..9d944ce690 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -229,8 +229,8 @@ def test_training_backend_normalizes_explicit_none_seed_and_dtypes(self):
             _coerce_optional_nonneg_float("max_grad_value", -1)
 
     def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
-        # `cast_norm_output_to_input_dtype` and `dataset_order` ship in the
-        # paired unsloth-zoo update. Until that floor is in place, the
+        # `cast_norm_output_to_input_dtype`, `dataset_order`, and
+        # `append_eos` ship in the paired unsloth-zoo update. Until that floor is in place, the
         # worker must gate them so releases that predate those fields can
         # still construct MLXTrainingConfig without TypeError.
         source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
@@ -243,7 +243,10 @@ def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
             'if "cast_norm_output_to_input_dtype" in _supported_fields:', source
         )
         self.assertIn('if "dataset_order" in _supported_fields:', source)
-        # The unconditional kwargs must NOT include either gated field.
+        self.assertIn('if "append_eos" in _supported_fields:', source)
+        self.assertIn('format_type == "raw"', source)
+        self.assertIn('mlx_config_kwargs["append_eos"] = bool(raw_text_mode)', source)
+        # The unconditional kwargs must NOT include any gated field.
         # Use proper paren tracking; `source.find(")", ...)` would stop at
         # the first close paren inside the dict body (e.g.
         # `int(config.get("save_steps", 0) or 0)`) and miss any future
@@ -266,6 +269,7 @@ def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
         unconditional = source[unconditional_block_start:end]
         self.assertNotIn("cast_norm_output_to_input_dtype", unconditional)
         self.assertNotIn("dataset_order", unconditional)
+        self.assertNotIn("append_eos", unconditional)
 
     def test_training_route_forwards_embedding_learning_rate(self):
         training_route = _load_route_module(

From d66f4a71a14c9a3d119f27aea593ad07e2d4926c Mon Sep 17 00:00:00 2001
From: DoubleMathew <mmathew23@gmail.com>
Date: Tue, 26 May 2026 12:20:59 -0500
Subject: [PATCH 16/22] Wire MLX leaf norm clipping through Studio

---
 studio/backend/core/training/training.py      |  3 +++
 studio/backend/core/training/worker.py        | 10 ++++++++
 studio/backend/models/training.py             |  9 +++++++
 .../tests/test_training_raw_support.py        | 24 +++++++++++++++----
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/studio/backend/core/training/training.py b/studio/backend/core/training/training.py
index bcc32f374d..38471b5d33 100644
--- a/studio/backend/core/training/training.py
+++ b/studio/backend/core/training/training.py
@@ -261,6 +261,9 @@ def start_training(self, job_id: str, **kwargs) -> bool:
             "max_grad_value": _coerce_optional_nonneg_float(
                 "max_grad_value", kwargs.get("max_grad_value")
             ),
+            "max_grad_leaf_norm": _coerce_optional_nonneg_float(
+                "max_grad_leaf_norm", kwargs.get("max_grad_leaf_norm")
+            ),
             "cast_norm_output_to_input_dtype": _coerce_optional_bool(
                 kwargs.get("cast_norm_output_to_input_dtype"), True
             ),
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index a6d0a3dcb3..1809e6047c 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1405,6 +1405,14 @@ def _fmt_progress(status_message = "", **_kw):
                 f"Unsloth MLX: max_grad_value={max_grad_value} must be >= 0 "
                 "(0 or None disables elementwise clipping)."
             )
+    max_grad_leaf_norm = config.get("max_grad_leaf_norm")
+    if max_grad_leaf_norm is not None:
+        max_grad_leaf_norm = float(max_grad_leaf_norm)
+        if max_grad_leaf_norm < 0:
+            raise ValueError(
+                f"Unsloth MLX: max_grad_leaf_norm={max_grad_leaf_norm} must be >= 0 "
+                "(0 or None disables proportional leaf-norm clipping)."
+            )
     weight_decay = config.get("weight_decay", 0.001)
     weight_decay = 0.001 if weight_decay is None else float(weight_decay)
 
@@ -1442,6 +1450,8 @@ def _fmt_progress(status_message = "", **_kw):
         )
     if "dataset_order" in _supported_fields:
         mlx_config_kwargs["dataset_order"] = "torch_randperm"
+    if "max_grad_leaf_norm" in _supported_fields:
+        mlx_config_kwargs["max_grad_leaf_norm"] = max_grad_leaf_norm
     if "append_eos" in _supported_fields:
         raw_text_mode = training_type == "Continued Pretraining" or format_type == "raw"
         # Studio SFT formatting owns rendered examples; raw/CPT text still
diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py
index 99141a601e..91c7998843 100644
--- a/studio/backend/models/training.py
+++ b/studio/backend/models/training.py
@@ -275,6 +275,15 @@ def _check_lora_dropout(cls, v: float) -> float:
             "If unset, MLX uses its runtime default."
         ),
     )
+    max_grad_leaf_norm: Optional[float] = Field(
+        None,
+        ge = 0,
+        description = (
+            "MLX-only proportional per-parameter gradient norm cap. "
+            "Preserves each tensor's gradient direction without global norm "
+            "clipping's memory overhead."
+        ),
+    )
     cast_norm_output_to_input_dtype: bool = Field(
         True,
         description = (
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 9d944ce690..664e2e3d13 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -108,11 +108,13 @@ def start(self):
                 training_type = "LoRA/QLoRA",
                 max_grad_norm = 0.7,
                 max_grad_value = 3.0,
+                max_grad_leaf_norm = 1.3,
             )
 
         config = mock_process.call_args.kwargs["kwargs"]["config"]
         self.assertEqual(config["max_grad_norm"], 0.7)
         self.assertEqual(config["max_grad_value"], 3.0)
+        self.assertEqual(config["max_grad_leaf_norm"], 1.3)
 
     def test_training_backend_forwards_random_seed_without_internal_mlx_seed_keys(self):
         backend = TrainingBackend()
@@ -202,8 +204,8 @@ def test_mlx_worker_preserves_null_max_grad_value_for_trainer_default(self):
 
     def test_training_backend_normalizes_explicit_none_seed_and_dtypes(self):
         # Raw / backend callers can pass `random_seed=None`,
-        # `cast_norm_output_to_input_dtype=None`, and
-        # `max_grad_value=None` (or omit them) and must NOT leak the
+        # `cast_norm_output_to_input_dtype=None`, and MLX clip knobs
+        # as None (or omit them) and must NOT leak the
         # `None` past `TrainingBackend.start_training`. Otherwise
         # transformers.set_seed(None) raises, PEFT init becomes
         # nondeterministic, and the MLX norm-output cast silently flips.
@@ -227,10 +229,18 @@ def test_training_backend_normalizes_explicit_none_seed_and_dtypes(self):
         self.assertEqual(_coerce_optional_nonneg_float("max_grad_value", 0), 0.0)
         with self.assertRaises(ValueError):
             _coerce_optional_nonneg_float("max_grad_value", -1)
+        self.assertIsNone(_coerce_optional_nonneg_float("max_grad_leaf_norm", None))
+        self.assertEqual(
+            _coerce_optional_nonneg_float("max_grad_leaf_norm", "1.3"),
+            1.3,
+        )
+        with self.assertRaises(ValueError):
+            _coerce_optional_nonneg_float("max_grad_leaf_norm", -1)
 
     def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
-        # `cast_norm_output_to_input_dtype`, `dataset_order`, and
-        # `append_eos` ship in the paired unsloth-zoo update. Until that floor is in place, the
+        # `cast_norm_output_to_input_dtype`, `dataset_order`,
+        # `max_grad_leaf_norm`, and `append_eos` ship in the paired
+        # unsloth-zoo update. Until that floor is in place, the
         # worker must gate them so releases that predate those fields can
         # still construct MLXTrainingConfig without TypeError.
         source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
@@ -243,6 +253,11 @@ def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
             'if "cast_norm_output_to_input_dtype" in _supported_fields:', source
         )
         self.assertIn('if "dataset_order" in _supported_fields:', source)
+        self.assertIn('if "max_grad_leaf_norm" in _supported_fields:', source)
+        self.assertIn(
+            'mlx_config_kwargs["max_grad_leaf_norm"] = max_grad_leaf_norm',
+            source,
+        )
         self.assertIn('if "append_eos" in _supported_fields:', source)
         self.assertIn('format_type == "raw"', source)
         self.assertIn('mlx_config_kwargs["append_eos"] = bool(raw_text_mode)', source)
@@ -269,6 +284,7 @@ def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
         unconditional = source[unconditional_block_start:end]
         self.assertNotIn("cast_norm_output_to_input_dtype", unconditional)
         self.assertNotIn("dataset_order", unconditional)
+        self.assertNotIn("max_grad_leaf_norm", unconditional)
         self.assertNotIn("append_eos", unconditional)
 
     def test_training_route_forwards_embedding_learning_rate(self):

From 6a406cb4910565d3abd501327ffd4f0efbd452f4 Mon Sep 17 00:00:00 2001
From: DoubleMathew <mmathew23@gmail.com>
Date: Tue, 26 May 2026 15:57:35 -0500
Subject: [PATCH 17/22] Respect VLM layer filters for explicit LoRA targets

Rationale / guardrails for the local Studio/vision push:

When callers provide explicit VLM LoRA target_modules together with layer filters, FastVisionModel still needs to route the explicit targets through get_peft_regex. Otherwise the layer filters are ignored and adapters can be attached outside the requested language/vision scope.

Do not revert this to plain list(target_modules) for explicit module lists. The CUDA/Studio-facing contract is that explicit targets and layer filters compose: target_modules selects module names, while finetune_language_layers / finetune_vision_layers / finetune_attention_modules / finetune_mlp_modules constrain where those targets are allowed.

The regression test covers the language-only explicit q_proj case and source-checks that explicit targets are wrapped through get_peft_regex when filters are active.
---
 tests/python/test_vision_lora_targeting.py | 41 ++++++++++++++++++++++
 unsloth/models/vision.py                   | 14 ++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 tests/python/test_vision_lora_targeting.py

diff --git a/tests/python/test_vision_lora_targeting.py b/tests/python/test_vision_lora_targeting.py
new file mode 100644
index 0000000000..9e5a8da626
--- /dev/null
+++ b/tests/python/test_vision_lora_targeting.py
@@ -0,0 +1,41 @@
+from pathlib import Path
+import re
+
+import torch
+
+
+def test_vlm_lora_regex_respects_language_only_with_explicit_targets():
+    from unsloth_zoo.peft_utils import get_peft_regex
+
+    class FakeVLM(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.language_model = torch.nn.Module()
+            self.language_model.layers = torch.nn.ModuleList([torch.nn.Module()])
+            self.language_model.layers[0].self_attn = torch.nn.Module()
+            self.language_model.layers[0].self_attn.q_proj = torch.nn.Linear(4, 4)
+            self.vision_tower = torch.nn.Module()
+            self.vision_tower.vision_model = torch.nn.Module()
+            self.vision_tower.vision_model.encoder = torch.nn.Module()
+            self.vision_tower.vision_model.encoder.layers = torch.nn.ModuleList([torch.nn.Module()])
+            self.vision_tower.vision_model.encoder.layers[0].self_attn = torch.nn.Module()
+            self.vision_tower.vision_model.encoder.layers[0].self_attn.q_proj = torch.nn.Linear(4, 4)
+
+    regex = get_peft_regex(
+        FakeVLM(),
+        finetune_vision_layers=False,
+        finetune_language_layers=True,
+        finetune_attention_modules=True,
+        finetune_mlp_modules=True,
+        target_modules=["q_proj"],
+    )
+
+    assert re.search(regex, "language_model.layers.0.self_attn.q_proj")
+    assert not re.search(regex, "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj")
+
+
+def test_fast_vision_model_wraps_explicit_targets_when_layer_filters_are_used():
+    source = Path("unsloth/models/vision.py").read_text()
+
+    assert "target_modules = get_peft_regex(" in source
+    assert "target_modules = list(target_modules)" in source
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index df371e00c8..53470f2ecc 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -1371,6 +1371,20 @@ def get_peft_model(
                 tuple,
                 str,
             )
+            if type(target_modules) in (list, tuple) and (
+                not finetune_vision_layers
+                or not finetune_language_layers
+                or not finetune_attention_modules
+                or not finetune_mlp_modules
+            ):
+                target_modules = get_peft_regex(
+                    model,
+                    finetune_vision_layers = finetune_vision_layers,
+                    finetune_language_layers = finetune_language_layers,
+                    finetune_attention_modules = finetune_attention_modules,
+                    finetune_mlp_modules = finetune_mlp_modules,
+                    target_modules = list(target_modules),
+                )
 
         if hasattr(model, "vllm_engine"):
             if (

From ad8bf147e160ece09f96c3c7db632ff66ac71fa7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 27 May 2026 00:52:59 +0000
Subject: [PATCH 18/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/python/test_vision_lora_targeting.py | 26 ++++++++++++++--------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/tests/python/test_vision_lora_targeting.py b/tests/python/test_vision_lora_targeting.py
index 9e5a8da626..5f9233827b 100644
--- a/tests/python/test_vision_lora_targeting.py
+++ b/tests/python/test_vision_lora_targeting.py
@@ -17,21 +17,29 @@ def __init__(self):
             self.vision_tower = torch.nn.Module()
             self.vision_tower.vision_model = torch.nn.Module()
             self.vision_tower.vision_model.encoder = torch.nn.Module()
-            self.vision_tower.vision_model.encoder.layers = torch.nn.ModuleList([torch.nn.Module()])
-            self.vision_tower.vision_model.encoder.layers[0].self_attn = torch.nn.Module()
-            self.vision_tower.vision_model.encoder.layers[0].self_attn.q_proj = torch.nn.Linear(4, 4)
+            self.vision_tower.vision_model.encoder.layers = torch.nn.ModuleList(
+                [torch.nn.Module()]
+            )
+            self.vision_tower.vision_model.encoder.layers[
+                0
+            ].self_attn = torch.nn.Module()
+            self.vision_tower.vision_model.encoder.layers[
+                0
+            ].self_attn.q_proj = torch.nn.Linear(4, 4)
 
     regex = get_peft_regex(
         FakeVLM(),
-        finetune_vision_layers=False,
-        finetune_language_layers=True,
-        finetune_attention_modules=True,
-        finetune_mlp_modules=True,
-        target_modules=["q_proj"],
+        finetune_vision_layers = False,
+        finetune_language_layers = True,
+        finetune_attention_modules = True,
+        finetune_mlp_modules = True,
+        target_modules = ["q_proj"],
     )
 
     assert re.search(regex, "language_model.layers.0.self_attn.q_proj")
-    assert not re.search(regex, "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj")
+    assert not re.search(
+        regex, "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj"
+    )
 
 
 def test_fast_vision_model_wraps_explicit_targets_when_layer_filters_are_used():

From 976520c91999254fe9f46abe848a7aca1ddd0cb3 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <info@unsloth.ai>
Date: Wed, 27 May 2026 11:16:02 +0000
Subject: [PATCH 19/22] Refresh MLX smoke clip-config note for leaf_norm
 default

Trim the 11-line comment block to 5 lines and correct the stale claim
that MLXTrainingConfig defaults to max_grad_value=1.0. The new default
is max_grad_leaf_norm=1.0 (same memory profile as elementwise but
direction-preserving). The smoke still pins max_grad_value=1.0
explicitly to keep the 13-seed pass-rate fixture stable.
---
 tests/studio/run_real_mlx_smoke.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/tests/studio/run_real_mlx_smoke.py b/tests/studio/run_real_mlx_smoke.py
index f30ebc8a52..d0f16545f5 100644
--- a/tests/studio/run_real_mlx_smoke.py
+++ b/tests/studio/run_real_mlx_smoke.py
@@ -331,17 +331,11 @@ def cmd_train(args) -> int:
             lr_scheduler_type = "constant",
             optim = "adamw",
             weight_decay = 0.0,
-            # max_grad_value (elementwise) is materially cheaper than
-            # max_grad_norm on MLX -- norm clip needs a cross-tree
-            # reduction + materializing all grad tensors at full
-            # precision, value clip is tree_map(mx.clip) per leaf.
-            # MLXTrainingConfig defaults to max_grad_value=1.0 for
-            # exactly this reason; pin both explicitly here so the
-            # configured clip matches what runs (the trainer prints a
-            # notice when both > 0 and value wins, so disable norm).
-            # Empirical 13-seed pass rate at this fixture: value=1.0
-            # 62%, norm=1.0 46%, value=5.0 33%, value=0.5 77% -- the
-            # cheaper default is also the higher-pass-rate default.
+            # Pin the elementwise clip explicitly to match the existing
+            # 13-seed-tested smoke fixture (value=1.0 -> 62% pass; norm=1.0
+            # -> 46%). New MLX default is max_grad_leaf_norm=1.0 (per-leaf
+            # L2 rescale, same memory profile as elementwise, preserves
+            # direction). max_grad_norm pays a cross-tree reduction cost.
             max_grad_norm = 0.0,
             max_grad_value = 1.0,
             logging_steps = 1,

From ae6c2594a21a6714cf0005fa11d7858a7b67d8b8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 27 May 2026 13:35:36 +0000
Subject: [PATCH 20/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_cpu_threads.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/studio/backend/tests/test_cpu_threads.py b/studio/backend/tests/test_cpu_threads.py
index 0dbcbdb74b..1224941622 100644
--- a/studio/backend/tests/test_cpu_threads.py
+++ b/studio/backend/tests/test_cpu_threads.py
@@ -63,16 +63,18 @@ def test_cpu_thread_cap_is_opt_in(raw):
 
 
 # Anything that is not a positive integer raises a clear ValueError.
-@pytest.mark.parametrize("raw", ["zero", "0", "-3", "1.5", "abc", "8a", "0x4", "1e3", "4 0"])
+@pytest.mark.parametrize(
+    "raw", ["zero", "0", "-3", "1.5", "abc", "8a", "0x4", "1e3", "4 0"]
+)
 def test_cpu_thread_cap_requires_positive_integer(raw):
-    with pytest.raises(ValueError, match="must be a positive integer"):
+    with pytest.raises(ValueError, match = "must be a positive integer"):
         configure_cpu_threads({"UNSLOTH_CPU_THREADS": raw})
 
 
 # env=None path uses real os.environ (production call from run.py / main.py).
 def test_cpu_thread_cap_uses_os_environ_when_env_is_none(monkeypatch):
     for variable in (*_THREAD_POOL_ENV_VARS, "UNSLOTH_CPU_THREADS"):
-        monkeypatch.delenv(variable, raising=False)
+        monkeypatch.delenv(variable, raising = False)
     monkeypatch.setenv("UNSLOTH_CPU_THREADS", "3")
 
     configure_cpu_threads()
@@ -84,7 +86,7 @@ def test_cpu_thread_cap_uses_os_environ_when_env_is_none(monkeypatch):
 # Calling twice must not flip any seeded value.
 def test_cpu_thread_cap_idempotent(monkeypatch):
     for variable in (*_THREAD_POOL_ENV_VARS, "UNSLOTH_CPU_THREADS"):
-        monkeypatch.delenv(variable, raising=False)
+        monkeypatch.delenv(variable, raising = False)
     monkeypatch.setenv("UNSLOTH_CPU_THREADS", "5")
 
     configure_cpu_threads()
@@ -138,9 +140,9 @@ def test_invalid_cpu_thread_cap_exits_without_traceback(entry_point):
 
     result = subprocess.run(
         [sys.executable, str(entry_point)],
-        env=env,
-        capture_output=True,
-        text=True,
+        env = env,
+        capture_output = True,
+        text = True,
     )
 
     assert result.returncode == 1

From 71c363d5aebe68b156834f75f89dbceeddbb4ed2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jun 2026 10:21:35 +0000
Subject: [PATCH 21/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/training.py         |  8 ++------
 studio/backend/tests/test_cpu_threads.py         |  4 +---
 .../tests/test_mlx_training_worker_config.py     |  4 +---
 .../backend/tests/test_training_raw_support.py   |  4 +---
 tests/python/test_vision_lora_targeting.py       | 16 +++++-----------
 tests/studio/run_real_mlx_smoke.py               |  3 +--
 unsloth/models/vision.py                         |  6 +-----
 7 files changed, 12 insertions(+), 33 deletions(-)

diff --git a/studio/backend/core/training/training.py b/studio/backend/core/training/training.py
index c2d33f499c..289e324b17 100644
--- a/studio/backend/core/training/training.py
+++ b/studio/backend/core/training/training.py
@@ -67,13 +67,9 @@ def _coerce_optional_nonneg_float(name: str, value):
     try:
         coerced = float(value)
     except (TypeError, ValueError):
-        raise ValueError(
-            f"Unsloth: {name}={value!r} must be a non-negative float or None."
-        )
+        raise ValueError(f"Unsloth: {name}={value!r} must be a non-negative float or None.")
     if coerced < 0:
-        raise ValueError(
-            f"Unsloth: {name}={coerced} must be >= 0 (use 0 or None to disable)."
-        )
+        raise ValueError(f"Unsloth: {name}={coerced} must be >= 0 (use 0 or None to disable).")
     return coerced
 
 
diff --git a/studio/backend/tests/test_cpu_threads.py b/studio/backend/tests/test_cpu_threads.py
index 28f6902004..2930c9f081 100644
--- a/studio/backend/tests/test_cpu_threads.py
+++ b/studio/backend/tests/test_cpu_threads.py
@@ -63,9 +63,7 @@ def test_cpu_thread_cap_is_opt_in(raw):
 
 
 # Anything that is not a positive integer raises a clear ValueError.
-@pytest.mark.parametrize(
-    "raw", ["zero", "0", "-3", "1.5", "abc", "8a", "0x4", "1e3", "4 0"]
-)
+@pytest.mark.parametrize("raw", ["zero", "0", "-3", "1.5", "abc", "8a", "0x4", "1e3", "4 0"])
 def test_cpu_thread_cap_requires_positive_integer(raw):
     with pytest.raises(ValueError, match = "must be a positive integer"):
         configure_cpu_threads({"UNSLOTH_CPU_THREADS": raw})
diff --git a/studio/backend/tests/test_mlx_training_worker_config.py b/studio/backend/tests/test_mlx_training_worker_config.py
index a914d4f980..44ef9045b4 100644
--- a/studio/backend/tests/test_mlx_training_worker_config.py
+++ b/studio/backend/tests/test_mlx_training_worker_config.py
@@ -86,9 +86,7 @@ def test_mlx_studio_rejects_unknown_scheduler():
 
 
 def test_mlx_studio_keeps_hf_style_tokenizer_dual_purpose():
-    source = (
-        Path(__file__).resolve().parents[1] / "core" / "training" / "worker.py"
-    ).read_text()
+    source = (Path(__file__).resolve().parents[1] / "core" / "training" / "worker.py").read_text()
 
     assert "tokenizer = tokenizer" in source
     assert "processor = tokenizer if is_vlm else None" not in source
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 977428a35b..9c66cda702 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -249,9 +249,7 @@ def test_mlx_worker_feature_detects_optional_mlx_config_fields(self):
             'getattr(MLXTrainingConfig, "__dataclass_fields__", {})',
             source,
         )
-        self.assertIn(
-            'if "cast_norm_output_to_input_dtype" in _supported_fields:', source
-        )
+        self.assertIn('if "cast_norm_output_to_input_dtype" in _supported_fields:', source)
         self.assertIn('if "dataset_order" in _supported_fields:', source)
         self.assertIn('if "max_grad_leaf_norm" in _supported_fields:', source)
         self.assertIn(
diff --git a/tests/python/test_vision_lora_targeting.py b/tests/python/test_vision_lora_targeting.py
index 5f9233827b..0a27569efd 100644
--- a/tests/python/test_vision_lora_targeting.py
+++ b/tests/python/test_vision_lora_targeting.py
@@ -17,15 +17,11 @@ def __init__(self):
             self.vision_tower = torch.nn.Module()
             self.vision_tower.vision_model = torch.nn.Module()
             self.vision_tower.vision_model.encoder = torch.nn.Module()
-            self.vision_tower.vision_model.encoder.layers = torch.nn.ModuleList(
-                [torch.nn.Module()]
+            self.vision_tower.vision_model.encoder.layers = torch.nn.ModuleList([torch.nn.Module()])
+            self.vision_tower.vision_model.encoder.layers[0].self_attn = torch.nn.Module()
+            self.vision_tower.vision_model.encoder.layers[0].self_attn.q_proj = torch.nn.Linear(
+                4, 4
             )
-            self.vision_tower.vision_model.encoder.layers[
-                0
-            ].self_attn = torch.nn.Module()
-            self.vision_tower.vision_model.encoder.layers[
-                0
-            ].self_attn.q_proj = torch.nn.Linear(4, 4)
 
     regex = get_peft_regex(
         FakeVLM(),
@@ -37,9 +33,7 @@ def __init__(self):
     )
 
     assert re.search(regex, "language_model.layers.0.self_attn.q_proj")
-    assert not re.search(
-        regex, "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj"
-    )
+    assert not re.search(regex, "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj")
 
 
 def test_fast_vision_model_wraps_explicit_targets_when_layer_filters_are_used():
diff --git a/tests/studio/run_real_mlx_smoke.py b/tests/studio/run_real_mlx_smoke.py
index 0225dd7d7f..e58b2098d8 100644
--- a/tests/studio/run_real_mlx_smoke.py
+++ b/tests/studio/run_real_mlx_smoke.py
@@ -334,8 +334,7 @@ def _on_step(
     ), f"expected {expected_logged_steps} logged steps, got {losses_per_step}"
     if "train_steps" in train_result:
         assert int(train_result["train_steps"]) == expected_logged_steps, (
-            f"expected train_steps={expected_logged_steps}, got "
-            f"{train_result['train_steps']}"
+            f"expected train_steps={expected_logged_steps}, got " f"{train_result['train_steps']}"
         )
     for i, l in enumerate(losses_per_step):
         # Allow exact 0.0: fp16 per-step loss underflows to 0.0 after
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index c3f6cf92a7..346ec0b8b3 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -1398,11 +1398,7 @@ def get_peft_model(
                 finetune_mlp_modules = finetune_mlp_modules,
             )
         else:
-            assert type(target_modules) in (
-                list,
-                tuple,
-                str,
-            )
+            assert type(target_modules) in (list, tuple, str)
             if type(target_modules) in (list, tuple) and (
                 not finetune_vision_layers
                 or not finetune_language_layers

From d142420536eba16c3adb2ea6cd5101cea423d30f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 12 Jun 2026 10:25:00 +0000
Subject: [PATCH 22/22] Forward max_grad_leaf_norm through the training route
 and warn when layer filters constrain explicit target_modules for PR #5656

---
 studio/backend/routes/training.py                 | 1 +
 studio/backend/tests/test_training_raw_support.py | 8 ++++++++
 unsloth/models/vision.py                          | 5 +++++
 3 files changed, 14 insertions(+)

diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 554c50acd4..154ba9492c 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -205,6 +205,7 @@ async def start_training(
             "weight_decay": request.weight_decay,
             "max_grad_norm": request.max_grad_norm,
             "max_grad_value": request.max_grad_value,
+            "max_grad_leaf_norm": request.max_grad_leaf_norm,
             "cast_norm_output_to_input_dtype": request.cast_norm_output_to_input_dtype,
             "random_seed": request.random_seed,
             "packing": request.packing,
diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py
index 9c66cda702..fb3cffc91e 100644
--- a/studio/backend/tests/test_training_raw_support.py
+++ b/studio/backend/tests/test_training_raw_support.py
@@ -160,6 +160,14 @@ def start(self):
         self.assertNotIn("model_random_state", config)
         self.assertNotIn("lora_random_state", config)
 
+    def test_route_forwards_all_grad_clipping_fields(self):
+        # The HTTP route builds the config dict by hand; a schema field that
+        # is not forwarded here is silently dropped for REST callers.
+        source = (_BACKEND_ROOT / "routes" / "training.py").read_text()
+        self.assertIn('"max_grad_norm": request.max_grad_norm', source)
+        self.assertIn('"max_grad_value": request.max_grad_value', source)
+        self.assertIn('"max_grad_leaf_norm": request.max_grad_leaf_norm', source)
+
     def test_mlx_worker_falls_back_init_seeds_to_random_seed(self):
         source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text()
 
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
index 346ec0b8b3..66f9cf3d1b 100644
--- a/unsloth/models/vision.py
+++ b/unsloth/models/vision.py
@@ -1405,6 +1405,11 @@ def get_peft_model(
                 or not finetune_attention_modules
                 or not finetune_mlp_modules
             ):
+                print(
+                    "Unsloth: Explicit target_modules are constrained by the "
+                    "finetune_(vision|language|attention|mlp) filters; adapters "
+                    "attach only where both select."
+                )
                 target_modules = get_peft_regex(
                     model,
                     finetune_vision_layers = finetune_vision_layers,