diff --git a/studio/backend/core/training/training.py b/studio/backend/core/training/training.py index 6dd42976c7..d69246f584 100644 --- a/studio/backend/core/training/training.py +++ b/studio/backend/core/training/training.py @@ -37,6 +37,42 @@ logger = get_logger(__name__) +def _coerce_seed(value, default = 3407) -> int: + """Normalize None / non-int to `default` (transformers.set_seed(None) raises).""" + if value is None: + return int(default) + try: + return int(value) + except (TypeError, ValueError): + return int(default) + + +def _coerce_optional_bool(value, default: bool) -> bool: + """Treat explicit None as `default` instead of `bool(None) == False`.""" + if value is None: + return bool(default) + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in ("true", "1", "yes", "on"): + return True + if normalized in ("false", "0", "no", "off", ""): + return False + return bool(value) + + +def _coerce_optional_nonneg_float(name: str, value): + """Reject negatives; HTTP `ge=0` doesn't cover raw `**kwargs` callers.""" + if value is None: + return None + try: + coerced = float(value) + except (TypeError, ValueError): + raise ValueError(f"Unsloth: {name}={value!r} must be a non-negative float or None.") + if coerced < 0: + raise ValueError(f"Unsloth: {name}={coerced} must be >= 0 (use 0 or None to disable).") + return coerced + + _HF_TMP_CHECKPOINT_RE = re.compile(r"^tmp-checkpoint-\d+$") @@ -239,7 +275,17 @@ def start_training(self, job_id: str, **kwargs) -> bool: "save_steps": kwargs.get("save_steps", 0), "weight_decay": kwargs.get("weight_decay", 0.001), "max_grad_norm": kwargs.get("max_grad_norm", 0.0), - "random_seed": kwargs.get("random_seed", 3407), + "max_grad_value": _coerce_optional_nonneg_float( + "max_grad_value", kwargs.get("max_grad_value") + ), + "max_grad_leaf_norm": _coerce_optional_nonneg_float( + "max_grad_leaf_norm", kwargs.get("max_grad_leaf_norm") + ), + "cast_norm_output_to_input_dtype": _coerce_optional_bool( + kwargs.get("cast_norm_output_to_input_dtype"), True + ), + # MLX/CUDA/embedding workers need an int (transformers.set_seed(None) raises). + "random_seed": _coerce_seed(kwargs.get("random_seed")), "packing": kwargs.get("packing", False), "optim": kwargs.get("optim", "adamw_8bit"), "lr_scheduler_type": kwargs.get("lr_scheduler_type", "linear"), diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py index 1120744a2d..696e9a07ff 100644 --- a/studio/backend/core/training/worker.py +++ b/studio/backend/core/training/worker.py @@ -1424,6 +1424,14 @@ def _poll_stop(): is_dataset_image = bool(config.get("is_dataset_image", False)) training_type = config.get("training_type", "LoRA/QLoRA") use_lora = training_type == "LoRA/QLoRA" + # Normalize seed; explicit None must not reach the seed chain. + _raw_seed = config.get("random_seed", 3407) + random_seed = 3407 if _raw_seed is None else int(_raw_seed) + # `config.get(k, d)` only fills d when key is missing; handle explicit None too. + _model_seed = config.get("model_random_state") + model_random_state = random_seed if _model_seed is None else int(_model_seed) + _lora_seed = config.get("lora_random_state") + lora_random_state = random_seed if _lora_seed is None else int(_lora_seed) model, tokenizer = FastMLXModel.from_pretrained( model_name, load_in_4bit = config.get("load_in_4bit", True), @@ -1431,7 +1439,7 @@ def _poll_stop(): text_only = None if is_dataset_image else True, token = hf_token, trust_remote_code = bool(config.get("trust_remote_code", False)), - random_state = config.get("random_seed", 3407), + random_state = model_random_state, ) is_vlm = bool(is_dataset_image and getattr(model, "_is_vlm_model", False)) @@ -1473,7 +1481,7 @@ def _poll_stop(): lora_dropout = config.get("lora_dropout", 0.0), use_rslora = config.get("use_rslora", False), init_lora_weights = config.get("init_lora_weights", True), - random_state = config.get("random_seed", 3407), + random_state = lora_random_state, target_modules = config.get("target_modules") or [ "q_proj", @@ -1704,40 +1712,76 @@ def _fmt_progress(status_message = "", **_kw): else: eval_steps_val = int(eval_steps_val) - # MLX: per-element clip to [-1, 1]; norm clip disabled (its global reduction - # breaks MLX's eager pipeline). 1.0 not 5.0: |g_i| > 5 rarely fires, so the - # historical 5.0 was effectively a no-op. + # Per-element clipping only; trainer owns the None default. Re-validate + # for direct worker callers (training.py normalizes the main path). max_grad_norm = 0.0 - max_grad_value = 1.0 # TODO: expose MLX grad-clip in Studio UI for power users + max_grad_value = config.get("max_grad_value") + if max_grad_value is not None: + max_grad_value = float(max_grad_value) + if max_grad_value < 0: + raise ValueError( + f"Unsloth MLX: max_grad_value={max_grad_value} must be >= 0 " + "(0 or None disables elementwise clipping)." + ) + max_grad_leaf_norm = config.get("max_grad_leaf_norm") + if max_grad_leaf_norm is not None: + max_grad_leaf_norm = float(max_grad_leaf_norm) + if max_grad_leaf_norm < 0: + raise ValueError( + f"Unsloth MLX: max_grad_leaf_norm={max_grad_leaf_norm} must be >= 0 " + "(0 or None disables proportional leaf-norm clipping)." + ) + weight_decay = config.get("weight_decay", 0.001) + weight_decay = 0.001 if weight_decay is None else float(weight_decay) + + mlx_config_kwargs = dict( + per_device_train_batch_size = batch_size, + gradient_accumulation_steps = grad_accum, + max_steps = max_steps, + learning_rate = lr_value, + warmup_steps = warmup_steps, + lr_scheduler_type = lr_scheduler_type, + optim = optim_name, + weight_decay = weight_decay, + max_grad_norm = max_grad_norm, + max_grad_value = max_grad_value, + logging_steps = 1, + max_seq_length = max_seq_length, + seed = random_seed, + use_cce = True, + compile = True, + gradient_checkpointing = use_grad_checkpoint, + streaming = is_vlm, + packing = bool(config.get("packing", False)), + output_dir = output_dir, + save_steps = int(config.get("save_steps", 0) or 0), + eval_steps = eval_steps_val, + ) + + # Feature-detect optional fields so this PR works without the paired zoo bump. + _supported_fields = getattr(MLXTrainingConfig, "__dataclass_fields__", {}) + if "cast_norm_output_to_input_dtype" in _supported_fields: + # Explicit None falls back to True (default). + _raw_cast = config.get("cast_norm_output_to_input_dtype", True) + mlx_config_kwargs["cast_norm_output_to_input_dtype"] = ( + True if _raw_cast is None else bool(_raw_cast) + ) + if "dataset_order" in _supported_fields: + mlx_config_kwargs["dataset_order"] = "torch_randperm" + if "max_grad_leaf_norm" in _supported_fields: + mlx_config_kwargs["max_grad_leaf_norm"] = max_grad_leaf_norm + if "append_eos" in _supported_fields: + raw_text_mode = training_type == "Continued Pretraining" or format_type == "raw" + # Studio SFT formatting owns rendered examples; raw/CPT text still + # needs MLX to append EOS like the CUDA raw-text path. + mlx_config_kwargs["append_eos"] = bool(raw_text_mode) trainer = MLXTrainer( model = model, tokenizer = tokenizer, train_dataset = dataset, eval_dataset = eval_dataset, - args = MLXTrainingConfig( - per_device_train_batch_size = batch_size, - gradient_accumulation_steps = grad_accum, - max_steps = max_steps, - learning_rate = lr_value, - warmup_steps = warmup_steps, - lr_scheduler_type = lr_scheduler_type, - optim = optim_name, - weight_decay = float(config.get("weight_decay", 0.001) or 0.001), - max_grad_norm = max_grad_norm, - max_grad_value = max_grad_value, - logging_steps = 1, - max_seq_length = max_seq_length, - seed = config.get("random_seed", 3407), - use_cce = True, - compile = True, - gradient_checkpointing = use_grad_checkpoint, - streaming = is_vlm, - packing = bool(config.get("packing", False)), - output_dir = output_dir, - save_steps = int(config.get("save_steps", 0) or 0), - eval_steps = eval_steps_val, - ), + args = MLXTrainingConfig(**mlx_config_kwargs), ) _trainer_ref[0] = trainer if _stop_requested[0]: diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py index ca04591178..6cf8907b84 100644 --- a/studio/backend/models/training.py +++ b/studio/backend/models/training.py @@ -325,7 +325,37 @@ def _check_lora_dropout(cls, v: float) -> float: ge = 0, description = "Global gradient norm clipping threshold. Set 0 to disable.", ) - random_seed: int = Field(42, description = "Random seed") + max_grad_value: Optional[float] = Field( + None, + ge = 0, + description = ( + "MLX-only elementwise gradient value clipping threshold. " + "If unset, MLX uses its runtime default." + ), + ) + max_grad_leaf_norm: Optional[float] = Field( + None, + ge = 0, + description = ( + "MLX-only proportional per-parameter gradient norm cap. " + "Preserves each tensor's gradient direction without global norm " + "clipping's memory overhead." + ), + ) + cast_norm_output_to_input_dtype: bool = Field( + True, + description = ( + "MLX-only: keep norm parameters in fp32 but cast norm outputs " + "back to the incoming activation dtype." + ), + ) + random_seed: int = Field( + 3407, + description = ( + "Random seed; matches the Studio backend / MLX worker default " + "and unsloth's historical recommended value." + ), + ) packing: bool = Field(False, description = "Enable sequence packing") optim: str = Field("adamw_8bit", description = "Optimizer") lr_scheduler_type: str = Field("linear", description = "Learning rate scheduler type") diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py index 281f03bcaf..09a3e06c91 100644 --- a/studio/backend/routes/training.py +++ b/studio/backend/routes/training.py @@ -215,6 +215,9 @@ async def start_training( "save_steps": request.save_steps, "weight_decay": request.weight_decay, "max_grad_norm": request.max_grad_norm, + "max_grad_value": request.max_grad_value, + "max_grad_leaf_norm": request.max_grad_leaf_norm, + "cast_norm_output_to_input_dtype": request.cast_norm_output_to_input_dtype, "random_seed": request.random_seed, "packing": request.packing, "optim": request.optim, diff --git a/studio/backend/tests/test_mlx_training_worker_config.py b/studio/backend/tests/test_mlx_training_worker_config.py index 4402031467..44ef9045b4 100644 --- a/studio/backend/tests/test_mlx_training_worker_config.py +++ b/studio/backend/tests/test_mlx_training_worker_config.py @@ -85,6 +85,13 @@ def test_mlx_studio_rejects_unknown_scheduler(): _normalize_mlx_studio_scheduler("linear_typo") +def test_mlx_studio_keeps_hf_style_tokenizer_dual_purpose(): + source = (Path(__file__).resolve().parents[1] / "core" / "training" / "worker.py").read_text() + + assert "tokenizer = tokenizer" in source + assert "processor = tokenizer if is_vlm else None" not in source + + def test_mlx_vlm_resize_uses_max_dimension_like_torch_trainer(): assert _mlx_vlm_max_resized_size(1000, 500, 512) == (512, 256) assert _mlx_vlm_max_resized_size(500, 1000, 512) == (256, 512) diff --git a/studio/backend/tests/test_training_raw_support.py b/studio/backend/tests/test_training_raw_support.py index 2b1299de5c..fb3cffc91e 100644 --- a/studio/backend/tests/test_training_raw_support.py +++ b/studio/backend/tests/test_training_raw_support.py @@ -107,10 +107,191 @@ def start(self): model_name = "unsloth/test", training_type = "LoRA/QLoRA", max_grad_norm = 0.7, + max_grad_value = 3.0, + max_grad_leaf_norm = 1.3, ) config = mock_process.call_args.kwargs["kwargs"]["config"] self.assertEqual(config["max_grad_norm"], 0.7) + self.assertEqual(config["max_grad_value"], 3.0) + self.assertEqual(config["max_grad_leaf_norm"], 1.3) + + def test_training_backend_forwards_random_seed_without_internal_mlx_seed_keys(self): + backend = TrainingBackend() + + class DummyProcess: + pid = 12345 + + def start(self): + return None + + class DummyThread: + def start(self): + return None + + dummy_queue = object() + + with ( + patch( + "core.training.training.prepare_gpu_selection", + return_value = ([0], {"selection_mode": "auto"}), + ), + patch( + "core.training.training._CTX.Queue", + side_effect = [dummy_queue, dummy_queue], + ), + patch( + "core.training.training._CTX.Process", return_value = DummyProcess() + ) as mock_process, + patch( + "core.training.training.threading.Thread", + return_value = DummyThread(), + ), + ): + backend.start_training( + job_id = "test-seed", + model_name = "unsloth/test", + training_type = "LoRA/QLoRA", + random_seed = 1234, + ) + + config = mock_process.call_args.kwargs["kwargs"]["config"] + self.assertEqual(config["random_seed"], 1234) + self.assertNotIn("model_random_state", config) + self.assertNotIn("lora_random_state", config) + + def test_route_forwards_all_grad_clipping_fields(self): + # The HTTP route builds the config dict by hand; a schema field that + # is not forwarded here is silently dropped for REST callers. + source = (_BACKEND_ROOT / "routes" / "training.py").read_text() + self.assertIn('"max_grad_norm": request.max_grad_norm', source) + self.assertIn('"max_grad_value": request.max_grad_value', source) + self.assertIn('"max_grad_leaf_norm": request.max_grad_leaf_norm', source) + + def test_mlx_worker_falls_back_init_seeds_to_random_seed(self): + source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text() + + # random_seed itself is normalized first so explicit None coming + # from a raw / backend caller does not propagate through the chain. + self.assertIn('_raw_seed = config.get("random_seed", 3407)', source) + self.assertIn( + "random_seed = 3407 if _raw_seed is None else int(_raw_seed)", + source, + ) + # Both absent and explicit None must fall back to random_seed. + # `dict.get(key, default)` only fills the default on absent keys, + # so an explicit `None` would otherwise reach FastMLXModel / + # get_peft_model and disable deterministic init. + self.assertIn('_model_seed = config.get("model_random_state")', source) + self.assertIn( + "model_random_state = random_seed if _model_seed is None else int(_model_seed)", + source, + ) + self.assertIn('_lora_seed = config.get("lora_random_state")', source) + self.assertIn( + "lora_random_state = random_seed if _lora_seed is None else int(_lora_seed)", + source, + ) + self.assertIn("random_state = model_random_state", source) + self.assertIn("random_state = lora_random_state", source) + # MLXTrainingConfig now receives the normalized seed directly. + self.assertIn("seed = random_seed,", source) + + def test_mlx_worker_preserves_null_max_grad_value_for_trainer_default(self): + source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text() + + # None must survive to the MLX trainer so it picks its own runtime + # default, and any other value must coerce to float without + # rebinding None to 1.0 (which the legacy code did). + self.assertIn('max_grad_value = config.get("max_grad_value")', source) + self.assertIn("max_grad_value = float(max_grad_value)", source) + self.assertNotIn( + "max_grad_value = 1.0 if max_grad_value is None else float(max_grad_value)", + source, + ) + + def test_training_backend_normalizes_explicit_none_seed_and_dtypes(self): + # Raw / backend callers can pass `random_seed=None`, + # `cast_norm_output_to_input_dtype=None`, and MLX clip knobs + # as None (or omit them) and must NOT leak the + # `None` past `TrainingBackend.start_training`. Otherwise + # transformers.set_seed(None) raises, PEFT init becomes + # nondeterministic, and the MLX norm-output cast silently flips. + from core.training.training import ( + _coerce_seed, + _coerce_optional_bool, + _coerce_optional_nonneg_float, + ) + + self.assertEqual(_coerce_seed(None), 3407) + self.assertEqual(_coerce_seed("123"), 123) + self.assertEqual(_coerce_seed("not-a-number"), 3407) + + self.assertTrue(_coerce_optional_bool(None, True)) + self.assertFalse(_coerce_optional_bool(None, False)) + self.assertFalse(_coerce_optional_bool("false", True)) + self.assertTrue(_coerce_optional_bool("true", False)) + + self.assertIsNone(_coerce_optional_nonneg_float("max_grad_value", None)) + self.assertEqual(_coerce_optional_nonneg_float("max_grad_value", "2.5"), 2.5) + self.assertEqual(_coerce_optional_nonneg_float("max_grad_value", 0), 0.0) + with self.assertRaises(ValueError): + _coerce_optional_nonneg_float("max_grad_value", -1) + self.assertIsNone(_coerce_optional_nonneg_float("max_grad_leaf_norm", None)) + self.assertEqual( + _coerce_optional_nonneg_float("max_grad_leaf_norm", "1.3"), + 1.3, + ) + with self.assertRaises(ValueError): + _coerce_optional_nonneg_float("max_grad_leaf_norm", -1) + + def test_mlx_worker_feature_detects_optional_mlx_config_fields(self): + # `cast_norm_output_to_input_dtype`, `dataset_order`, + # `max_grad_leaf_norm`, and `append_eos` ship in the paired + # unsloth-zoo update. Until that floor is in place, the + # worker must gate them so releases that predate those fields can + # still construct MLXTrainingConfig without TypeError. + source = (_BACKEND_ROOT / "core" / "training" / "worker.py").read_text() + + self.assertIn( + 'getattr(MLXTrainingConfig, "__dataclass_fields__", {})', + source, + ) + self.assertIn('if "cast_norm_output_to_input_dtype" in _supported_fields:', source) + self.assertIn('if "dataset_order" in _supported_fields:', source) + self.assertIn('if "max_grad_leaf_norm" in _supported_fields:', source) + self.assertIn( + 'mlx_config_kwargs["max_grad_leaf_norm"] = max_grad_leaf_norm', + source, + ) + self.assertIn('if "append_eos" in _supported_fields:', source) + self.assertIn('format_type == "raw"', source) + self.assertIn('mlx_config_kwargs["append_eos"] = bool(raw_text_mode)', source) + # The unconditional kwargs must NOT include any gated field. + # Use proper paren tracking; `source.find(")", ...)` would stop at + # the first close paren inside the dict body (e.g. + # `int(config.get("save_steps", 0) or 0)`) and miss any future + # unconditional addition of the gated fields later in the dict. + unconditional_block_start = source.find("mlx_config_kwargs = dict(") + self.assertNotEqual(unconditional_block_start, -1) + depth = 0 + i = unconditional_block_start + len("mlx_config_kwargs = dict") + end = i + while i < len(source): + ch = source[i] + if ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + if depth == 0: + end = i + 1 + break + i += 1 + unconditional = source[unconditional_block_start:end] + self.assertNotIn("cast_norm_output_to_input_dtype", unconditional) + self.assertNotIn("dataset_order", unconditional) + self.assertNotIn("max_grad_leaf_norm", unconditional) + self.assertNotIn("append_eos", unconditional) def test_training_route_forwards_embedding_learning_rate(self): training_route = _load_route_module( diff --git a/studio/frontend/src/features/training/api/mappers.ts b/studio/frontend/src/features/training/api/mappers.ts index ead719f825..773c5581ae 100644 --- a/studio/frontend/src/features/training/api/mappers.ts +++ b/studio/frontend/src/features/training/api/mappers.ts @@ -111,6 +111,7 @@ export function buildTrainingStartPayload( eval_steps: config.evalSteps, weight_decay: config.weightDecay, max_grad_norm: 0.0, + max_grad_value: null, random_seed: config.randomSeed, packing: isEmbedding ? false : config.packing, optim: config.optimizerType, diff --git a/studio/frontend/src/features/training/types/api.ts b/studio/frontend/src/features/training/types/api.ts index 8490d5ee6f..b466276bc2 100644 --- a/studio/frontend/src/features/training/types/api.ts +++ b/studio/frontend/src/features/training/types/api.ts @@ -37,6 +37,7 @@ export interface TrainingStartRequest { eval_steps: number; weight_decay: number; max_grad_norm: number; + max_grad_value?: number | null; random_seed: number; packing: boolean; optim: string; diff --git a/tests/python/test_vision_lora_targeting.py b/tests/python/test_vision_lora_targeting.py new file mode 100644 index 0000000000..0a27569efd --- /dev/null +++ b/tests/python/test_vision_lora_targeting.py @@ -0,0 +1,43 @@ +from pathlib import Path +import re + +import torch + + +def test_vlm_lora_regex_respects_language_only_with_explicit_targets(): + from unsloth_zoo.peft_utils import get_peft_regex + + class FakeVLM(torch.nn.Module): + def __init__(self): + super().__init__() + self.language_model = torch.nn.Module() + self.language_model.layers = torch.nn.ModuleList([torch.nn.Module()]) + self.language_model.layers[0].self_attn = torch.nn.Module() + self.language_model.layers[0].self_attn.q_proj = torch.nn.Linear(4, 4) + self.vision_tower = torch.nn.Module() + self.vision_tower.vision_model = torch.nn.Module() + self.vision_tower.vision_model.encoder = torch.nn.Module() + self.vision_tower.vision_model.encoder.layers = torch.nn.ModuleList([torch.nn.Module()]) + self.vision_tower.vision_model.encoder.layers[0].self_attn = torch.nn.Module() + self.vision_tower.vision_model.encoder.layers[0].self_attn.q_proj = torch.nn.Linear( + 4, 4 + ) + + regex = get_peft_regex( + FakeVLM(), + finetune_vision_layers = False, + finetune_language_layers = True, + finetune_attention_modules = True, + finetune_mlp_modules = True, + target_modules = ["q_proj"], + ) + + assert re.search(regex, "language_model.layers.0.self_attn.q_proj") + assert not re.search(regex, "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj") + + +def test_fast_vision_model_wraps_explicit_targets_when_layer_filters_are_used(): + source = Path("unsloth/models/vision.py").read_text() + + assert "target_modules = get_peft_regex(" in source + assert "target_modules = list(target_modules)" in source diff --git a/tests/studio/run_real_mlx_smoke.py b/tests/studio/run_real_mlx_smoke.py index 6bf0288206..e58b2098d8 100644 --- a/tests/studio/run_real_mlx_smoke.py +++ b/tests/studio/run_real_mlx_smoke.py @@ -11,9 +11,10 @@ python run_real_mlx_smoke.py reload --format {lora|merged|gguf} --dir D `train` loads gemma-3-270m-it, applies LoRA, probes pre/post loss+grad, -overfits one repeated row, generates, saves in lora/merged_16bit/gguf -(gguf best-effort), and writes train_metrics.json. `reload` reopens each -saved format in a fresh process and writes _reload_metrics.json. +overfits one repeated row for 30 deterministic steps (batch 2, accum 3), +generates, saves in lora/merged_16bit/gguf (gguf best-effort), and writes +train_metrics.json. `reload` reopens each saved format in a fresh process +and writes _reload_metrics.json. GGUF export and LoRA reload fixes land in unslothai/unsloth-zoo#627. @@ -120,10 +121,9 @@ def _compute_loss_and_grad_norm(model, tokenizer, text: str) -> tuple[float, flo import mlx.nn as nn from mlx.utils import tree_flatten + # Match Studio's text dataset path: Studio passes exactly the formatted + # text to the tokenizer and does not append EOS behind the user's back. ids = list(tokenizer.encode(text)) - eos_id = getattr(tokenizer, "eos_token_id", None) - if eos_id is not None: - ids.append(int(eos_id)) if len(ids) < 2: raise RuntimeError(f"text too short to compute loss: {len(ids)} tokens") @@ -268,10 +268,9 @@ def cmd_train(args) -> int: lr_scheduler_type = "constant", optim = "adamw", weight_decay = 0.0, - # Elementwise value clip is cheaper than norm clip on MLX (no - # cross-tree reduction) and has a higher 13-seed pass rate at this - # fixture (value=1.0 62%, norm=1.0 46%). Pin both: value wins when - # both > 0, so disable norm. + # Pin the elementwise clip to match the 13-seed-tested fixture + # (value=1.0 62% pass, norm=1.0 46%). Zoo's new MLX default is + # max_grad_leaf_norm=1.0; explicit value wins, norm disabled. max_grad_norm = 0.0, max_grad_value = 1.0, logging_steps = 1, @@ -329,9 +328,14 @@ def _on_step( } # logging_steps=1 + max_steps=N -> N callbacks; track config so the # gate auto-follows if max_steps is bumped again. + expected_logged_steps = int(config.max_steps) assert ( - len(losses_per_step) == config.max_steps - ), f"expected {config.max_steps} logged steps, got {losses_per_step}" + len(losses_per_step) == expected_logged_steps + ), f"expected {expected_logged_steps} logged steps, got {losses_per_step}" + if "train_steps" in train_result: + assert int(train_result["train_steps"]) == expected_logged_steps, ( + f"expected train_steps={expected_logged_steps}, got " f"{train_result['train_steps']}" + ) for i, l in enumerate(losses_per_step): # Allow exact 0.0: fp16 per-step loss underflows to 0.0 after # the LoRA reaches loss=0 around step ~10 with this fixture + diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py index e8161427d5..66f9cf3d1b 100644 --- a/unsloth/models/vision.py +++ b/unsloth/models/vision.py @@ -1399,6 +1399,25 @@ def get_peft_model( ) else: assert type(target_modules) in (list, tuple, str) + if type(target_modules) in (list, tuple) and ( + not finetune_vision_layers + or not finetune_language_layers + or not finetune_attention_modules + or not finetune_mlp_modules + ): + print( + "Unsloth: Explicit target_modules are constrained by the " + "finetune_(vision|language|attention|mlp) filters; adapters " + "attach only where both select." + ) + target_modules = get_peft_regex( + model, + finetune_vision_layers = finetune_vision_layers, + finetune_language_layers = finetune_language_layers, + finetune_attention_modules = finetune_attention_modules, + finetune_mlp_modules = finetune_mlp_modules, + target_modules = list(target_modules), + ) if hasattr(model, "vllm_engine"): if (