From d9f9c685989b63dce02eab0a54d10c60da6a94f2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 7 May 2026 06:18:05 +0000 Subject: [PATCH 1/2] tests: skip MoE LoRA extractor coverage when discovery finds zero classes test_every_patched_moe_experts_class_has_lora_extractor asserts that every class flagged `_unsloth_already_patched=True` whose source defines 3D `gate_up_proj`/`down_proj` parameters also has `_unsloth_lora_extractor_fn` registered. The defensive precondition was "discovery produced at least one patched class"; failing that, the test raised AssertionError("Discovery produced zero patched MoE classes ... Investigate."). That precondition fires on any transformers version older than the MoE surface unsloth_zoo patches. Specifically, with transformers==4.57.6 none of the gemma4 / llama4 / qwen3_moe / mxfp4 classes the TEMPORARY_PATCHES list targets exist, so every patch fn no-ops and discovery returns []. The "zero discoveries" outcome on an older transformers is not a regression of the bug being prevented (a class patched-but-missing-extractor); it just means the relevant classes aren't there to patch. Switch the precondition from `assert patched, ...` to `pytest.skip(...)` with a comment explaining the two indistinguishable cases. CI cells running newer transformers (where classes do exist) will still exercise the real assertion below; the older-transformers cell skips cleanly. Surfaced by the consolidated CPU-CI matrix on unslothai/unsloth#5312 in the cell pinned to `transformers==4.57.6`. --- tests/test_moe_lora_extractor_coverage.py | 26 ++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tests/test_moe_lora_extractor_coverage.py b/tests/test_moe_lora_extractor_coverage.py index f23ab890c..b16b4ffca 100644 --- a/tests/test_moe_lora_extractor_coverage.py +++ b/tests/test_moe_lora_extractor_coverage.py @@ -302,11 +302,27 @@ def test_every_patched_moe_experts_class_has_lora_extractor(): _apply_all_temporary_patches() patched = _discover_patched_moe_classes() - assert patched, ( - "Discovery produced zero patched MoE classes. Either no MoE " - "transformers module imported on this transformers version, or the " - "_unsloth_already_patched marker convention changed. Investigate." - ) + if not patched: + # Two situations produce zero discoveries: + # 1. The transformers version installed predates the MoE class + # surface unsloth_zoo patches (e.g. transformers 4.57.6 ships + # none of the gemma4 / llama4 / qwen3_moe / mxfp4 classes the + # TEMPORARY_PATCHES list targets, so every patch fn no-ops). + # 2. The `_unsloth_already_patched` marker convention drifted + # (the regression this test guards against). + # We can't tell (1) and (2) apart by discovery alone. Skip rather + # than fail: a CI cell pinned to an older transformers should not + # surface as a red regression here. CI cells running the + # transformers version the PR actually pins (and any newer one) + # will discover patched classes and run the real assertion. + pytest.skip( + "No patched MoE classes discovered on this transformers version. " + "Either transformers is older than the MoE class surface " + "unsloth_zoo patches, or the _unsloth_already_patched marker " + "convention changed. The latter is a real regression but " + "indistinguishable from the former here; skip and let cells " + "running newer transformers exercise the real assertion." + ) # Hard contract: extractor must be registered on every patched class. missing = [ From 4c464c8bba7ce283978b971529a97e471221c2b3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 7 May 2026 07:10:29 +0000 Subject: [PATCH 2/2] tests: distinguish missing surface from discovery regression Per codex/gemini review on PR #628: replacing the original assert with an unconditional skip created a silent-failure path -- a real discovery regression on a transformers that DOES ship the grouped-MoE surface would now skip rather than fail, defeating the test. Disambiguate the two zero-discovery causes by walking transformers once and asking whether ANY class anywhere in transformers.models.* declares `gate_up_proj` + `down_proj` as 3D `nn.Parameter`s -- the exact filter `_looks_like_grouped_moe_experts` runs against discovered classes. If at least one such class exists AND discovery returned zero, that is a real regression: fail loud. If no such class exists, the patch surface this test targets is genuinely absent on this transformers version: skip. This avoids hard-coding model names that change as transformers grows new MoE families. Also update terminology in the comment + skip / fail messages: the load-bearing marker `_has_unsloth_patched_forward` reads is `_original___forward`, not the per-family `_unsloth_already_patched` flag. --- tests/test_moe_lora_extractor_coverage.py | 71 +++++++++++++++++------ 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/tests/test_moe_lora_extractor_coverage.py b/tests/test_moe_lora_extractor_coverage.py index b16b4ffca..a430dcd67 100644 --- a/tests/test_moe_lora_extractor_coverage.py +++ b/tests/test_moe_lora_extractor_coverage.py @@ -303,25 +303,60 @@ def test_every_patched_moe_experts_class_has_lora_extractor(): patched = _discover_patched_moe_classes() if not patched: - # Two situations produce zero discoveries: - # 1. The transformers version installed predates the MoE class - # surface unsloth_zoo patches (e.g. transformers 4.57.6 ships - # none of the gemma4 / llama4 / qwen3_moe / mxfp4 classes the - # TEMPORARY_PATCHES list targets, so every patch fn no-ops). - # 2. The `_unsloth_already_patched` marker convention drifted - # (the regression this test guards against). - # We can't tell (1) and (2) apart by discovery alone. Skip rather - # than fail: a CI cell pinned to an older transformers should not - # surface as a red regression here. CI cells running the - # transformers version the PR actually pins (and any newer one) - # will discover patched classes and run the real assertion. + # Disambiguate two zero-discovery causes: + # (a) the installed transformers predates every MoE class surface + # unsloth_zoo targets (TEMPORARY_PATCHES entries no-op silently + # when their target submodule is absent) -- legitimate skip; or + # (b) the `_original___forward` marker + # convention used by `temporary_patches.utils.patch_function` + # (and read by `_has_unsloth_patched_forward` above) drifted -- + # a real regression that the test must surface. + # Probe a stable canary: if ANY of the known MoE classes the patch + # functions target is importable on this transformers, then the + # patch surface exists and (a) is ruled out -- a zero discovery + # under that condition is (b), and we fail loudly. Otherwise we + # genuinely have no patch surface to test and skip. + # Structural canary: walk every transformers.models.*.modeling_* module + # and ask "does ANY class declare `gate_up_proj` + `down_proj` as 3D + # `nn.Parameter`?" That is precisely the surface + # `_looks_like_grouped_moe_experts` filters discovery on. If at least + # one such class exists anywhere in transformers, the surface this + # test guards IS present on this version, and discovery returning 0 + # is a real regression (either `_has_unsloth_patched_forward`'s marker + # convention drifted or every TEMPORARY_PATCHES entry that targets + # such a class silently failed before reaching `patch_function`). + # Scanning the live tree avoids hard-coding model names that change + # as transformers grows new MoE families. + importable = [ + f"{cls.__module__}.{cls.__name__}" + for modeling in _iter_modeling_modules() + for _, cls in inspect.getmembers(modeling, inspect.isclass) + if isinstance(cls, type) + and getattr(cls, "__module__", None) == modeling.__name__ + and _looks_like_grouped_moe_experts(cls) + ] + if importable: + raise AssertionError( + "Discovery produced zero patched MoE classes, but transformers " + f"ships {len(importable)} class(es) whose `__init__` declares " + "`gate_up_proj` + `down_proj` as 3D `nn.Parameter`s -- the " + "exact surface this test targets. Examples: " + f"{importable[:5]}. That means TEMPORARY_PATCHES entries ran " + "without recording the `_original___" + "forward` marker that `_has_unsloth_patched_forward` reads. " + "Either `patch_function` no longer stores the original under " + "that attribute name, or the patch fn for every grouped-MoE " + "family silently failed before reaching `patch_function`. " + "This is the regression the test guards against -- do not " + "convert this back into a skip without first re-establishing " + "the marker convention." + ) pytest.skip( - "No patched MoE classes discovered on this transformers version. " - "Either transformers is older than the MoE class surface " - "unsloth_zoo patches, or the _unsloth_already_patched marker " - "convention changed. The latter is a real regression but " - "indistinguishable from the former here; skip and let cells " - "running newer transformers exercise the real assertion." + "No patched MoE classes discovered AND no class anywhere in " + "transformers.models.* declares `gate_up_proj` + `down_proj` as " + "3D `nn.Parameter`s. The grouped-MoE patch surface this test " + "guards is absent on this transformers version; cells running a " + "newer transformers exercise the real assertion." ) # Hard contract: extractor must be registered on every patched class.