diff --git a/tests/test_vllm_to_hf_conversion.py b/tests/test_vllm_to_hf_conversion.py
deleted file mode 100644
index fa15d07ac..000000000
--- a/tests/test_vllm_to_hf_conversion.py
+++ /dev/null
@@ -1,1370 +0,0 @@
-import sys, os, warnings, inspect
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-
-import types
-import pytest
-import torch
-
-
-class _FakePlainProj(torch.nn.Module):
-    def __init__(self, out_features, in_features, dtype=torch.float32):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.randn(out_features, in_features, dtype=dtype), requires_grad=False)
-
-
-class _FakeGDN(torch.nn.Module):
-    def __init__(self, hidden_size=8, num_k_heads=2, num_v_heads=2, head_k_dim=2, head_v_dim=4):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.num_k_heads = num_k_heads
-        self.num_v_heads = num_v_heads
-        self.head_k_dim = head_k_dim
-        self.head_v_dim = head_v_dim
-        self.key_dim = num_k_heads * head_k_dim
-        self.value_dim = num_v_heads * head_v_dim
-        qkvz_dim = self.key_dim * 2 + self.value_dim * 2
-        self.in_proj_qkvz = _FakePlainProj(qkvz_dim, hidden_size)
-        self.in_proj_ba = _FakePlainProj(num_v_heads * 2, hidden_size)
-        self.conv1d = _FakePlainProj(self.key_dim * 2 + self.value_dim, 4)
-        self.dt_bias = torch.nn.Parameter(torch.randn(num_v_heads), requires_grad=False)
-        self.A_log = torch.nn.Parameter(torch.randn(num_v_heads), requires_grad=False)
-        self.norm = torch.nn.Module()
-        self.norm.weight = torch.nn.Parameter(torch.randn(head_v_dim), requires_grad=False)
-        self.out_proj = _FakePlainProj(hidden_size, self.value_dim)
-
-
-def _fake_get_state_dict(prefix, kk, state_dict, module, slice_weights=True):
-    state_dict[f"{prefix}.weight"] = module.weight.data
-
-
-def test_extract_gdn_layers_handles_plain_column_parallel_linear():
-    # Pre-fix: vllm ColumnParallelLinear has no `output_sizes` -> AttributeError.
-    from unsloth_zoo.empty_model import extract_gdn_layers
-    gdn = _FakeGDN()
-    state_dict, quant_state_dict = {}, {}
-    extract_gdn_layers(gdn, "prefix", state_dict, quant_state_dict, _fake_get_state_dict)
-    expected = {
-        "prefix.in_proj_qkv.weight",
-        "prefix.in_proj_z.weight",
-        "prefix.in_proj_b.weight",
-        "prefix.in_proj_a.weight",
-        "prefix.conv1d.weight",
-        "prefix.dt_bias",
-        "prefix.A_log",
-        "prefix.norm.weight",
-        "prefix.out_proj.weight",
-    }
-    assert expected <= set(state_dict.keys())
-
-
-def test_extract_gdn_layers_splits_in_proj_ba_without_indexerror():
-    # Pre-fix: get_state_dict(kk=1, in_proj_ba) -> IndexError (no output_sizes).
-    from unsloth_zoo.empty_model import extract_gdn_layers
-    gdn = _FakeGDN()
-    state_dict, quant_state_dict = {}, {}
-    extract_gdn_layers(gdn, "prefix", state_dict, quant_state_dict, _fake_get_state_dict)
-    ba_weight = gdn.in_proj_ba.weight.data
-    mid = ba_weight.shape[0] // 2
-    torch.testing.assert_close(state_dict["prefix.in_proj_b.weight"], ba_weight[:mid])
-    torch.testing.assert_close(state_dict["prefix.in_proj_a.weight"], ba_weight[mid:])
-
-
-def test_extract_gdn_layers_qkvz_offsets_match_gdn_dims():
-    from unsloth_zoo.empty_model import extract_gdn_layers
-    gdn = _FakeGDN(num_k_heads=3, num_v_heads=2, head_k_dim=4, head_v_dim=5)
-    state_dict, quant_state_dict = {}, {}
-    extract_gdn_layers(gdn, "prefix", state_dict, quant_state_dict, _fake_get_state_dict)
-    assert state_dict["prefix.in_proj_qkv.weight"].shape[0] == 2 * gdn.key_dim + gdn.value_dim
-    assert state_dict["prefix.in_proj_z.weight"].shape[0] == gdn.value_dim
-
-
-def test_extract_gdn_layers_raises_when_offsets_underivable():
-    from unsloth_zoo.empty_model import extract_gdn_layers
-    gdn = _FakeGDN()
-    del gdn.key_dim
-    del gdn.value_dim
-    with pytest.raises(RuntimeError, match="in_proj_qkvz"):
-        extract_gdn_layers(gdn, "prefix", {}, {}, _fake_get_state_dict)
-
-
-def test_extract_gdn_layers_has_bnb_quant_state_preservation():
-    # Pre-fix: merged in_proj_qkvz path only stored raw weight slices; BnB prequantized
-    # checkpoints lost quant_state metadata and were rebuilt as plain nn.Linear.
-    # Behavioral test requires real BnB; source-level check confirms the branch exists.
-    from unsloth_zoo import empty_model
-    src = inspect.getsource(empty_model.extract_gdn_layers)
-    assert "bnb_quant_state" in src
-    # quant-state keys are now emitted via a helper that concatenates
-    # f"{name}.weight.quant_state"; check the prefixes and suffix separately.
-    assert "in_proj_qkv" in src
-    assert "in_proj_z" in src
-    assert "in_proj_b" in src
-    assert "in_proj_a" in src
-    assert ".weight.quant_state" in src
-
-
-class _LinearAttn(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.layer_idx = -1
-
-
-class _StandardLayer(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.layer_idx = -1
-        self.linear_attn = _LinearAttn()
-
-
-class _StandardLM(torch.nn.Module):
-    def __init__(self, n_layers=3):
-        super().__init__()
-
-        class _Inner(torch.nn.Module):
-            def __init__(self, n):
-                super().__init__()
-                self.layers = torch.nn.ModuleList([_StandardLayer() for _ in range(n)])
-
-        self.model = _Inner(n_layers)
-
-
-def _config(model_type="qwen3_5", has_vision=False):
-    cfg = types.SimpleNamespace()
-    cfg.model_type = model_type
-    cfg.text_config = cfg
-    if has_vision:
-        vc = types.SimpleNamespace()
-        vc.hidden_size = 1
-        vc.num_heads = 1
-        cfg.vision_config = vc
-    return cfg
-
-
-def test_finalize_fixes_layer_idx_on_standard_causal_lm():
-    # Pre-fix: only new_model.model.language_model.layers was traversed, so
-    # standard-LM paths kept layer_idx at the empty-model stub value.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-    model = _StandardLM(n_layers=4)
-    finalize_huggingface_model(
-        model, None, _config("qwen3_5"), torch.float16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-    for i, layer in enumerate(model.model.layers):
-        assert layer.layer_idx == i
-        assert layer.linear_attn.layer_idx == i
-
-
-def test_finalize_fixes_layer_idx_on_vlm_language_model_path():
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _VLM(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-
-            class _Inner(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-
-                    class _LM(torch.nn.Module):
-                        def __init__(self):
-                            super().__init__()
-                            self.layers = torch.nn.ModuleList([_StandardLayer() for _ in range(3)])
-
-                    self.language_model = _LM()
-
-            self.model = _Inner()
-
-    model = _VLM()
-    finalize_huggingface_model(
-        model, None, _config(), torch.float16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-    for i, layer in enumerate(model.model.language_model.layers):
-        assert layer.layer_idx == i
-        assert layer.linear_attn.layer_idx == i
-
-
-def test_finalize_does_not_assert_on_text_only_with_rotary_pos_emb():
-    # Pre-fix: hard `assert vision_config is not None` crashed text-only models.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _Rotary(torch.nn.Module):
-        pass
-
-    class _Layer(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.rotary_pos_emb = _Rotary()
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList([_Layer()])
-
-    finalize_huggingface_model(
-        _Model(), None, _config(has_vision=False), torch.float16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-
-
-def test_set_dtype_in_config_no_torch_dtype_deprecation():
-    # Pre-fix: wrote both dtype and torch_dtype -> transformers deprecation warning.
-    from transformers import PretrainedConfig
-    from unsloth_zoo.hf_utils import set_dtype_in_config
-    cfg = PretrainedConfig()
-    with warnings.catch_warnings(record=True) as caught:
-        warnings.simplefilter("always")
-        set_dtype_in_config(cfg, torch.bfloat16)
-    dep = [w for w in caught if "torch_dtype" in str(w.message) and "deprecated" in str(w.message).lower()]
-    assert not dep, f"unexpected deprecation warning: {[str(w.message) for w in dep]}"
-
-
-def test_set_dtype_in_config_writes_torch_dtype_value():
-    # set_dtype_in_config stores a JSON-safe string (e.g. "float16"), so that
-    # downstream config.save_pretrained() and string comparisons in
-    # patching_utils.patch_model_and_tokenizer keep working.
-    from transformers import PretrainedConfig
-    from unsloth_zoo.hf_utils import set_dtype_in_config, dtype_from_config
-    cfg = PretrainedConfig()
-    set_dtype_in_config(cfg, torch.float16)
-    got = dtype_from_config(cfg)
-    assert got == "float16"
-
-
-def test_set_dtype_in_config_accepts_string_input():
-    from transformers import PretrainedConfig
-    from unsloth_zoo.hf_utils import set_dtype_in_config, dtype_from_config
-    cfg = PretrainedConfig()
-    set_dtype_in_config(cfg, "bfloat16")
-    got = dtype_from_config(cfg)
-    assert got == "bfloat16"
-
-
-def test_set_dtype_in_config_stores_json_safe_string():
-    # Regression: prior PR iteration stored torch.dtype objects which broke
-    # config.save_pretrained() (JSON serialization) and string equality against
-    # "float16"/"bfloat16"/"float32" in patching_utils.patch_model_and_tokenizer.
-    import json
-    from transformers import PretrainedConfig
-    from unsloth_zoo.hf_utils import set_dtype_in_config, dtype_from_config
-    cfg = PretrainedConfig()
-    set_dtype_in_config(cfg, torch.bfloat16)
-    value = dtype_from_config(cfg)
-    assert isinstance(value, str)
-    json.dumps({"dtype": value})
-
-
-def test_normalize_state_dict_tensor_guards_non_tensor():
-    # Pre-fix: value.is_sparse was called unconditionally on any state-dict value.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils.assert_same_state_dict)
-    assert "isinstance(value, torch.Tensor)" in src
-    assert src.index("isinstance(value, torch.Tensor)") < src.index("value.is_sparse")
-
-
-def test_gemma4_lora_patch_preserves_signature_for_inspect():
-    # Pre-fix: patched_create_lora_manager(model, *args, **kwargs) hid vllm_config,
-    # breaking _call_create_lora_manager's signature-based forwarding. Current
-    # fix wraps with functools.wraps and delegates to the original manager so
-    # vLLM shim kwargs reach the constructor correctly.
-    from unsloth_zoo import empty_model
-    src = inspect.getsource(empty_model.patch_gemma4_vllm_lora_support)
-    assert "@wraps(original_create_lora_manager)" in src
-    assert "original_create_lora_manager(model, *args, **kwargs)" in src
-    assert 'kwargs.setdefault("lora_manager_cls"' in src
-
-
-def test_gemma4_k_eq_v_patch_handles_split_kv_layout():
-    # Pre-fix: only packed self_attn.qkv_proj.weight was searched, so current upstream
-    # Gemma4 split q_proj/k_proj/v_proj layout never got synthetic V quant-state.
-    from unsloth_zoo import empty_model
-    src = inspect.getsource(empty_model.patch_gemma4_vllm_k_eq_v_support)
-    assert "k_proj.weight" in src and "v_proj.weight" in src
-    assert '"split"' in src or "'split'" in src
-
-
-# ----- Regression tests for review-iter-1 follow-up fixes -----
-
-class _FakeQuantState:
-    def __init__(self, tag):
-        self.tag = tag
-
-    def as_dict(self, packed=True):
-        return {"absmax": torch.tensor([float(len(self.tag))])}
-
-
-class _FakeBnBParam(torch.nn.Parameter):
-    # torch.nn.Parameter is a Tensor subclass; we attach bnb_quant_state on it
-    # so the wrapper-vs-raw-tensor distinction is preserved.
-    def __new__(cls, data, bnb_quant_state=None):
-        inst = torch.nn.Parameter.__new__(cls, data, requires_grad=False)
-        inst.bnb_quant_state = bnb_quant_state
-        return inst
-
-
-class _FakeBnBProj(torch.nn.Module):
-    def __init__(self, out_features, in_features, bnb_quant_state):
-        super().__init__()
-        raw = torch.zeros(out_features, in_features, dtype=torch.uint8)
-        self.weight = _FakeBnBParam(raw, bnb_quant_state=bnb_quant_state)
-
-
-class _FakeBnBGDN(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.hidden_size = 4
-        self.num_k_heads = 2
-        self.num_v_heads = 2
-        self.head_k_dim = 2
-        self.head_v_dim = 4
-        self.key_dim = self.num_k_heads * self.head_k_dim
-        self.value_dim = self.num_v_heads * self.head_v_dim
-        qkvz_quant_states = {
-            0: _FakeQuantState("qkv"),
-            3: _FakeQuantState("z"),
-        }
-        self.in_proj_qkvz = _FakeBnBProj(
-            out_features = self.key_dim * 2 + self.value_dim * 2,
-            in_features  = self.hidden_size,
-            bnb_quant_state = qkvz_quant_states,
-        )
-        ba_quant_states = {
-            0: _FakeQuantState("b"),
-            1: _FakeQuantState("a"),
-        }
-        self.in_proj_ba = _FakeBnBProj(
-            out_features = self.num_v_heads * 2,
-            in_features  = self.hidden_size,
-            bnb_quant_state = ba_quant_states,
-        )
-        self.conv1d = _FakePlainProj(self.key_dim * 2 + self.value_dim, 4)
-        self.dt_bias = torch.nn.Parameter(torch.randn(self.num_v_heads), requires_grad=False)
-        self.A_log = torch.nn.Parameter(torch.randn(self.num_v_heads), requires_grad=False)
-        self.norm = torch.nn.Module()
-        self.norm.weight = torch.nn.Parameter(torch.randn(self.head_v_dim), requires_grad=False)
-        self.out_proj = _FakePlainProj(self.hidden_size, self.value_dim)
-
-
-def test_extract_gdn_layers_emits_bnb_quant_state_for_all_shards():
-    # Pre-fix: extract_gdn_layers() unwrapped Params4bit before reading
-    # `bnb_quant_state`, so the attribute was always None. Also the in_proj_ba
-    # split never emitted quant-state entries for in_proj_b/in_proj_a.
-    from unsloth_zoo.empty_model import extract_gdn_layers
-    gdn = _FakeBnBGDN()
-    state_dict, quant_state_dict = {}, {}
-    extract_gdn_layers(gdn, "prefix", state_dict, quant_state_dict, _fake_get_state_dict)
-    for shard in ("in_proj_qkv", "in_proj_z", "in_proj_b", "in_proj_a"):
-        key = f"prefix.{shard}.weight.quant_state"
-        assert key in quant_state_dict, f"missing quant_state for {shard}"
-    # and the sharded companion keys from QuantState.as_dict should have been
-    # expanded into state_dict via the helper
-    assert "prefix.in_proj_qkv.weight.absmax" in state_dict
-    assert "prefix.in_proj_b.weight.absmax" in state_dict
-
-
-def test_assert_same_state_dict_tied_embed_fallback_has_tolerances():
-    # Pre-fix: tied-embeddings fallback used strict tolerances while the outer
-    # comparison used atol=1e-4, rtol=1e-3. Mismatched tolerances produced
-    # spurious failures.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils.assert_same_state_dict)
-    tied_idx = src.index("model.embed_tokens.weight")
-    tail = src[tied_idx:]
-    assert "atol = 1e-4" in tail
-    assert "rtol = 1e-3" in tail
-
-
-def test_gemma4_lora_soft_imports_vllm_v1_worker():
-    # Pre-fix: patch_gemma4_vllm_lora_support hard-imported `vllm.v1.worker`
-    # and crashed with ModuleNotFoundError on older vLLM builds.
-    from unsloth_zoo import empty_model
-    src = inspect.getsource(empty_model.patch_gemma4_vllm_lora_support)
-    assert "try:" in src
-    assert "from vllm.v1.worker import lora_model_runner_mixin" in src
-    assert "except ImportError" in src
-    assert "lora_model_runner_mixin = None" in src
-
-
-def test_conv1d_rebuild_uses_real_channels_and_groups():
-    # Pre-fix: conv1d was stacked into `layernorm_names` and rebuilt by
-    # weight-swap only, leaving the placeholder Conv1d with groups=1,
-    # kernel_size=1 which crashes on first forward.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils.convert_vllm_to_huggingface)
-    assert '".conv1d"' in src
-    assert "Conv1d(" in src
-    assert "groups = channels" in src
-    # conv1d is no longer classified as a layernorm
-    assert '"conv1d",' not in src
-
-
-def test_lm_head_extraction_collapsed_to_single_path():
-    # Pre-fix: two `elif` fallbacks for vllm_internals.language_model.lm_head
-    # and vllm_internals.lm_head were dead code because named_modules() already
-    # traverses the full subtree.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils._get_vllm_state_dict)
-    lm_start = src.index("# LM Head")
-    lm_block = src[lm_start : lm_start + 800]
-    assert "language_model.lm_head" not in lm_block
-    assert 'elif hasattr(vllm_internals, "lm_head")' not in lm_block
-
-
-def test_gemma4_k_eq_v_set_hoists_constant_check():
-    # Pre-fix: model_type == "gemma4" and attention_k_eq_v were evaluated on
-    # every iteration of the set comprehension. Current fix also routes the
-    # model-type check through the shared _is_gemma4_config helper so that
-    # text-only Gemma4 (model_type == "gemma4_text") is matched too.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils._get_vllm_state_dict)
-    assert 'if _is_gemma4_config(config) and getattr(text_config, "attention_k_eq_v"' in src
-    assert "gemma4_k_eq_v_layers = set()" in src
-
-
-def test_merger_linear_fc_moved_to_non_layered():
-    # Pre-fix: model.visual.merger.linear_fc1/linear_fc2 (no {kk} placeholder)
-    # sat in additional_layers and were reassigned once per layer iteration.
-    from unsloth_zoo.empty_model import get_model_layer_config
-    cfg = get_model_layer_config()
-    additional = set(cfg["additional_layers"])
-    non_layered = set(cfg["non_layered_components"])
-    assert "model.visual.merger.linear_fc1" not in additional
-    assert "model.visual.merger.linear_fc2" not in additional
-    assert "model.visual.merger.linear_fc1" in non_layered
-    assert "model.visual.merger.linear_fc2" in non_layered
-
-
-def test_finalize_does_not_overwrite_unrelated_submodule_config_dtype():
-    # Behavioral: a submodule that carries its own config (with a distinct
-    # identity from the top-level/text/vision/audio configs) must NOT get its
-    # dtype overwritten by finalize_huggingface_model.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _SubConfig:
-        def __init__(self, dtype):
-            self.dtype = dtype
-
-    class _SubModule(torch.nn.Module):
-        def __init__(self, dtype):
-            super().__init__()
-            self.config = _SubConfig(dtype)
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.sub = _SubModule(dtype="float32")
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList()
-
-    top_cfg = types.SimpleNamespace(model_type="llama", dtype="bfloat16")
-    top_cfg.text_config = top_cfg
-
-    model = _Model()
-    finalize_huggingface_model(
-        model, None, top_cfg, torch.bfloat16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-    # Unknown submodule config must keep its original dtype.
-    assert model.sub.config.dtype == "float32"
-    # Top-level config is a known config and should be updated to bfloat16.
-    assert top_cfg.dtype == "bfloat16"
-
-
-def test_finalize_keeps_gemma4_rotary_buffers_float32_after_dtype_cast():
-    # Behavioral: on Gemma4, even after finalize casts the model to bfloat16/
-    # float16, rotary_emb buffers must remain in float32 for rotary math.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _RotaryCfg:
-        pass
-
-    class _FakeRotaryEmb(torch.nn.Module):
-        # Mimics the minimal interface finalize touches: a `config` attribute
-        # plus float buffers that should survive at float32 on Gemma4.
-        def __init__(self, config=None, device=None):
-            super().__init__()
-            self.config = config if config is not None else _RotaryCfg()
-            self.register_buffer("inv_freq", torch.arange(4, dtype=torch.float32))
-            self.register_buffer("original_inv_freq", torch.arange(4, dtype=torch.float32))
-            self.attention_scaling = torch.tensor(1.0, dtype=torch.float32)
-
-    class _Attn(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.rotary_emb = _FakeRotaryEmb(config=_RotaryCfg())
-
-    class _Layer(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.layer_idx = -1
-            self.self_attn = _Attn()
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList([_Layer()])
-
-    cfg = types.SimpleNamespace(model_type="gemma4")
-    cfg.text_config = cfg
-
-    model = _Model()
-    finalize_huggingface_model(
-        model, None, cfg, torch.bfloat16,
-        quantization_config={}, bnb_config=None,
-    )
-    rotary = model.model.layers[0].self_attn.rotary_emb
-    assert rotary.inv_freq.dtype == torch.float32
-    assert rotary.original_inv_freq.dtype == torch.float32
-
-
-def test_finalize_non_gemma4_rotary_buffers_follow_model_dtype():
-    # Behavioral sanity check: for non-Gemma4 models the rotary buffer dtype
-    # should follow the requested model dtype (buffer_dtype = dtype branch).
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _RotaryCfg:
-        pass
-
-    class _FakeRotaryEmb(torch.nn.Module):
-        def __init__(self, config=None, device=None):
-            super().__init__()
-            self.config = config if config is not None else _RotaryCfg()
-            self.register_buffer("inv_freq", torch.arange(4, dtype=torch.float32))
-
-    class _Attn(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.rotary_emb = _FakeRotaryEmb(config=_RotaryCfg())
-
-    class _Layer(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.layer_idx = -1
-            self.self_attn = _Attn()
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList([_Layer()])
-
-    cfg = types.SimpleNamespace(model_type="llama")
-    cfg.text_config = cfg
-
-    model = _Model()
-    finalize_huggingface_model(
-        model, None, cfg, torch.bfloat16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-    rotary = model.model.layers[0].self_attn.rotary_emb
-    # Rotary inv_freq is kept at float32 for all archs to preserve RoPE precision.
-    assert rotary.inv_freq.dtype == torch.float32
-
-
-def test_set_dtype_in_config_else_branch_picks_correct_field():
-    # Pre-fix: the else-branch selection was inverted. This exercises the
-    # neither-attribute path explicitly.
-    from unsloth_zoo.hf_utils import set_dtype_in_config, HAS_TORCH_DTYPE
-
-    class _Bare:
-        pass
-
-    obj = _Bare()
-    set_dtype_in_config(obj, torch.float16)
-    expected_field = "torch_dtype" if HAS_TORCH_DTYPE else "dtype"
-    other_field = "dtype" if HAS_TORCH_DTYPE else "torch_dtype"
-    assert getattr(obj, expected_field, None) == "float16"
-    # Only one field should be written (no leakage into the other slot).
-    assert getattr(obj, other_field, None) is None
-
-
-def test_assert_same_state_dict_ignores_quantstate_entries():
-    # Behavioral: _normalize_state_dict_tensor returns None for non-tensor
-    # values like BnB QuantState dicts, and the comparison loop skips those.
-    # Previously these entries caused an AttributeError masked into failures.
-    from unsloth_zoo.vllm_utils import assert_same_state_dict
-
-    w = torch.randn(4, 4)
-    old = {"x.weight": w, "x.weight.quant_state": {"some": "metadata"}}
-    new = {"x.weight": w, "x.weight.quant_state": {"some": "metadata"}}
-    # Must not raise: the QuantState-shaped dict is skipped, the tensor matches.
-    assert_same_state_dict(old, new)
-
-
-def test_normalize_state_dict_tensor_handles_parameter():
-    # Behavioral: a Parameter is detached and normalized to a tensor.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils.assert_same_state_dict)
-    # Smoke: full comparison with a Parameter on both sides.
-    p_old = torch.nn.Parameter(torch.ones(2, 2), requires_grad=False)
-    p_new = torch.nn.Parameter(torch.ones(2, 2), requires_grad=False)
-    vllm_utils.assert_same_state_dict({"w": p_old}, {"w": p_new})
-    # And returning None for a non-tensor is reachable via the guarded path.
-    assert "return None" in src
-
-
-class _FakeLinearModule(torch.nn.Module):
-    def __init__(self, out_features, in_features):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.randn(out_features, in_features), requires_grad=False)
-
-
-class _FakeGemma4Layer(torch.nn.Module):
-    # Minimal stand-in so hasattr(layer, "per_layer_input_gate") hits the new
-    # extraction branch without needing a real Gemma4 model.
-    def __init__(self, hidden=4):
-        super().__init__()
-        self.per_layer_input_gate = _FakeLinearModule(hidden, hidden)
-        self.per_layer_projection = _FakeLinearModule(hidden, hidden)
-
-
-def test_gemma4_per_layer_extraction_emits_state_dict_entries():
-    # Behavioral: when a decoder layer exposes per_layer_input_gate /
-    # per_layer_projection, extraction must populate state_dict with those
-    # paths so the reconstruction templates have something to read.
-    state_dict = {}
-
-    def fake_get_state_dict(prefix, kk, sd, module, slice_weights=True):
-        sd[f"{prefix}.weight"] = module.weight.data
-
-    layer = _FakeGemma4Layer()
-    kk = 0
-    prefix = "model.language_model"
-    # Mirror the exact calls the fix adds in _get_vllm_state_dict so the test
-    # pins the shape of the emitted keys without reproducing all of
-    # _get_vllm_state_dict's setup.
-    if hasattr(layer, "per_layer_input_gate"):
-        fake_get_state_dict(
-            f"{prefix}.layers.{kk}.per_layer_input_gate",
-            0, state_dict, layer.per_layer_input_gate,
-        )
-    if hasattr(layer, "per_layer_projection"):
-        fake_get_state_dict(
-            f"{prefix}.layers.{kk}.per_layer_projection",
-            0, state_dict, layer.per_layer_projection,
-        )
-    assert "model.language_model.layers.0.per_layer_input_gate.weight" in state_dict
-    assert "model.language_model.layers.0.per_layer_projection.weight" in state_dict
-
-
-def test_set_additional_modules_loads_visual_merger_linear_fc():
-    # Regression: the "linear" filter in set_additional_modules dropped
-    # model.visual.merger.linear_fc1/2 after the PR moved them into
-    # non_layered_components. set_additional_modules must now restore them.
-    from unsloth_zoo.empty_model import set_additional_modules
-
-    class _LM(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.embed_tokens = torch.nn.Embedding(2, 1)
-            self.norm = torch.nn.LayerNorm(1)
-
-    class _Merger(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.linear_fc1 = torch.nn.Linear(1, 1, bias=False)
-            self.linear_fc2 = torch.nn.Linear(1, 1, bias=False)
-
-    class _Visual(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.merger = _Merger()
-
-    class _Inner(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.language_model = _LM()
-            self.visual = _Visual()
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = _Inner()
-            self.lm_head = torch.nn.Linear(1, 2, bias=False)
-
-    model = _Model()
-    fc1_target = torch.full((1, 1), 7.0)
-    fc2_target = torch.full((1, 1), 9.0)
-    quant_state_dict = {
-        "model.language_model.embed_tokens.weight": torch.zeros(2, 1),
-        "model.language_model.norm.weight": torch.ones(1),
-        "lm_head.weight": torch.zeros(2, 1),
-        "model.visual.merger.linear_fc1.weight": fc1_target,
-        "model.visual.merger.linear_fc2.weight": fc2_target,
-    }
-    cfg = types.SimpleNamespace(pad_token_id=0, text_config=types.SimpleNamespace(tie_word_embeddings=False))
-    set_additional_modules(model, quant_state_dict, cfg)
-    torch.testing.assert_close(model.model.visual.merger.linear_fc1.weight.data, fc1_target)
-    torch.testing.assert_close(model.model.visual.merger.linear_fc2.weight.data, fc2_target)
-
-
-def test_get_vllm_state_dict_extracts_layernorm_when_layer_lacks_mlp():
-    # Regression: the early `continue` for layers without `mlp` previously
-    # short-circuited before the layernorm extraction loop, dropping
-    # input_layernorm.weight on linear-attention / MoE-only layers.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils._get_vllm_state_dict)
-    layernorm_idx = src.index('layer_config[\'layernorms\']')
-    no_mlp_idx = src.index('if not hasattr(layer, "mlp"):')
-    assert layernorm_idx < no_mlp_idx, (
-        "layernorm extraction loop must run before the no-mlp early continue "
-        "so layernorms are exported for every decoder layer"
-    )
-
-
-def test_finalize_huggingface_model_dtype_propagates_to_replaced_live_config():
-    # Regression: copy_attributes can replace new_model.config with a config
-    # object whose id() differs from the input `config`, so the id-keyed
-    # dtype reapply loop missed it. After the fix, the live config tree is
-    # also brought up to date.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _LiveCfg:
-        def __init__(self, dtype):
-            self.dtype = dtype
-            self.text_config = self
-            self.model_type = "llama"
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.config = _LiveCfg("bfloat16")
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList()
-
-    input_cfg = types.SimpleNamespace(model_type="llama", dtype="bfloat16")
-    input_cfg.text_config = input_cfg
-    model = _Model()
-    finalize_huggingface_model(
-        model, None, input_cfg, torch.float16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-    assert model.config.dtype == "float16"
-
-
-def test_finalize_huggingface_model_vision_rotary_uses_identity_check():
-    # Regression: previously vision rotary classification compared __class__
-    # of the rotary's config against vision_config's class, which misfires
-    # when text and vision configs share a Python class. Identity-based
-    # check must not reroute a text rotary to vision_config in that case.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _SharedCfg:
-        def __init__(self, hidden_size=4):
-            self.hidden_size = hidden_size
-
-    text_cfg_obj = _SharedCfg(8)
-    vision_cfg_obj = _SharedCfg(16)
-
-    captured = {}
-
-    class _Rotary(torch.nn.Module):
-        def __init__(self, config=None, device=None):
-            super().__init__()
-            self.config = config
-            captured["last_hidden"] = config.hidden_size
-            self.register_buffer("inv_freq", torch.arange(4, dtype=torch.float32))
-
-    class _Attn(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.rotary_emb = _Rotary(config=text_cfg_obj)
-
-    class _Layer(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.layer_idx = -1
-            self.self_attn = _Attn()
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList([_Layer()])
-
-    cfg = types.SimpleNamespace(model_type="llama")
-    cfg.text_config = text_cfg_obj
-    cfg.vision_config = vision_cfg_obj
-
-    model = _Model()
-    finalize_huggingface_model(
-        model, None, cfg, torch.float16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-    assert captured["last_hidden"] == text_cfg_obj.hidden_size, (
-        "rotary using text_config must not be re-classified as a vision rotary "
-        "just because the two configs share a Python class"
-    )
-
-
-def test_layer_scalar_keeps_buffer_registration_after_conversion():
-    # Regression: the `if layer_name in quant_state_dict` branch in
-    # convert_vllm_to_huggingface always wrapped the value in nn.Parameter,
-    # silently moving HF Gemma4 layer_scalar from `_buffers` to `_parameters`.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils.convert_vllm_to_huggingface)
-    assert "_buffers" in src
-    assert 'getattr(parent, "_buffers"' in src or "parent._buffers" in src
-
-
-def test_assert_same_state_dict_uses_tight_tolerance_for_same_dtype():
-    # Regression: assert_same_state_dict previously applied atol=1e-4 /
-    # rtol=1e-3 unconditionally, masking weight-extraction errors on
-    # same-dtype non-FP8 weights. The relaxed tolerance must now only
-    # apply to the dtype-mismatch / FP8 upcast branch.
-    from unsloth_zoo.vllm_utils import assert_same_state_dict
-    a = torch.randn(8, 8, dtype=torch.float32)
-    b = a.clone()
-    b[0, 0] += 5e-4
-    raised = False
-    try:
-        assert_same_state_dict({"w": a}, {"w": b})
-    except Exception:
-        raised = True
-    assert raised, "5e-4 fp32 mismatch must fail the tight torch default tolerance"
-
-
-def test_conv1d_branch_requires_linear_attn_in_layer_name():
-    # Regression: `endswith(".conv1d")` would silently rebuild any future
-    # non-GDN .conv1d layer as depthwise. Branch must require linear_attn.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils.convert_vllm_to_huggingface)
-    assert 'endswith(".conv1d") and "linear_attn" in layer_name' in src
-
-
-def test_gemma4_lora_patch_covers_both_classes():
-    # Regression: only Gemma4ForConditionalGeneration was patched, so
-    # text-only Gemma4ForCausalLM still hit the unsupported-LoRA path.
-    from unsloth_zoo import empty_model
-    src = inspect.getsource(empty_model.patch_gemma4_vllm_lora_support)
-    assert "Gemma4ForCausalLM" in src
-    assert "_unsloth_gemma4_class_patched" in src
-
-
-def test_get_model_layer_config_includes_gemma4_top_level_ple_modules():
-    # Regression: top-level Gemma4 PLE modules (embed_tokens_per_layer,
-    # per_layer_model_projection, per_layer_projection_norm) were missing
-    # from extraction tables, leaving them with random init.
-    from unsloth_zoo.empty_model import get_model_layer_config
-    cfg = get_model_layer_config()
-    non_layered = set(cfg["non_layered_components"])
-    assert "model.language_model.embed_tokens_per_layer" in non_layered
-    assert "model.language_model.per_layer_model_projection" in non_layered
-    assert "model.language_model.per_layer_projection_norm" in non_layered
-
-
-def test_finalize_non_gemma4_rotary_stays_fp32_through_to_dtype():
-    # Regression: the non-Gemma4 branch previously skipped the float32 rotary
-    # buffer restoration after new_model.to(dtype), downcasting inv_freq /
-    # original_inv_freq to bf16/fp16 for Qwen3.5 and other non-Gemma4 models.
-    # Must exercise the (quantization_config == {} and bnb_config is None)
-    # path so .to(dtype) actually runs.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _Cfg:
-        pass
-
-    class _Rotary(torch.nn.Module):
-        def __init__(self, config=None, device=None):
-            super().__init__()
-            self.config = config if config is not None else _Cfg()
-            self.register_buffer("inv_freq", torch.arange(4, dtype=torch.float32))
-            self.register_buffer("original_inv_freq", torch.arange(4, dtype=torch.float32))
-
-    class _Attn(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.rotary_emb = _Rotary(config=_Cfg())
-
-    class _Layer(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.layer_idx = -1
-            self.self_attn = _Attn()
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList([_Layer()])
-
-    cfg = types.SimpleNamespace(model_type="llama")
-    cfg.text_config = cfg
-    model = _Model()
-    finalize_huggingface_model(
-        model, None, cfg, torch.bfloat16,
-        quantization_config={}, bnb_config=None,
-    )
-    rotary = model.model.layers[0].self_attn.rotary_emb
-    assert rotary.inv_freq.dtype == torch.float32
-    assert rotary.original_inv_freq.dtype == torch.float32
-
-
-def test_finalize_tolerates_rotary_rebuild_failure_without_crashing():
-    # Regression: module.rotary_emb.__class__(config=..., device=...) can
-    # raise for Gemma4 multimodal rotary when copy_attributes drifts the
-    # config identity so the vision rotary ends up with a text config shape.
-    # finalize_huggingface_model must catch the exception, keep the existing
-    # rotary instance, and still float32-lift its buffers.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _BadCfg:
-        pass
-
-    class _ExplodingRotary(torch.nn.Module):
-        calls = 0
-
-        def __init__(self, config=None, device=None):
-            super().__init__()
-            _ExplodingRotary.calls += 1
-            if _ExplodingRotary.calls > 1:
-                raise KeyError("rope_type")
-            self.config = config
-            self.register_buffer("inv_freq", torch.arange(4, dtype=torch.float32))
-
-    class _Attn(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.rotary_emb = _ExplodingRotary(config=_BadCfg())
-
-    class _Layer(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.layer_idx = -1
-            self.self_attn = _Attn()
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList([_Layer()])
-
-    cfg = types.SimpleNamespace(model_type="gemma4")
-    cfg.text_config = cfg
-    model = _Model()
-    # Must not raise even though the rotary re-init raises KeyError on second call.
-    finalize_huggingface_model(
-        model, None, cfg, torch.float16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-    rotary = model.model.layers[0].self_attn.rotary_emb
-    assert rotary.inv_freq.dtype == torch.float32
-
-
-def test_finalize_routes_vision_tower_rotary_to_vision_config_by_module_path():
-    # Regression: id()-based text/vision routing drifted after copy_attributes,
-    # misrouting vision rotary through text_config (which lacks the vision
-    # rope_parameters shape). The fix adds a module-path fallback so a rotary
-    # under 'vision_tower' is built with vision_config even when identity
-    # match fails.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _TextCfg:
-        hidden_size = 8
-        num_heads = 2
-
-    class _VisionCfg:
-        hidden_size = 16
-        num_heads = 2
-
-    captured = {}
-
-    class _Rotary(torch.nn.Module):
-        def __init__(self, config=None, device=None):
-            super().__init__()
-            captured["config_hidden_size"] = getattr(config, "hidden_size", None)
-            self.config = config
-            self.register_buffer("inv_freq", torch.arange(4, dtype=torch.float32))
-
-    class _Inner(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            # New unrelated config instance so id() match against the top-level
-            # vision_config fails; module path must take over.
-            self.rotary_emb = _Rotary(config=object())
-
-    class _VisionTower(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.encoder = _Inner()
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList()
-            self.model.vision_tower = _VisionTower()
-
-    cfg = types.SimpleNamespace(model_type="gemma4")
-    cfg.text_config = _TextCfg()
-    cfg.vision_config = _VisionCfg()
-
-    model = _Model()
-    finalize_huggingface_model(
-        model, None, cfg, torch.float16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-    assert captured["config_hidden_size"] == _VisionCfg.hidden_size, (
-        "vision-tower rotary must be rebuilt with vision_config even when "
-        "the config identity check fails"
-    )
-
-
-def test_extract_gdn_layers_dequantize_uses_unpacked_midpoint():
-    # Regression: `mid = ba_weight.shape[0] // 2` was computed on the packed
-    # uint8 Params4bit buffer (numel/2 shape), then reused to slice the
-    # dequantized full tensor whose shape[0] is out_features. When those two
-    # differ, in_proj_b / in_proj_a ended up with wrong rows.
-    from unsloth_zoo.empty_model import extract_gdn_layers
-
-    class _PlainProj(torch.nn.Module):
-        def __init__(self, out_features, in_features):
-            super().__init__()
-            self.weight = torch.nn.Parameter(
-                torch.randn(out_features, in_features), requires_grad=False,
-            )
-
-    class _FakeQS:
-        def as_dict(self, packed=True):
-            return {}
-
-    class _PackedParam(torch.nn.Parameter):
-        def __new__(cls, data, quant_states):
-            inst = torch.nn.Parameter.__new__(cls, data, requires_grad=False)
-            inst.bnb_quant_state = quant_states
-            return inst
-
-    class _BAProj(torch.nn.Module):
-        def __init__(self, packed_len):
-            super().__init__()
-            # Only index 0 has a QuantState -> triggers dequantize branch.
-            self.weight = _PackedParam(
-                torch.zeros(packed_len, dtype=torch.uint8),
-                {0: _FakeQS(), 1: None},
-            )
-
-    class _GDN(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.hidden_size = 4
-            self.num_k_heads = 2
-            self.num_v_heads = 4
-            self.head_k_dim = 2
-            self.head_v_dim = 4
-            self.key_dim = 4
-            self.value_dim = 16
-            self.in_proj_qkvz = _PlainProj(
-                2 * self.key_dim + 2 * self.value_dim, self.hidden_size,
-            )
-            # Packed length 12 -> packed mid 6. Dequantized shape below is 24 x 1
-            # so the correct mid is 12.
-            self.in_proj_ba = _BAProj(12)
-            self.conv1d = _PlainProj(self.key_dim * 2 + self.value_dim, 4)
-            self.dt_bias = torch.nn.Parameter(torch.randn(self.num_v_heads), requires_grad=False)
-            self.A_log = torch.nn.Parameter(torch.randn(self.num_v_heads), requires_grad=False)
-            self.norm = torch.nn.Module()
-            self.norm.weight = torch.nn.Parameter(
-                torch.randn(self.head_v_dim), requires_grad=False,
-            )
-            self.out_proj = _PlainProj(self.hidden_size, self.value_dim)
-
-    bnb = sys.modules.setdefault("bitsandbytes", types.ModuleType("bitsandbytes"))
-    bnb_fn = types.ModuleType("bitsandbytes.functional")
-
-    def fake_dequantize_4bit(data, quant_state=None):
-        return torch.arange(24, dtype=torch.float32).reshape(24, 1)
-
-    bnb_fn.dequantize_4bit = fake_dequantize_4bit
-    sys.modules["bitsandbytes.functional"] = bnb_fn
-
-    def _fake_get_state_dict(prefix, kk, sd, module, slice_weights=True):
-        sd[f"{prefix}.weight"] = module.weight.data
-
-    gdn = _GDN()
-    state_dict, quant_state_dict = {}, {}
-    extract_gdn_layers(gdn, "prefix", state_dict, quant_state_dict, _fake_get_state_dict)
-    b = state_dict["prefix.in_proj_b.weight"]
-    a = state_dict["prefix.in_proj_a.weight"]
-    assert b.shape[0] == 12, f"in_proj_b got {b.shape[0]} rows, expected 12 (dequantized mid)"
-    assert a.shape[0] == 12, f"in_proj_a got {a.shape[0]} rows, expected 12 (dequantized mid)"
-
-
-def test_lm_head_lookup_uses_exact_name_not_substring():
-    # Regression: `"lm_head" in name` would match a submodule named e.g.
-    # 'lm_head_norm' before the real 'lm_head', returning the wrong module.
-    # The fix requires an exact match or a .lm_head suffix.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils._get_vllm_state_dict)
-    assert 'name == "lm_head"' in src
-    assert 'name.endswith(".lm_head")' in src
-    # Loose substring test must not be present.
-    assert '"lm_head" in name' not in src
-
-
-# ----- Regression tests for review-iter-1 hardening -----
-
-
-def test_convert_regex_handles_trailing_digit_parameter_paths():
-    # Pre-fix: `re.sub(r"\.([\d]{1,})\.", r"[\1].", layer_name)` required a
-    # trailing dot, so a parameter-list-style key such as
-    # `model.language_model.embed_tokens_per_layer.0` was not converted to
-    # bracket form and `exec(...)` hit a SyntaxError.
-    import re
-    pattern = r"\.([\d]+)(?=\.|$)"
-    sub = lambda x: f"[{x.group(1)}]"
-    assert re.sub(pattern, sub, "model.language_model.embed_tokens_per_layer.0") \
-        == "model.language_model.embed_tokens_per_layer[0]"
-    assert re.sub(pattern, sub, "model.layers.12.self_attn.q_proj") \
-        == "model.layers[12].self_attn.q_proj"
-    assert re.sub(pattern, sub, "model.visual.merger.linear_fc1") \
-        == "model.visual.merger.linear_fc1"
-
-
-def test_convert_vllm_to_huggingface_uses_robust_bracket_regex():
-    # The Parameter-assignment path for `if layer_name in quant_state_dict`
-    # must use the anchor-or-end regex so that keys ending in `.DIGIT` get
-    # converted to bracket form.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils.convert_vllm_to_huggingface)
-    assert r'r"\.([\d]+)(?=\.|$)"' in src
-    param_branch_anchor = "# for attributes of type nn.Parameter, there's no .weight"
-    idx = src.index(param_branch_anchor)
-    nearby = src[idx:idx + 400]
-    assert r'r"\.([\d]+)(?=\.|$)"' in nearby
-    assert r'r"\.([\d]{1,})\."' not in nearby
-
-
-def test_finalize_rotary_reinit_failure_skips_float32_lift():
-    # Regression: a bare `try/except Exception: pass` on rotary reinit used
-    # to float32-lift buffers on the stale rotary. The fix only lifts when
-    # reinit succeeds so wrong-shape placeholder buffers do not get blessed.
-    from unsloth_zoo.empty_model import finalize_huggingface_model
-
-    class _BadCfg:
-        pass
-
-    class _ExplodingRotary(torch.nn.Module):
-        calls = 0
-
-        def __init__(self, config=None, device=None):
-            super().__init__()
-            _ExplodingRotary.calls += 1
-            if _ExplodingRotary.calls > 1:
-                raise KeyError("rope_type")
-            self.config = config
-            self.register_buffer("inv_freq", torch.arange(4, dtype=torch.float16))
-
-    class _Attn(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.rotary_emb = _ExplodingRotary(config=_BadCfg())
-
-    class _Layer(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.layer_idx = -1
-            self.self_attn = _Attn()
-
-    class _Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = torch.nn.Module()
-            self.model.layers = torch.nn.ModuleList([_Layer()])
-
-    cfg = types.SimpleNamespace(model_type="gemma4")
-    cfg.text_config = cfg
-    model = _Model()
-    finalize_huggingface_model(
-        model, None, cfg, torch.float16,
-        quantization_config={"x": 1}, bnb_config=None,
-    )
-    rotary = model.model.layers[0].self_attn.rotary_emb
-    # reinit raised -> buffer dtype unchanged from pre-call (fp16)
-    assert rotary.inv_freq.dtype == torch.float16
-
-
-def test_is_gemma4_config_matches_both_variants():
-    from unsloth_zoo.empty_model import _is_gemma4_config
-
-    top_only = types.SimpleNamespace(model_type="gemma4")
-    assert _is_gemma4_config(top_only)
-
-    nested_text_only = types.SimpleNamespace(model_type="unrelated")
-    nested_text_only.text_config = types.SimpleNamespace(model_type="gemma4_text")
-    assert _is_gemma4_config(nested_text_only)
-
-    text_only_causal = types.SimpleNamespace(model_type="gemma4_text")
-    text_only_causal.text_config = text_only_causal
-    assert _is_gemma4_config(text_only_causal)
-
-    not_gemma4 = types.SimpleNamespace(model_type="llama")
-    not_gemma4.text_config = not_gemma4
-    assert not _is_gemma4_config(not_gemma4)
-
-    assert not _is_gemma4_config(None)
-
-
-def test_load_vllm_routes_gemma4_gate_through_helper():
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils.load_vllm)
-    assert "_is_gemma4_config(config)" in src
-    assert 'getattr(config, "model_type", None) == "gemma4"' not in src
-
-
-def test_load_vllm_gemma4_patch_runs_after_bnb_autodetect():
-    # Regression: the Gemma4 k_eq_v patch was gated on the caller-provided
-    # `use_bitsandbytes` before model-name / quant_method auto-detection, so
-    # `-bnb-4bit` models with use_bitsandbytes=False at call time would skip
-    # the patch. The fix moves the gate below the autodetect line.
-    from unsloth_zoo import vllm_utils
-    src = inspect.getsource(vllm_utils.load_vllm)
-    autodetect_anchor = "use_bitsandbytes = use_bitsandbytes or"
-    gate_anchor = "patch_gemma4_vllm_k_eq_v_support()"
-    assert autodetect_anchor in src
-    assert gate_anchor in src
-    assert src.index(autodetect_anchor) < src.index(gate_anchor)
-
-
-def test_patch_gemma4_vllm_lora_support_preserves_embedding_modules():
-    # Regression: `cls.embedding_modules = {}` clobbered a pre-existing
-    # embedding registry on the vLLM class, which vLLM's LoRA manager uses
-    # to route adapters to embedding layers. The fix guards the assignment
-    # so it only runs when the attribute is absent.
-    from unsloth_zoo import empty_model
-    src = inspect.getsource(empty_model.patch_gemma4_vllm_lora_support)
-    assert 'if not hasattr(cls, "embedding_modules"):' in src
-    guard_idx = src.index('if not hasattr(cls, "embedding_modules"):')
-    assign_idx = src.index("cls.embedding_modules = {}")
-    assert guard_idx < assign_idx, (
-        "embedding_modules assignment must sit inside the hasattr guard"
-    )
-
-
-def test_patch_gemma4_vllm_lora_support_guards_gemma4_mm_import():
-    # Regression: a hard `from vllm...gemma4_mm import ...` at top of the
-    # patch function crashed with ModuleNotFoundError on text-only Gemma4
-    # vLLM builds. The fix wraps each class import in try/except.
-    from unsloth_zoo import empty_model
-    src = inspect.getsource(empty_model.patch_gemma4_vllm_lora_support)
-    mm_line = "from vllm.model_executor.models.gemma4_mm import Gemma4ForConditionalGeneration"
-    assert mm_line in src
-    mm_idx = src.index(mm_line)
-    pre = src[:mm_idx]
-    assert pre.rstrip().endswith("try:")
-    assert "if not classes_to_patch:" in src
-    assert "return" in src[src.index("if not classes_to_patch:"):]
-
-
-def test_patch_gemma4_vllm_k_eq_v_support_guards_private_loader_attr():
-    # Regression: hasattr(BitsAndBytesModelLoader._stack_quantization_states, ...)
-    # raised AttributeError on vLLM builds where the private method was
-    # renamed or absent. Fix routes through getattr with a None default.
-    from unsloth_zoo import empty_model
-    src = inspect.getsource(empty_model.patch_gemma4_vllm_k_eq_v_support)
-    assert 'getattr(\n        BitsAndBytesModelLoader, "_stack_quantization_states", None' in src \
-        or 'getattr(BitsAndBytesModelLoader, "_stack_quantization_states", None' in src
-    assert "if stack_quantization_states is None:" in src
-
-
-def test_patch_gemma4_vllm_k_eq_v_support_searches_hf_style_prefix():
-    # Regression: _get_gemma4_k_eq_v_pairs only searched
-    # ("language_model.model", "model") prefixes, missing HF-style
-    # model.language_model for multimodal Gemma4.
-    from unsloth_zoo import empty_model
-    src = inspect.getsource(empty_model.patch_gemma4_vllm_k_eq_v_support)
-    assert '"model.language_model"' in src
-    assert '"language_model.model"' in src
-
-
-def test_patch_gemma4_vllm_lora_support_early_returns_when_no_classes():
-    import sys as _sys
-    import types as _types
-    from unsloth_zoo import empty_model
-
-    stub_packages = {
-        "vllm": _types.ModuleType("vllm"),
-        "vllm.model_executor": _types.ModuleType("vllm.model_executor"),
-        "vllm.model_executor.models": _types.ModuleType("vllm.model_executor.models"),
-        "vllm.model_executor.models.interfaces": _types.ModuleType("vllm.model_executor.models.interfaces"),
-        "vllm.lora": _types.ModuleType("vllm.lora"),
-        "vllm.lora.model_manager": _types.ModuleType("vllm.lora.model_manager"),
-    }
-    for name in stub_packages:
-        stub_packages[name].__path__ = []
-    stub_packages["vllm.model_executor.models.interfaces"].supports_lora = lambda model: False
-
-    class _FakeLoRAManager:
-        def __init__(self, *args, **kwargs):
-            pass
-
-    class _FakeCreate:
-        pass
-
-    def fake_create(model, *args, **kwargs):
-        return None
-
-    stub_packages["vllm.lora.model_manager"].LoRAModelManager = _FakeLoRAManager
-    stub_packages["vllm.lora.model_manager"].create_lora_manager = fake_create
-    stub_packages["vllm.model_executor.models"].gemma4_mm = None  # do not register submodule
-
-    saved = {}
-    for name, mod in stub_packages.items():
-        saved[name] = _sys.modules.get(name)
-        _sys.modules[name] = mod
-    # Ensure neither gemma4 nor gemma4_mm submodules resolve.
-    for missing in (
-        "vllm.model_executor.models.gemma4",
-        "vllm.model_executor.models.gemma4_mm",
-        "vllm.v1",
-        "vllm.v1.worker",
-    ):
-        saved[missing] = _sys.modules.get(missing)
-        _sys.modules[missing] = None
-    try:
-        # Must return without raising when no gemma4 class importable.
-        empty_model.patch_gemma4_vllm_lora_support()
-        # And the fake create_lora_manager must not have been replaced.
-        assert stub_packages["vllm.lora.model_manager"].create_lora_manager is fake_create
-    finally:
-        for name, prev in saved.items():
-            if prev is None:
-                _sys.modules.pop(name, None)
-            else:
-                _sys.modules[name] = prev
-
-
-def test_patch_gemma4_vllm_k_eq_v_support_noop_when_private_attr_missing():
-    import sys as _sys
-    import types as _types
-    from unsloth_zoo import empty_model
-
-    fake_pkg = _types.ModuleType("vllm.model_executor.model_loader.bitsandbytes_loader")
-
-    class _FakeLoader:
-        pass
-
-    fake_pkg.BitsAndBytesModelLoader = _FakeLoader
-    saved = {}
-    for name in (
-        "vllm",
-        "vllm.model_executor",
-        "vllm.model_executor.model_loader",
-        "vllm.model_executor.model_loader.bitsandbytes_loader",
-    ):
-        saved[name] = _sys.modules.get(name)
-    for name in ("vllm", "vllm.model_executor", "vllm.model_executor.model_loader"):
-        if _sys.modules.get(name) is None:
-            _sys.modules[name] = _types.ModuleType(name)
-            _sys.modules[name].__path__ = []
-    _sys.modules["vllm.model_executor.model_loader.bitsandbytes_loader"] = fake_pkg
-    try:
-        empty_model.patch_gemma4_vllm_k_eq_v_support()
-        assert not hasattr(_FakeLoader, "_stack_quantization_states")
-    finally:
-        for name, prev in saved.items():
-            if prev is None:
-                _sys.modules.pop(name, None)
-            else:
-                _sys.modules[name] = prev