diff --git a/.github/workflows/studio-export-fix-ci.yml b/.github/workflows/studio-export-fix-ci.yml
new file mode 100644
index 000000000..699b78d16
--- /dev/null
+++ b/.github/workflows/studio-export-fix-ci.yml
@@ -0,0 +1,62 @@
+name: studio-export-fix-ci
+
+on:
+  push:
+    branches: [main, nightly]
+    paths:
+      - "unsloth_zoo/llama_cpp.py"
+      - "tests/test_quantize_gguf_q2_k_l.py"
+      - "tests/test_convert_hf_to_gguf_patcher.py"
+      - ".github/workflows/studio-export-fix-ci.yml"
+  pull_request:
+    paths:
+      - "unsloth_zoo/llama_cpp.py"
+      - "tests/test_quantize_gguf_q2_k_l.py"
+      - "tests/test_convert_hf_to_gguf_patcher.py"
+      - ".github/workflows/studio-export-fix-ci.yml"
+
+concurrency:
+  group: studio-export-fix-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  studio-export-fix:
+    name: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      # Cap matrix at 3 in flight so Windows stays under the repo-level
+      # 5-concurrent-Windows-runner limit when this job runs alongside others.
+      max-parallel: 3
+      matrix:
+        os: [ubuntu-latest, macos-14, windows-latest]
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 15
+    env:
+      # 5000/h vs 60/h on raw.githubusercontent.com for the live-upstream tests.
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      UNSLOTH_COMPILE_DISABLE: '1'
+      PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+
+      - name: Install minimal test deps
+        run: |
+          python -m pip install --upgrade pip
+          # Pure-Python tests: monkeypatch subprocess + AST-parse upstream files.
+          # No torch / transformers needed. Keep slim so Windows cold start stays under a minute.
+          python -m pip install pytest psutil requests tqdm
+
+      - name: Run patcher + q2_k_l unit tests
+        shell: bash
+        run: |
+          pytest -v \
+            tests/test_quantize_gguf_q2_k_l.py \
+            tests/test_convert_hf_to_gguf_patcher.py
diff --git a/tests/test_convert_hf_to_gguf_patcher.py b/tests/test_convert_hf_to_gguf_patcher.py
new file mode 100644
index 000000000..04ff37827
--- /dev/null
+++ b/tests/test_convert_hf_to_gguf_patcher.py
@@ -0,0 +1,512 @@
+"""Tests for the layout-aware convert_hf_to_gguf.py patcher.
+
+Covers the helpers that distinguish upstream llama.cpp's old monolithic
+convert_hf_to_gguf.py from the new conversion/ package layout, plus the
+in-place branding patch on conversion/base.py and the Qwen2MoE-skip path.
+
+Two flavours:
+
+  - synthetic_*: hand-crafted fixture trees that match the upstream layouts
+    structurally; no network. These are the load-bearing CI gates.
+  - latest_*  : pulls the current files from raw.githubusercontent.com and
+    asserts the patcher still understands master. Skipped when offline.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+
+def _load_llama_cpp_module():
+    repo_root = Path(__file__).resolve().parents[1]
+    module_path = repo_root / "unsloth_zoo" / "llama_cpp.py"
+    spec = importlib.util.spec_from_file_location("llama_cpp_under_test_patcher", module_path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+# --- Synthetic fixtures matching upstream layouts ---------------------------
+
+# A minimal but realistic stand-in for the new `convert_hf_to_gguf.py`
+# entrypoint. The structural anchor we detect on is `from conversion import`.
+_PACKAGE_ENTRYPOINT = b"""\
+#!/usr/bin/env python3
+import argparse
+import sys
+from pathlib import Path
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+from conversion import (
+    ModelBase,
+    ModelType,
+    get_model_architecture,
+    get_model_class,
+    logger,
+    print_registered_models,
+)
+"""
+
+# A minimal stand-in for conversion/base.py containing the canonical
+# Metadata.load call site at 8-space indent (matches conversion/base.py:912).
+_PACKAGE_BASE_PY = b"""\
+import gguf
+from enum import IntEnum
+
+
+class ModelType(IntEnum):
+    TEXT = 0
+    MMPROJ = 1
+
+
+class ModelBase:
+    _model_classes = {ModelType.TEXT: {}, ModelType.MMPROJ: {}}
+
+    def prepare_metadata(self, vocab_only):
+        total_params, shared_params, expert_params, expert_count = (0, 0, 0, 0)
+
+        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
+
+        if self.remote_hf_model_id:
+            self.metadata.name = self.remote_hf_model_id
+"""
+
+# A minimal stand-in for conversion/__init__.py with realistic TEXT_MODEL_MAP
+# and MMPROJ_MODEL_MAP dict literals (matches __init__.py:19-231,234-283).
+_PACKAGE_INIT_PY = b"""\
+from __future__ import annotations
+from .base import ModelBase, ModelType
+
+
+TEXT_MODEL_MAP: dict[str, str] = {
+    "LlamaForCausalLM": "llama",
+    "MistralForCausalLM": "llama",
+    "Qwen3ForCausalLM": "qwen",
+    "Qwen2MoeForCausalLM": "qwen",
+    "Qwen3MoeForCausalLM": "qwen",
+    "Gemma3ForCausalLM": "gemma",
+}
+
+
+MMPROJ_MODEL_MAP: dict[str, str] = {
+    "LlavaForConditionalGeneration": "llava",
+    "Gemma3ForConditionalGeneration": "gemma",
+}
+
+
+def load_all_models() -> None:
+    pass
+
+
+def get_model_class(name, mmproj=False):
+    return ModelBase
+"""
+
+# Stand-in for the new conversion/qwen.py: contains both expert-key literals
+# in the same find_hparam call (upstream already handles the alias).
+_PACKAGE_QWEN_PY = b"""\
+from .base import ModelBase
+
+
+class Qwen2MoeModel(ModelBase):
+    def set_gguf_parameters(self):
+        n_experts = self.find_hparam(["num_local_experts", "num_experts"])
+        return n_experts
+"""
+
+# A minimal stand-in for the OLD monolithic convert_hf_to_gguf.py. Note: NO
+# `from conversion import` anywhere; that is the structural anchor for layout
+# detection. ModelBase and ModelType are defined inline.
+_MONOLITH = b"""\
+import argparse
+import gguf
+from enum import IntEnum
+
+
+class ModelType(IntEnum):
+    TEXT = 0
+    MMPROJ = 1
+
+
+class ModelBase:
+    _model_classes = {ModelType.TEXT: {"LlamaForCausalLM": object}, ModelType.MMPROJ: {}}
+
+    def prepare_metadata(self):
+        self.metadata = gguf.Metadata.load(override, card, name, params)
+
+        if self.remote_hf_model_id:
+            self.metadata.name = self.remote_hf_model_id
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--outfile", default=None)
+"""
+
+
+@pytest.fixture
+def package_layout(tmp_path):
+    """Build a synthetic new-layout llama.cpp tree on disk and return its root."""
+    root = tmp_path / "llama.cpp"
+    root.mkdir()
+    (root / "convert_hf_to_gguf.py").write_bytes(_PACKAGE_ENTRYPOINT)
+    conv = root / "conversion"
+    conv.mkdir()
+    (conv / "__init__.py").write_bytes(_PACKAGE_INIT_PY)
+    (conv / "base.py").write_bytes(_PACKAGE_BASE_PY)
+    (conv / "qwen.py").write_bytes(_PACKAGE_QWEN_PY)
+    return root
+
+
+@pytest.fixture
+def monolith_layout(tmp_path):
+    """Build a synthetic old-layout llama.cpp tree on disk and return its root."""
+    root = tmp_path / "llama.cpp"
+    root.mkdir()
+    (root / "convert_hf_to_gguf.py").write_bytes(_MONOLITH)
+    return root
+
+
+# --- Layout detection -------------------------------------------------------
+
+
+def test_detect_layout_returns_package_for_new_tree(package_layout):
+    llama_cpp = _load_llama_cpp_module()
+    entry_bytes = (package_layout / "convert_hf_to_gguf.py").read_bytes()
+    assert llama_cpp._detect_converter_layout(entry_bytes, str(package_layout)) == "package"
+
+
+def test_detect_layout_returns_monolith_for_old_tree(monolith_layout):
+    llama_cpp = _load_llama_cpp_module()
+    entry_bytes = (monolith_layout / "convert_hf_to_gguf.py").read_bytes()
+    assert llama_cpp._detect_converter_layout(entry_bytes, str(monolith_layout)) == "monolith"
+
+
+def test_detect_layout_falls_back_to_monolith_when_conversion_dir_missing(tmp_path):
+    """Entrypoint has the `from conversion import` anchor but the package dir is
+    absent on disk -> treat as monolith (defensive)."""
+    llama_cpp = _load_llama_cpp_module()
+    assert llama_cpp._detect_converter_layout(_PACKAGE_ENTRYPOINT, str(tmp_path)) == "monolith"
+
+
+# --- Arch enumeration from conversion/__init__.py ---------------------------
+
+
+def test_extract_text_model_map_keys(package_layout):
+    llama_cpp = _load_llama_cpp_module()
+    init_py = package_layout / "conversion" / "__init__.py"
+    keys = llama_cpp._extract_dict_keys_from_conversion_init(str(init_py), "TEXT_MODEL_MAP")
+    assert {"LlamaForCausalLM", "Qwen3ForCausalLM", "Gemma3ForCausalLM"} <= keys
+    assert "Qwen2MoeForCausalLM" in keys
+
+
+def test_extract_mmproj_model_map_keys(package_layout):
+    llama_cpp = _load_llama_cpp_module()
+    init_py = package_layout / "conversion" / "__init__.py"
+    keys = llama_cpp._extract_dict_keys_from_conversion_init(str(init_py), "MMPROJ_MODEL_MAP")
+    assert "LlavaForConditionalGeneration" in keys
+    assert "Gemma3ForConditionalGeneration" in keys
+
+
+def test_extract_returns_empty_for_missing_dict(package_layout):
+    llama_cpp = _load_llama_cpp_module()
+    init_py = package_layout / "conversion" / "__init__.py"
+    keys = llama_cpp._extract_dict_keys_from_conversion_init(str(init_py), "NON_EXISTENT_MAP")
+    assert keys == set()
+
+
+def test_extract_returns_empty_for_unparseable_file(tmp_path):
+    """If conversion/__init__.py is missing or unparseable, we get an empty set
+    rather than raising — patcher then warns but does not abort."""
+    llama_cpp = _load_llama_cpp_module()
+    assert llama_cpp._extract_dict_keys_from_conversion_init(str(tmp_path / "nope.py"), "TEXT_MODEL_MAP") == set()
+
+
+# --- Branding patch on conversion/base.py -----------------------------------
+
+
+def test_branding_patch_applies_and_is_idempotent(package_layout):
+    llama_cpp = _load_llama_cpp_module()
+    base_py = package_layout / "conversion" / "base.py"
+
+    # First call: applies.
+    assert llama_cpp._apply_branding_patch_to_base(str(base_py)) == "applied"
+    content = base_py.read_bytes()
+    assert b"# UNSLOTH_BRANDING_APPLIED" in content
+    assert b"self.metadata.quantized_by = 'Unsloth'" in content
+    assert b"self.metadata.repo_url = 'https://huggingface.co/unsloth'" in content
+    assert b"self.metadata.tags = ['unsloth', 'llama.cpp']" in content
+
+    # Second call: no-op (idempotent).
+    assert llama_cpp._apply_branding_patch_to_base(str(base_py)) == "already-applied"
+    # File contents should be unchanged after the second call.
+    assert base_py.read_bytes() == content
+
+
+def test_branding_patch_pattern_missing_when_metadata_load_absent(tmp_path):
+    """A conversion/base.py without the Metadata.load call returns 'pattern-missing'."""
+    llama_cpp = _load_llama_cpp_module()
+    base_py = tmp_path / "base.py"
+    base_py.write_bytes(b"# completely different file content\n")
+    assert llama_cpp._apply_branding_patch_to_base(str(base_py)) == "pattern-missing"
+
+
+def test_branding_patch_preserves_lines_around_target(package_layout):
+    llama_cpp = _load_llama_cpp_module()
+    base_py = package_layout / "conversion" / "base.py"
+    original = base_py.read_bytes()
+    llama_cpp._apply_branding_patch_to_base(str(base_py))
+    patched = base_py.read_bytes()
+
+    # The Metadata.load line itself is preserved verbatim.
+    assert b"self.metadata = gguf.Metadata.load(" in patched
+    # Code that followed the target (the if self.remote_hf_model_id... block)
+    # is still present after the patch (we only inserted lines, not deleted).
+    assert b"if self.remote_hf_model_id:" in patched
+    assert b"self.metadata.name = self.remote_hf_model_id" in patched
+    # File grew (we added 4 branding lines + marker), not shrank.
+    assert len(patched) > len(original)
+
+
+# --- Qwen expert-key alias detection ---------------------------------------
+
+
+def test_qwen_aliases_detected_when_both_keys_present(package_layout):
+    llama_cpp = _load_llama_cpp_module()
+    qwen_py = package_layout / "conversion" / "qwen.py"
+    assert llama_cpp._qwen_already_handles_expert_aliases(str(qwen_py)) is True
+
+
+def test_qwen_aliases_not_detected_when_only_one_key_present(tmp_path):
+    llama_cpp = _load_llama_cpp_module()
+    qwen_py = tmp_path / "qwen.py"
+    qwen_py.write_bytes(b'n = self.hparams["num_experts"]\n')  # only num_experts
+    assert llama_cpp._qwen_already_handles_expert_aliases(str(qwen_py)) is False
+
+
+# --- Cache-key invalidation (sibling info) ---------------------------------
+
+
+def test_conversion_sibling_info_changes_when_base_py_changes(package_layout):
+    llama_cpp = _load_llama_cpp_module()
+    info_before = llama_cpp._conversion_sibling_info(str(package_layout))
+    assert info_before is not None
+
+    # Touch base.py with new content (mtime + size both change).
+    base_py = package_layout / "conversion" / "base.py"
+    base_py.write_bytes(base_py.read_bytes() + b"\n# extra trailing comment\n")
+
+    info_after = llama_cpp._conversion_sibling_info(str(package_layout))
+    assert info_after is not None
+    assert info_after != info_before, (
+        "_conversion_sibling_info must change when conversion/base.py changes, "
+        "so the @lru_cache(1) entry is invalidated"
+    )
+
+
+def test_conversion_sibling_info_none_for_monolith(monolith_layout):
+    llama_cpp = _load_llama_cpp_module()
+    assert llama_cpp._conversion_sibling_info(str(monolith_layout)) is None
+
+
+# --- _get_llama_cpp_dir resolution (addresses PR #667 review) ---------------
+
+
+def test_llama_cpp_dir_defaults_when_no_local_script():
+    llama_cpp = _load_llama_cpp_module()
+    assert llama_cpp._get_llama_cpp_dir(None) == llama_cpp.LLAMA_CPP_DEFAULT_DIR
+
+
+def test_llama_cpp_dir_resolves_to_source_dir_when_local_script_set(tmp_path):
+    """UNSLOTH_LLAMA_CPP_SCRIPTS_DIR override: the patcher must operate
+    against the directory containing the selected converter, not the
+    hard-coded default. Mirrors `_resolve_local_convert_script`'s 3-tuple
+    return shape `(abs_path, mtime_ns, size)`."""
+    llama_cpp = _load_llama_cpp_module()
+    custom = tmp_path / "custom_llama_cpp"
+    custom.mkdir()
+    src = custom / "convert_hf_to_gguf.py"
+    src.write_bytes(b"# placeholder\n")
+    local_info = (str(src), src.stat().st_mtime_ns, src.stat().st_size)
+    assert llama_cpp._get_llama_cpp_dir(local_info) == str(custom)
+
+
+def test_package_layout_does_not_require_module_import(tmp_path, monkeypatch):
+    """Regression for Codex P1 on 3a9a23c: when UNSLOTH_LLAMA_CPP_SCRIPTS_DIR
+    points at a package-layout checkout, the patcher must NOT call
+    `_load_module_from_path` on the entrypoint. Importing it would resolve
+    `from conversion import ...` against LLAMA_CPP_DEFAULT_DIR (a different
+    dir than the override) and raise ModuleNotFoundError, aborting the
+    patcher before AST arch extraction + branding could run.
+
+    We assert the contract by replacing `_load_module_from_path` with a
+    sentinel that fails the test if called, then driving the patcher end-
+    to-end with `UNSLOTH_LLAMA_CPP_SCRIPTS_DIR` set."""
+    llama_cpp = _load_llama_cpp_module()
+
+    # Build a custom package-layout checkout in tmp_path. We extend the
+    # shared fixture with a parse_args() stub so the end-to-end pipeline can
+    # finish its flag-parsing step on the patched file (the real upstream
+    # entrypoint has these calls; the shared fixture omits them because no
+    # other test exercises the full pipeline).
+    entry_with_args = _PACKAGE_ENTRYPOINT + (
+        b"\n"
+        b"def parse_args():\n"
+        b"    parser = argparse.ArgumentParser()\n"
+        b"    parser.add_argument(\"model\")\n"
+        b"    parser.add_argument(\"--outfile\", default=None)\n"
+        b"    parser.add_argument(\"--outtype\", default=\"f16\")\n"
+        b"    parser.add_argument(\"--vocab-only\", action=\"store_true\")\n"
+        b"    return parser.parse_args()\n"
+    )
+    root = tmp_path / "custom_llama_cpp"
+    root.mkdir()
+    (root / "convert_hf_to_gguf.py").write_bytes(entry_with_args)
+    conv = root / "conversion"
+    conv.mkdir()
+    (conv / "__init__.py").write_bytes(_PACKAGE_INIT_PY)
+    (conv / "base.py").write_bytes(_PACKAGE_BASE_PY)
+    (conv / "qwen.py").write_bytes(_PACKAGE_QWEN_PY)
+
+    monkeypatch.setenv("UNSLOTH_LLAMA_CPP_SCRIPTS_DIR", str(root))
+
+    # Sentinel: any call here means the patcher fell through to the
+    # module-load path on package layout, which is the bug we're guarding.
+    called = {"hit": False}
+    def _trap(*a, **kw):
+        called["hit"] = True
+        raise AssertionError("monolith-only _load_module_from_path called on package layout")
+    monkeypatch.setattr(llama_cpp, "_load_module_from_path", _trap)
+
+    # Cache must be cleared between runs because @lru_cache(1) keys include
+    # the resolved local_script_info -- but a stale entry from a previous
+    # test would short-circuit the new call.
+    llama_cpp._download_convert_hf_to_gguf_cached.cache_clear()
+
+    patched_path, text_archs, vision_archs = llama_cpp._download_convert_hf_to_gguf("regression_no_module_import")
+
+    assert called["hit"] is False
+    assert patched_path.endswith(".py")
+    assert "LlamaForCausalLM" in text_archs
+    assert text_archs == frozenset(text_archs)
+    # base.py was patched in place under the override dir.
+    assert b"# UNSLOTH_BRANDING_APPLIED" in (conv / "base.py").read_bytes()
+    # Cleanup for follow-on tests.
+    llama_cpp._download_convert_hf_to_gguf_cached.cache_clear()
+
+
+def test_patcher_anchors_on_custom_dir_when_override_set(tmp_path):
+    """Build a custom llama.cpp tree with the new package layout in a temp
+    dir, point a synthetic local_script_info at it, and confirm sibling
+    info + layout detection target THAT dir, not the hardcoded default."""
+    llama_cpp = _load_llama_cpp_module()
+    root = tmp_path / "custom_llama_cpp"
+    root.mkdir()
+    (root / "convert_hf_to_gguf.py").write_bytes(_PACKAGE_ENTRYPOINT)
+    conv = root / "conversion"
+    conv.mkdir()
+    (conv / "__init__.py").write_bytes(_PACKAGE_INIT_PY)
+    (conv / "base.py").write_bytes(_PACKAGE_BASE_PY)
+    (conv / "qwen.py").write_bytes(_PACKAGE_QWEN_PY)
+
+    local_info = (
+        str(root / "convert_hf_to_gguf.py"),
+        (root / "convert_hf_to_gguf.py").stat().st_mtime_ns,
+        (root / "convert_hf_to_gguf.py").stat().st_size,
+    )
+    resolved = llama_cpp._get_llama_cpp_dir(local_info)
+    assert resolved == str(root)
+    sib = llama_cpp._conversion_sibling_info(resolved)
+    assert sib is not None
+    assert sib[1][0] == str(conv / "base.py")  # base.py path in sibling tuple
+    layout = llama_cpp._detect_converter_layout(_PACKAGE_ENTRYPOINT, resolved)
+    assert layout == "package"
+
+
+# --- Network smoke against current upstream llama.cpp ----------------------
+
+
+@pytest.fixture
+def latest_llama_cpp(tmp_path):
+    """Fetch the current convert_hf_to_gguf.py + conversion/{__init__,base,qwen}.py
+    from raw.githubusercontent.com. Skips the test cleanly when offline or rate-
+    limited (raw.githubusercontent.com is documented at 60 req/hour unauthed)."""
+    requests = pytest.importorskip("requests")
+    root = tmp_path / "llama.cpp"
+    (root / "conversion").mkdir(parents=True)
+    base_url = "https://raw.githubusercontent.com/ggml-org/llama.cpp/master/"
+    files = {
+        "convert_hf_to_gguf.py": root / "convert_hf_to_gguf.py",
+        "conversion/__init__.py": root / "conversion" / "__init__.py",
+        "conversion/base.py": root / "conversion" / "base.py",
+        "conversion/qwen.py": root / "conversion" / "qwen.py",
+    }
+    headers = {}
+    if os.environ.get("GITHUB_TOKEN"):
+        headers["Authorization"] = f"Bearer {os.environ['GITHUB_TOKEN']}"
+    for rel, dest in files.items():
+        try:
+            r = requests.get(base_url + rel, timeout=15, headers=headers)
+        except requests.exceptions.RequestException as exc:
+            pytest.skip(f"network unreachable: {exc}")
+        if r.status_code in (403, 429, 503):
+            pytest.skip(f"upstream rate-limited / unavailable: HTTP {r.status_code}")
+        if r.status_code != 200:
+            pytest.skip(f"upstream missing {rel}: HTTP {r.status_code}")
+        dest.write_bytes(r.content)
+    return root
+
+
+def test_latest_upstream_detected_as_package_layout(latest_llama_cpp):
+    llama_cpp = _load_llama_cpp_module()
+    entry_bytes = (latest_llama_cpp / "convert_hf_to_gguf.py").read_bytes()
+    layout = llama_cpp._detect_converter_layout(entry_bytes, str(latest_llama_cpp))
+    assert layout == "package", "current llama.cpp master should match the new layout"
+
+
+def test_latest_upstream_branding_patch_applies(latest_llama_cpp):
+    """Against the live upstream conversion/base.py, the branding regex must
+    still match. If upstream changes the indentation or arguments of
+    Metadata.load, this test fails fast so we can update the regex."""
+    llama_cpp = _load_llama_cpp_module()
+    base_py = latest_llama_cpp / "conversion" / "base.py"
+    status = llama_cpp._apply_branding_patch_to_base(str(base_py))
+    assert status == "applied", f"branding patch did not apply to upstream base.py: {status}"
+    content = base_py.read_bytes()
+    assert b"# UNSLOTH_BRANDING_APPLIED" in content
+    assert b"self.metadata.quantized_by = 'Unsloth'" in content
+
+
+def test_latest_upstream_qwen_already_handles_aliases(latest_llama_cpp):
+    """Upstream Qwen module is expected to call find_hparam with both keys."""
+    llama_cpp = _load_llama_cpp_module()
+    qwen_py = latest_llama_cpp / "conversion" / "qwen.py"
+    if not qwen_py.exists():
+        pytest.skip("upstream conversion/qwen.py absent")
+    assert llama_cpp._qwen_already_handles_expert_aliases(str(qwen_py)) is True
+
+
+def test_latest_upstream_arch_enumeration_non_empty(latest_llama_cpp):
+    """TEXT_MODEL_MAP in upstream conversion/__init__.py must produce a non-empty
+    architecture allowlist. This is the assertion that would have caught the
+    original 'No supported architectures' warning if it had been a test."""
+    llama_cpp = _load_llama_cpp_module()
+    init_py = latest_llama_cpp / "conversion" / "__init__.py"
+    text_archs = llama_cpp._extract_dict_keys_from_conversion_init(str(init_py), "TEXT_MODEL_MAP")
+    assert "LlamaForCausalLM" in text_archs, (
+        f"upstream TEXT_MODEL_MAP missing LlamaForCausalLM; "
+        f"got {sorted(text_archs)[:10]}..."
+    )
+    # The set should also contain at least some Qwen entries since this is the
+    # user's reported architecture family.
+    qwen_keys = {k for k in text_archs if k.startswith("Qwen")}
+    assert qwen_keys, f"upstream TEXT_MODEL_MAP has no Qwen* entries: {sorted(text_archs)[:20]}..."
diff --git a/tests/test_quantize_gguf_q2_k_l.py b/tests/test_quantize_gguf_q2_k_l.py
new file mode 100644
index 000000000..c6164ec44
--- /dev/null
+++ b/tests/test_quantize_gguf_q2_k_l.py
@@ -0,0 +1,186 @@
+"""Q2_K_L preset dispatch inside quantize_gguf.
+
+Q2_K_L is an Unsloth-side preset, not a native llama.cpp ftype. It maps to
+``llama-quantize --output-tensor-type q8_0 --token-embedding-type q8_0 IN OUT
+q2_k NTHREADS``. Before this fix, the MLX/Studio export path forwarded the raw
+``q2_k_l`` string to llama-quantize which then aborted with
+``main: invalid ftype 'q2_k_l'``.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import subprocess
+import sys
+from pathlib import Path
+from types import SimpleNamespace
+
+
+def _load_llama_cpp_module():
+    repo_root = Path(__file__).resolve().parents[1]
+    module_path = repo_root / "unsloth_zoo" / "llama_cpp.py"
+    spec = importlib.util.spec_from_file_location("llama_cpp_under_test", module_path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def _install_fake_subprocess_run(monkeypatch, llama_cpp):
+    """Replace subprocess.run with a capturing fake (no real binary invoked)."""
+
+    captured: dict[str, object] = {}
+
+    def fake_run(cmd, *args, **kwargs):
+        captured["cmd"] = cmd
+        captured["args"] = args
+        captured["kwargs"] = kwargs
+        return SimpleNamespace(stdout="ok", returncode=0)
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+    monkeypatch.setattr(llama_cpp.subprocess, "run", fake_run)
+    return captured
+
+
+def _stub_output_exists(monkeypatch):
+    """Pretend the output file was produced (no real quantization happened)."""
+
+    monkeypatch.setattr(Path, "exists", lambda self: True)
+    monkeypatch.setattr(Path, "stat", lambda self: SimpleNamespace(st_size=4096))
+
+
+def test_q2_k_l_expands_to_q2_k_with_output_and_embedding_q8_0(monkeypatch):
+    llama_cpp = _load_llama_cpp_module()
+    captured = _install_fake_subprocess_run(monkeypatch, llama_cpp)
+    _stub_output_exists(monkeypatch)
+
+    llama_cpp.quantize_gguf(
+        input_gguf="/tmp/in.gguf",
+        output_gguf="/tmp/out.gguf",
+        quant_type="q2_k_l",
+        quantizer_location="/usr/bin/llama-quantize",
+        n_threads=4,
+        print_output=False,
+    )
+
+    cmd = captured["cmd"]
+    assert isinstance(cmd, str), f"command should be a shell string (existing convention); got {type(cmd)!r}"
+    # The literal preset name must NOT reach llama-quantize.
+    assert "q2_k_l" not in cmd, f"q2_k_l leaked into llama-quantize command: {cmd!r}"
+    # The expanded ftype must appear, as a standalone token.
+    assert " q2_k " in cmd, f"q2_k token missing: {cmd!r}"
+    # Both preset flags must appear, in either order.
+    assert "--output-tensor-type q8_0" in cmd, f"--output-tensor-type q8_0 missing: {cmd!r}"
+    assert "--token-embedding-type q8_0" in cmd, f"--token-embedding-type q8_0 missing: {cmd!r}"
+    # Sanity: input/output paths and thread count are still present.
+    assert "/tmp/in.gguf" in cmd
+    assert "/tmp/out.gguf" in cmd
+    assert " 4" in cmd, f"n_threads missing: {cmd!r}"
+
+
+def test_q2_k_l_is_case_insensitive(monkeypatch):
+    """Studio frontend may send Q2_K_L / Q2_k_L / etc. Treat them identically."""
+
+    llama_cpp = _load_llama_cpp_module()
+    captured = _install_fake_subprocess_run(monkeypatch, llama_cpp)
+    _stub_output_exists(monkeypatch)
+
+    for variant in ("Q2_K_L", "q2_K_L", "  q2_k_l  "):
+        captured.clear()
+        llama_cpp.quantize_gguf(
+            input_gguf="/tmp/in.gguf",
+            output_gguf="/tmp/out.gguf",
+            quant_type=variant,
+            quantizer_location="/usr/bin/llama-quantize",
+            n_threads=4,
+            print_output=False,
+        )
+        cmd = captured["cmd"]
+        assert " q2_k " in cmd, f"variant {variant!r}: expansion missing: {cmd!r}"
+        assert "--output-tensor-type q8_0" in cmd
+        assert "--token-embedding-type q8_0" in cmd
+
+
+def test_other_quant_types_are_untouched(monkeypatch):
+    """Non-preset ftypes must traverse the original code path byte-for-byte.
+
+    Linux + Windows non-regression: ensures the q2_k_l branch does not affect
+    any other ftype. q3_k_l is a real llama.cpp ftype distinct from q2_k_l and
+    must be passed through verbatim.
+    """
+
+    llama_cpp = _load_llama_cpp_module()
+    captured = _install_fake_subprocess_run(monkeypatch, llama_cpp)
+    _stub_output_exists(monkeypatch)
+
+    for ftype in (
+        "q2_k", "q3_k_s", "q3_k_m", "q3_k_l",  # q3_k_l is a real ftype, NOT a preset
+        "q4_0", "q4_1", "q4_k_s", "q4_k_m",
+        "q5_0", "q5_1", "q5_k_s", "q5_k_m",
+        "q6_k", "q8_0", "bf16", "f16", "f32",
+    ):
+        captured.clear()
+        llama_cpp.quantize_gguf(
+            input_gguf="/tmp/in.gguf",
+            output_gguf="/tmp/out.gguf",
+            quant_type=ftype,
+            quantizer_location="/usr/bin/llama-quantize",
+            n_threads=4,
+            print_output=False,
+        )
+        cmd = captured["cmd"]
+        assert f" {ftype} " in cmd, f"ftype {ftype!r} not preserved: {cmd!r}"
+        assert "--output-tensor-type" not in cmd, (
+            f"ftype {ftype!r} accidentally picked up preset flags: {cmd!r}"
+        )
+        assert "--token-embedding-type" not in cmd, (
+            f"ftype {ftype!r} accidentally picked up preset flags: {cmd!r}"
+        )
+
+
+def test_q2_k_l_print_output_path_logs_preset_expansion(capsys, monkeypatch):
+    """When print_output=True the user sees both the original request and the expansion."""
+
+    llama_cpp = _load_llama_cpp_module()
+    _install_fake_subprocess_run(monkeypatch, llama_cpp)
+    _stub_output_exists(monkeypatch)
+
+    llama_cpp.quantize_gguf(
+        input_gguf="/tmp/in.gguf",
+        output_gguf="/tmp/out.gguf",
+        quant_type="q2_k_l",
+        quantizer_location="/usr/bin/llama-quantize",
+        n_threads=4,
+        print_output=True,
+    )
+
+    out = capsys.readouterr().out
+    assert "Quantizing to q2_k_l" in out, out
+    assert "Expanding Q2_K_L preset" in out, out
+
+
+def test_q2_k_l_error_message_keeps_original_preset_name(monkeypatch):
+    """If llama-quantize fails, the RuntimeError should mention q2_k_l (what the
+    user asked for) rather than q2_k (the rewritten internal ftype)."""
+
+    llama_cpp = _load_llama_cpp_module()
+
+    def failing_run(cmd, *args, **kwargs):
+        raise subprocess.CalledProcessError(returncode=1, cmd=cmd, output="boom")
+
+    monkeypatch.setattr(subprocess, "run", failing_run)
+    monkeypatch.setattr(llama_cpp.subprocess, "run", failing_run)
+
+    try:
+        llama_cpp.quantize_gguf(
+            input_gguf="/tmp/in.gguf",
+            output_gguf="/tmp/out.gguf",
+            quant_type="q2_k_l",
+            quantizer_location="/usr/bin/llama-quantize",
+            n_threads=4,
+            print_output=False,
+        )
+    except RuntimeError as exc:
+        assert "q2_k_l" in str(exc), f"error msg should keep preset name: {exc}"
+    else:
+        raise AssertionError("expected RuntimeError")
diff --git a/unsloth_zoo/llama_cpp.py b/unsloth_zoo/llama_cpp.py
index e5548a1cd..b7f04bbb4 100644
--- a/unsloth_zoo/llama_cpp.py
+++ b/unsloth_zoo/llama_cpp.py
@@ -31,6 +31,7 @@
 import os
 import time
 import re
+import ast
 import requests
 import json
 from tqdm.auto import tqdm as ProgressBar
@@ -921,14 +922,161 @@ def _load_module_from_path(filepath, module_name):
 pass
 
 
+_UNSLOTH_BRANDING_MARKER = b"# UNSLOTH_BRANDING_APPLIED"
+_BRANDING_PATTERN = re.compile(
+    rb"(self\.metadata \= gguf\.Metadata\.load\(.+?\))([\n\r]+([\s\t]{4,}))",
+    flags = re.MULTILINE,
+)
+
+
+def _get_llama_cpp_dir(local_script_info):
+    """Resolve the directory holding the converter being patched.
+    UNSLOTH_LLAMA_CPP_SCRIPTS_DIR wins when set; otherwise the default
+    ~/.unsloth/llama.cpp. Single anchor for layout detection, branding patch,
+    Qwen check, and sibling-info cache key."""
+    if local_script_info is not None:
+        return os.path.dirname(local_script_info[0])
+    return LLAMA_CPP_DEFAULT_DIR
+pass
+
+
+def _conversion_sibling_info(llama_cpp_dir):
+    """Hashable (path, mtime, size) tuples for conversion/{__init__,base,qwen}.py.
+    Folded into the patcher cache key so re-pulled llama.cpp checkouts re-patch.
+    Returns None on monolithic layout."""
+    conv_dir = os.path.join(llama_cpp_dir, "conversion")
+    init_py  = os.path.join(conv_dir, "__init__.py")
+    base_py  = os.path.join(conv_dir, "base.py")
+    qwen_py  = os.path.join(conv_dir, "qwen.py")
+    if not (os.path.isfile(init_py) and os.path.isfile(base_py)):
+        return None
+    def _stat(p):
+        try:
+            s = os.stat(p)
+            return (p, s.st_mtime_ns, s.st_size)
+        except OSError:
+            return (p, 0, 0)
+    return (
+        _stat(init_py),
+        _stat(base_py),
+        _stat(qwen_py) if os.path.isfile(qwen_py) else None,
+    )
+pass
+
+
+def _detect_converter_layout(entry_content_bytes, llama_cpp_dir):
+    """Return 'package' for the new conversion/ package layout, else 'monolith'.
+    Detection is structural: entrypoint must contain `from conversion import`
+    AND conversion/__init__.py + conversion/base.py must exist on disk."""
+    try:
+        if b"from conversion import" not in entry_content_bytes:
+            return "monolith"
+        init_py = os.path.join(llama_cpp_dir, "conversion", "__init__.py")
+        base_py = os.path.join(llama_cpp_dir, "conversion", "base.py")
+        if os.path.isfile(init_py) and os.path.isfile(base_py):
+            return "package"
+    except Exception:
+        # Detection is best-effort; on any I/O or attribute error fall back
+        # to monolith so the legacy regex patches still run.
+        pass
+    return "monolith"
+pass
+
+
+def _extract_dict_keys_from_conversion_init(conv_init_path, dict_name):
+    """AST-parse conversion/__init__.py for TEXT_MODEL_MAP / MMPROJ_MODEL_MAP
+    string-literal keys. Used as the arch allowlist on the new layout because
+    ModelBase._model_classes is empty until load_all_models() runs."""
+    try:
+        with open(conv_init_path, "rb") as f:
+            tree = ast.parse(f.read())
+    except Exception:
+        return set()
+    keys = set()
+    def _harvest(value):
+        if isinstance(value, ast.Dict):
+            for k in value.keys:
+                if isinstance(k, ast.Constant) and isinstance(k.value, str):
+                    keys.add(k.value)
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                if isinstance(target, ast.Name) and target.id == dict_name:
+                    _harvest(node.value)
+        elif isinstance(node, ast.AnnAssign):
+            if isinstance(node.target, ast.Name) and node.target.id == dict_name:
+                _harvest(node.value)
+    return keys
+pass
+
+
+def _apply_branding_patch_to_base(conv_base_path):
+    """Insert Unsloth metadata branding after `self.metadata = gguf.Metadata.load(...)`
+    in conversion/base.py. Idempotent via a one-line marker.
+    Returns 'applied' / 'already-applied' / 'pattern-missing'."""
+    try:
+        with open(conv_base_path, "rb") as f:
+            content = f.read()
+    except OSError:
+        return "pattern-missing"
+    if _UNSLOTH_BRANDING_MARKER in content:
+        return "already-applied"
+
+    def _replace(match):
+        load_call = match.group(1)
+        suffix    = match.group(2)   # already starts with newline + indent
+        indent    = match.group(3)
+        return (
+            load_call + b"\n"
+            + indent + _UNSLOTH_BRANDING_MARKER + b"\n"
+            + indent + b"if hasattr(self.metadata, 'quantized_by'): self.metadata.quantized_by = 'Unsloth'\n"
+            + indent + b"if hasattr(self.metadata, 'repo_url'): self.metadata.repo_url = 'https://huggingface.co/unsloth'\n"
+            + indent + b"if hasattr(self.metadata, 'tags'): self.metadata.tags = ['unsloth', 'llama.cpp']"
+            + suffix
+        )
+
+    new_content, n = _BRANDING_PATTERN.subn(_replace, content, count = 1)
+    if n == 0:
+        return "pattern-missing"
+    try:
+        with open(conv_base_path, "wb") as f:
+            f.write(new_content)
+    except OSError:
+        return "pattern-missing"
+    return "applied"
+pass
+
+
+def _qwen_already_handles_expert_aliases(conv_qwen_path):
+    """True iff conversion/qwen.py already searches both num_local_experts AND
+    num_experts. Upstream master uses
+        self.find_hparam(["num_local_experts", "num_experts"])
+    so the legacy patch is a no-op and the warning is misleading."""
+    try:
+        with open(conv_qwen_path, "rb") as f:
+            content = f.read()
+    except OSError:
+        return False
+    return (b"num_local_experts" in content) and (b"num_experts" in content)
+pass
+
+
 def _download_convert_hf_to_gguf(name = "unsloth_convert_hf_to_gguf"):
-    # Resolve the env var on every call so changes between calls are honored;
-    # the resolved value is part of the cache key on the implementation below.
-    return _download_convert_hf_to_gguf_cached(name, _resolve_local_convert_script())
+    # Resolve env vars + sibling mtimes on every call; both are folded into
+    # the @lru_cache key so re-pulled llama.cpp checkouts re-run the patcher.
+    # Anchor the conversion/ lookup to the converter being patched, not
+    # always LLAMA_CPP_DEFAULT_DIR -- matters when UNSLOTH_LLAMA_CPP_SCRIPTS_DIR
+    # points at a different checkout.
+    local_script_info = _resolve_local_convert_script()
+    return _download_convert_hf_to_gguf_cached(
+        name,
+        local_script_info,
+        _conversion_sibling_info(_get_llama_cpp_dir(local_script_info)),
+    )
 
 
 @lru_cache(1)
-def _download_convert_hf_to_gguf_cached(name, _local_script_info):
+def _download_convert_hf_to_gguf_cached(name, _local_script_info, _conversion_info):
     # All Unsloth Zoo code licensed under LGPLv3
     # Downloads from llama.cpp's GitHub repository, or reads a local copy when
     # UNSLOTH_LLAMA_CPP_SCRIPTS_DIR is set. _local_script_info is
@@ -944,6 +1092,13 @@ def _download_convert_hf_to_gguf_cached(name, _local_script_info):
     text_archs = set()
     vision_archs = set()
     temp_original_file_path = None # Initialize for finally block
+    original_module_name = None    # Only set on the monolith branch
+    # Set by introspection; read by Patch 2 + Patch 3 below. Default to
+    # 'monolith' so a failed introspection still drives the legacy patches.
+    _layout = "monolith"
+    # Resolve once: same dir feeds layout detection, branding patch, Qwen
+    # check, sibling cache key. UNSLOTH_LLAMA_CPP_SCRIPTS_DIR overrides default.
+    _llama_cpp_dir = _get_llama_cpp_dir(_local_script_info)
 
     _local_script = _local_script_info[0] if _local_script_info is not None else None
 
@@ -976,65 +1131,89 @@ def _download_convert_hf_to_gguf_cached(name, _local_script_info):
             if original_content is None:
                 raise _last_err  # type: ignore[misc]
 
-        # 2. Introspect Original Script for Supported Architectures
+        # 2. Detect layout BEFORE attempting to import. The package-layout
+        # entrypoint does `from conversion import ...`, which a temp-file
+        # import resolves against LLAMA_CPP_DEFAULT_DIR -- so when the user
+        # set UNSLOTH_LLAMA_CPP_SCRIPTS_DIR to a different checkout, the
+        # import would ModuleNotFoundError and abort the patcher before we
+        # could reach the AST-based arch extraction path.
+        _layout = _detect_converter_layout(original_content, _llama_cpp_dir)
+        logger.info(f"Unsloth: convert_hf_to_gguf layout detected: {_layout}")
         logger.info("Unsloth: Identifying llama.cpp gguf supported architectures...")
-        with tempfile.NamedTemporaryFile(
-            mode='wb', suffix=".py", prefix="original_gguf_", dir=LLAMA_CPP_DEFAULT_DIR, delete=False
-        ) as temp_file:
-            temp_original_file_path = temp_file.name
-            temp_file.write(original_content)
-            temp_file.flush()
-
-        logger.debug(f"Loading module from temporary file: {temp_original_file_path}")
-        original_module_name = f"convert_hf_to_gguf_{os.path.basename(temp_original_file_path).split('.')[0]}"
-
-        # Set NO_LOCAL_GGUF to prevent the script from adding path again
-        old_env = os.environ.get('NO_LOCAL_GGUF')
-        os.environ['NO_LOCAL_GGUF'] = '1'
-
-        try:
-            module = _load_module_from_path(temp_original_file_path, original_module_name)
-        finally:
-            # Restore environment
-            if old_env is None:
-                os.environ.pop('NO_LOCAL_GGUF', None)
-            else:
-                os.environ['NO_LOCAL_GGUF'] = old_env
 
-        # --- Extract Supported Architectures (TEXT and VISION) ---
-        ModelBase = getattr(module, 'ModelBase', None)
-        ModelType = getattr(module, 'ModelType', None)
-
-        if ModelBase is None or ModelType is None:
-            logger.warning(
-                f"Unsloth: Failed to find 'ModelBase' or 'ModelType' in the original downloaded script. "
-                f"Structure might have changed. Cannot determine supported architectures."
-            )
-        elif not hasattr(ModelBase, '_model_classes') or not isinstance(ModelBase._model_classes, dict):
-             logger.warning(
-                f"Unsloth: 'ModelBase._model_classes' not found or not a dictionary in original script."
-                 " Cannot determine supported architectures."
-            )
+        if _layout == "package":
+            # Package layout: archs come from AST-parsing the static
+            # TEXT_MODEL_MAP / MMPROJ_MODEL_MAP in conversion/__init__.py.
+            # No module import required, so we skip the temp-write entirely.
+            conv_init_py = os.path.join(_llama_cpp_dir, "conversion", "__init__.py")
+            text_archs   = _extract_dict_keys_from_conversion_init(conv_init_py, "TEXT_MODEL_MAP")
+            vision_archs = _extract_dict_keys_from_conversion_init(conv_init_py, "MMPROJ_MODEL_MAP")
+            supported_types.update(text_archs)
+            supported_types.update(vision_archs)
+            if not supported_types:
+                logger.warning(
+                    "Unsloth: conversion/__init__.py parsed but TEXT_MODEL_MAP / "
+                    "MMPROJ_MODEL_MAP yielded no architecture keys. The arch "
+                    "allowlist will be empty; conversion will still attempt to run."
+                )
         else:
-            # Check for TEXT models
-            if hasattr(ModelType, 'TEXT') and ModelType.TEXT in ModelBase._model_classes:
-                if isinstance(ModelBase._model_classes[ModelType.TEXT], dict):
-                    text_archs = set(ModelBase._model_classes[ModelType.TEXT].keys())
-                    supported_types.update(text_archs)
-                else:
-                    logger.warning("Unsloth: ModelBase._model_classes[ModelType.TEXT] is not a dictionary.")
-            else:
-                logger.info("Unsloth: No TEXT model architectures found registered in the original script.")
+            # Monolith layout: original behaviour. Write the entrypoint to a
+            # temp file under LLAMA_CPP_DEFAULT_DIR and import it to read
+            # ModelBase._model_classes.
+            with tempfile.NamedTemporaryFile(
+                mode='wb', suffix=".py", prefix="original_gguf_", dir=LLAMA_CPP_DEFAULT_DIR, delete=False
+            ) as temp_file:
+                temp_original_file_path = temp_file.name
+                temp_file.write(original_content)
+                temp_file.flush()
+
+            logger.debug(f"Loading module from temporary file: {temp_original_file_path}")
+            original_module_name = f"convert_hf_to_gguf_{os.path.basename(temp_original_file_path).split('.')[0]}"
+
+            # Set NO_LOCAL_GGUF to prevent the script from adding path again
+            old_env = os.environ.get('NO_LOCAL_GGUF')
+            os.environ['NO_LOCAL_GGUF'] = '1'
 
-            # Check for VISION models
-            if hasattr(ModelType, 'MMPROJ') and ModelType.MMPROJ in ModelBase._model_classes:
-                if isinstance(ModelBase._model_classes[ModelType.MMPROJ], dict):
-                    vision_archs = set(ModelBase._model_classes[ModelType.MMPROJ].keys())
-                    supported_types.update(vision_archs)
+            try:
+                module = _load_module_from_path(temp_original_file_path, original_module_name)
+            finally:
+                if old_env is None:
+                    os.environ.pop('NO_LOCAL_GGUF', None)
                 else:
-                    logger.warning("Unsloth: ModelBase._model_classes[ModelType.MMPROJ] is not a dictionary.")
+                    os.environ['NO_LOCAL_GGUF'] = old_env
+            ModelBase = getattr(module, 'ModelBase', None)
+            ModelType = getattr(module, 'ModelType', None)
+
+            if ModelBase is None or ModelType is None:
+                logger.warning(
+                    f"Unsloth: Failed to find 'ModelBase' or 'ModelType' in the original downloaded script. "
+                    f"Structure might have changed. Cannot determine supported architectures."
+                )
+            elif not hasattr(ModelBase, '_model_classes') or not isinstance(ModelBase._model_classes, dict):
+                 logger.warning(
+                    f"Unsloth: 'ModelBase._model_classes' not found or not a dictionary in original script."
+                     " Cannot determine supported architectures."
+                )
             else:
-                 logger.info("Unsloth: No VISION model architectures found registered in the original script.")
+                # Check for TEXT models
+                if hasattr(ModelType, 'TEXT') and ModelType.TEXT in ModelBase._model_classes:
+                    if isinstance(ModelBase._model_classes[ModelType.TEXT], dict):
+                        text_archs = set(ModelBase._model_classes[ModelType.TEXT].keys())
+                        supported_types.update(text_archs)
+                    else:
+                        logger.warning("Unsloth: ModelBase._model_classes[ModelType.TEXT] is not a dictionary.")
+                else:
+                    logger.info("Unsloth: No TEXT model architectures found registered in the original script.")
+
+                # Check for VISION models
+                if hasattr(ModelType, 'MMPROJ') and ModelType.MMPROJ in ModelBase._model_classes:
+                    if isinstance(ModelBase._model_classes[ModelType.MMPROJ], dict):
+                        vision_archs = set(ModelBase._model_classes[ModelType.MMPROJ].keys())
+                        supported_types.update(vision_archs)
+                    else:
+                        logger.warning("Unsloth: ModelBase._model_classes[ModelType.MMPROJ] is not a dictionary.")
+                else:
+                     logger.info("Unsloth: No VISION model architectures found registered in the original script.")
         # --- End Architecture Extraction ---
 
         # Convert final set to frozenset for immutability (good practice for cache keys/return values)
@@ -1047,8 +1226,8 @@ def _download_convert_hf_to_gguf_cached(name, _local_script_info):
                 f"Unsloth: No supported architectures (TEXT or VISION) could be determined from the original script."
             )
 
-        # Cleanup module reference
-        if original_module_name in sys.modules:
+        # Cleanup module reference (only set on the monolith branch)
+        if original_module_name is not None and original_module_name in sys.modules:
              del sys.modules[original_module_name]
 
     except Exception as e:
@@ -1085,41 +1264,73 @@ def _download_convert_hf_to_gguf_cached(name, _local_script_info):
 
 
 
-        # Patch 2: Metadata Branding
+        # Patch 2: Metadata Branding.
+        # Monolith: target lives in the entrypoint; patch the in-memory bytes.
+        # Package: target moved to conversion/base.py; patch that file in place
+        # (idempotent via _UNSLOTH_BRANDING_MARKER) since the entrypoint just
+        # imports ModelBase from it at runtime.
         try:
-            metadata_patch_applied = False
-            new_patched_content = re.sub(
-                rb"(self\.metadata \= gguf\.Metadata\.load\(.+?\))([\n\r]+([\s\t]{4,}))",
-                rb"\1\n"
-                rb"\3if hasattr(self.metadata, 'quantized_by'): self.metadata.quantized_by = 'Unsloth'\n"
-                rb"\3if hasattr(self.metadata, 'repo_url'): self.metadata.repo_url = 'https://huggingface.co/unsloth'\n"
-                rb"\3if hasattr(self.metadata, 'tags'): self.metadata.tags = ['unsloth', 'llama.cpp']\n"
-                rb"\2",
-                patched_content, count=1, flags=re.MULTILINE
-            )
-            if new_patched_content != patched_content: patched_content = new_patched_content; metadata_patch_applied = True
-            if not metadata_patch_applied:
-                 if re.search(rb"self\.metadata \= gguf\.Metadata\.load\(", patched_content): logger.warning("Unsloth: Metadata branding patch target found, but regex failed to apply.")
-                 else: logger.warning("Unsloth: Metadata branding patch target 'self.metadata = gguf.Metadata.load(...)' not found.")
+            if _layout == "package":
+                conv_base_py = os.path.join(_llama_cpp_dir, "conversion", "base.py")
+                _branding_status = _apply_branding_patch_to_base(conv_base_py)
+                if _branding_status == "applied":
+                    logger.info(f"Unsloth: Metadata branding patch applied to {conv_base_py}.")
+                elif _branding_status == "already-applied":
+                    logger.info(f"Unsloth: Metadata branding patch already present in {conv_base_py} (idempotent skip).")
+                else:
+                    logger.warning(
+                        f"Unsloth: Metadata branding patch target not found in {conv_base_py}. "
+                        f"Upstream may have refactored Metadata.load again."
+                    )
+            else:
+                metadata_patch_applied = False
+                new_patched_content = re.sub(
+                    rb"(self\.metadata \= gguf\.Metadata\.load\(.+?\))([\n\r]+([\s\t]{4,}))",
+                    rb"\1\n"
+                    rb"\3if hasattr(self.metadata, 'quantized_by'): self.metadata.quantized_by = 'Unsloth'\n"
+                    rb"\3if hasattr(self.metadata, 'repo_url'): self.metadata.repo_url = 'https://huggingface.co/unsloth'\n"
+                    rb"\3if hasattr(self.metadata, 'tags'): self.metadata.tags = ['unsloth', 'llama.cpp']\n"
+                    rb"\2",
+                    patched_content, count=1, flags=re.MULTILINE
+                )
+                if new_patched_content != patched_content: patched_content = new_patched_content; metadata_patch_applied = True
+                if not metadata_patch_applied:
+                     if re.search(rb"self\.metadata \= gguf\.Metadata\.load\(", patched_content): logger.warning("Unsloth: Metadata branding patch target found, but regex failed to apply.")
+                     else: logger.warning("Unsloth: Metadata branding patch target 'self.metadata = gguf.Metadata.load(...)' not found.")
         except Exception as e: logger.error(f"Unsloth: Error applying metadata branding patch: {e}", exc_info=True); raise
 
 
-        # Patch 3: Qwen2MoE/Qwen3MoE num_experts fix
+        # Patch 3: Qwen2MoE / Qwen3MoE num_experts fix.
+        # Package layout uses find_hparam(["num_local_experts", "num_experts"])
+        # already, so the legacy patch is obsolete and its warning misleading.
+        # Skip it (info-log) on new layout; run unchanged on monolith.
         try:
-            # Use a single regex to handle both quote styles
-            num_experts_pattern = rb'n_experts = self\.hparams\[(["\'])num_experts\1\]'
-            replacement = (
-                b"# Qwen3MoE seems to use num_local_experts instead of num_experts\n"
-                b"            n_experts = self.hparams.get('num_experts', None) or self.hparams.get('num_local_experts')"
-            )
+            _qwen_handled = False
+            if _layout == "package":
+                conv_qwen_py = os.path.join(_llama_cpp_dir, "conversion", "qwen.py")
+                if os.path.isfile(conv_qwen_py) and _qwen_already_handles_expert_aliases(conv_qwen_py):
+                    logger.info(
+                        "Unsloth: Qwen2MoE expert-key alias already handled upstream "
+                        "(conversion/qwen.py uses find_hparam([num_local_experts, num_experts])) "
+                        "-- legacy patch skipped."
+                    )
+                    _qwen_handled = True
+
+            if not _qwen_handled:
+                # Use a single regex to handle both quote styles
+                num_experts_pattern = rb'n_experts = self\.hparams\[(["\'])num_experts\1\]'
+                replacement = (
+                    b"# Qwen3MoE seems to use num_local_experts instead of num_experts\n"
+                    b"            n_experts = self.hparams.get('num_experts', None) or self.hparams.get('num_local_experts')"
+                )
 
-            new_patched_content = re.sub(num_experts_pattern, replacement, patched_content)
-            num_experts_patch_applied = (new_patched_content != patched_content)
+                new_patched_content = re.sub(num_experts_pattern, replacement, patched_content)
+                num_experts_patch_applied = (new_patched_content != patched_content)
 
-            if num_experts_patch_applied:
-                patched_content = new_patched_content
-            else:
-                logger.warning("Unsloth: Qwen2MoE num_experts patch target not found.")
+                if num_experts_patch_applied:
+                    patched_content = new_patched_content
+                else:
+                    logger.warning("Unsloth: Qwen2MoE num_experts patch target not found.")
 
         except Exception as e:
             logger.error(f"Unsloth: Error applying Qwen2MoE num_experts patch: {e}", exc_info=True)
@@ -1587,10 +1798,26 @@ def _quote(s):
         import shlex
         return shlex.quote(s)
 
-    command = f"{_quote(quantizer_location)} {_quote(input_gguf)} {_quote(output_gguf)} {quant_type} {n_threads}"
+    # Q2_K_L is an Unsloth preset (q2_k + q8_0 output / embedding tensors),
+    # not a native llama.cpp ftype. Expand here so every caller shares one path.
+    _display_quant_type = quant_type
+    _extra_flags = ""
+    if str(quant_type).strip().lower() == "q2_k_l":
+        _extra_flags = "--output-tensor-type q8_0 --token-embedding-type q8_0 "
+        quant_type = "q2_k"
+
+    command = (
+        f"{_quote(quantizer_location)} {_extra_flags}"
+        f"{_quote(input_gguf)} {_quote(output_gguf)} {quant_type} {n_threads}"
+    )
 
     if print_output:
-        print(f"Unsloth: Quantizing to {quant_type}...")
+        print(f"Unsloth: Quantizing to {_display_quant_type}...")
+        if _extra_flags:
+            print(
+                "Unsloth: Expanding Q2_K_L preset "
+                "(q2_k + --output-tensor-type q8_0 --token-embedding-type q8_0)."
+            )
 
     try:
         if print_output:
@@ -1603,7 +1830,7 @@ def _quote(s):
     except subprocess.CalledProcessError as e:
         if print_output and hasattr(e, 'stdout') and e.stdout:
             print(e.stdout)
-        raise RuntimeError(f"Failed to quantize {input_gguf} to {quant_type}: {e}")
+        raise RuntimeError(f"Failed to quantize {input_gguf} to {_display_quant_type}: {e}")
 
     # Verify output exists and get size using pathlib
     output_path = Path(output_gguf)