diff --git a/.github/workflows/studio-export-fix-ci.yml b/.github/workflows/studio-export-fix-ci.yml new file mode 100644 index 000000000..699b78d16 --- /dev/null +++ b/.github/workflows/studio-export-fix-ci.yml @@ -0,0 +1,62 @@ +name: studio-export-fix-ci + +on: + push: + branches: [main, nightly] + paths: + - "unsloth_zoo/llama_cpp.py" + - "tests/test_quantize_gguf_q2_k_l.py" + - "tests/test_convert_hf_to_gguf_patcher.py" + - ".github/workflows/studio-export-fix-ci.yml" + pull_request: + paths: + - "unsloth_zoo/llama_cpp.py" + - "tests/test_quantize_gguf_q2_k_l.py" + - "tests/test_convert_hf_to_gguf_patcher.py" + - ".github/workflows/studio-export-fix-ci.yml" + +concurrency: + group: studio-export-fix-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + studio-export-fix: + name: ${{ matrix.os }} + strategy: + fail-fast: false + # Cap matrix at 3 in flight so Windows stays under the repo-level + # 5-concurrent-Windows-runner limit when this job runs alongside others. + max-parallel: 3 + matrix: + os: [ubuntu-latest, macos-14, windows-latest] + runs-on: ${{ matrix.os }} + timeout-minutes: 15 + env: + # 5000/h vs 60/h on raw.githubusercontent.com for the live-upstream tests. + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + UNSLOTH_COMPILE_DISABLE: '1' + PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install minimal test deps + run: | + python -m pip install --upgrade pip + # Pure-Python tests: monkeypatch subprocess + AST-parse upstream files. + # No torch / transformers needed. Keep slim so Windows cold start stays under a minute. + python -m pip install pytest psutil requests tqdm + + - name: Run patcher + q2_k_l unit tests + shell: bash + run: | + pytest -v \ + tests/test_quantize_gguf_q2_k_l.py \ + tests/test_convert_hf_to_gguf_patcher.py diff --git a/tests/test_convert_hf_to_gguf_patcher.py b/tests/test_convert_hf_to_gguf_patcher.py new file mode 100644 index 000000000..04ff37827 --- /dev/null +++ b/tests/test_convert_hf_to_gguf_patcher.py @@ -0,0 +1,512 @@ +"""Tests for the layout-aware convert_hf_to_gguf.py patcher. + +Covers the helpers that distinguish upstream llama.cpp's old monolithic +convert_hf_to_gguf.py from the new conversion/ package layout, plus the +in-place branding patch on conversion/base.py and the Qwen2MoE-skip path. + +Two flavours: + + - synthetic_*: hand-crafted fixture trees that match the upstream layouts + structurally; no network. These are the load-bearing CI gates. + - latest_* : pulls the current files from raw.githubusercontent.com and + asserts the patcher still understands master. Skipped when offline. +""" + +from __future__ import annotations + +import importlib.util +import os +import sys +from pathlib import Path + +import pytest + + +def _load_llama_cpp_module(): + repo_root = Path(__file__).resolve().parents[1] + module_path = repo_root / "unsloth_zoo" / "llama_cpp.py" + spec = importlib.util.spec_from_file_location("llama_cpp_under_test_patcher", module_path) + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +# --- Synthetic fixtures matching upstream layouts --------------------------- + +# A minimal but realistic stand-in for the new `convert_hf_to_gguf.py` +# entrypoint. The structural anchor we detect on is `from conversion import`. +_PACKAGE_ENTRYPOINT = b"""\ +#!/usr/bin/env python3 +import argparse +import sys +from pathlib import Path + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf + +from conversion import ( + ModelBase, + ModelType, + get_model_architecture, + get_model_class, + logger, + print_registered_models, +) +""" + +# A minimal stand-in for conversion/base.py containing the canonical +# Metadata.load call site at 8-space indent (matches conversion/base.py:912). +_PACKAGE_BASE_PY = b"""\ +import gguf +from enum import IntEnum + + +class ModelType(IntEnum): + TEXT = 0 + MMPROJ = 1 + + +class ModelBase: + _model_classes = {ModelType.TEXT: {}, ModelType.MMPROJ: {}} + + def prepare_metadata(self, vocab_only): + total_params, shared_params, expert_params, expert_count = (0, 0, 0, 0) + + self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) + + if self.remote_hf_model_id: + self.metadata.name = self.remote_hf_model_id +""" + +# A minimal stand-in for conversion/__init__.py with realistic TEXT_MODEL_MAP +# and MMPROJ_MODEL_MAP dict literals (matches __init__.py:19-231,234-283). +_PACKAGE_INIT_PY = b"""\ +from __future__ import annotations +from .base import ModelBase, ModelType + + +TEXT_MODEL_MAP: dict[str, str] = { + "LlamaForCausalLM": "llama", + "MistralForCausalLM": "llama", + "Qwen3ForCausalLM": "qwen", + "Qwen2MoeForCausalLM": "qwen", + "Qwen3MoeForCausalLM": "qwen", + "Gemma3ForCausalLM": "gemma", +} + + +MMPROJ_MODEL_MAP: dict[str, str] = { + "LlavaForConditionalGeneration": "llava", + "Gemma3ForConditionalGeneration": "gemma", +} + + +def load_all_models() -> None: + pass + + +def get_model_class(name, mmproj=False): + return ModelBase +""" + +# Stand-in for the new conversion/qwen.py: contains both expert-key literals +# in the same find_hparam call (upstream already handles the alias). +_PACKAGE_QWEN_PY = b"""\ +from .base import ModelBase + + +class Qwen2MoeModel(ModelBase): + def set_gguf_parameters(self): + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + return n_experts +""" + +# A minimal stand-in for the OLD monolithic convert_hf_to_gguf.py. Note: NO +# `from conversion import` anywhere; that is the structural anchor for layout +# detection. ModelBase and ModelType are defined inline. +_MONOLITH = b"""\ +import argparse +import gguf +from enum import IntEnum + + +class ModelType(IntEnum): + TEXT = 0 + MMPROJ = 1 + + +class ModelBase: + _model_classes = {ModelType.TEXT: {"LlamaForCausalLM": object}, ModelType.MMPROJ: {}} + + def prepare_metadata(self): + self.metadata = gguf.Metadata.load(override, card, name, params) + + if self.remote_hf_model_id: + self.metadata.name = self.remote_hf_model_id + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--outfile", default=None) +""" + + +@pytest.fixture +def package_layout(tmp_path): + """Build a synthetic new-layout llama.cpp tree on disk and return its root.""" + root = tmp_path / "llama.cpp" + root.mkdir() + (root / "convert_hf_to_gguf.py").write_bytes(_PACKAGE_ENTRYPOINT) + conv = root / "conversion" + conv.mkdir() + (conv / "__init__.py").write_bytes(_PACKAGE_INIT_PY) + (conv / "base.py").write_bytes(_PACKAGE_BASE_PY) + (conv / "qwen.py").write_bytes(_PACKAGE_QWEN_PY) + return root + + +@pytest.fixture +def monolith_layout(tmp_path): + """Build a synthetic old-layout llama.cpp tree on disk and return its root.""" + root = tmp_path / "llama.cpp" + root.mkdir() + (root / "convert_hf_to_gguf.py").write_bytes(_MONOLITH) + return root + + +# --- Layout detection ------------------------------------------------------- + + +def test_detect_layout_returns_package_for_new_tree(package_layout): + llama_cpp = _load_llama_cpp_module() + entry_bytes = (package_layout / "convert_hf_to_gguf.py").read_bytes() + assert llama_cpp._detect_converter_layout(entry_bytes, str(package_layout)) == "package" + + +def test_detect_layout_returns_monolith_for_old_tree(monolith_layout): + llama_cpp = _load_llama_cpp_module() + entry_bytes = (monolith_layout / "convert_hf_to_gguf.py").read_bytes() + assert llama_cpp._detect_converter_layout(entry_bytes, str(monolith_layout)) == "monolith" + + +def test_detect_layout_falls_back_to_monolith_when_conversion_dir_missing(tmp_path): + """Entrypoint has the `from conversion import` anchor but the package dir is + absent on disk -> treat as monolith (defensive).""" + llama_cpp = _load_llama_cpp_module() + assert llama_cpp._detect_converter_layout(_PACKAGE_ENTRYPOINT, str(tmp_path)) == "monolith" + + +# --- Arch enumeration from conversion/__init__.py --------------------------- + + +def test_extract_text_model_map_keys(package_layout): + llama_cpp = _load_llama_cpp_module() + init_py = package_layout / "conversion" / "__init__.py" + keys = llama_cpp._extract_dict_keys_from_conversion_init(str(init_py), "TEXT_MODEL_MAP") + assert {"LlamaForCausalLM", "Qwen3ForCausalLM", "Gemma3ForCausalLM"} <= keys + assert "Qwen2MoeForCausalLM" in keys + + +def test_extract_mmproj_model_map_keys(package_layout): + llama_cpp = _load_llama_cpp_module() + init_py = package_layout / "conversion" / "__init__.py" + keys = llama_cpp._extract_dict_keys_from_conversion_init(str(init_py), "MMPROJ_MODEL_MAP") + assert "LlavaForConditionalGeneration" in keys + assert "Gemma3ForConditionalGeneration" in keys + + +def test_extract_returns_empty_for_missing_dict(package_layout): + llama_cpp = _load_llama_cpp_module() + init_py = package_layout / "conversion" / "__init__.py" + keys = llama_cpp._extract_dict_keys_from_conversion_init(str(init_py), "NON_EXISTENT_MAP") + assert keys == set() + + +def test_extract_returns_empty_for_unparseable_file(tmp_path): + """If conversion/__init__.py is missing or unparseable, we get an empty set + rather than raising — patcher then warns but does not abort.""" + llama_cpp = _load_llama_cpp_module() + assert llama_cpp._extract_dict_keys_from_conversion_init(str(tmp_path / "nope.py"), "TEXT_MODEL_MAP") == set() + + +# --- Branding patch on conversion/base.py ----------------------------------- + + +def test_branding_patch_applies_and_is_idempotent(package_layout): + llama_cpp = _load_llama_cpp_module() + base_py = package_layout / "conversion" / "base.py" + + # First call: applies. + assert llama_cpp._apply_branding_patch_to_base(str(base_py)) == "applied" + content = base_py.read_bytes() + assert b"# UNSLOTH_BRANDING_APPLIED" in content + assert b"self.metadata.quantized_by = 'Unsloth'" in content + assert b"self.metadata.repo_url = 'https://huggingface.co/unsloth'" in content + assert b"self.metadata.tags = ['unsloth', 'llama.cpp']" in content + + # Second call: no-op (idempotent). + assert llama_cpp._apply_branding_patch_to_base(str(base_py)) == "already-applied" + # File contents should be unchanged after the second call. + assert base_py.read_bytes() == content + + +def test_branding_patch_pattern_missing_when_metadata_load_absent(tmp_path): + """A conversion/base.py without the Metadata.load call returns 'pattern-missing'.""" + llama_cpp = _load_llama_cpp_module() + base_py = tmp_path / "base.py" + base_py.write_bytes(b"# completely different file content\n") + assert llama_cpp._apply_branding_patch_to_base(str(base_py)) == "pattern-missing" + + +def test_branding_patch_preserves_lines_around_target(package_layout): + llama_cpp = _load_llama_cpp_module() + base_py = package_layout / "conversion" / "base.py" + original = base_py.read_bytes() + llama_cpp._apply_branding_patch_to_base(str(base_py)) + patched = base_py.read_bytes() + + # The Metadata.load line itself is preserved verbatim. + assert b"self.metadata = gguf.Metadata.load(" in patched + # Code that followed the target (the if self.remote_hf_model_id... block) + # is still present after the patch (we only inserted lines, not deleted). + assert b"if self.remote_hf_model_id:" in patched + assert b"self.metadata.name = self.remote_hf_model_id" in patched + # File grew (we added 4 branding lines + marker), not shrank. + assert len(patched) > len(original) + + +# --- Qwen expert-key alias detection --------------------------------------- + + +def test_qwen_aliases_detected_when_both_keys_present(package_layout): + llama_cpp = _load_llama_cpp_module() + qwen_py = package_layout / "conversion" / "qwen.py" + assert llama_cpp._qwen_already_handles_expert_aliases(str(qwen_py)) is True + + +def test_qwen_aliases_not_detected_when_only_one_key_present(tmp_path): + llama_cpp = _load_llama_cpp_module() + qwen_py = tmp_path / "qwen.py" + qwen_py.write_bytes(b'n = self.hparams["num_experts"]\n') # only num_experts + assert llama_cpp._qwen_already_handles_expert_aliases(str(qwen_py)) is False + + +# --- Cache-key invalidation (sibling info) --------------------------------- + + +def test_conversion_sibling_info_changes_when_base_py_changes(package_layout): + llama_cpp = _load_llama_cpp_module() + info_before = llama_cpp._conversion_sibling_info(str(package_layout)) + assert info_before is not None + + # Touch base.py with new content (mtime + size both change). + base_py = package_layout / "conversion" / "base.py" + base_py.write_bytes(base_py.read_bytes() + b"\n# extra trailing comment\n") + + info_after = llama_cpp._conversion_sibling_info(str(package_layout)) + assert info_after is not None + assert info_after != info_before, ( + "_conversion_sibling_info must change when conversion/base.py changes, " + "so the @lru_cache(1) entry is invalidated" + ) + + +def test_conversion_sibling_info_none_for_monolith(monolith_layout): + llama_cpp = _load_llama_cpp_module() + assert llama_cpp._conversion_sibling_info(str(monolith_layout)) is None + + +# --- _get_llama_cpp_dir resolution (addresses PR #667 review) --------------- + + +def test_llama_cpp_dir_defaults_when_no_local_script(): + llama_cpp = _load_llama_cpp_module() + assert llama_cpp._get_llama_cpp_dir(None) == llama_cpp.LLAMA_CPP_DEFAULT_DIR + + +def test_llama_cpp_dir_resolves_to_source_dir_when_local_script_set(tmp_path): + """UNSLOTH_LLAMA_CPP_SCRIPTS_DIR override: the patcher must operate + against the directory containing the selected converter, not the + hard-coded default. Mirrors `_resolve_local_convert_script`'s 3-tuple + return shape `(abs_path, mtime_ns, size)`.""" + llama_cpp = _load_llama_cpp_module() + custom = tmp_path / "custom_llama_cpp" + custom.mkdir() + src = custom / "convert_hf_to_gguf.py" + src.write_bytes(b"# placeholder\n") + local_info = (str(src), src.stat().st_mtime_ns, src.stat().st_size) + assert llama_cpp._get_llama_cpp_dir(local_info) == str(custom) + + +def test_package_layout_does_not_require_module_import(tmp_path, monkeypatch): + """Regression for Codex P1 on 3a9a23c: when UNSLOTH_LLAMA_CPP_SCRIPTS_DIR + points at a package-layout checkout, the patcher must NOT call + `_load_module_from_path` on the entrypoint. Importing it would resolve + `from conversion import ...` against LLAMA_CPP_DEFAULT_DIR (a different + dir than the override) and raise ModuleNotFoundError, aborting the + patcher before AST arch extraction + branding could run. + + We assert the contract by replacing `_load_module_from_path` with a + sentinel that fails the test if called, then driving the patcher end- + to-end with `UNSLOTH_LLAMA_CPP_SCRIPTS_DIR` set.""" + llama_cpp = _load_llama_cpp_module() + + # Build a custom package-layout checkout in tmp_path. We extend the + # shared fixture with a parse_args() stub so the end-to-end pipeline can + # finish its flag-parsing step on the patched file (the real upstream + # entrypoint has these calls; the shared fixture omits them because no + # other test exercises the full pipeline). + entry_with_args = _PACKAGE_ENTRYPOINT + ( + b"\n" + b"def parse_args():\n" + b" parser = argparse.ArgumentParser()\n" + b" parser.add_argument(\"model\")\n" + b" parser.add_argument(\"--outfile\", default=None)\n" + b" parser.add_argument(\"--outtype\", default=\"f16\")\n" + b" parser.add_argument(\"--vocab-only\", action=\"store_true\")\n" + b" return parser.parse_args()\n" + ) + root = tmp_path / "custom_llama_cpp" + root.mkdir() + (root / "convert_hf_to_gguf.py").write_bytes(entry_with_args) + conv = root / "conversion" + conv.mkdir() + (conv / "__init__.py").write_bytes(_PACKAGE_INIT_PY) + (conv / "base.py").write_bytes(_PACKAGE_BASE_PY) + (conv / "qwen.py").write_bytes(_PACKAGE_QWEN_PY) + + monkeypatch.setenv("UNSLOTH_LLAMA_CPP_SCRIPTS_DIR", str(root)) + + # Sentinel: any call here means the patcher fell through to the + # module-load path on package layout, which is the bug we're guarding. + called = {"hit": False} + def _trap(*a, **kw): + called["hit"] = True + raise AssertionError("monolith-only _load_module_from_path called on package layout") + monkeypatch.setattr(llama_cpp, "_load_module_from_path", _trap) + + # Cache must be cleared between runs because @lru_cache(1) keys include + # the resolved local_script_info -- but a stale entry from a previous + # test would short-circuit the new call. + llama_cpp._download_convert_hf_to_gguf_cached.cache_clear() + + patched_path, text_archs, vision_archs = llama_cpp._download_convert_hf_to_gguf("regression_no_module_import") + + assert called["hit"] is False + assert patched_path.endswith(".py") + assert "LlamaForCausalLM" in text_archs + assert text_archs == frozenset(text_archs) + # base.py was patched in place under the override dir. + assert b"# UNSLOTH_BRANDING_APPLIED" in (conv / "base.py").read_bytes() + # Cleanup for follow-on tests. + llama_cpp._download_convert_hf_to_gguf_cached.cache_clear() + + +def test_patcher_anchors_on_custom_dir_when_override_set(tmp_path): + """Build a custom llama.cpp tree with the new package layout in a temp + dir, point a synthetic local_script_info at it, and confirm sibling + info + layout detection target THAT dir, not the hardcoded default.""" + llama_cpp = _load_llama_cpp_module() + root = tmp_path / "custom_llama_cpp" + root.mkdir() + (root / "convert_hf_to_gguf.py").write_bytes(_PACKAGE_ENTRYPOINT) + conv = root / "conversion" + conv.mkdir() + (conv / "__init__.py").write_bytes(_PACKAGE_INIT_PY) + (conv / "base.py").write_bytes(_PACKAGE_BASE_PY) + (conv / "qwen.py").write_bytes(_PACKAGE_QWEN_PY) + + local_info = ( + str(root / "convert_hf_to_gguf.py"), + (root / "convert_hf_to_gguf.py").stat().st_mtime_ns, + (root / "convert_hf_to_gguf.py").stat().st_size, + ) + resolved = llama_cpp._get_llama_cpp_dir(local_info) + assert resolved == str(root) + sib = llama_cpp._conversion_sibling_info(resolved) + assert sib is not None + assert sib[1][0] == str(conv / "base.py") # base.py path in sibling tuple + layout = llama_cpp._detect_converter_layout(_PACKAGE_ENTRYPOINT, resolved) + assert layout == "package" + + +# --- Network smoke against current upstream llama.cpp ---------------------- + + +@pytest.fixture +def latest_llama_cpp(tmp_path): + """Fetch the current convert_hf_to_gguf.py + conversion/{__init__,base,qwen}.py + from raw.githubusercontent.com. Skips the test cleanly when offline or rate- + limited (raw.githubusercontent.com is documented at 60 req/hour unauthed).""" + requests = pytest.importorskip("requests") + root = tmp_path / "llama.cpp" + (root / "conversion").mkdir(parents=True) + base_url = "https://raw.githubusercontent.com/ggml-org/llama.cpp/master/" + files = { + "convert_hf_to_gguf.py": root / "convert_hf_to_gguf.py", + "conversion/__init__.py": root / "conversion" / "__init__.py", + "conversion/base.py": root / "conversion" / "base.py", + "conversion/qwen.py": root / "conversion" / "qwen.py", + } + headers = {} + if os.environ.get("GITHUB_TOKEN"): + headers["Authorization"] = f"Bearer {os.environ['GITHUB_TOKEN']}" + for rel, dest in files.items(): + try: + r = requests.get(base_url + rel, timeout=15, headers=headers) + except requests.exceptions.RequestException as exc: + pytest.skip(f"network unreachable: {exc}") + if r.status_code in (403, 429, 503): + pytest.skip(f"upstream rate-limited / unavailable: HTTP {r.status_code}") + if r.status_code != 200: + pytest.skip(f"upstream missing {rel}: HTTP {r.status_code}") + dest.write_bytes(r.content) + return root + + +def test_latest_upstream_detected_as_package_layout(latest_llama_cpp): + llama_cpp = _load_llama_cpp_module() + entry_bytes = (latest_llama_cpp / "convert_hf_to_gguf.py").read_bytes() + layout = llama_cpp._detect_converter_layout(entry_bytes, str(latest_llama_cpp)) + assert layout == "package", "current llama.cpp master should match the new layout" + + +def test_latest_upstream_branding_patch_applies(latest_llama_cpp): + """Against the live upstream conversion/base.py, the branding regex must + still match. If upstream changes the indentation or arguments of + Metadata.load, this test fails fast so we can update the regex.""" + llama_cpp = _load_llama_cpp_module() + base_py = latest_llama_cpp / "conversion" / "base.py" + status = llama_cpp._apply_branding_patch_to_base(str(base_py)) + assert status == "applied", f"branding patch did not apply to upstream base.py: {status}" + content = base_py.read_bytes() + assert b"# UNSLOTH_BRANDING_APPLIED" in content + assert b"self.metadata.quantized_by = 'Unsloth'" in content + + +def test_latest_upstream_qwen_already_handles_aliases(latest_llama_cpp): + """Upstream Qwen module is expected to call find_hparam with both keys.""" + llama_cpp = _load_llama_cpp_module() + qwen_py = latest_llama_cpp / "conversion" / "qwen.py" + if not qwen_py.exists(): + pytest.skip("upstream conversion/qwen.py absent") + assert llama_cpp._qwen_already_handles_expert_aliases(str(qwen_py)) is True + + +def test_latest_upstream_arch_enumeration_non_empty(latest_llama_cpp): + """TEXT_MODEL_MAP in upstream conversion/__init__.py must produce a non-empty + architecture allowlist. This is the assertion that would have caught the + original 'No supported architectures' warning if it had been a test.""" + llama_cpp = _load_llama_cpp_module() + init_py = latest_llama_cpp / "conversion" / "__init__.py" + text_archs = llama_cpp._extract_dict_keys_from_conversion_init(str(init_py), "TEXT_MODEL_MAP") + assert "LlamaForCausalLM" in text_archs, ( + f"upstream TEXT_MODEL_MAP missing LlamaForCausalLM; " + f"got {sorted(text_archs)[:10]}..." + ) + # The set should also contain at least some Qwen entries since this is the + # user's reported architecture family. + qwen_keys = {k for k in text_archs if k.startswith("Qwen")} + assert qwen_keys, f"upstream TEXT_MODEL_MAP has no Qwen* entries: {sorted(text_archs)[:20]}..." diff --git a/tests/test_quantize_gguf_q2_k_l.py b/tests/test_quantize_gguf_q2_k_l.py new file mode 100644 index 000000000..c6164ec44 --- /dev/null +++ b/tests/test_quantize_gguf_q2_k_l.py @@ -0,0 +1,186 @@ +"""Q2_K_L preset dispatch inside quantize_gguf. + +Q2_K_L is an Unsloth-side preset, not a native llama.cpp ftype. It maps to +``llama-quantize --output-tensor-type q8_0 --token-embedding-type q8_0 IN OUT +q2_k NTHREADS``. Before this fix, the MLX/Studio export path forwarded the raw +``q2_k_l`` string to llama-quantize which then aborted with +``main: invalid ftype 'q2_k_l'``. +""" + +from __future__ import annotations + +import importlib.util +import subprocess +import sys +from pathlib import Path +from types import SimpleNamespace + + +def _load_llama_cpp_module(): + repo_root = Path(__file__).resolve().parents[1] + module_path = repo_root / "unsloth_zoo" / "llama_cpp.py" + spec = importlib.util.spec_from_file_location("llama_cpp_under_test", module_path) + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def _install_fake_subprocess_run(monkeypatch, llama_cpp): + """Replace subprocess.run with a capturing fake (no real binary invoked).""" + + captured: dict[str, object] = {} + + def fake_run(cmd, *args, **kwargs): + captured["cmd"] = cmd + captured["args"] = args + captured["kwargs"] = kwargs + return SimpleNamespace(stdout="ok", returncode=0) + + monkeypatch.setattr(subprocess, "run", fake_run) + monkeypatch.setattr(llama_cpp.subprocess, "run", fake_run) + return captured + + +def _stub_output_exists(monkeypatch): + """Pretend the output file was produced (no real quantization happened).""" + + monkeypatch.setattr(Path, "exists", lambda self: True) + monkeypatch.setattr(Path, "stat", lambda self: SimpleNamespace(st_size=4096)) + + +def test_q2_k_l_expands_to_q2_k_with_output_and_embedding_q8_0(monkeypatch): + llama_cpp = _load_llama_cpp_module() + captured = _install_fake_subprocess_run(monkeypatch, llama_cpp) + _stub_output_exists(monkeypatch) + + llama_cpp.quantize_gguf( + input_gguf="/tmp/in.gguf", + output_gguf="/tmp/out.gguf", + quant_type="q2_k_l", + quantizer_location="/usr/bin/llama-quantize", + n_threads=4, + print_output=False, + ) + + cmd = captured["cmd"] + assert isinstance(cmd, str), f"command should be a shell string (existing convention); got {type(cmd)!r}" + # The literal preset name must NOT reach llama-quantize. + assert "q2_k_l" not in cmd, f"q2_k_l leaked into llama-quantize command: {cmd!r}" + # The expanded ftype must appear, as a standalone token. + assert " q2_k " in cmd, f"q2_k token missing: {cmd!r}" + # Both preset flags must appear, in either order. + assert "--output-tensor-type q8_0" in cmd, f"--output-tensor-type q8_0 missing: {cmd!r}" + assert "--token-embedding-type q8_0" in cmd, f"--token-embedding-type q8_0 missing: {cmd!r}" + # Sanity: input/output paths and thread count are still present. + assert "/tmp/in.gguf" in cmd + assert "/tmp/out.gguf" in cmd + assert " 4" in cmd, f"n_threads missing: {cmd!r}" + + +def test_q2_k_l_is_case_insensitive(monkeypatch): + """Studio frontend may send Q2_K_L / Q2_k_L / etc. Treat them identically.""" + + llama_cpp = _load_llama_cpp_module() + captured = _install_fake_subprocess_run(monkeypatch, llama_cpp) + _stub_output_exists(monkeypatch) + + for variant in ("Q2_K_L", "q2_K_L", " q2_k_l "): + captured.clear() + llama_cpp.quantize_gguf( + input_gguf="/tmp/in.gguf", + output_gguf="/tmp/out.gguf", + quant_type=variant, + quantizer_location="/usr/bin/llama-quantize", + n_threads=4, + print_output=False, + ) + cmd = captured["cmd"] + assert " q2_k " in cmd, f"variant {variant!r}: expansion missing: {cmd!r}" + assert "--output-tensor-type q8_0" in cmd + assert "--token-embedding-type q8_0" in cmd + + +def test_other_quant_types_are_untouched(monkeypatch): + """Non-preset ftypes must traverse the original code path byte-for-byte. + + Linux + Windows non-regression: ensures the q2_k_l branch does not affect + any other ftype. q3_k_l is a real llama.cpp ftype distinct from q2_k_l and + must be passed through verbatim. + """ + + llama_cpp = _load_llama_cpp_module() + captured = _install_fake_subprocess_run(monkeypatch, llama_cpp) + _stub_output_exists(monkeypatch) + + for ftype in ( + "q2_k", "q3_k_s", "q3_k_m", "q3_k_l", # q3_k_l is a real ftype, NOT a preset + "q4_0", "q4_1", "q4_k_s", "q4_k_m", + "q5_0", "q5_1", "q5_k_s", "q5_k_m", + "q6_k", "q8_0", "bf16", "f16", "f32", + ): + captured.clear() + llama_cpp.quantize_gguf( + input_gguf="/tmp/in.gguf", + output_gguf="/tmp/out.gguf", + quant_type=ftype, + quantizer_location="/usr/bin/llama-quantize", + n_threads=4, + print_output=False, + ) + cmd = captured["cmd"] + assert f" {ftype} " in cmd, f"ftype {ftype!r} not preserved: {cmd!r}" + assert "--output-tensor-type" not in cmd, ( + f"ftype {ftype!r} accidentally picked up preset flags: {cmd!r}" + ) + assert "--token-embedding-type" not in cmd, ( + f"ftype {ftype!r} accidentally picked up preset flags: {cmd!r}" + ) + + +def test_q2_k_l_print_output_path_logs_preset_expansion(capsys, monkeypatch): + """When print_output=True the user sees both the original request and the expansion.""" + + llama_cpp = _load_llama_cpp_module() + _install_fake_subprocess_run(monkeypatch, llama_cpp) + _stub_output_exists(monkeypatch) + + llama_cpp.quantize_gguf( + input_gguf="/tmp/in.gguf", + output_gguf="/tmp/out.gguf", + quant_type="q2_k_l", + quantizer_location="/usr/bin/llama-quantize", + n_threads=4, + print_output=True, + ) + + out = capsys.readouterr().out + assert "Quantizing to q2_k_l" in out, out + assert "Expanding Q2_K_L preset" in out, out + + +def test_q2_k_l_error_message_keeps_original_preset_name(monkeypatch): + """If llama-quantize fails, the RuntimeError should mention q2_k_l (what the + user asked for) rather than q2_k (the rewritten internal ftype).""" + + llama_cpp = _load_llama_cpp_module() + + def failing_run(cmd, *args, **kwargs): + raise subprocess.CalledProcessError(returncode=1, cmd=cmd, output="boom") + + monkeypatch.setattr(subprocess, "run", failing_run) + monkeypatch.setattr(llama_cpp.subprocess, "run", failing_run) + + try: + llama_cpp.quantize_gguf( + input_gguf="/tmp/in.gguf", + output_gguf="/tmp/out.gguf", + quant_type="q2_k_l", + quantizer_location="/usr/bin/llama-quantize", + n_threads=4, + print_output=False, + ) + except RuntimeError as exc: + assert "q2_k_l" in str(exc), f"error msg should keep preset name: {exc}" + else: + raise AssertionError("expected RuntimeError") diff --git a/unsloth_zoo/llama_cpp.py b/unsloth_zoo/llama_cpp.py index e5548a1cd..b7f04bbb4 100644 --- a/unsloth_zoo/llama_cpp.py +++ b/unsloth_zoo/llama_cpp.py @@ -31,6 +31,7 @@ import os import time import re +import ast import requests import json from tqdm.auto import tqdm as ProgressBar @@ -921,14 +922,161 @@ def _load_module_from_path(filepath, module_name): pass +_UNSLOTH_BRANDING_MARKER = b"# UNSLOTH_BRANDING_APPLIED" +_BRANDING_PATTERN = re.compile( + rb"(self\.metadata \= gguf\.Metadata\.load\(.+?\))([\n\r]+([\s\t]{4,}))", + flags = re.MULTILINE, +) + + +def _get_llama_cpp_dir(local_script_info): + """Resolve the directory holding the converter being patched. + UNSLOTH_LLAMA_CPP_SCRIPTS_DIR wins when set; otherwise the default + ~/.unsloth/llama.cpp. Single anchor for layout detection, branding patch, + Qwen check, and sibling-info cache key.""" + if local_script_info is not None: + return os.path.dirname(local_script_info[0]) + return LLAMA_CPP_DEFAULT_DIR +pass + + +def _conversion_sibling_info(llama_cpp_dir): + """Hashable (path, mtime, size) tuples for conversion/{__init__,base,qwen}.py. + Folded into the patcher cache key so re-pulled llama.cpp checkouts re-patch. + Returns None on monolithic layout.""" + conv_dir = os.path.join(llama_cpp_dir, "conversion") + init_py = os.path.join(conv_dir, "__init__.py") + base_py = os.path.join(conv_dir, "base.py") + qwen_py = os.path.join(conv_dir, "qwen.py") + if not (os.path.isfile(init_py) and os.path.isfile(base_py)): + return None + def _stat(p): + try: + s = os.stat(p) + return (p, s.st_mtime_ns, s.st_size) + except OSError: + return (p, 0, 0) + return ( + _stat(init_py), + _stat(base_py), + _stat(qwen_py) if os.path.isfile(qwen_py) else None, + ) +pass + + +def _detect_converter_layout(entry_content_bytes, llama_cpp_dir): + """Return 'package' for the new conversion/ package layout, else 'monolith'. + Detection is structural: entrypoint must contain `from conversion import` + AND conversion/__init__.py + conversion/base.py must exist on disk.""" + try: + if b"from conversion import" not in entry_content_bytes: + return "monolith" + init_py = os.path.join(llama_cpp_dir, "conversion", "__init__.py") + base_py = os.path.join(llama_cpp_dir, "conversion", "base.py") + if os.path.isfile(init_py) and os.path.isfile(base_py): + return "package" + except Exception: + # Detection is best-effort; on any I/O or attribute error fall back + # to monolith so the legacy regex patches still run. + pass + return "monolith" +pass + + +def _extract_dict_keys_from_conversion_init(conv_init_path, dict_name): + """AST-parse conversion/__init__.py for TEXT_MODEL_MAP / MMPROJ_MODEL_MAP + string-literal keys. Used as the arch allowlist on the new layout because + ModelBase._model_classes is empty until load_all_models() runs.""" + try: + with open(conv_init_path, "rb") as f: + tree = ast.parse(f.read()) + except Exception: + return set() + keys = set() + def _harvest(value): + if isinstance(value, ast.Dict): + for k in value.keys: + if isinstance(k, ast.Constant) and isinstance(k.value, str): + keys.add(k.value) + for node in ast.walk(tree): + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == dict_name: + _harvest(node.value) + elif isinstance(node, ast.AnnAssign): + if isinstance(node.target, ast.Name) and node.target.id == dict_name: + _harvest(node.value) + return keys +pass + + +def _apply_branding_patch_to_base(conv_base_path): + """Insert Unsloth metadata branding after `self.metadata = gguf.Metadata.load(...)` + in conversion/base.py. Idempotent via a one-line marker. + Returns 'applied' / 'already-applied' / 'pattern-missing'.""" + try: + with open(conv_base_path, "rb") as f: + content = f.read() + except OSError: + return "pattern-missing" + if _UNSLOTH_BRANDING_MARKER in content: + return "already-applied" + + def _replace(match): + load_call = match.group(1) + suffix = match.group(2) # already starts with newline + indent + indent = match.group(3) + return ( + load_call + b"\n" + + indent + _UNSLOTH_BRANDING_MARKER + b"\n" + + indent + b"if hasattr(self.metadata, 'quantized_by'): self.metadata.quantized_by = 'Unsloth'\n" + + indent + b"if hasattr(self.metadata, 'repo_url'): self.metadata.repo_url = 'https://huggingface.co/unsloth'\n" + + indent + b"if hasattr(self.metadata, 'tags'): self.metadata.tags = ['unsloth', 'llama.cpp']" + + suffix + ) + + new_content, n = _BRANDING_PATTERN.subn(_replace, content, count = 1) + if n == 0: + return "pattern-missing" + try: + with open(conv_base_path, "wb") as f: + f.write(new_content) + except OSError: + return "pattern-missing" + return "applied" +pass + + +def _qwen_already_handles_expert_aliases(conv_qwen_path): + """True iff conversion/qwen.py already searches both num_local_experts AND + num_experts. Upstream master uses + self.find_hparam(["num_local_experts", "num_experts"]) + so the legacy patch is a no-op and the warning is misleading.""" + try: + with open(conv_qwen_path, "rb") as f: + content = f.read() + except OSError: + return False + return (b"num_local_experts" in content) and (b"num_experts" in content) +pass + + def _download_convert_hf_to_gguf(name = "unsloth_convert_hf_to_gguf"): - # Resolve the env var on every call so changes between calls are honored; - # the resolved value is part of the cache key on the implementation below. - return _download_convert_hf_to_gguf_cached(name, _resolve_local_convert_script()) + # Resolve env vars + sibling mtimes on every call; both are folded into + # the @lru_cache key so re-pulled llama.cpp checkouts re-run the patcher. + # Anchor the conversion/ lookup to the converter being patched, not + # always LLAMA_CPP_DEFAULT_DIR -- matters when UNSLOTH_LLAMA_CPP_SCRIPTS_DIR + # points at a different checkout. + local_script_info = _resolve_local_convert_script() + return _download_convert_hf_to_gguf_cached( + name, + local_script_info, + _conversion_sibling_info(_get_llama_cpp_dir(local_script_info)), + ) @lru_cache(1) -def _download_convert_hf_to_gguf_cached(name, _local_script_info): +def _download_convert_hf_to_gguf_cached(name, _local_script_info, _conversion_info): # All Unsloth Zoo code licensed under LGPLv3 # Downloads from llama.cpp's GitHub repository, or reads a local copy when # UNSLOTH_LLAMA_CPP_SCRIPTS_DIR is set. _local_script_info is @@ -944,6 +1092,13 @@ def _download_convert_hf_to_gguf_cached(name, _local_script_info): text_archs = set() vision_archs = set() temp_original_file_path = None # Initialize for finally block + original_module_name = None # Only set on the monolith branch + # Set by introspection; read by Patch 2 + Patch 3 below. Default to + # 'monolith' so a failed introspection still drives the legacy patches. + _layout = "monolith" + # Resolve once: same dir feeds layout detection, branding patch, Qwen + # check, sibling cache key. UNSLOTH_LLAMA_CPP_SCRIPTS_DIR overrides default. + _llama_cpp_dir = _get_llama_cpp_dir(_local_script_info) _local_script = _local_script_info[0] if _local_script_info is not None else None @@ -976,65 +1131,89 @@ def _download_convert_hf_to_gguf_cached(name, _local_script_info): if original_content is None: raise _last_err # type: ignore[misc] - # 2. Introspect Original Script for Supported Architectures + # 2. Detect layout BEFORE attempting to import. The package-layout + # entrypoint does `from conversion import ...`, which a temp-file + # import resolves against LLAMA_CPP_DEFAULT_DIR -- so when the user + # set UNSLOTH_LLAMA_CPP_SCRIPTS_DIR to a different checkout, the + # import would ModuleNotFoundError and abort the patcher before we + # could reach the AST-based arch extraction path. + _layout = _detect_converter_layout(original_content, _llama_cpp_dir) + logger.info(f"Unsloth: convert_hf_to_gguf layout detected: {_layout}") logger.info("Unsloth: Identifying llama.cpp gguf supported architectures...") - with tempfile.NamedTemporaryFile( - mode='wb', suffix=".py", prefix="original_gguf_", dir=LLAMA_CPP_DEFAULT_DIR, delete=False - ) as temp_file: - temp_original_file_path = temp_file.name - temp_file.write(original_content) - temp_file.flush() - - logger.debug(f"Loading module from temporary file: {temp_original_file_path}") - original_module_name = f"convert_hf_to_gguf_{os.path.basename(temp_original_file_path).split('.')[0]}" - - # Set NO_LOCAL_GGUF to prevent the script from adding path again - old_env = os.environ.get('NO_LOCAL_GGUF') - os.environ['NO_LOCAL_GGUF'] = '1' - - try: - module = _load_module_from_path(temp_original_file_path, original_module_name) - finally: - # Restore environment - if old_env is None: - os.environ.pop('NO_LOCAL_GGUF', None) - else: - os.environ['NO_LOCAL_GGUF'] = old_env - # --- Extract Supported Architectures (TEXT and VISION) --- - ModelBase = getattr(module, 'ModelBase', None) - ModelType = getattr(module, 'ModelType', None) - - if ModelBase is None or ModelType is None: - logger.warning( - f"Unsloth: Failed to find 'ModelBase' or 'ModelType' in the original downloaded script. " - f"Structure might have changed. Cannot determine supported architectures." - ) - elif not hasattr(ModelBase, '_model_classes') or not isinstance(ModelBase._model_classes, dict): - logger.warning( - f"Unsloth: 'ModelBase._model_classes' not found or not a dictionary in original script." - " Cannot determine supported architectures." - ) + if _layout == "package": + # Package layout: archs come from AST-parsing the static + # TEXT_MODEL_MAP / MMPROJ_MODEL_MAP in conversion/__init__.py. + # No module import required, so we skip the temp-write entirely. + conv_init_py = os.path.join(_llama_cpp_dir, "conversion", "__init__.py") + text_archs = _extract_dict_keys_from_conversion_init(conv_init_py, "TEXT_MODEL_MAP") + vision_archs = _extract_dict_keys_from_conversion_init(conv_init_py, "MMPROJ_MODEL_MAP") + supported_types.update(text_archs) + supported_types.update(vision_archs) + if not supported_types: + logger.warning( + "Unsloth: conversion/__init__.py parsed but TEXT_MODEL_MAP / " + "MMPROJ_MODEL_MAP yielded no architecture keys. The arch " + "allowlist will be empty; conversion will still attempt to run." + ) else: - # Check for TEXT models - if hasattr(ModelType, 'TEXT') and ModelType.TEXT in ModelBase._model_classes: - if isinstance(ModelBase._model_classes[ModelType.TEXT], dict): - text_archs = set(ModelBase._model_classes[ModelType.TEXT].keys()) - supported_types.update(text_archs) - else: - logger.warning("Unsloth: ModelBase._model_classes[ModelType.TEXT] is not a dictionary.") - else: - logger.info("Unsloth: No TEXT model architectures found registered in the original script.") + # Monolith layout: original behaviour. Write the entrypoint to a + # temp file under LLAMA_CPP_DEFAULT_DIR and import it to read + # ModelBase._model_classes. + with tempfile.NamedTemporaryFile( + mode='wb', suffix=".py", prefix="original_gguf_", dir=LLAMA_CPP_DEFAULT_DIR, delete=False + ) as temp_file: + temp_original_file_path = temp_file.name + temp_file.write(original_content) + temp_file.flush() + + logger.debug(f"Loading module from temporary file: {temp_original_file_path}") + original_module_name = f"convert_hf_to_gguf_{os.path.basename(temp_original_file_path).split('.')[0]}" + + # Set NO_LOCAL_GGUF to prevent the script from adding path again + old_env = os.environ.get('NO_LOCAL_GGUF') + os.environ['NO_LOCAL_GGUF'] = '1' - # Check for VISION models - if hasattr(ModelType, 'MMPROJ') and ModelType.MMPROJ in ModelBase._model_classes: - if isinstance(ModelBase._model_classes[ModelType.MMPROJ], dict): - vision_archs = set(ModelBase._model_classes[ModelType.MMPROJ].keys()) - supported_types.update(vision_archs) + try: + module = _load_module_from_path(temp_original_file_path, original_module_name) + finally: + if old_env is None: + os.environ.pop('NO_LOCAL_GGUF', None) else: - logger.warning("Unsloth: ModelBase._model_classes[ModelType.MMPROJ] is not a dictionary.") + os.environ['NO_LOCAL_GGUF'] = old_env + ModelBase = getattr(module, 'ModelBase', None) + ModelType = getattr(module, 'ModelType', None) + + if ModelBase is None or ModelType is None: + logger.warning( + f"Unsloth: Failed to find 'ModelBase' or 'ModelType' in the original downloaded script. " + f"Structure might have changed. Cannot determine supported architectures." + ) + elif not hasattr(ModelBase, '_model_classes') or not isinstance(ModelBase._model_classes, dict): + logger.warning( + f"Unsloth: 'ModelBase._model_classes' not found or not a dictionary in original script." + " Cannot determine supported architectures." + ) else: - logger.info("Unsloth: No VISION model architectures found registered in the original script.") + # Check for TEXT models + if hasattr(ModelType, 'TEXT') and ModelType.TEXT in ModelBase._model_classes: + if isinstance(ModelBase._model_classes[ModelType.TEXT], dict): + text_archs = set(ModelBase._model_classes[ModelType.TEXT].keys()) + supported_types.update(text_archs) + else: + logger.warning("Unsloth: ModelBase._model_classes[ModelType.TEXT] is not a dictionary.") + else: + logger.info("Unsloth: No TEXT model architectures found registered in the original script.") + + # Check for VISION models + if hasattr(ModelType, 'MMPROJ') and ModelType.MMPROJ in ModelBase._model_classes: + if isinstance(ModelBase._model_classes[ModelType.MMPROJ], dict): + vision_archs = set(ModelBase._model_classes[ModelType.MMPROJ].keys()) + supported_types.update(vision_archs) + else: + logger.warning("Unsloth: ModelBase._model_classes[ModelType.MMPROJ] is not a dictionary.") + else: + logger.info("Unsloth: No VISION model architectures found registered in the original script.") # --- End Architecture Extraction --- # Convert final set to frozenset for immutability (good practice for cache keys/return values) @@ -1047,8 +1226,8 @@ def _download_convert_hf_to_gguf_cached(name, _local_script_info): f"Unsloth: No supported architectures (TEXT or VISION) could be determined from the original script." ) - # Cleanup module reference - if original_module_name in sys.modules: + # Cleanup module reference (only set on the monolith branch) + if original_module_name is not None and original_module_name in sys.modules: del sys.modules[original_module_name] except Exception as e: @@ -1085,41 +1264,73 @@ def _download_convert_hf_to_gguf_cached(name, _local_script_info): - # Patch 2: Metadata Branding + # Patch 2: Metadata Branding. + # Monolith: target lives in the entrypoint; patch the in-memory bytes. + # Package: target moved to conversion/base.py; patch that file in place + # (idempotent via _UNSLOTH_BRANDING_MARKER) since the entrypoint just + # imports ModelBase from it at runtime. try: - metadata_patch_applied = False - new_patched_content = re.sub( - rb"(self\.metadata \= gguf\.Metadata\.load\(.+?\))([\n\r]+([\s\t]{4,}))", - rb"\1\n" - rb"\3if hasattr(self.metadata, 'quantized_by'): self.metadata.quantized_by = 'Unsloth'\n" - rb"\3if hasattr(self.metadata, 'repo_url'): self.metadata.repo_url = 'https://huggingface.co/unsloth'\n" - rb"\3if hasattr(self.metadata, 'tags'): self.metadata.tags = ['unsloth', 'llama.cpp']\n" - rb"\2", - patched_content, count=1, flags=re.MULTILINE - ) - if new_patched_content != patched_content: patched_content = new_patched_content; metadata_patch_applied = True - if not metadata_patch_applied: - if re.search(rb"self\.metadata \= gguf\.Metadata\.load\(", patched_content): logger.warning("Unsloth: Metadata branding patch target found, but regex failed to apply.") - else: logger.warning("Unsloth: Metadata branding patch target 'self.metadata = gguf.Metadata.load(...)' not found.") + if _layout == "package": + conv_base_py = os.path.join(_llama_cpp_dir, "conversion", "base.py") + _branding_status = _apply_branding_patch_to_base(conv_base_py) + if _branding_status == "applied": + logger.info(f"Unsloth: Metadata branding patch applied to {conv_base_py}.") + elif _branding_status == "already-applied": + logger.info(f"Unsloth: Metadata branding patch already present in {conv_base_py} (idempotent skip).") + else: + logger.warning( + f"Unsloth: Metadata branding patch target not found in {conv_base_py}. " + f"Upstream may have refactored Metadata.load again." + ) + else: + metadata_patch_applied = False + new_patched_content = re.sub( + rb"(self\.metadata \= gguf\.Metadata\.load\(.+?\))([\n\r]+([\s\t]{4,}))", + rb"\1\n" + rb"\3if hasattr(self.metadata, 'quantized_by'): self.metadata.quantized_by = 'Unsloth'\n" + rb"\3if hasattr(self.metadata, 'repo_url'): self.metadata.repo_url = 'https://huggingface.co/unsloth'\n" + rb"\3if hasattr(self.metadata, 'tags'): self.metadata.tags = ['unsloth', 'llama.cpp']\n" + rb"\2", + patched_content, count=1, flags=re.MULTILINE + ) + if new_patched_content != patched_content: patched_content = new_patched_content; metadata_patch_applied = True + if not metadata_patch_applied: + if re.search(rb"self\.metadata \= gguf\.Metadata\.load\(", patched_content): logger.warning("Unsloth: Metadata branding patch target found, but regex failed to apply.") + else: logger.warning("Unsloth: Metadata branding patch target 'self.metadata = gguf.Metadata.load(...)' not found.") except Exception as e: logger.error(f"Unsloth: Error applying metadata branding patch: {e}", exc_info=True); raise - # Patch 3: Qwen2MoE/Qwen3MoE num_experts fix + # Patch 3: Qwen2MoE / Qwen3MoE num_experts fix. + # Package layout uses find_hparam(["num_local_experts", "num_experts"]) + # already, so the legacy patch is obsolete and its warning misleading. + # Skip it (info-log) on new layout; run unchanged on monolith. try: - # Use a single regex to handle both quote styles - num_experts_pattern = rb'n_experts = self\.hparams\[(["\'])num_experts\1\]' - replacement = ( - b"# Qwen3MoE seems to use num_local_experts instead of num_experts\n" - b" n_experts = self.hparams.get('num_experts', None) or self.hparams.get('num_local_experts')" - ) + _qwen_handled = False + if _layout == "package": + conv_qwen_py = os.path.join(_llama_cpp_dir, "conversion", "qwen.py") + if os.path.isfile(conv_qwen_py) and _qwen_already_handles_expert_aliases(conv_qwen_py): + logger.info( + "Unsloth: Qwen2MoE expert-key alias already handled upstream " + "(conversion/qwen.py uses find_hparam([num_local_experts, num_experts])) " + "-- legacy patch skipped." + ) + _qwen_handled = True + + if not _qwen_handled: + # Use a single regex to handle both quote styles + num_experts_pattern = rb'n_experts = self\.hparams\[(["\'])num_experts\1\]' + replacement = ( + b"# Qwen3MoE seems to use num_local_experts instead of num_experts\n" + b" n_experts = self.hparams.get('num_experts', None) or self.hparams.get('num_local_experts')" + ) - new_patched_content = re.sub(num_experts_pattern, replacement, patched_content) - num_experts_patch_applied = (new_patched_content != patched_content) + new_patched_content = re.sub(num_experts_pattern, replacement, patched_content) + num_experts_patch_applied = (new_patched_content != patched_content) - if num_experts_patch_applied: - patched_content = new_patched_content - else: - logger.warning("Unsloth: Qwen2MoE num_experts patch target not found.") + if num_experts_patch_applied: + patched_content = new_patched_content + else: + logger.warning("Unsloth: Qwen2MoE num_experts patch target not found.") except Exception as e: logger.error(f"Unsloth: Error applying Qwen2MoE num_experts patch: {e}", exc_info=True) @@ -1587,10 +1798,26 @@ def _quote(s): import shlex return shlex.quote(s) - command = f"{_quote(quantizer_location)} {_quote(input_gguf)} {_quote(output_gguf)} {quant_type} {n_threads}" + # Q2_K_L is an Unsloth preset (q2_k + q8_0 output / embedding tensors), + # not a native llama.cpp ftype. Expand here so every caller shares one path. + _display_quant_type = quant_type + _extra_flags = "" + if str(quant_type).strip().lower() == "q2_k_l": + _extra_flags = "--output-tensor-type q8_0 --token-embedding-type q8_0 " + quant_type = "q2_k" + + command = ( + f"{_quote(quantizer_location)} {_extra_flags}" + f"{_quote(input_gguf)} {_quote(output_gguf)} {quant_type} {n_threads}" + ) if print_output: - print(f"Unsloth: Quantizing to {quant_type}...") + print(f"Unsloth: Quantizing to {_display_quant_type}...") + if _extra_flags: + print( + "Unsloth: Expanding Q2_K_L preset " + "(q2_k + --output-tensor-type q8_0 --token-embedding-type q8_0)." + ) try: if print_output: @@ -1603,7 +1830,7 @@ def _quote(s): except subprocess.CalledProcessError as e: if print_output and hasattr(e, 'stdout') and e.stdout: print(e.stdout) - raise RuntimeError(f"Failed to quantize {input_gguf} to {quant_type}: {e}") + raise RuntimeError(f"Failed to quantize {input_gguf} to {_display_quant_type}: {e}") # Verify output exists and get size using pathlib output_path = Path(output_gguf)