diff --git a/.github/workflows/consolidated-tests-ci.yml b/.github/workflows/consolidated-tests-ci.yml
deleted file mode 100644
index 6b008d4bb1..0000000000
--- a/.github/workflows/consolidated-tests-ci.yml
+++ /dev/null
@@ -1,2265 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# One consolidated CPU-only job that runs every test_* function the existing
-# CI does not already cover from this repo plus the full unsloth_zoo@main
-# CPU test suite plus unsloth_zoo.compiler.test_apply_fused_lm_head.
-#
-# Why a separate workflow:
-# - studio-backend-ci.yml's "Repo tests (CPU)" job already auto-discovers
-# tests/ minus tests/qlora, tests/saving, tests/utils, tests/sh. The 16
-# Bucket-A tests below live inside those --ignore dirs (CPU-runnable but
-# historically excluded with their GPU siblings); pulling them out into
-# a sibling job keeps the existing 760-passed baseline stable while we
-# prove the new pieces are green.
-# - unsloth_zoo has no CI on main today (.github/workflows/ is empty
-# upstream as of HEAD 030e4ba). 106 of its 111 test_* functions are
-# CPU-runnable; the 5 GPU/vLLM ones are deselected here.
-# - test_apply_fused_lm_head lives at unsloth_zoo/compiler.py:1983, not
-# under tests/, so it is not picked up by `pytest tests/`. It is a
-# plain function with no fixtures: pure regex over transformers source
-# strings, ~5-15 s wall, no GPU.
-#
-# Strict mode: every test step is gating (no `continue-on-error`). The
-# upstream patch fixes that previously caused per-cell red have landed:
-# - unslothai/unsloth#5319 (patch_fast_lora import, patch_sft_trainer
-# Union, openenv OSError graceful skip).
-# - unslothai/unsloth-zoo#628 (MoE coverage canary so old transformers
-# skips legitimately while real discovery regressions still fail).
-# After those merges every observed cell failure was one of these two
-# things; if they regress we want a red cell, not a green-with-fail-prints
-# cell.
-
-name: Core
-
-on:
- pull_request:
- paths:
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'studio/**'
- - 'tests/**'
- - 'pyproject.toml'
- - '.github/workflows/consolidated-tests-ci.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
- inputs:
- unsloth_zoo_ref:
- description: 'unsloth_zoo git ref to test against (default main)'
- required: false
- default: 'main'
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- consolidated:
- # Matrix: three (transformers, TRL) combos cover the failure surface the
- # PR cares about:
- # 1. transformers==4.57.6 + TRL latest <1.0.0 (the just-before-5.x line)
- # 2. transformers latest 5.x + TRL latest 1.x (the absolute upstream tip;
- # currently 5.8.0 + 1.3.0, both BEYOND the unsloth/unsloth_zoo
- # <=5.5.0 / <=0.24.0 caps -- the cell exists explicitly to surface
- # drift signal)
- # 3. transformers + TRL pinned by pyproject.toml's dependency entries
- # (resolved dynamically at job time via tomllib)
- # fail-fast: false so each cell runs independently and a transformers /
- # TRL drift signal in one cell does not cancel the others. No
- # job-level or per-step `continue-on-error` -- real test failures now
- # fail the cell. Patches with legitimate CPU-runner preconditions
- # (real CUDA dispatcher, runtime args) are explicitly skipped via
- # NEEDS_PRECONDITION in the runtime check shim below.
- strategy:
- fail-fast: false
- matrix:
- combo:
- - id: t4576-trl0latest
- label: "HF=4.57.6 + TRL<1"
- transformers_spec: "transformers==4.57.6"
- trl_spec: "trl>=0.18.2,<1.0.0"
- - id: tlatest5-trl1latest
- label: "HF=latest + TRL=latest"
- transformers_spec: "transformers>=5,<6"
- trl_spec: "trl>=1,<2"
- - id: pyproject
- label: "HF=default + TRL=default"
- transformers_spec: "__from_pyproject__"
- trl_spec: "__from_pyproject__"
- name: "Core (${{ matrix.combo.label }})"
- runs-on: ubuntu-latest
- timeout-minutes: 35
- # No job-level or per-step `continue-on-error`. Earlier iterations
- # masked real test failures behind green check icons; that lie is
- # gone. A failing test step fails the cell. NEEDS_PRECONDITION in
- # the runtime check shim handles patches that legitimately cannot
- # run on a CPU-only runner (real CUDA dispatcher, runtime args).
- env:
- UNSLOTH_ZOO_REF: ${{ inputs.unsloth_zoo_ref || 'main' }}
- MATRIX_TRANSFORMERS_SPEC: ${{ matrix.combo.transformers_spec }}
- MATRIX_TRL_SPEC: ${{ matrix.combo.trl_spec }}
- MATRIX_COMBO_ID: ${{ matrix.combo.id }}
- # Hoisted to job-level so every step (Sanity, Bucket-A, unsloth_zoo
- # pytest, test_apply_fused_lm_head) inherits it. transformers' bundled
- # *_pb2.py was generated against an older protoc; the C++ protobuf
- # 4+/5+/6 implementation rejects them with "Descriptors cannot be
- # created directly". The pure-Python parser bypasses the check; the
- # speed cost is negligible for these tests.
- PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python
- PYTHONPATH: ${{ github.workspace }}/studio
- UNSLOTH_COMPILE_DISABLE: '1'
- # unsloth_zoo/__init__.py:314 raises ImportError unless UNSLOTH_IS_PRESENT
- # is set — normally it is set by unsloth.__init__ when unsloth is imported
- # first. In this job we sometimes import unsloth_zoo.* (e.g.
- # unsloth_zoo.saving_utils, unsloth_zoo.temporary_patches) without going
- # through `import unsloth` first; pin the env var to 1 so unsloth_zoo's
- # bootstrap accepts it. Setting it has no effect on unsloth itself.
- UNSLOTH_IS_PRESENT: '1'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- # Node 22 unblocks tests/studio/test_chat_preset_builtin_invariants.py's
- # `node --experimental-strip-types` subprocess. Cheap to install; keeps
- # the consolidated job self-sufficient even if studio-backend-ci.yml
- # changes its node setup.
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - name: Install uv (some unsloth_zoo dev tooling expects it on PATH)
- run: pip install uv
-
- - name: Resolve matrix specs (handle __from_pyproject__ sentinel)
- # The pyproject cell uses a sentinel; resolve the real `transformers`
- # and `trl` constraints from the project's pyproject.toml at job time.
- # unsloth's pyproject puts the LLM stack pins in
- # [project.optional-dependencies] under the `huggingfacenotorch`
- # extra (top-level [project.dependencies] is just typer/pydantic/etc.),
- # so we walk every optional extra and pick the first matching spec.
- # Other cells pass their spec through unchanged.
- run: |
- set -euxo pipefail
- python <<'PY' >> "$GITHUB_ENV"
- import os, re, tomllib
- spec_t = os.environ["MATRIX_TRANSFORMERS_SPEC"]
- spec_r = os.environ["MATRIX_TRL_SPEC"]
-
- def _pkg_name(spec: str) -> str:
- m = re.match(r"\s*([A-Za-z0-9_.-]+)", spec)
- return (m.group(1).lower() if m else "")
-
- if spec_t == "__from_pyproject__" or spec_r == "__from_pyproject__":
- with open("pyproject.toml", "rb") as f:
- doc = tomllib.load(f)
- proj = doc.get("project", {})
- # Try top-level deps first, then all optional extras.
- all_deps: list[str] = list(proj.get("dependencies", []))
- for _name, dep_list in proj.get("optional-dependencies", {}).items():
- all_deps.extend(dep_list)
-
- if spec_t == "__from_pyproject__":
- spec_t = next((x for x in all_deps if _pkg_name(x) == "transformers"),
- "transformers")
- if spec_r == "__from_pyproject__":
- spec_r = next((x for x in all_deps if _pkg_name(x) == "trl"),
- "trl")
- print(f"RESOLVED_TRANSFORMERS_SPEC={spec_t}")
- print(f"RESOLVED_TRL_SPEC={spec_r}")
- PY
- # Echo to logs so the matrix cell label maps cleanly to a spec.
- grep RESOLVED_ "$GITHUB_ENV" || true
-
- - name: Install runtime deps (mirrors studio-backend-ci.yml + mlx-ci.yml)
- # The shape matches studio-backend-ci.yml's "Repo tests (CPU)" install
- # so we inherit the same CPU-spoof harness in tests/conftest.py and
- # the same import-chain guarantees, plus the extra deps that the
- # tests/saving + tests/utils Bucket-A files transitively need but
- # which Repo tests (CPU) does not require because it --ignores
- # those directories:
- # - protobuf + sentencepiece: tests/saving/test_fix_sentencepiece_gguf_robustness.py
- # does `from transformers.utils import sentencepiece_model_pb2`,
- # which imports `google.protobuf`. Not pulled by transformers'
- # base install.
- # - triton: unsloth/_gpu_init.py:232 does an unconditional
- # `import triton`. The triton PyPI wheel installs cleanly on
- # Linux x86_64 even without CUDA (the import succeeds; runtime
- # GPU work is what would fail, which we never do here).
- # transformers + trl are matrix-parameterized.
- run: |
- set -euxo pipefail
- python -m pip install --upgrade pip
- pip install -r studio/backend/requirements/studio.txt
- pip install \
- python-multipart aiofiles sqlalchemy cryptography \
- pyyaml jinja2 mammoth unpdf requests typer \
- 'numpy<3' pytest==9.0.3 pytest-asyncio httpx \
- protobuf sentencepiece triton \
- psutil packaging tqdm safetensors datasets \
- 'peft>=0.18,<0.20' 'accelerate>=0.34,<2' \
- ipython
- # torchvision: unsloth_zoo.vision_utils imports it at module scope.
- pip install --index-url https://download.pytorch.org/whl/cpu \
- 'torch>=2.4,<2.11' 'torchvision<0.26'
- # transformers + trl from the matrix combo.
- pip install "$RESOLVED_TRANSFORMERS_SPEC"
- pip install "$RESOLVED_TRL_SPEC"
- # bitsandbytes: hard import in unsloth/models/_utils.py. Recent
- # versions ship a CPU build that imports cleanly on Linux.
- pip install 'bitsandbytes>=0.45'
- # unsloth itself, editable, no-deps so pip does not fight the
- # explicit torch CPU-index install above.
- pip install -e . --no-deps
- echo "::group::Installed transformers + trl + torch + unsloth versions"
- pip show transformers
- pip show trl
- pip show torch
- pip show unsloth
- echo "::endgroup::"
-
- - name: Clone unsloth_zoo @ ${{ env.UNSLOTH_ZOO_REF }}
- # We need the repository tree (the wheel does not ship tests/), so
- # clone shallow then editable-install so unsloth_zoo.* imports
- # resolve to the cloned tree. We use `pip show` for the location
- # check rather than `import unsloth_zoo` because the latter calls
- # device_type.get_device_type() at module load and raises on a
- # GPU-less runner; pytest steps below route through the existing
- # tests/conftest.py spoof which handles that.
- run: |
- set -euxo pipefail
- # github.com occasionally 500s on the git fetch; retry so a
- # single upstream blip does not fail CI.
- for attempt in 1 2 3; do
- rm -rf "$RUNNER_TEMP/unsloth-zoo"
- if git clone --depth=1 --branch="$UNSLOTH_ZOO_REF" \
- https://github.com/unslothai/unsloth-zoo \
- "$RUNNER_TEMP/unsloth-zoo"; then
- break
- fi
- if [ "$attempt" -eq 3 ]; then
- echo "::error::git clone unsloth-zoo failed after 3 attempts"
- exit 1
- fi
- delay=$((5 * attempt))
- echo "::warning::clone failed (attempt $attempt/3), retrying in ${delay}s..."
- sleep "$delay"
- done
- pip install -e "$RUNNER_TEMP/unsloth-zoo" --no-deps
- pip show unsloth_zoo
-
- - name: Sanity — collection only (both repos)
- # Catches import-time breakage before we run the suite. Cheap; bails
- # the job out fast if a transformers/torch resolution went sideways.
- # Inherits PYTHONPATH / UNSLOTH_COMPILE_DISABLE / PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION
- # from the job-level env block.
- run: |
- set -euxo pipefail
- python -m pytest --collect-only -q \
- tests/saving/test_save_shell_injection.py \
- tests/saving/test_patch_saving_none_tokenizer.py \
- tests/saving/test_fix_sentencepiece_gguf_robustness.py \
- tests/utils/test_attention_masks.py \
- tests/utils/test_trunc_normal_patch.py
- python -m pytest --collect-only -q "$RUNNER_TEMP/unsloth-zoo/tests/"
-
- - name: import_fixes drift detectors (18 tests, HARD GATE)
- # One drift detector per fix_* / patch_* function in
- # unsloth/import_fixes.py. The detectors assert the *healthy*
- # upstream shape that the fix expects ABSENT the regression;
- # ANY DRIFT DETECTED -> pytest.fail (NEVER skip) so the
- # matrix cell goes red and the maintainer triages on the
- # next PR, not in a downstream user's crash report.
- #
- # Pathologies covered by the suite (each maps to one fix
- # function with the line range cited in the test docstring):
- # * protobuf MessageFactory GetPrototype / GetMessageClass
- # * datasets 4.4.x recursion range
- # * TRL tuple-vs-bool _*_available caching
- # * transformers PreTrainedModel.enable_input_require_grads
- # source pattern flip
- # * transformers torchcodec / causal_conv1d availability
- # flags
- # * transformers + accelerate is_wandb_available
- # * peft.utils.transformers_weight_conversion importability
- # + build_peft_weight_mapping signature
- # * triton 3.6+ CompiledKernel num_ctas / cluster_dims
- # * torch / torchvision pinned compatibility table
- # * vllm guided_decoding_params / structured_outputs +
- # aimv2 ovis config version
- # * huggingface_hub is_offline_mode / HF_HUB_OFFLINE
- # * torch.nn.init.trunc_normal_ presence (patch site for
- # patch_trunc_normal_precision_issue)
- # * xformers post-num_splits-key fix version
- # HARD GATE: a red cell here is a real upstream regression
- # without a corresponding zoo / unsloth-side workaround.
- run: |
- python -m pytest -v --tb=short tests/test_import_fixes_drift.py
-
- - name: public-api surface drift detectors (9 tests, HARD GATE)
- # Companion to test_import_fixes_drift.py: that file catches
- # third-party drift; this one catches drift in unsloth's OWN
- # public surface (FastLanguageModel / FastVisionModel /
- # FastModel + their classmethods + is_bf16_supported). A
- # rename here would silently break the unslothai/notebooks tree
- # one PR cycle later -- this gate catches it BEFORE the
- # breakage reaches users.
- run: |
- python -m pytest -v --tb=short tests/test_public_api_surface.py
-
- - name: unsloth Bucket-A — CPU tests not in Repo tests (CPU)
- # 16 tests across 5 files. They live inside tests/saving/ and
- # tests/utils/, both of which Repo tests (CPU) excludes via --ignore
- # because their sibling files need real GPUs / real HF weights.
- # The five files below are pure-Python + AST/protobuf/regex tests
- # that run cleanly on CPU. Env inherited from the job block.
- run: |
- python -m pytest -q --tb=short \
- tests/saving/test_save_shell_injection.py \
- tests/saving/test_patch_saving_none_tokenizer.py \
- tests/saving/test_fix_sentencepiece_gguf_robustness.py \
- tests/utils/test_attention_masks.py \
- tests/utils/test_trunc_normal_patch.py \
- --deselect 'tests/utils/test_attention_masks.py::test_run_attention_flash_varlen_receives_window_and_softcap'
- # The deselected test monkeypatches flash_attn_varlen_func, which is
- # only bound on the module when `flash_attn` is importable. flash_attn
- # requires CUDA + dev toolchain, which the CPU-only ubuntu-latest
- # runner does not have. The other 15 Bucket-A tests pass cleanly.
-
- - name: unsloth_zoo @ ${{ env.UNSLOTH_ZOO_REF }} — full pytest (CPU)
- # 106 of 111 test_* in unsloth_zoo are CPU-only. The two CUDA-skip
- # cases below auto-skip on a GPU-less runner; deselect them
- # explicitly so the no-CUDA outcome is "deselected", not "skipped",
- # making intent visible in the report. Env inherited from job block.
- working-directory: ${{ runner.temp }}/unsloth-zoo
- run: |
- python -m pytest -q --tb=short tests/ \
- --deselect tests/test_unsloth_zoo_lora_merge.py::test_active_merge_device_returns_string_on_cuda_host \
- --deselect tests/test_unsloth_zoo_lora_merge.py::test_merge_lora_moves_cpu_inputs_to_active_device
-
- - name: unsloth_zoo — test_apply_fused_lm_head (lives in compiler.py)
- # `test_apply_fused_lm_head` lives at unsloth_zoo/compiler.py:1983,
- # not under tests/, so pytest's default discovery does not pick it up.
- # We route it through pytest by writing a one-shot shim test file
- # inside the unsloth checkout's tests/ — pytest then walks UP and
- # picks up tests/conftest.py, whose GPU-spoof harness (lines 84-141)
- # patches torch.cuda.is_available, torch.cuda.memory.mem_get_info,
- # torch.cuda.get_device_capability, and is_bf16_supported. That full
- # spoof is required because unsloth_zoo/temporary_patches/gpt_oss.py
- # at module load reads torch.cuda.memory.mem_get_info(0), which
- # bare `is_available = True` doesn't cover. Env inherited.
- run: |
- set -euxo pipefail
- cat > tests/_zoo_apply_fused_lm_head_shim.py <<'PY'
- # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
- # Wraps unsloth_zoo.compiler.test_apply_fused_lm_head so that
- # tests/conftest.py's GPU-spoof harness applies before the import.
- # _zoo_aggressive_cuda_spoof extends conftest's harness with deeper
- # patches (see tests/_zoo_aggressive_cuda_spoof.py).
- import sys, pathlib
- sys.path.insert(0, str(pathlib.Path(__file__).parent))
- import _zoo_aggressive_cuda_spoof as _spoof
- _spoof.apply()
- from unsloth_zoo.compiler import test_apply_fused_lm_head as _zoo_test
- def test_zoo_apply_fused_lm_head_runs():
- _zoo_test()
- PY
- python -m pytest -q --tb=short tests/_zoo_apply_fused_lm_head_shim.py
- rm -f tests/_zoo_apply_fused_lm_head_shim.py
-
- - name: Static checks — unsloth/trainer.py + unsloth/models/rl.py against latest pip TRL
- # AST-only sanity: confirm both files parse and that every TRL symbol
- # they reference still exists in the installed `trl`. Catches API
- # drift (renamed / removed TRL classes) without running training.
- # Pre-fetches latest pip transformers in case TRL pinned an older one.
- run: |
- set -euxo pipefail
- # Use the matrix-resolved transformers + trl versions already
- # installed by the runtime-deps step (don't upgrade here; that
- # would defeat the matrix's purpose of testing against the
- # specific (transformers, trl) combination the cell selected).
- python <<'PY'
- import ast, importlib, pathlib, sys
- paths = [pathlib.Path("unsloth/trainer.py"),
- pathlib.Path("unsloth/models/rl.py")]
- for p in paths:
- src = p.read_text()
- tree = ast.parse(src, filename=str(p))
- # Collect every `from trl... import X` and `from trl... import (X, Y)`
- missing = []
- for node in ast.walk(tree):
- if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith("trl"):
- mod = importlib.import_module(node.module)
- for alias in node.names:
- if alias.name == "*":
- continue
- if not hasattr(mod, alias.name):
- missing.append(f"{node.module}.{alias.name}")
- print(f"{p}: TRL symbols referenced and resolved -> {'OK' if not missing else 'MISSING ' + ', '.join(missing)}")
- if missing:
- sys.exit(1)
- PY
-
- - name: Static checks — unsloth_zoo/tiled_mlp.py against latest pip transformers
- # AST parse + transformers symbol-resolution. The user flagged tiled
- # MLP patching as the path that breaks first when transformers ships
- # an MLP class rename; this step is the canary against whatever
- # transformers version the matrix cell selected.
- working-directory: ${{ runner.temp }}/unsloth-zoo
- run: |
- set -euxo pipefail
- python <<'PY'
- import ast, importlib, pathlib, sys
- p = pathlib.Path("unsloth_zoo/tiled_mlp.py")
- src = p.read_text()
- tree = ast.parse(src, filename=str(p))
- missing = []
- for node in ast.walk(tree):
- if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith("transformers"):
- try:
- mod = importlib.import_module(node.module)
- except Exception as e:
- missing.append(f"{node.module} (import failed: {type(e).__name__})")
- continue
- for alias in node.names:
- if alias.name == "*":
- continue
- if not hasattr(mod, alias.name):
- missing.append(f"{node.module}.{alias.name}")
- print(f"{p}: transformers symbols referenced -> {'OK' if not missing else 'MISSING ' + ', '.join(missing)}")
- if missing:
- sys.exit(1)
- PY
-
- - name: Static checks — unsloth_zoo/hf_utils.py syntax + import-graph
- working-directory: ${{ runner.temp }}/unsloth-zoo
- run: |
- set -euxo pipefail
- python <<'PY'
- import ast, pathlib
- p = pathlib.Path("unsloth_zoo/hf_utils.py")
- tree = ast.parse(p.read_text(), filename=str(p))
- # Surface every public function + class so the PR check log shows
- # what's covered, not just OK/FAIL.
- public = []
- for node in tree.body:
- if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)) and not node.name.startswith("_"):
- public.append(f"{type(node).__name__.replace('Def','').lower()}:{node.name}")
- print(f"hf_utils.py public surface ({len(public)}): " + ", ".join(public))
- PY
-
- - name: Runtime checks — invoke every zero-arg patch_* across both repos (via pytest shim)
- # Routed through pytest so tests/conftest.py's GPU-spoof harness
- # applies before any unsloth_zoo.temporary_patches.* import.
- # Locally validated 50/51 zero-arg patches succeed; the lone failure
- # surfaces a real bug (unsloth.models._utils.patch_fast_lora raises
- # NameError: name 'fast_lora_forward' is not defined). The shim
- # reports the full ledger but only fails when one of the two
- # `required` helpers is absent.
- run: |
- set -euxo pipefail
- cat > tests/_runtime_patch_check_shim.py <<'PY'
- # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
- # Wraps the runtime patch_* validation into a pytest test so the
- # tests/conftest.py GPU-spoof harness applies. continue-on-error
- # at the workflow level catches per-patch failures; this shim only
- # asserts that the two `required` helpers are reachable.
- import sys, pathlib
- sys.path.insert(0, str(pathlib.Path(__file__).parent))
- import _zoo_aggressive_cuda_spoof as _spoof
- _spoof.apply()
- import importlib, inspect
-
- MODULES = [
- "unsloth.models._utils", "unsloth.models.rl", "unsloth.import_fixes",
- "unsloth.kernels.cross_entropy_loss", "unsloth.kernels.rms_layernorm",
- "unsloth.tokenizer_utils", "unsloth.save",
- "unsloth_zoo.patching_utils", "unsloth_zoo.gradient_checkpointing",
- "unsloth_zoo.loss_utils", "unsloth_zoo.tokenizer_utils",
- "unsloth_zoo.tiled_mlp", "unsloth_zoo.dataset_utils",
- "unsloth_zoo.patch_torch_functions",
- "unsloth_zoo.temporary_patches.gemma",
- "unsloth_zoo.temporary_patches.ministral",
- "unsloth_zoo.temporary_patches.pixtral",
- "unsloth_zoo.temporary_patches.deepseek_v3_moe",
- "unsloth_zoo.temporary_patches.qwen3_5_moe",
- "unsloth_zoo.temporary_patches.mxfp4",
- "unsloth_zoo.temporary_patches.bitsandbytes",
- "unsloth_zoo.temporary_patches.flex_attention_bwd",
- ]
- REQUIRED = {
- "patch_unsloth_smart_gradient_checkpointing",
- "patch_gradient_accumulation_fix",
- }
- # Patches whose signature looks zero-arg (`()` or all-defaulted)
- # but which actually require either runtime args or real CUDA.
- # Calling these in isolation is meaningless, so skip the
- # invocation. Symbol presence (REQUIRED above) is still verified.
- # patch_linear_scaling / patch_llama_rope_scaling: defaults are
- # None placeholders; the bodies start with
- # `assert is not None`.
- # patch_unsloth_smart_gradient_checkpointing: legitimately
- # allocates CUDA tensors via aten::empty.memory_format inside
- # initialize_unsloth_gradient_checkpointing(); the
- # torch.cuda.* spoof can't intercept that at the dispatcher
- # level.
- NEEDS_PRECONDITION = {
- "patch_linear_scaling",
- "patch_llama_rope_scaling",
- "patch_unsloth_smart_gradient_checkpointing",
- }
-
- def test_zero_arg_patch_invocations():
- ok, fail, args, skipped, miss_imports = 0, [], [], [], {}
- seen_required = set()
- for mod_name in MODULES:
- try:
- mod = importlib.import_module(mod_name)
- except Exception as e:
- miss_imports[mod_name] = f"{type(e).__name__}: {e}"
- continue
- for name in sorted(dir(mod)):
- if not name.startswith("patch_"): continue
- fn = getattr(mod, name, None)
- if not callable(fn): continue
- if name in REQUIRED: seen_required.add(name)
- try:
- sig = inspect.signature(fn)
- need = [p.name for p in sig.parameters.values()
- if p.default is inspect.Parameter.empty
- and p.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD,
- inspect.Parameter.POSITIONAL_ONLY)]
- except (TypeError, ValueError):
- need = []
- if need:
- args.append((mod_name, name, need)); continue
- if name in NEEDS_PRECONDITION:
- skipped.append(f"{mod_name}.{name}")
- print(f" SKIP {mod_name}.{name} (needs precondition / CUDA)")
- continue
- try:
- fn()
- ok += 1
- print(f" OK {mod_name}.{name}")
- except Exception as e:
- fail.append((mod_name, name, type(e).__name__, str(e)[:200]))
- print(f" FAIL {mod_name}.{name} -> {type(e).__name__}: {str(e)[:200]}")
- print(f"\nzero-arg patch_*: ok={ok} fail={len(fail)} skipped={len(skipped)}")
- print(f"arg-required patch_* (skipped, listed for review): {len(args)}")
- for m, n, r in args:
- print(f" needs={r}: {m}.{n}")
- if skipped:
- print(f"explicitly skipped (needs precondition / CUDA): {skipped}")
- if miss_imports:
- print("\nmodules failed to import (skipped):")
- for k, v in miss_imports.items():
- print(f" {k}: {v}")
- print(f"required patch_* helpers seen: {sorted(seen_required)}")
- missing = REQUIRED - seen_required
- assert not missing, f"required patch_* helpers MISSING: {sorted(missing)}"
- # Strict: any zero-arg patch that raises is a real
- # regression now that #5319 has landed (the three previously
- # known-broken patches are fixed; legitimate
- # CPU-precondition skips are recorded in NEEDS_PRECONDITION
- # above, not in `fail`). Print all failures and re-raise
- # them as one assertion message.
- if fail:
- raise AssertionError(
- f"zero-arg patch_* invocation failures (ok={ok}, "
- f"fail={len(fail)}, skipped={len(skipped)}):\n "
- + "\n ".join(
- f"{m}.{n} -> {ec}: {msg}" for m, n, ec, msg in fail
- )
- )
- PY
- python -m pytest -q --tb=short tests/_runtime_patch_check_shim.py -s
- rm -f tests/_runtime_patch_check_shim.py
-
- - name: Runtime checks — patch_tiled_mlp on a synthetic MLP module (via pytest shim)
- # Same shim pattern: pytest picks up tests/conftest.py before importing
- # unsloth_zoo.tiled_mlp, so the GPU-spoof harness covers
- # unsloth_zoo.temporary_patches.gpt_oss's mem_get_info call.
- run: |
- set -euxo pipefail
- cat > tests/_tiled_mlp_check_shim.py <<'PY'
- # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
- import sys, pathlib
- sys.path.insert(0, str(pathlib.Path(__file__).parent))
- import _zoo_aggressive_cuda_spoof as _spoof
- _spoof.apply()
- import torch
- import torch.nn as nn
- from unsloth_zoo.tiled_mlp import patch_tiled_mlp, patch_mlp
-
- class _MLP(nn.Module):
- def __init__(self, hidden=64, intermediate=128):
- super().__init__()
- self.gate_proj = nn.Linear(hidden, intermediate, bias=False)
- self.up_proj = nn.Linear(hidden, intermediate, bias=False)
- self.down_proj = nn.Linear(intermediate, hidden, bias=False)
- self.act_fn = nn.SiLU()
- def forward(self, x):
- return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
- class _FakeModel(nn.Module):
- def __init__(self):
- super().__init__()
- self.layers = nn.ModuleList([nn.ModuleDict({"mlp": _MLP()}) for _ in range(2)])
- def forward(self, x):
- for layer in self.layers:
- x = x + layer["mlp"](x)
- return x
-
- def test_patch_tiled_mlp_numerical_equivalence():
- # `patch_mlp(target_arctic=True)` sets `chunk_size = max(1, H)`
- # and shards the SEQUENCE dim with `n_shards = max(1, S //
- # chunk_size)`. Pick S > H so the tiled path actually runs
- # multi-shard (n_shards = 192 // 64 = 3, plus a remainder
- # shard) rather than degenerating to n_shards = 1 which is
- # bit-exact and only confirms patching installed something.
- # If the tiled implementation is correct, multi-shard output
- # must still match the un-tiled reference within FP32 noise.
- torch.manual_seed(0)
- m = _FakeModel().eval()
- hidden = 64
- # 192 = 3 * hidden, so divmod(192, 64) = (3, 0) -> 3 shards,
- # no remainder; gives a clean multi-shard verification.
- x = torch.randn(2, 192, hidden)
- with torch.no_grad():
- y_before = m(x).clone()
- patch_mlp(m.layers[0]["mlp"])
- patch_tiled_mlp(m)
- # Sanity-check we are actually exercising the multi-shard
- # path: poke chunk_size by re-deriving it the same way
- # `tiled_forward_arctic_size` does.
- S = x.shape[1]
- chunk = max(1, hidden)
- n_shards_expected = max(1, S // chunk)
- assert n_shards_expected > 1, (
- "tiled MLP shim is not exercising multi-shard: "
- f"S={S}, chunk={chunk}, n_shards={n_shards_expected}"
- )
- with torch.no_grad():
- y_after = m(x).clone()
- err = (y_before - y_after).abs().max().item()
- print(
- f"patch_tiled_mlp multi-shard (n_shards={n_shards_expected}) "
- f"output diff = {err:.3e}"
- )
- assert err < 1e-3, f"tiled MLP output drifted: {err}"
- PY
- python -m pytest -q --tb=short tests/_tiled_mlp_check_shim.py -s
- rm -f tests/_tiled_mlp_check_shim.py
-
- - name: Compiler cache hygiene + source-rewriter invariants (synthetic inputs)
- # Lightweight pipeline coverage for unsloth_zoo.compiler. Pure regex
- # / tokenize / ast paths driven by tiny synthetic source strings:
- # - higher_precision_softmax (basic + idempotent)
- # - fix_rotary_embedding_dtype (no-op + active under
- # UNSLOTH_FORCE_CUSTOM_DTYPE)
- # - fix_attention_dtype_consistency (insert + idempotent)
- # - convert_attention_masks_to_bool (rewrite + no-op)
- # - create_new_function happy-path (versioning block, license
- # header, AST parse, importlib re-import)
- # - create_new_function **kwargs collision (exercises
- # _rewrite_kwargs_param + _insert_kwargs_alias)
- # - UNSLOTH_COMPILE_OVERWRITE=0 forced-recompile on transformers
- # version mismatch (compiler.py:947-963)
- # - matching short-circuit when versions are equal
- # No real transformers modeling module is loaded; complements the
- # heavier real-class round-trip step below. Wall-time ~10-25s.
- run: |
- set -euxo pipefail
- cat > tests/_compiler_cache_invariants_shim.py <<'PY'
- # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
- # Cache-hygiene + source-rewriter invariants for unsloth_zoo.compiler.
- import sys, pathlib, os, ast, importlib, importlib.util, time
- sys.path.insert(0, str(pathlib.Path(__file__).parent))
- import _zoo_aggressive_cuda_spoof as _spoof
- _spoof.apply()
- import pytest
- import torch # noqa: F401 (compiler.py imports torch at module load)
-
-
- def _isolate_cache(tmp_path, monkeypatch):
- """Point UNSLOTH_COMPILE_LOCATION at tmp_path and reset module
- globals. The compiler.py global is captured at module load
- (line 75/179), so we delete + reimport per test."""
- monkeypatch.setenv("UNSLOTH_COMPILE_LOCATION", str(tmp_path))
- if "unsloth_zoo.compiler" in sys.modules:
- del sys.modules["unsloth_zoo.compiler"]
- import unsloth_zoo.compiler as compiler
- compiler.UNSLOTH_COMPILE_LOCATION = str(tmp_path)
- compiler.UNSLOTH_COMPILE_USE_TEMP = False
- return compiler
-
-
- def test_higher_precision_softmax_basic_and_idempotent(tmp_path, monkeypatch):
- c = _isolate_cache(tmp_path, monkeypatch)
- src = (
- "y = nn.functional.softmax(x, dim=-1)\n"
- "z = F.softmax(a, dim=1, dtype=torch.bfloat16)\n"
- )
- out = c.higher_precision_softmax(src)
- assert "dtype = torch.float32).to(x.dtype)" in out
- assert "dtype = torch.float32).to(a.dtype)" in out
- # Idempotency landed in unslothai/unsloth-zoo#631
- # (negative-lookahead on `.to(.dtype)` so a second
- # pass does not append another cast).
- assert c.higher_precision_softmax(out) == out
-
-
- def test_fix_rotary_dtype_no_op_without_env(tmp_path, monkeypatch):
- c = _isolate_cache(tmp_path, monkeypatch)
- monkeypatch.delenv("UNSLOTH_FORCE_CUSTOM_DTYPE", raising=False)
- src = "out = cos.to(dtype=x.dtype) + sin.to(dtype=x.dtype)\n"
- assert c.fix_rotary_embedding_dtype(src) == src
-
-
- def test_fix_rotary_dtype_active(tmp_path, monkeypatch):
- c = _isolate_cache(tmp_path, monkeypatch)
- monkeypatch.setenv(
- "UNSLOTH_FORCE_CUSTOM_DTYPE",
- "float16;torch.float32;torch.bfloat16;torch.float16;pass",
- )
- monkeypatch.setenv("UNSLOTH_FORCE_FLOAT32", "1")
- src = "out = cos.to(dtype=x.dtype) + sin.to(dtype=x.dtype)\n"
- out = c.fix_rotary_embedding_dtype(src)
- # Active form rewrites cos.to / sin.to. Either the conditional
- # form or the cast form is acceptable -- different transformers
- # versions surface slightly different outputs from the rewriter.
- assert "cos.to(dtype=x.dtype)" not in out
- assert "sin.to(dtype=x.dtype)" not in out
-
-
- def test_fix_attention_dtype_consistency_insert_then_idempotent(tmp_path, monkeypatch):
- c = _isolate_cache(tmp_path, monkeypatch)
- src = (
- " query_states, key_states = apply_rotary_pos_emb("
- "query_states, key_states, cos, sin)\n"
- " attn = q @ k.T\n"
- )
- out = c.fix_attention_dtype_consistency(src)
- assert out.count("value_states = value_states.to(query_states.dtype)") == 1
- assert c.fix_attention_dtype_consistency(out) == out
-
-
- def test_convert_attention_masks_to_bool_rewrites(tmp_path, monkeypatch):
- c = _isolate_cache(tmp_path, monkeypatch)
- src = (
- "def make_mask(x):\n"
- " out = torch.finfo(x.dtype).min * x\n"
- " return out\n"
- )
- out = c.convert_attention_masks_to_bool("make_mask", src)
- # Loose match: rewriter inserts a `!=torch.finfo(...).min` check
- # somewhere on the return path. Tightening to an exact
- # last-line match is brittle across transformers versions.
- assert "!=torch.finfo" in out
-
-
- def test_convert_attention_masks_to_bool_no_op(tmp_path, monkeypatch):
- c = _isolate_cache(tmp_path, monkeypatch)
- src = "def make_mask(x):\n return x\n"
- assert c.convert_attention_masks_to_bool("make_mask", src) == src
-
-
- def _versioning_lines(file_text):
- """Extract the four version strings from the versioning block."""
- assert file_text.startswith('"""\n'), "missing opening triple-quote"
- head = file_text.split("__UNSLOTH_VERSIONING__", 1)[0]
- lines = [ln for ln in head.splitlines() if ln and ln != '"""']
- return lines
-
-
- def test_create_new_function_happy_path(tmp_path, monkeypatch):
- c = _isolate_cache(tmp_path, monkeypatch)
- src = "def f(x):\n return nn.functional.softmax(x, dim=-1)\n"
- c.create_new_function(
- name="f_happy", new_source=src, model_location="builtins",
- functions=[], overwrite=True,
- )
- cached = tmp_path / "f_happy.py"
- assert cached.exists()
- text = cached.read_text(encoding="utf-8")
- versions = _versioning_lines(text)
- assert len(versions) == 4, versions
- assert text.count(c._full_license_header) == 1
- ast.parse(text)
- spec = importlib.util.spec_from_file_location("f_happy_reimport", cached)
- m2 = importlib.util.module_from_spec(spec)
- spec.loader.exec_module(m2)
- assert callable(m2.f)
- import inspect as _inspect
- # higher_precision_softmax should have promoted to float32.
- assert "dtype = torch.float32" in _inspect.getsource(m2.f)
-
-
- def test_create_new_function_overwrite_zero_recompiles_on_version_mismatch(
- tmp_path, monkeypatch,
- ):
- c = _isolate_cache(tmp_path, monkeypatch)
- name = "vmismatch"
- cached = tmp_path / f"{name}.py"
- stub = (
- '"""\n0.0.0\n0.0.0\n0.0.0-stub\n0.0.0\n__UNSLOTH_VERSIONING__\n"""\n'
- + c._full_license_header
- + "def vmismatch(x):\n return x\n"
- )
- cached.write_text(stub, encoding="utf-8")
- monkeypatch.setenv("UNSLOTH_COMPILE_OVERWRITE", "0")
- src = "def vmismatch(x):\n return x + 1\n"
- c.create_new_function(
- name=name, new_source=src, model_location="builtins",
- functions=[], overwrite=False,
- )
- text = cached.read_text(encoding="utf-8")
- assert "0.0.0-stub" not in text, (
- "OVERWRITE=0 + transformers-version-mismatch did NOT recompile"
- )
- versions = _versioning_lines(text)
- import importlib.metadata as _md
- assert versions[2] == _md.version("transformers")
-
-
- def test_create_new_function_overwrite_zero_short_circuits_when_versions_match(
- tmp_path, monkeypatch,
- ):
- c = _isolate_cache(tmp_path, monkeypatch)
- name = "vmatch"
- src = "def vmatch(x):\n return x\n"
- c.create_new_function(
- name=name, new_source=src, model_location="builtins",
- functions=[], overwrite=True,
- )
- cached = tmp_path / f"{name}.py"
- mtime_before = cached.stat().st_mtime_ns
- time.sleep(0.05)
- monkeypatch.setenv("UNSLOTH_COMPILE_OVERWRITE", "0")
- c.create_new_function(
- name=name, new_source=src, model_location="builtins",
- functions=[], overwrite=False,
- )
- assert cached.stat().st_mtime_ns == mtime_before, (
- "OVERWRITE=0 + matching versions should NOT rewrite the file"
- )
- PY
- python -m pytest -q --tb=short tests/_compiler_cache_invariants_shim.py
- rm -f tests/_compiler_cache_invariants_shim.py
-
- - name: Compiler full-model-sweep (every transformers.models.*) + SFT trainer round-trip
- # Calls `unsloth_compile_transformers(model_type=...)` against EVERY
- # `transformers.models.` package the matrix's transformers ships
- # (pkgutil.iter_modules walk -- 383 packages on 4.57.6, similar on
- # latest), then ast.parse / importlib-load / introspect the
- # generated unsloth_compiled_cache/*.py file per model. Catches
- # regex / source-rewriter drift across the matrix's (transformers,
- # trl) combination -- the dominant failure mode of
- # `unsloth_compile_transformers` after a transformers point release.
- #
- # 21 model_types currently break the compiler (verified locally on
- # transformers 4.57.6). They are listed in KNOWN_BROKEN below with
- # their failure mode so the sweep stays green and any NEW breakage
- # surfaces as red. Each entry is tracked for an individual fix
- # PR on unsloth-zoo. The list is split by failure category so
- # follow-up PRs can target one bug at a time.
- #
- # Hermetic cache dir per pytest invocation; we override the
- # job-level UNSLOTH_COMPILE_DISABLE=1 inside the shim so
- # compilation actually runs here. Wall-time estimate ~2-3 min
- # warm (mean ~0.3s/model, 383 models = ~110s on the runner).
- run: |
- set -euxo pipefail
- cat > tests/_zoo_compiler_cache_shim.py <<'PY'
- # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
- import os, sys, ast, pathlib, importlib.util, tempfile
- _HERE = pathlib.Path(__file__).parent
- sys.path.insert(0, str(_HERE))
- import _zoo_aggressive_cuda_spoof as _spoof
- _spoof.apply()
-
- # Hermetic cache dir + force compile path. The compiler's
- # globals (UNSLOTH_COMPILE_LOCATION, UNSLOTH_COMPILE_USE_TEMP)
- # are captured at module load; an earlier conftest `import
- # unsloth` may have already imported unsloth_zoo.compiler with
- # the default "unsloth_compiled_cache" path. Mutate the live
- # module globals after import so this shim is robust to that
- # ordering. Otherwise the compiler silently writes to the
- # default cache and the per-model file assertion fails.
- _CACHE = pathlib.Path(tempfile.mkdtemp(prefix="unsloth_cache_"))
- os.environ["UNSLOTH_COMPILE_LOCATION"] = str(_CACHE)
- os.environ["UNSLOTH_COMPILE_OVERWRITE"] = "1"
- os.environ.pop("UNSLOTH_COMPILE_DISABLE", None)
-
- import pytest
- import unsloth_zoo.compiler as _zoo_compiler
- _zoo_compiler.UNSLOTH_COMPILE_LOCATION = str(_CACHE)
- _zoo_compiler.UNSLOTH_COMPILE_USE_TEMP = False
- from unsloth_zoo.compiler import unsloth_compile_transformers
-
-
- def _verify_file(path: pathlib.Path, must_expose):
- assert path.exists(), f"compiler did not write {path}"
- src = path.read_text(encoding="utf-8")
- ast.parse(src, filename=str(path))
- spec = importlib.util.spec_from_file_location(path.stem, path)
- mod = importlib.util.module_from_spec(spec)
- spec.loader.exec_module(mod)
- for name in must_expose:
- assert hasattr(mod, name), (
- f"{path.name} missing expected attr {name!r}; "
- f"found: {sorted(n for n in dir(mod) if not n.startswith('_'))[:25]}"
- )
-
-
- # ---------- Full transformers.models.* compile sweep ----------
- # Track the model_types that currently break the compiler on
- # transformers >=5,<6. After unsloth-zoo#632 landed, transformers
- # 4.57.6 has zero failures across all model_types; the 27 entries
- # below are the residual failures on the tf 5.x line. New breakage
- # on any OTHER model_type fails the cell. Each entry is a
- # tracking item for a follow-up unsloth-zoo PR.
- KNOWN_BROKEN_COMPILE = {
- # Category A: `string index out of range` in source rewriter.
- "colpali": "string index out of range",
- "colqwen2": "string index out of range",
- "colmodernvbert": "string index out of range",
- "dpr": "string index out of range",
- "gemma4_assistant":"string index out of range",
- "rag": "string index out of range",
- "shieldgemma2": "string index out of range",
- "timm_backbone": "string index out of range",
- # Category B: rewriter emits invalid Python source.
- "clvp": "emitted file: unexpected indent",
- "falcon_mamba": "emitted file: unexpected indent",
- "gpt2": "emitted file: unexpected indent",
- "imagegpt": "emitted file: unexpected indent",
- "mamba": "emitted file: unexpected indent",
- "tapas": "emitted file: expected ':'",
- "xlstm": "emitted file: unexpected indent",
- # Category B-2: emit unterminated string literal (latest tf).
- "audioflamingo3": "emitted file: unterminated string literal",
- "musicflamingo": "emitted file: unterminated string literal",
- "voxtral": "emitted file: unterminated string literal",
- "voxtral_realtime":"emitted file: unterminated string literal",
- # Category C: rewriter emits unclosed paren.
- "kosmos2": "emitted file: '(' was never closed",
- "kosmos2_5": "emitted file: '(' was never closed",
- # Category D: imports list builder picks up a non-exported name.
- "auto": "module has no attribute _BaseModelWithGenerate",
- "bit": "module has no attribute Linear",
- "regnet": "module has no attribute Linear",
- "resnet": "module has no attribute Linear",
- # Category E: undefined name in emitted file.
- "perceiver": "name 'AbstractPreprocessor' is not defined",
- "sam3_lite_text": "name 'Sam3LiteTextLayerScaledResidual' is not defined",
- # Category F: compile exceeds 60s budget on the runner.
- # First seen on transformers >=5,<6; each represents a slow
- # or recursive source-rewriter path the zoo can address.
- "beit": "TimeoutError: compile exceeds per-model budget",
- "sam": "TimeoutError: compile exceeds per-model budget",
- "sam_hq": "TimeoutError: compile exceeds per-model budget",
- }
-
-
- def _all_model_types():
- import pkgutil, transformers.models as tm
- return sorted(s.name for s in pkgutil.iter_modules(tm.__path__) if s.ispkg)
-
-
- def test_compile_every_transformers_model_type():
- """Run unsloth_compile_transformers across every model_type
- the matrix's transformers ships. Allowed outcomes:
- ok -> compile emitted a parseable, importable cache file
- skipped -> no `modeling_.py` file (expected for some
- umbrella packages like `auto`, `deprecated`)
- known -> in KNOWN_BROKEN_COMPILE; tracked for follow-up.
- Any uncaught failure fails the cell.
-
- Per-model SIGALRM cap so one infinite-looping model_type
- cannot wedge the whole sweep + nuke the job timeout
- (observed on transformers >=5,<6 -- 30+ min hang before
- this guard landed)."""
- import importlib as _il
- import signal
- ok = 0
- skipped = []
- known = []
- new_failures = []
- models = _all_model_types()
- def _on_timeout(signum, frame):
- raise TimeoutError("compile exceeded per-model budget")
- prev_handler = signal.signal(signal.SIGALRM, _on_timeout)
- try:
- for i, model_type in enumerate(models):
- if i % 25 == 0:
- print(f" sweep progress: {i}/{len(models)} -> {model_type}", flush=True)
- modeling_path = f"transformers.models.{model_type}.modeling_{model_type}"
- try:
- _il.import_module(modeling_path)
- except (ModuleNotFoundError, ImportError):
- skipped.append((model_type, "no modeling file"))
- continue
- signal.alarm(60)
- try:
- unsloth_compile_transformers(
- model_type=model_type, fast_lora_forwards=False,
- )
- except Exception as e:
- signal.alarm(0)
- msg = f"{type(e).__name__}: {str(e)[:200]}"
- if model_type in KNOWN_BROKEN_COMPILE:
- known.append((model_type, msg))
- else:
- new_failures.append((model_type, msg))
- continue
- signal.alarm(0)
- if model_type in KNOWN_BROKEN_COMPILE:
- # Came back green unexpectedly -- that's GOOD news,
- # the bug was fixed. Surface it so we can drop the
- # entry from KNOWN_BROKEN_COMPILE.
- print(
- f" UNEXPECTED-OK {model_type}: was in "
- "KNOWN_BROKEN_COMPILE, now compiles cleanly. "
- "Drop the entry."
- )
- ok += 1
- finally:
- signal.alarm(0)
- signal.signal(signal.SIGALRM, prev_handler)
- print(f"\nCompile sweep: ok={ok} skipped={len(skipped)} "
- f"known-broken={len(known)} new-failures={len(new_failures)}")
- for m, r in known:
- print(f" KNOWN {m}: {r}")
- for m, r in new_failures[:30]:
- print(f" NEW {m}: {r}")
- if len(new_failures) > 30:
- print(f" ...and {len(new_failures)-30} more new failures")
- assert not new_failures, (
- f"unsloth_compile_transformers introduced new failures on "
- f"{len(new_failures)} model_types not in the known-broken "
- f"list: {[m for m, _ in new_failures]}"
- )
- # Sanity floor: at least 200 model_types should compile cleanly
- # (we observed 362 ok / 383 total on transformers 4.57.6).
- assert ok >= 200, (
- f"only {ok} model_types compiled cleanly; expected >=200. "
- "Possible transformers-version-induced regression."
- )
-
-
- @pytest.mark.parametrize("model_type,rms_class", [
- ("llama", "LlamaRMSNorm"),
- ("qwen3", "Qwen3RMSNorm"),
- ("gemma3", "Gemma3RMSNorm"),
- ])
- def test_compile_real_modeling_module(model_type, rms_class):
- """Spot-check on the three production-relevant families that
- the compile_every sweep also covers; this case verifies the
- emitted cache file has the model-specific RMSNorm class
- attribute, not just that the file parses + imports.
-
- ``unsloth_compile_transformers`` is not idempotent in-
- process: calling it twice on the same modeling module
- after rewriting class attributes corrupts the inspect
- source/line cache and the second emitted file is malformed
- Python. The sweep above already produced a valid cache
- file for every non-KNOWN_BROKEN model_type, so just verify
- that artefact here. Trigger a compile only when running
- this test in isolation (no sweep preceded)."""
- import importlib as _il
- try:
- modeling = _il.import_module(
- f"transformers.models.{model_type}.modeling_{model_type}"
- )
- except ModuleNotFoundError:
- pytest.skip(
- f"transformers build lacks model_type={model_type}"
- )
- combined = _CACHE / f"unsloth_compiled_module_{model_type}.py"
- if not combined.exists():
- unsloth_compile_transformers(
- model_type=model_type, fast_lora_forwards=False,
- )
- modeling = _il.import_module(
- f"transformers.models.{model_type}.modeling_{model_type}"
- )
- assert getattr(modeling, "__UNSLOTH_PATCHED__", False) is True
- _verify_file(combined, must_expose=[rms_class])
-
-
- def test_compile_disable_writes_nothing():
- """Negative control: when UNSLOTH_COMPILE_DISABLE=1 the
- compile path must early-return without producing new files."""
- os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
- try:
- before = set(_CACHE.iterdir())
- # Pick a model_type that still resolves on this transformers.
- for mt in ("llama", "mistral", "qwen2"):
- try:
- import importlib as _il
- _il.import_module(
- f"transformers.models.{mt}.modeling_{mt}"
- )
- break
- except ModuleNotFoundError:
- continue
- else:
- pytest.skip("no probe model_type available")
- unsloth_compile_transformers(
- model_type=mt, fast_lora_forwards=False,
- )
- after = set(_CACHE.iterdir())
- assert after == before, (
- f"DISABLE=1 still wrote: {[p.name for p in after - before]}"
- )
- finally:
- os.environ.pop("UNSLOTH_COMPILE_DISABLE", None)
-
-
- def test_compile_sft_trainer_patch():
- """Round-trip TRL's SFTTrainer through the rl.py patch path
- and verify the generated UnslothSFTTrainer.py."""
- pytest.importorskip("trl")
- try:
- from unsloth.models.rl import _patch_trl_rl_trainers
- except ImportError:
- pytest.skip("unsloth.models.rl._patch_trl_rl_trainers absent")
- try:
- _patch_trl_rl_trainers("sft_trainer")
- except Exception as e:
- # TRL 1.x renames break the patch helper internally; we
- # accept that here and skip rather than fail the cell.
- pytest.skip(f"_patch_trl_rl_trainers raised: {type(e).__name__}: {e}")
- sft = _CACHE / "UnslothSFTTrainer.py"
- if not sft.exists():
- pytest.skip(
- "_patch_trl_rl_trainers ran but did not emit "
- "UnslothSFTTrainer.py on this TRL version."
- )
- _verify_file(sft, must_expose=["UnslothSFTTrainer"])
- PY
- python -m pytest -q --tb=short tests/_zoo_compiler_cache_shim.py
- rm -f tests/_zoo_compiler_cache_shim.py
-
- - name: TRL trainer + Config auto-discovery + dynamic patch coverage
- # Mirror unsloth/models/rl.py:patch_trl_rl_trainers AND verify the
- # dynamic per-version patch surface:
- # 1. AST-parse every *_trainer / *_config submodule.
- # 2. Apply the same *Trainer / *Config discovery rules
- # _patch_trl_rl_trainers uses (rl.py:553-620).
- # 3. Orphan check: every _trainer must have a sibling
- # _config OR an inline *Config.
- # 4. Dynamic count: enumerate every canonical trainer that
- # imports cleanly, run patch_trl_rl_trainers(), assert
- # every one ends up Unsloth-prefixed in-place. Floor matches
- # the cohort sizes from the version sweep:
- # TRL 0.22-0.23 -> 18 canonical trainers
- # TRL 0.24-0.28 -> 15 canonical trainers
- # TRL 0.29-1.x -> 6 canonical (rest are experimental
- # thin-wrappers; covered next)
- # 5. Experimental coverage (TRL 0.29+): walk trl.experimental.*,
- # find every *Trainer class, verify the umbrella patch
- # reaches them via the thin-wrapper MRO walk in
- # _patch_trl_rl_trainers (rl.py:677-702).
- # Per-cell wall-time ~30-60s.
- run: |
- set -euxo pipefail
- cat > tests/_trl_trainer_discovery_shim.py <<'PY'
- # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
- # Walks every *_trainer / *_config module in trl.trainer and
- # validates that unsloth's auto-discovery rules in
- # unsloth/models/rl.py:_patch_trl_rl_trainers (lines 542-620,
- # 1934-1949) still pick out exactly one *Trainer and one
- # *Config per module on the matrix's TRL version.
- import sys, pathlib, importlib, importlib.util, ast, inspect
-
- sys.path.insert(0, str(pathlib.Path(__file__).parent))
- import _zoo_aggressive_cuda_spoof as _spoof
- _spoof.apply()
-
- import pytest
- pytest.importorskip("trl")
- import trl # noqa: F401 (forces lazy-module init)
- import trl.trainer
-
-
- def _is_real_submodule(qual_name: str) -> bool:
- """True iff `qual_name` resolves to an importable submodule
- with a file on disk (i.e. has a non-None find_spec().origin).
-
- TRL re-exports utility FUNCTIONS into `trl.trainer.__init__`
- whose names happen to end with `_config` (e.g.
- `get_peft_config`, `get_quantization_config`). Without this
- filter the `endswith` check below picks them up as if they
- were submodules and the AST stage fails on `no spec`. The
- same trap exists for `_trainer` (none today, but defensive).
- """
- try:
- spec = importlib.util.find_spec(qual_name)
- except (ImportError, ValueError):
- return False
- return spec is not None and bool(getattr(spec, "origin", None))
-
-
- # Replicate rl.py:1939-1943 verbatim, then filter to actual
- # submodules so re-exported utility functions (e.g.
- # `get_peft_config`) do not pollute the AST sweep.
- def _trainer_files():
- return [
- x for x in dir(trl.trainer)
- if x.islower()
- and x.endswith("_trainer")
- and x != "base_trainer"
- and _is_real_submodule(f"trl.trainer.{x}")
- ]
-
-
- def _config_files():
- return [
- x for x in dir(trl.trainer)
- if x.islower()
- and x.endswith("_config")
- and _is_real_submodule(f"trl.trainer.{x}")
- ]
-
-
- def _ast_parse_module_via_spec(qual_name: str):
- """AST-parse a module's source on disk WITHOUT importing it.
- `trl.trainer` uses _LazyModule so `find_spec` resolves the
- file path without firing the module-level `__init__`. This
- dodges optional-dep ImportErrors (e.g. grpo_trainer's vllm
- import) and still surfaces real syntax drift in the file."""
- spec = importlib.util.find_spec(qual_name)
- if spec is None or not spec.origin:
- return None, "no spec"
- path = pathlib.Path(spec.origin)
- if not path.is_file():
- return None, f"spec.origin not a file: {path}"
- src = path.read_text(encoding="utf-8")
- ast.parse(src, filename=str(path))
- return path, None
-
-
- def test_every_trl_trainer_and_config_module_ast_parses():
- """Stage 1: pure file-on-disk AST parse. Catches a TRL
- source-level syntax issue on any matrix cell without
- triggering optional-dep imports."""
- fail = []
- ok = 0
- for name in _trainer_files() + _config_files():
- qual = f"trl.trainer.{name}"
- try:
- path, err = _ast_parse_module_via_spec(qual)
- if err:
- fail.append((qual, err))
- else:
- ok += 1
- except SyntaxError as e:
- fail.append((qual, f"SyntaxError: {e}"))
- except Exception as e:
- fail.append((qual, f"{type(e).__name__}: {e}"))
- print(f"AST-parsed {ok} TRL trainer+config modules; failed={len(fail)}")
- for q, e in fail:
- print(f" AST FAIL {q}: {e}")
- assert not fail, f"AST parse failed for {len(fail)} TRL modules"
-
-
- def _apply_unsloth_discovery_rules(mod, trainer_file):
- """Replicate the four endswith filters in
- rl.py:553-569 verbatim."""
- prefix = trainer_file.split("_")[0]
- names = [
- x for x in dir(mod)
- if x.endswith("Trainer") and x != "Trainer"
- and not x.startswith("_") and prefix in x.lower()
- ]
- configs = [
- x for x in dir(mod)
- if x.endswith("Config") and x != "Config"
- and not x.startswith("_") and prefix in x.lower()
- ]
- return names, configs
-
-
- def _resolve_config_via_fallbacks(trainer_file, name_list, mod):
- """Replicate rl.py:575-615: try the sibling *_config.py
- module, then the MRO walk fallback. Returns the resolved
- config-name list (length 0 or 1)."""
- # Fallback 1: _config.py module sibling.
- cfg_module_name = trainer_file.replace("_trainer", "_config")
- try:
- cfg_mod = getattr(trl.trainer, cfg_module_name)
- except Exception:
- cfg_mod = None
- if cfg_mod is not None:
- prefix = trainer_file.split("_")[0]
- hits = [
- x for x in dir(cfg_mod)
- if x.endswith("Config") and x != "Config"
- and not x.startswith("_") and prefix in x.lower()
- ]
- if len(hits) == 1:
- return hits
- # Fallback 2: MRO walk into experimental parent module.
- if len(name_list) != 1:
- return []
- try:
- trainer_cls = getattr(mod, name_list[0])
- except Exception:
- return []
- prefix = trainer_file.split("_")[0]
- for parent in trainer_cls.__mro__[1:]:
- if parent is object:
- continue
- parent_mod = inspect.getmodule(parent)
- if parent_mod is None:
- continue
- if parent_mod.__name__ == f"trl.trainer.{trainer_file}":
- continue
- hits = [
- x for x in dir(parent_mod)
- if x.endswith("Config") and x != "Config"
- and not x.startswith("_") and prefix in x.lower()
- ]
- if len(hits) == 1:
- return hits
- return []
-
-
- def test_unsloth_auto_discovery_finds_trainer_and_config_per_module():
- """Stage 2: drive the same unsloth rules over every trainer
- file. import-failures (optional deps) are recorded as
- `import-skipped`, mirroring rl.py:1944-1948 try/except."""
- ok = 0
- import_skipped = []
- discovery_skipped = []
- fail = []
- for trainer_file in _trainer_files():
- qual = f"trl.trainer.{trainer_file}"
- try:
- mod = getattr(trl.trainer, trainer_file)
- except Exception as e:
- import_skipped.append((qual, f"{type(e).__name__}: {e}"))
- continue
- trainers, configs = _apply_unsloth_discovery_rules(
- mod, trainer_file,
- )
- if len(trainers) != 1:
- discovery_skipped.append(
- (qual, f"trainers={trainers}")
- )
- continue
- if len(configs) != 1:
- configs = _resolve_config_via_fallbacks(
- trainer_file, trainers, mod,
- )
- if len(configs) != 1:
- fail.append(
- (qual,
- f"trainer={trainers[0]} but config not found "
- "(checked module, *_config sibling, and MRO)")
- )
- continue
- ok += 1
- print(f" OK {qual}: trainer={trainers[0]}, config={configs[0]}")
- print(
- f"\nDiscovery: ok={ok} import_skipped={len(import_skipped)} "
- f"discovery_skipped={len(discovery_skipped)} fail={len(fail)}"
- )
- for q, r in import_skipped:
- print(f" IMPORT-SKIP {q}: {r}")
- for q, r in discovery_skipped:
- print(f" DISC-SKIP {q}: {r}")
- for q, r in fail:
- print(f" FAIL {q}: {r}")
- # Hard contract: every TRAINER that imports cleanly AND has
- # exactly one *Trainer must also resolve exactly one *Config
- # via one of the three rules. import-skipped + discovery-
- # skipped (no/multiple *Trainer) are tolerated.
- assert not fail, (
- f"unsloth discovery rules failed for {len(fail)} trainers"
- )
- # Sanity: at least 3 trainers should fully discover on any
- # matrix cell (sft + reward + dpo are the historical core).
- assert ok >= 3, (
- f"only {ok} trainers fully discovered; expected >=3 "
- "(sft/reward/dpo). Possible TRL surface regression."
- )
-
-
- def test_orphan_trainer_modules_do_not_exist():
- """Stage 3: every _trainer module should have a sibling
- _config (TRL 0.26+ convention) OR an inline *Config. An
- ORPHAN _trainer with neither is a TRL refactor we want
- to know about: it would silently break unsloth's
- auto-discovery without raising."""
- orphans = []
- for trainer_file in _trainer_files():
- cfg_module_name = trainer_file.replace("_trainer", "_config")
- has_sibling_cfg = (
- importlib.util.find_spec(
- f"trl.trainer.{cfg_module_name}"
- ) is not None
- )
- if has_sibling_cfg:
- continue
- # No sibling -> require an inline *Config in the
- # trainer module itself (resolved via discovery rules).
- try:
- mod = getattr(trl.trainer, trainer_file)
- except Exception:
- # Optional-dep failure -> skip; the AST-parse stage
- # already covered the file.
- continue
- _, configs = _apply_unsloth_discovery_rules(
- mod, trainer_file,
- )
- if not configs:
- orphans.append(trainer_file)
- assert not orphans, (
- "Orphan TRL trainer modules with neither sibling "
- f"_config.py nor an inline *Config: {orphans}. "
- "unsloth auto-discovery would silently skip these."
- )
-
-
- # ---- Dynamic patch coverage: count + verify Unsloth-prefixed ----
-
- def _enumerate_canonical_trainer_classes():
- """Walk trl.trainer/*_trainer.py on disk (the source of
- truth for what `dir(trl.trainer)` should expose) and return
- [(trainer_file, TrainerClass), ...] for every entry that
- imports + has exactly-one resolvable *Trainer per the
- unsloth rules. Skips optional-dep ImportErrors."""
- out = []
- for trainer_file in _trainer_files():
- try:
- mod = getattr(trl.trainer, trainer_file)
- except Exception:
- continue
- trainers, _ = _apply_unsloth_discovery_rules(mod, trainer_file)
- if len(trainers) != 1:
- continue
- try:
- cls = getattr(mod, trainers[0])
- except Exception:
- continue
- out.append((trainer_file, cls))
- return out
-
-
- def _enumerate_experimental_trainer_packages():
- """TRL 0.29+ moved many trainers (bco, cpo, gkd, nash_md,
- online_dpo, orpo, ppo, prm, xpo, ...) to `trl.experimental.`,
- re-exposing them via thin-wrapper deprecation shims in
- `trl.trainer._trainer`. List every `trl.experimental.`
- that defines at least one *Trainer class, parsed by AST so we
- do NOT trigger the optional-dep imports on the package init."""
- spec = importlib.util.find_spec("trl.experimental")
- if spec is None or not spec.submodule_search_locations:
- return []
- import re as _re
- hits = []
- for root in spec.submodule_search_locations:
- rp = pathlib.Path(root)
- for sub in sorted(rp.iterdir()):
- if not sub.is_dir() or sub.name.startswith("_"):
- continue
- classes = []
- for py in sub.rglob("*.py"):
- try:
- src = py.read_text(encoding="utf-8")
- except Exception:
- continue
- for m in _re.finditer(
- r"^class\s+([A-Za-z0-9_]+Trainer)\b", src, _re.M,
- ):
- classes.append(m.group(1))
- if classes:
- hits.append((sub.name, sorted(set(classes))))
- return hits
-
-
- def _is_unsloth_patched(cls) -> bool:
- return getattr(cls, "__name__", "").startswith("Unsloth")
-
-
- def test_unsloth_patches_every_canonical_trainer_in_this_trl_version():
- """Verify the count + identity of canonically-patched trainers
- matches the trainer surface this TRL version actually ships.
-
- For TRL 0.22.x-0.23.x: ~18 canonical trainers expected.
- For TRL 0.24.x-0.28.x: ~15 canonical trainers expected.
- For TRL 0.29.x-1.x: 6 canonical (rest are experimental
- thin-wrappers; covered by the next test)."""
- from unsloth.models.rl import patch_trl_rl_trainers
- before = _enumerate_canonical_trainer_classes()
- before_count = len(before)
- before_unpatched = [
- (tf, cls.__name__) for tf, cls in before
- if not _is_unsloth_patched(cls)
- ]
- # Apply unsloth's umbrella patch.
- patch_trl_rl_trainers()
- # Re-enumerate (some classes may have been replaced in-module).
- after = _enumerate_canonical_trainer_classes()
- after_count = len(after)
- patched = [(tf, cls.__name__) for tf, cls in after
- if _is_unsloth_patched(cls)]
- unpatched = [(tf, cls.__name__) for tf, cls in after
- if not _is_unsloth_patched(cls)]
- print(
- f"\nCanonical trainer surface for TRL {trl.__version__}: "
- f"discoverable_before={before_count} "
- f"discoverable_after={after_count} "
- f"patched={len(patched)} unpatched={len(unpatched)}"
- )
- for tf, n in patched:
- print(f" PATCHED {tf}: {n}")
- for tf, n in unpatched:
- print(f" UNPATCHED {tf}: {n}")
- # Hard contract: every canonical trainer that imports
- # cleanly must end up Unsloth-prefixed after the umbrella
- # patch. If a trainer was discoverable BEFORE the patch but
- # is missing from `after`, that is a separate (rare) issue
- # we surface as failure.
- assert before_count == after_count, (
- f"trainer-class set changed across patching: "
- f"before={[n for _, n in before_unpatched]} "
- f"after={[n for _, n in unpatched]}"
- )
- assert not unpatched, (
- "unsloth.models.rl.patch_trl_rl_trainers did NOT patch: "
- + ", ".join(f"{tf}:{n}" for tf, n in unpatched)
- )
- # Floor matches the cohort sizes from the TRL version sweep:
- # 18 (0.22-0.23), 15 (0.24-0.28), 6 (0.29+ canonical only).
- assert len(patched) >= 6, (
- f"only {len(patched)} canonical trainers patched; "
- "expected >= 6 (the smallest production cohort)."
- )
-
-
- def test_unsloth_patches_experimental_trainers_via_thin_wrappers():
- """TRL 0.29+ ships canonical-`trl.trainer._trainer` modules
- for many trainers as deprecation thin-wrappers that forward
- to `trl.experimental.`. unsloth's
- `_patch_trl_rl_trainers` (rl.py:677-702) detects
- `trl.experimental` in the trainer source and resolves to
- the parent class -- so patching the canonical entry should
- also Unsloth-prefix the experimental class via in-module
- setattr.
-
- Verify by walking trl.experimental.* AST for every *Trainer
- class, then checking whether it (or any class with the same
- name in the experimental package) carries the Unsloth
- prefix after the umbrella patch."""
- from unsloth.models.rl import patch_trl_rl_trainers
- patch_trl_rl_trainers()
- experimental_pkgs = _enumerate_experimental_trainer_packages()
- if not experimental_pkgs:
- pytest.skip(
- f"TRL {trl.__version__} has no trl.experimental.* "
- "trainer surface (pre-0.29 cohort). The canonical "
- "test above already covers patching here."
- )
- found = []
- missing = []
- for pkg_name, class_names in experimental_pkgs:
- qual = f"trl.experimental.{pkg_name}"
- try:
- pkg_mod = importlib.import_module(qual)
- except Exception as e:
- # Optional-dep ImportError: experimental package
- # could not be loaded. Match unsloth's runtime
- # tolerance: this would also be silently skipped
- # by `_patch_trl_rl_trainers`. Record but do not
- # fail.
- print(
- f" IMPORT-SKIP {qual}: "
- f"{type(e).__name__}: {str(e)[:120]}"
- )
- continue
- for cls_name in class_names:
- cls = getattr(pkg_mod, cls_name, None)
- if cls is None:
- # Class is defined inside the package but not
- # re-exported on the package init. Walk
- # submodules to find it.
- import pkgutil as _pku
- for sub in _pku.walk_packages(
- pkg_mod.__path__, prefix=qual + "."
- ):
- try:
- sub_mod = importlib.import_module(sub.name)
- except Exception:
- continue
- cls = getattr(sub_mod, cls_name, None)
- if cls is not None:
- break
- if cls is None:
- missing.append((pkg_name, cls_name))
- continue
- if _is_unsloth_patched(cls):
- found.append((pkg_name, cls_name))
- print(f" PATCHED trl.experimental.{pkg_name}.{cls_name}")
- else:
- # Not Unsloth-prefixed: either unsloth chose
- # not to patch this surface (e.g. the canonical
- # thin-wrapper module did not exist) or the
- # patch silently failed. Record both
- # outcomes; the assertion below tolerates the
- # gap as informational, not failure -- the
- # canonical test enforces the hard contract.
- print(
- f" NOT-PATCHED trl.experimental.{pkg_name}."
- f"{cls_name} (no Unsloth-prefix on the "
- "experimental surface)"
- )
- total_experimental = sum(len(cs) for _, cs in experimental_pkgs)
- print(
- f"\nExperimental trainer surface (TRL {trl.__version__}): "
- f"{len(experimental_pkgs)} packages, "
- f"{total_experimental} *Trainer classes; "
- f"unsloth-patched={len(found)} class-missing={len(missing)}"
- )
- # Hard contract: a *Trainer class declared in a python
- # source file must be locatable in its package after import.
- # If we saw the class definition but cannot find the symbol
- # at runtime, the package's public surface drifted.
- assert not missing, (
- "experimental *Trainer classes declared in source but "
- f"not importable: {missing}"
- )
- PY
- python -m pytest -q --tb=short -s tests/_trl_trainer_discovery_shim.py
- rm -f tests/_trl_trainer_discovery_shim.py
-
- - name: MoE per-family coverage + GRPO patches + grouped_gemm AST
- # Catches the recurring class of bugs that PR #624 (gemma4 missing
- # extractor), PR #612 (gemma4 GRPO patch silently dropped), PR #607
- # (gate_up LoRA dropped from grad graph), PR #601 (qwen MoE shape
- # mismatch), unsloth#4934 (TRL disable_gradient_checkpointing
- # corrupts unsloth GC), and unsloth#3598 (gradient_accumulation
- # double-scale on accepts_loss_kwargs=False) targeted. Coverage:
- #
- # 1. Per-MoE-family side-effect contract: for every patch_*_moe
- # function in unsloth_zoo.temporary_patches, if its target
- # transformers class is importable on this matrix cell, the
- # patch must mark the class with `_unsloth_already_patched=True`
- # after running. This is exactly what unsloth_zoo's existing
- # test_moe_lora_extractor_coverage walks at the registration
- # level; here we tie each patch fn to its declared target so a
- # silent early-return (PR #612 style) surfaces as red rather
- # than a coverage skip.
- #
- # 2. PR #4934 (GRPO + TRL 1.0): patch_trl_disable_gradient_checkpointing
- # must rebind trl.models.utils.disable_gradient_checkpointing to
- # the unsloth no-op AND propagate the rebinding to every trl.*
- # module that imported the symbol by reference.
- #
- # 3. PR #3598 (gradient_accumulation): patch_gradient_accumulation_fix
- # must run cleanly on a synthetic Trainer whose training_step
- # signature carries `num_items_in_batch`. The original bug was
- # that `accepts_loss_kwargs=False` (Qwen3VL, Gemma3 in t-4.57)
- # caused double loss-scaling; here we verify the rewrite path
- # itself does not raise on a CPU-resolvable shape.
- #
- # 4. unsloth/kernels/moe/grouped_gemm AST smoke: the Triton kernels
- # are GPU-only at runtime, but a SyntaxError or stray
- # string-literal in the source still surfaces as a test-time
- # ImportError on every install. ast.parse the .py files without
- # executing.
- #
- # Wall-time per cell ~30-60s. Routed through pytest for the spoof
- # harness so unsloth_zoo.temporary_patches imports are clean.
- run: |
- set -euxo pipefail
- cat > tests/_moe_coverage_shim.py <<'PY'
- # Auto-generated by .github/workflows/consolidated-tests-ci.yml.
- import sys, pathlib, ast, importlib, importlib.util, contextlib, os
- sys.path.insert(0, str(pathlib.Path(__file__).parent))
- import _zoo_aggressive_cuda_spoof as _spoof
- _spoof.apply()
-
- import pytest
-
- # Map each MoE patch function to the transformers classes it is
- # contractually responsible for marking with _unsloth_already_patched
- # after a successful run. Sourced from
- # unsloth_zoo/temporary_patches/_moe.py:
- # - qwen3_moe.py:382-398 patches Qwen3MoeExperts (new path) or
- # Qwen3MoeSparseMoeBlock (old path).
- # - qwen3_5_moe.py + qwen3_next_moe.py + qwen3_vl_moe.py register
- # extractors on Qwen3_5MoeExperts / Qwen3NextExperts /
- # Qwen3VLMoeTextExperts respectively.
- # - gemma4_moe.py marks Gemma4TextExperts (current) or
- # Gemma4TextMoEBlock (legacy).
- # - glm4_moe.py marks Glm4MoeLiteNaiveMoe.
- # - deepseek_v3_moe.py marks DeepseekV3NaiveMoe.
- # - gpt_oss.py:patch_gpt_oss_moe_for_lora marks GptOssExperts.
- # Each cell skips a target if the transformers version lacks it
- # (legitimate version-skew); only patches with at least one
- # importable target are exercised.
- # Each entry = ((patch_module, patch_fn), targets, env_setup,
- # version_gate). env_setup runs before the patch fn (e.g. set
- # UNSLOTH_MODEL_NAME for gpt_oss). version_gate is a callable
- # returning True when the patch SHOULD run on this transformers;
- # if False, the test skips with a documented reason.
- def _v5_or_later():
- try:
- import transformers
- major = int(transformers.__version__.split(".")[0])
- return major >= 5
- except Exception:
- return False
-
- MOE_PATCHES = [
- {
- "module": "unsloth_zoo.temporary_patches.qwen3_moe",
- "fn": "patch_qwen3_moe",
- "targets": [
- ("transformers.models.qwen3_moe.modeling_qwen3_moe", "Qwen3MoeExperts"),
- ("transformers.models.qwen3_moe.modeling_qwen3_moe", "Qwen3MoeSparseMoeBlock"),
- ],
- "env": {},
- "gate": lambda: True,
- "gate_reason": "",
- },
- {
- "module": "unsloth_zoo.temporary_patches.qwen3_5_moe",
- "fn": "patch_qwen3_5_moe",
- "targets": [
- ("transformers.models.qwen3_5_moe.modeling_qwen3_5_moe", "Qwen3_5MoeExperts"),
- ],
- "env": {}, "gate": lambda: True, "gate_reason": "",
- },
- {
- "module": "unsloth_zoo.temporary_patches.qwen3_next_moe",
- "fn": "patch_qwen3_next_moe",
- "targets": [
- ("transformers.models.qwen3_next.modeling_qwen3_next", "Qwen3NextExperts"),
- ],
- "env": {}, "gate": lambda: True, "gate_reason": "",
- },
- {
- "module": "unsloth_zoo.temporary_patches.qwen3_vl_moe",
- "fn": "patch_qwen3_vl_moe",
- "targets": [
- ("transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe", "Qwen3VLMoeTextExperts"),
- ],
- "env": {}, "gate": lambda: True, "gate_reason": "",
- },
- {
- "module": "unsloth_zoo.temporary_patches.gemma4_moe",
- "fn": "patch_gemma4_moe",
- "targets": [
- ("transformers.models.gemma4.modeling_gemma4", "Gemma4TextExperts"),
- ],
- "env": {}, "gate": lambda: True, "gate_reason": "",
- },
- {
- "module": "unsloth_zoo.temporary_patches.glm4_moe",
- "fn": "patch_glm4_moe",
- "targets": [
- ("transformers.models.glm4_moe.modeling_glm4_moe", "Glm4MoeLiteNaiveMoe"),
- ],
- "env": {}, "gate": lambda: True, "gate_reason": "",
- },
- {
- "module": "unsloth_zoo.temporary_patches.deepseek_v3_moe",
- "fn": "patch_deepseek_v3_moe",
- "targets": [
- ("transformers.models.deepseek_v3.modeling_deepseek_v3", "DeepseekV3NaiveMoe"),
- ],
- "env": {}, "gate": lambda: True, "gate_reason": "",
- },
- {
- "module": "unsloth_zoo.temporary_patches.gpt_oss",
- "fn": "patch_gpt_oss_moe_for_lora",
- "targets": [
- ("transformers.models.gpt_oss.modeling_gpt_oss", "GptOssExperts"),
- ],
- # The patch reads UNSLOTH_MODEL_NAME and only runs when
- # "gpt_oss" is in the normalized form. Set it explicitly
- # so the gate at gpt_oss.py:1387 passes; otherwise the
- # patch silently early-returns and the test would
- # spuriously fail.
- "env": {"UNSLOTH_MODEL_NAME": "gpt_oss"},
- # Additionally only runs on transformers >= 5
- # (gpt_oss.py:1392 `_is_transformers_v5()` gate).
- "gate": _v5_or_later,
- "gate_reason": (
- "patch_gpt_oss_moe_for_lora gates on "
- "transformers >= 5 (split-LoRA grouped_mm path)"
- ),
- },
- ]
-
-
- def _resolve_target_classes(targets):
- """Return [(qual, cls), ...] for every importable target."""
- out = []
- for mod_path, cls_name in targets:
- try:
- mod = importlib.import_module(mod_path)
- except Exception:
- continue
- cls = getattr(mod, cls_name, None)
- if cls is None:
- continue
- out.append((f"{mod_path}.{cls_name}", cls))
- return out
-
-
- @pytest.mark.parametrize(
- "spec",
- MOE_PATCHES,
- ids=lambda s: s["fn"],
- )
- def test_moe_patch_marks_its_target_when_class_present(spec, monkeypatch):
- """If at least one target class is importable AND the
- version gate passes, run the patch fn and assert at least
- one target is marked patched afterwards. Skips when the
- transformers version lacks every target or when the
- version gate blocks the patch (legitimate). Fails on
- silent patch-fn early-returns (PR #612 class of bug)."""
- targets = spec["targets"]
- patch_module = spec["module"]
- patch_name = spec["fn"]
- importable = _resolve_target_classes(targets)
- if not importable:
- pytest.skip(
- f"{patch_name}: no target class importable on this "
- f"transformers (looked for {[c for _, c in targets]})."
- )
- if not spec["gate"]():
- pytest.skip(
- f"{patch_name}: version gate blocks this cell. "
- f"Reason: {spec['gate_reason']}"
- )
- for k, v in spec["env"].items():
- monkeypatch.setenv(k, v)
- try:
- pmod = importlib.import_module(patch_module)
- except Exception as e:
- pytest.skip(
- f"{patch_module} import failed (likely optional dep): "
- f"{type(e).__name__}: {e}"
- )
- fn = getattr(pmod, patch_name, None)
- if fn is None or not callable(fn):
- pytest.skip(f"{patch_module} has no callable {patch_name}")
- try:
- fn()
- except Exception as e:
- raise AssertionError(
- f"{patch_name}() raised on a transformers that "
- f"DOES ship at least one target class ({importable}). "
- f"This is the silent-failure mode PR #612 fixed: "
- f"{type(e).__name__}: {e}"
- )
- # At least one importable target must now carry SOME marker
- # showing unsloth touched it. Accepted signals (each is set
- # by a different patch flow in unsloth_zoo):
- # - `_unsloth_already_patched=True` (gemma4, deepseek_v3, glm4)
- # - `_unsloth_lora_patched=True` (gpt_oss_moe_for_lora)
- # - `_unsloth_lora_extractor_fn` is callable (qwen3_*, glm4_moe)
- # - `_original___forward` attr
- # (set by patch_function: qwen3_moe SparseMoeBlock, etc.)
- # - `_original_forward` attribute (gpt_oss in-place patch)
- # Accept any one as "patched".
- def _is_patched(cls) -> bool:
- if getattr(cls, "_unsloth_already_patched", False) is True:
- return True
- if getattr(cls, "_unsloth_lora_patched", False) is True:
- return True
- if callable(getattr(cls, "_unsloth_lora_extractor_fn", None)):
- return True
- if "_original_forward" in dir(cls):
- return True
- cls_name = cls.__name__
- for attr in dir(cls):
- if attr.startswith("_original_") and attr.endswith(
- f"_{cls_name}_forward"
- ):
- return True
- return False
-
- after = _resolve_target_classes(targets)
- marked = [qual for qual, cls in after if _is_patched(cls)]
- if not marked:
- raise AssertionError(
- f"{patch_name}() ran without exception but no target "
- f"in {importable} carries any of the unsloth markers "
- "(_unsloth_already_patched / _unsloth_lora_patched / "
- "_unsloth_lora_extractor_fn / _original_*_forward). "
- "Patch silently no-op'd (PR #612 class of bug)."
- )
- print(f" {patch_name}: marked {marked}")
-
-
- # ---- PR #4934 (TRL 1.0+ GRPO disable_gradient_checkpointing) ----
-
- def test_patch_trl_disable_gradient_checkpointing():
- """unsloth/models/rl.py:patch_trl_disable_gradient_checkpointing
- must rebind trl.models.utils.disable_gradient_checkpointing to
- the unsloth no-op when TRL >= 1.0. Pre-1.0 TRL has no such
- symbol -> the patch returns early."""
- try:
- import trl.models.utils as _tmu
- except ImportError:
- pytest.skip("trl not installed")
- had_symbol = hasattr(_tmu, "disable_gradient_checkpointing")
- try:
- from unsloth.models.rl import patch_trl_disable_gradient_checkpointing
- except ImportError:
- pytest.skip(
- "unsloth.models.rl.patch_trl_disable_gradient_checkpointing "
- "absent (older unsloth than #4934)"
- )
- patch_trl_disable_gradient_checkpointing()
- if not had_symbol:
- # Pre-1.0 TRL: patch is a no-op early-return. Verify
- # nothing broke.
- pytest.skip(
- "TRL pre-1.0 has no disable_gradient_checkpointing; "
- "patch correctly early-returned."
- )
- fn = getattr(_tmu, "disable_gradient_checkpointing", None)
- assert fn is not None, (
- "trl.models.utils.disable_gradient_checkpointing missing "
- "after patch -- patch removed the symbol entirely?"
- )
- assert getattr(fn, "_unsloth_noop_patched", False) is True, (
- "trl.models.utils.disable_gradient_checkpointing was NOT "
- "rebound to the unsloth no-op. PR #4934 regression."
- )
- # PR #4934 also walks sys.modules to rebind trl.* modules
- # that imported the symbol by reference. Verify at least the
- # canonical trainer modules picked up the rebinding when
- # they re-export it.
- import sys
- checked = 0
- missed = []
- for mod_name, mod in list(sys.modules.items()):
- if not mod_name.startswith("trl."):
- continue
- bound = getattr(mod, "disable_gradient_checkpointing", None)
- if bound is None:
- continue
- checked += 1
- if not getattr(bound, "_unsloth_noop_patched", False):
- missed.append(mod_name)
- print(f" rebound disable_gradient_checkpointing in {checked} trl.* modules")
- assert not missed, (
- "trl.* modules that imported disable_gradient_checkpointing "
- f"by reference but did not get rebound: {missed}"
- )
-
-
- # ---- PR #3598 (gradient_accumulation loss-scaling rewrite) ----
-
- def test_patch_gradient_accumulation_fix_runs_on_synthetic_trainer():
- """patch_gradient_accumulation_fix rewrites a Trainer's
- `training_step` source via inspect+exec when the signature
- carries `num_items_in_batch`. PR #3598 fixed the rewrite
- path to not double-scale for trainers with
- `accepts_loss_kwargs=False`. Verify the patch fn runs
- without raising on a synthetic Trainer carrying that
- signature."""
- try:
- from unsloth.models._utils import patch_gradient_accumulation_fix
- except ImportError:
- pytest.skip(
- "unsloth.models._utils.patch_gradient_accumulation_fix absent"
- )
- try:
- from transformers import Trainer
- except ImportError:
- pytest.skip("transformers.Trainer absent")
- # The patch reads the live Trainer.training_step source. We
- # exercise the standard transformers.Trainer here -- if the
- # bug is reintroduced in the source rewriter (e.g. broken
- # exec, missing import injection), the patch fn raises.
- try:
- patch_gradient_accumulation_fix(Trainer)
- except Exception as e:
- raise AssertionError(
- "patch_gradient_accumulation_fix raised on a vanilla "
- f"transformers.Trainer: {type(e).__name__}: {e}"
- )
- # Idempotency: second call must not raise either (the rewrite
- # adds `_unsloth_training_step` marker so the second call
- # short-circuits per _utils.py:1692-1693).
- patch_gradient_accumulation_fix(Trainer)
-
-
- # ---- unsloth/kernels/moe/grouped_gemm AST smoke ----
-
- def _walk_py_files(root: pathlib.Path):
- for p in root.rglob("*.py"):
- if "__pycache__" in p.parts:
- continue
- yield p
-
-
- def test_unsloth_kernels_moe_grouped_gemm_ast_parses():
- """unsloth/kernels/moe/grouped_gemm hosts the Triton MoE
- kernels (GPU-only at runtime). A SyntaxError or stray token
- at the SOURCE level still surfaces as ImportError on every
- install, so AST-parse the .py files without executing."""
- # Locate `unsloth/kernels/moe/grouped_gemm` via the installed
- # `unsloth` package.
- import unsloth as _unsloth
- kernel_root = (
- pathlib.Path(_unsloth.__file__).parent
- / "kernels" / "moe" / "grouped_gemm"
- )
- if not kernel_root.exists():
- pytest.skip(
- f"{kernel_root} not present in this unsloth checkout."
- )
- fail = []
- ok = 0
- for p in _walk_py_files(kernel_root):
- try:
- ast.parse(p.read_text(encoding="utf-8"), filename=str(p))
- ok += 1
- except SyntaxError as e:
- fail.append((str(p), f"SyntaxError: {e}"))
- except Exception as e:
- fail.append((str(p), f"{type(e).__name__}: {e}"))
- print(f"AST-parsed {ok} grouped_gemm files; failed={len(fail)}")
- for path, err in fail:
- print(f" AST FAIL {path}: {err}")
- assert not fail, (
- f"AST parse failed for {len(fail)} grouped_gemm files"
- )
- # Sanity: the directory MUST contain at least the interface
- # + kernels + reference subtrees as documented.
- expected = [
- "interface.py",
- "kernels/forward.py",
- "kernels/backward.py",
- "reference/moe_block.py",
- "reference/moe_ops.py",
- ]
- missing = [e for e in expected if not (kernel_root / e).is_file()]
- assert not missing, (
- "grouped_gemm directory layout regressed; missing: "
- f"{missing}"
- )
- PY
- python -m pytest -q --tb=short -s tests/_moe_coverage_shim.py
- rm -f tests/_moe_coverage_shim.py
-
- - name: Summary
- if: always()
- run: |
- echo "::group::Versions"
- python -c "import sys, platform; print(sys.version); print(platform.platform())"
- python -c "import torch; print('torch', torch.__version__, 'cuda?', torch.cuda.is_available())"
- python -c "import transformers; print('transformers', transformers.__version__)"
- # `pip show` instead of `import unsloth_zoo` — its __init__ raises
- # without an accelerator and the spoof harness only kicks in under
- # pytest. Cheap and accurate.
- pip show unsloth_zoo
- echo "::endgroup::"
- echo "Consolidated job done. Coverage:"
- echo " - 16 unsloth Bucket-A tests under tests/saving/ + tests/utils/"
- echo " - unsloth_zoo @ ${UNSLOTH_ZOO_REF} pytest tests/ (5 GPU cases deselected)"
- echo " - unsloth_zoo.compiler.test_apply_fused_lm_head"
-
- llama-cpp-smoke:
- # Standalone llama.cpp build + smoke. Earlier this lived inside every
- # consolidated matrix cell and re-cmake'd llama.cpp ~5 min per cell --
- # 3 cells x 275 s = ~14 min of duplicated CPU on every PR for an
- # artefact that has nothing to do with the (transformers, TRL) combo.
- # `install_llama_cpp` clones ggml-org/llama.cpp at a pinned commit and
- # builds the LLAMA_CPP_TARGETS list; the result is independent of the
- # HF stack version. Run once, gate the PR.
- name: llama.cpp build + smoke
- runs-on: ubuntu-latest
- timeout-minutes: 25
- env:
- UNSLOTH_ZOO_REF: ${{ inputs.unsloth_zoo_ref || 'main' }}
- # Same env contract the matrix cells use: protobuf python parser
- # (transformers' bundled *_pb2.py needs it), studio on PYTHONPATH,
- # compile-disable + UNSLOTH_IS_PRESENT so unsloth_zoo's __init__
- # bootstrap accepts a pure-import.
- PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python
- PYTHONPATH: ${{ github.workspace }}/studio
- UNSLOTH_COMPILE_DISABLE: '1'
- UNSLOTH_IS_PRESENT: '1'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Install runtime deps for unsloth_zoo.llama_cpp
- # unsloth_zoo's `__init__` imports `temporary_patches`, which
- # in turn pulls per-architecture submodules (gemma3n, gemma4,
- # qwen3_*_moe, glm4_moe, deepseek_v3_moe, pixtral, ministral,
- # mxfp4, bitsandbytes, flex_attention_bwd) -- many of those
- # transitively touch transformers and peft / accelerate. Mirror
- # the matrix job's install minus the heavy bits that have no
- # bearing on `install_llama_cpp` itself: studio.txt's FastAPI
- # stack, bitsandbytes (CUDA-only build dependency), triton,
- # mammoth/unpdf (PDF tools), datasets, sqlalchemy/cryptography,
- # pytest (we run no tests). The remaining pin shape matches
- # studio-backend-ci.yml's "Repo tests (CPU)" baseline.
- run: |
- set -euxo pipefail
- python -m pip install --upgrade pip
- # Match the matrix job's torch path so unsloth_zoo's
- # `import torch` resolves to the same CPU build.
- pip install --index-url https://download.pytorch.org/whl/cpu \
- 'torch>=2.4,<2.11' 'torchvision<0.26'
- pip install \
- 'numpy<3' protobuf sentencepiece \
- requests tqdm psutil packaging safetensors \
- 'peft>=0.18,<0.20' 'accelerate>=0.34,<2'
- # transformers + trl come from pyproject.toml's pinned line
- # so this job stays in sync with whatever the consolidated
- # `__from_pyproject__` matrix cell is using.
- pip install transformers trl
- pip install -e . --no-deps
-
- - name: Clone unsloth_zoo @ ${{ env.UNSLOTH_ZOO_REF }}
- # Same shallow clone as the matrix job; we install editable so
- # `unsloth_zoo.llama_cpp` resolves to the cloned tree (and any
- # main-branch fixes flow into the smoke without a release).
- run: |
- set -euxo pipefail
- # github.com occasionally 500s on the git fetch; retry so a
- # single upstream blip does not fail CI.
- for attempt in 1 2 3; do
- rm -rf "$RUNNER_TEMP/unsloth-zoo"
- if git clone --depth=1 --branch="$UNSLOTH_ZOO_REF" \
- https://github.com/unslothai/unsloth-zoo \
- "$RUNNER_TEMP/unsloth-zoo"; then
- break
- fi
- if [ "$attempt" -eq 3 ]; then
- echo "::error::git clone unsloth-zoo failed after 3 attempts"
- exit 1
- fi
- delay=$((5 * attempt))
- echo "::warning::clone failed (attempt $attempt/3), retrying in ${delay}s..."
- sleep "$delay"
- done
- pip install -e "$RUNNER_TEMP/unsloth-zoo" --no-deps
- pip show unsloth_zoo
-
- - name: llama.cpp install via unsloth_zoo.llama_cpp + `llama-cli --help` smoke
- # Exercise the canonical `unsloth_zoo.llama_cpp.install_llama_cpp`
- # flow that GGUF export uses at runtime: clone ggml-org/llama.cpp
- # into ~/.unsloth/llama.cpp, build the LLAMA_CPP_TARGETS list
- # (llama-quantize, llama-cli, llama-mtmd-cli, llama-gguf-split,
- # llama-server) via cmake, then run `llama-cli --help`.
- #
- # This replaces the previous "download upstream prebuilt zip"
- # approach, which silently exited 0 with the message
- # "no ubuntu-x64 prebuilt asset" when ggml-org's release-asset
- # naming drifted (the regex `bin-ubuntu-x64.*\.zip$` no longer
- # matched their current asset names). The build path is the same
- # one Unsloth users hit in production via `model.save_pretrained_gguf`.
- #
- # Wall-time budget: ~3-5 min cold, dominated by cmake build of
- # 5 targets on the runner's 4 cores. Apt-package install is
- # handled by `install_llama_cpp` itself via its
- # `check_build_requirements` -> `install_package` chain.
- run: |
- set -euxo pipefail
- # libssl-dev / libcurl4-openssl-dev are needed by llama.cpp's
- # cmake build for HTTPS support; install up-front so the
- # `install_llama_cpp` requirement-check is a no-op.
- sudo apt-get update -qq
- sudo apt-get install -y -qq build-essential cmake git curl \
- libgomp1 libssl-dev libcurl4-openssl-dev
- python <<'PY'
- import os, shutil, subprocess, sys, pathlib
- # Apply the same CPU spoof the pytest shims use BEFORE any
- # unsloth_zoo import: unsloth_zoo/__init__.py calls
- # device_type.get_device_type() at module load and raises
- # `NotImplementedError: Unsloth cannot find any torch
- # accelerator` on a GPU-less runner. The spoof flips
- # torch.cuda.is_available() to True so the device probe takes
- # the cuda branch; we never actually run CUDA tensor ops in
- # this step (just clone+cmake+--help on the binaries).
- sys.path.insert(0, str(pathlib.Path("tests").resolve()))
- import _zoo_aggressive_cuda_spoof as _spoof
- _spoof.apply()
- from unsloth_zoo.llama_cpp import (
- install_llama_cpp,
- LLAMA_CPP_DEFAULT_DIR,
- LLAMA_CPP_TARGETS,
- )
- print(f"Unsloth llama.cpp default dir: {LLAMA_CPP_DEFAULT_DIR}")
- print(f"Build targets: {LLAMA_CPP_TARGETS}")
- # install_llama_cpp returns (quantizer_path, converter_script_path).
- # The quantizer's directory is the `llama.cpp` install root, which
- # also holds llama-cli after build/bin/llama-* gets copied up
- # (llama_cpp.py:867-871).
- quantizer, converter = install_llama_cpp(print_output=True)
- assert quantizer and os.path.exists(quantizer), (
- f"install_llama_cpp returned quantizer={quantizer!r} but file missing"
- )
- assert converter and os.path.isfile(converter), (
- f"install_llama_cpp returned converter={converter!r} but missing"
- )
- install_root = os.path.dirname(quantizer)
- cli = os.path.join(install_root, "llama-cli")
- assert os.path.exists(cli), (
- f"llama-cli not found at {cli!r} after build. Build root contents: "
- f"{sorted(p for p in os.listdir(install_root) if p.startswith('llama-'))[:20]}"
- )
- assert os.access(cli, os.X_OK), f"{cli!r} not executable"
- # `llama-cli --help` exits non-zero on some builds; the contract
- # is that recognizable help text appears on stdout/stderr.
- proc = subprocess.run(
- [cli, "--help"], capture_output=True, text=True, timeout=30,
- )
- combined = (proc.stdout or "") + (proc.stderr or "")
- print("--- llama-cli --help (first 30 lines) ---")
- print("\n".join(combined.splitlines()[:30]))
- assert any(
- tok in combined.lower()
- for tok in ("usage", "--help", "--model", "-m,")
- ), (
- f"llama-cli --help produced no recognizable help text. "
- f"exit={proc.returncode}\nstdout: {proc.stdout[:400]!r}\n"
- f"stderr: {proc.stderr[:400]!r}"
- )
- # Also exercise the quantizer the way GGUF export does: --help
- # round-trip on the binary that does the actual heavy lifting.
- q = subprocess.run(
- [quantizer, "--help"], capture_output=True, text=True, timeout=15,
- )
- q_combined = (q.stdout or "") + (q.stderr or "")
- assert "usage" in q_combined.lower() or "type" in q_combined.lower(), (
- f"llama-quantize --help produced no help text. "
- f"exit={q.returncode}\nstdout: {q.stdout[:400]!r}\n"
- f"stderr: {q.stderr[:400]!r}"
- )
- print(
- f"\nOK: install_llama_cpp produced a working llama-cli at {cli} "
- f"and llama-quantize at {quantizer}."
- )
- PY
diff --git a/.github/workflows/lint-ci.yml b/.github/workflows/lint-ci.yml
deleted file mode 100644
index 00e6e357e2..0000000000
--- a/.github/workflows/lint-ci.yml
+++ /dev/null
@@ -1,321 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Whole-repo, multi-language source-lint gate. Runs on every PR
-# (no path filter) because each step is sub-second to a few seconds
-# and together they catch a class of breakage the focused build
-# workflows would miss:
-#
-# - Python syntax + ruff + leftover debugger calls (across 350+
-# committed .py files, not just studio/backend).
-# - Shell `bash -n` parse for every committed *.sh.
-# - `yaml.safe_load` and `json.loads` round-trip for every
-# committed YAML / JSON config.
-#
-# TypeScript and Rust are NOT duplicated here on purpose:
-# - Studio Frontend CI runs `npm run typecheck` (= `tsc --noEmit`)
-# and `npm run build` (vite/swc) on every studio/frontend/**
-# change, which is a full TS AST + type check.
-# - Studio Tauri CI runs `tauri build --debug --no-bundle` on
-# every studio/src-tauri/** or studio/frontend/** change, which
-# compiles the Rust crate (= cargo check + cargo build).
-# Each is a stricter check than a parse-only step would be, so a
-# fast-fail duplicate here would only burn cache; the dedicated
-# workflows already block merges on Rust / TS regressions.
-
-name: Lint CI
-
-on:
- pull_request:
- push:
- branches: [main, pip]
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- source-lint:
- name: Source lint (Python + shell + YAML + JSON + safety nets)
- runs-on: ubuntu-latest
- timeout-minutes: 5
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- # Pin ruff to match .pre-commit-config.yaml so a CI-only ruff
- # bump cannot disagree with what pre-commit accepted.
- # codespell is pinned for the same reason: a reviewer should
- # never see a typo report appear and disappear depending on
- # which codespell version the runner happened to install.
- - run: pip install 'ruff==0.15.12' 'pyyaml>=6' 'codespell>=2.3,<3'
-
- - name: Linux deps for shellcheck
- run: sudo apt-get update -qq && sudo apt-get install -y --no-install-recommends shellcheck
-
- - name: Python AST/syntax check (every committed .py must compile)
- # python -m compileall uses the same parser the interpreter
- # uses, so anything broken here would also crash at
- # `import X` on a user's machine. Sub-second across 350+
- # files. Hard gate.
- run: |
- python -m compileall -q -j 0 \
- unsloth unsloth_cli studio tests cli.py unsloth-cli.py
-
- - name: Python ruff check (whole repo)
- # The narrow rule set in pyproject.toml [tool.ruff.lint]
- # selects E9 / F63 / F7 / F82 -- syntax errors, broken
- # comparisons, undefined names. The whole repo passes today,
- # so this is a hard gate.
- run: |
- ruff check unsloth unsloth_cli studio tests cli.py unsloth-cli.py
-
- - name: No leftover debugger / pdb / breakpoint calls
- # Catches the "I'll just stick a breakpoint() here" mistake
- # before it ships. AST-based so commented-out debugger
- # markers don't false-positive (a bare grep would; there
- # are three commented `# breakpoint()` markers in
- # unsloth/models/rl* today). Sub-second.
- run: |
- python <<'PY'
- import ast, pathlib, sys
-
- SKIP_PARTS = {".venv", "venv", "build", "dist", ".git",
- "unsloth_compiled_cache", "node_modules",
- "unsloth.egg-info"}
-
- bad = []
- scanned = 0
- for path in sorted(pathlib.Path(".").rglob("*.py")):
- if any(part in SKIP_PARTS for part in path.parts):
- continue
- scanned += 1
- try:
- tree = ast.parse(path.read_text(encoding="utf-8", errors="replace"))
- except SyntaxError:
- continue # compileall step above already failed this
- for node in ast.walk(tree):
- if not isinstance(node, ast.Call):
- continue
- fn = node.func
- if isinstance(fn, ast.Name) and fn.id == "breakpoint":
- bad.append((path, node.lineno, "breakpoint()"))
- elif (isinstance(fn, ast.Attribute) and fn.attr == "set_trace"
- and isinstance(fn.value, ast.Name)
- and fn.value.id in {"pdb", "ipdb"}):
- bad.append((path, node.lineno, f"{fn.value.id}.set_trace()"))
-
- if bad:
- for path, lineno, what in bad:
- print(f"::error file={path},line={lineno}::leftover {what} -- remove before merging")
- sys.exit(1)
- print(f"no leftover debugger calls (scanned {scanned} files)")
- PY
-
- - name: License-header drift (informational; whole repo)
- # Three header families are accepted across the repo:
- # 1. SPDX one-liner: `# SPDX-License-Identifier: ...`
- # Used across studio/ (AGPL-3.0-only) and a few new
- # files elsewhere.
- # 2. Apache-2.0 long form, marker phrase
- # "Licensed under the Apache License". Used across
- # unsloth/ and unsloth_cli/.
- # 3. GNU long form, marker phrase "General Public License".
- # That single substring covers GPL, LGPL ("GNU Lesser
- # General Public License") and AGPL ("GNU Affero
- # General Public License") preambles, all three of
- # which appear in unsloth/kernels/* (LGPL/AGPL) without
- # the SPDX line.
- # Empty files (mainly empty __init__.py) are skipped.
- # Surfaced as a warning; cleaning up the actual misses is a
- # follow-up PR, not a CI fix.
- continue-on-error: true
- run: |
- python <<'PY'
- import pathlib
-
- ACCEPTED = (
- "SPDX-License-Identifier", # any SPDX line
- "Licensed under the Apache License", # Apache-2.0 long form
- "General Public License", # GPL / LGPL / AGPL long form
- )
- SKIP_PARTS = {".venv", "venv", "build", "dist", ".git",
- "unsloth_compiled_cache", "node_modules",
- "unsloth.egg-info"}
-
- studio_missing = []
- other_missing = []
- for path in sorted(pathlib.Path(".").rglob("*.py")):
- if any(part in SKIP_PARTS for part in path.parts):
- continue
- text = path.read_text(encoding="utf-8", errors="replace")
- if not text.strip():
- continue # empty __init__.py etc.
- head = "\n".join(text.splitlines()[:25])
- if any(marker in head for marker in ACCEPTED):
- continue
- if "studio" in path.parts:
- studio_missing.append(path)
- else:
- other_missing.append(path)
-
- total = len(studio_missing) + len(other_missing)
- if total == 0:
- print("every committed .py has a recognised license header")
- else:
- print(f"::warning::{total} Python files have no recognised license "
- f"header (SPDX / Apache-2.0 / GNU long form): "
- f"studio={len(studio_missing)}, other={len(other_missing)}")
- for path in (studio_missing + other_missing)[:30]:
- print(f" {path}")
- if total > 30:
- print(f" ... and {total - 30} more")
- PY
-
- - name: Shell scripts parse cleanly (`bash -n`)
- # Same idea as Python's compileall: parse-only check that
- # every committed *.sh would not blow up at `bash script.sh`
- # invocation time on a release box. tests/sh/ is the largest
- # cluster (the install.sh shape tests).
- run: |
- shopt -s globstar
- fail=0
- for f in $(git ls-files '*.sh'); do
- if ! bash -n "$f"; then
- echo "::error file=$f::shell parse error"
- fail=1
- fi
- done
- if [ "$fail" -ne 0 ]; then
- exit 1
- fi
- n=$(git ls-files '*.sh' | wc -l)
- echo "$n shell scripts parse cleanly"
-
- - name: YAML files parse cleanly (yaml.safe_load)
- # Catches truncated workflow files, broken indents in
- # dependabot.yml / pre-commit configs, etc. Includes
- # .github/workflows/*.yml so a typo in the file we just
- # added shows up immediately.
- run: |
- python <<'PY'
- import pathlib, sys, yaml
-
- SKIP_PARTS = {".venv", "venv", "build", "dist", ".git",
- "node_modules", "unsloth_compiled_cache",
- "unsloth.egg-info"}
-
- bad = []
- scanned = 0
- for path in sorted(list(pathlib.Path(".").rglob("*.yml"))
- + list(pathlib.Path(".").rglob("*.yaml"))):
- if any(part in SKIP_PARTS for part in path.parts):
- continue
- scanned += 1
- try:
- with path.open("r", encoding="utf-8") as fh:
- list(yaml.safe_load_all(fh))
- except Exception as exc:
- bad.append((path, exc))
-
- if bad:
- for path, exc in bad:
- print(f"::error file={path}::YAML parse failed: {exc}")
- sys.exit(1)
- print(f"{scanned} YAML files parse cleanly")
- PY
-
- - name: JSON files parse cleanly (json.loads)
- # Catches malformed package.json, biome.json, etc. Skips:
- # - huge npm/bun lockfiles (machine-generated, slow to
- # parse, no value).
- # - tsconfig*.json: TypeScript convention is JSONC (JSON
- # with `/* ... */` comments), which standard json.loads
- # rejects. Strip-and-validate would need json5 or a
- # hand-rolled comment scrubber for marginal value, since
- # `tsc --noEmit` already validates these in Frontend CI.
- run: |
- python <<'PY'
- import fnmatch, json, pathlib, sys
-
- SKIP_PARTS = {".venv", "venv", "build", "dist", ".git",
- "node_modules", "unsloth_compiled_cache",
- "unsloth.egg-info"}
- SKIP_NAMES = {"package-lock.json", "bun.lock"}
- SKIP_PATTERNS = ("tsconfig*.json",)
-
- bad = []
- scanned = 0
- for path in sorted(pathlib.Path(".").rglob("*.json")):
- if any(part in SKIP_PARTS for part in path.parts):
- continue
- if path.name in SKIP_NAMES:
- continue
- if any(fnmatch.fnmatch(path.name, pat) for pat in SKIP_PATTERNS):
- continue
- scanned += 1
- try:
- json.loads(path.read_text(encoding="utf-8"))
- except Exception as exc:
- bad.append((path, exc))
-
- if bad:
- for path, exc in bad:
- print(f"::error file={path}::JSON parse failed: {exc}")
- sys.exit(1)
- print(f"{scanned} JSON files parse cleanly")
- PY
-
- - name: codespell typo check (informational)
- # Catches typos in code, comments, and docs across the repo.
- # Skips lockfiles, generated assets, binary artefacts, and
- # the LICENSE files (US/UK spelling drift in legal text is
- # not ours to second-guess). The ignore-words-list pulls
- # out short identifiers + valid technical terms that
- # codespell's default dictionary would otherwise flag
- # (e.g. `ans` as a math-quiz variable name in
- # tests/utils/aime_eval.py, `parm`/`parms` in PyTorch
- # nn.Module idioms). Non-blocking until the surfaced typos
- # are fixed; drop continue-on-error after the cleanup.
- continue-on-error: true
- run: |
- codespell \
- --skip='*.lock,*.lockb,*.json,*.svg,*.png,*.jpg,*.jpeg,*.gif,*.ico,*.woff*,*.ttf,*.eot,*.zip,*.gz,*.gguf,*.safetensors,*.bin,node_modules,.git,build,dist,unsloth_compiled_cache,unsloth.egg-info,target,studio/frontend/dist,*.pyc,*-licenses.txt,LICENSE*' \
- --ignore-words-list='ans,bu,hel,fo,te,ot,hist,ned,sav,recurser,datas,nin,parm,parms,checkin,nd,fr,inout,donot,uint' \
- --quiet-level=2
-
- - name: shellcheck on committed *.sh (informational)
- # Goes beyond `bash -n` (which only parses): catches subtle
- # shell bugs like unquoted variable expansions, useless
- # `cat`, command substitutions inside `[[`, etc. The
- # install/setup scripts are critical-path so the signal is
- # worth surfacing. Non-blocking until install.sh's
- # hand-rolled patterns get cleaned up; drop continue-on-error
- # afterwards.
- continue-on-error: true
- run: |
- # Exclude SC1090 ("source not followable") -- legitimate
- # for installer scripts that source files at runtime
- # paths shellcheck cannot resolve statically.
- # SC2034 ("variable assigned but never used") fires on
- # the export-only assignment idiom we use in install.sh.
- shellcheck -e SC1090,SC2034 $(git ls-files '*.sh')
-
- - name: ruff format drift (informational)
- # The canonical formatter is scripts/run_ruff_format.py
- # = ruff format + scripts/enforce_kwargs_spacing.py, so plain
- # `ruff format --check` reports the kwarg-spacing diff as
- # drift. Surface the count for visibility but keep
- # non-blocking until the custom pipeline is wired in here.
- continue-on-error: true
- run: |
- ruff format --check unsloth unsloth_cli studio tests cli.py unsloth-cli.py
diff --git a/.github/workflows/mlx-ci.yml b/.github/workflows/mlx-ci.yml
deleted file mode 100644
index 75940832a0..0000000000
--- a/.github/workflows/mlx-ci.yml
+++ /dev/null
@@ -1,430 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Focused PR gate for the MLX dispatch surface, running on a real
-# Apple Silicon runner.
-#
-# Runner: macos-14 (M1, 3 vCPU / 7 GB / Apple Silicon standard runner
-# -- FREE for public repositories per the GitHub Actions billing
-# reference; larger variants like macos-14-large/-xlarge are paid so
-# we deliberately avoid those).
-#
-# Why a single Mac job (no Linux+spoof leg): the dispatch tests are
-# 100% spoofed monkeypatches and run identically on any host, so the
-# Linux leg was duplicating the matrix tests already covered on Mac
-# while missing everything Apple-specific. The Mac job runs the SAME
-# spoofed matrix PLUS three things only a real Apple Silicon host
-# can prove:
-#
-# 1. unsloth._IS_MLX flips True on Darwin+arm64 with mlx genuinely
-# installed (no spoof).
-# 2. Every PR-A MLX-only unsloth_zoo module (mlx_loader, mlx_trainer,
-# mlx_compile, mlx_utils, mlx_cce, gated_delta_vjp) imports
-# against the real `mlx` + `mlx-lm` + `mlx-vlm` PyPI wheels --
-# each does `import mlx.core as mx` at module top level, so this
-# catches a future change that breaks the real wheels without
-# needing a Mac developer in the loop.
-# 3. The hardware-dispatch spoofs do not collide with the real
-# environment (the test fixture installs a MetaPathFinder that
-# blocks `import mlx.core` for "no-mlx" profiles, faithfully
-# simulating a Mac without mlx even when mlx IS installed).
-# 4. End-to-end MLX training + inference smoke test:
-# run_real_mlx_smoke.py trains unsloth/gemma-3-270m-it for 7
-# deterministic LoRA steps on a single repeated text row, then
-# verifies the trained model can complete the prompt and that
-# losses + grad norms are finite and well-behaved. This is the
-# only place in CI that exercises a real MLX backward pass +
-# optimizer step + inference call.
-#
-# Three dispatch test files documented in tests/studio/README.md:
-# - test_hardware_dispatch_matrix.py parametrized 7-profile matrix
-# + 2 dispatch-priority canaries
-# - test_is_mlx_dispatch_gate.py AST + runtime guard on
-# unsloth._IS_MLX
-# - test_mlx_training_worker_behaviors.py AST contract checks on
-# studio/backend/core/training/worker.py
-#
-# Surfaces a single PR check ("MLX CI on Mac M1 / dispatch").
-#
-# Security audit footprint: every package this workflow installs is
-# already covered by .github/workflows/security-audit.yml -- the deps
-# come from studio/backend/requirements/studio.txt and unsloth-zoo's
-# pyproject (resolved transitively). The git+ install of unsloth-zoo
-# is intentionally skipped by the audit (pip-audit cannot resolve a
-# git URL through PyPI metadata; the audit comment in security-audit.yml
-# documents this). No new package is introduced solely by MLX CI.
-
-name: MLX CI on Mac M1
-
-on:
- pull_request:
- paths:
- - 'unsloth/__init__.py'
- - 'unsloth/_gpu_init.py'
- - 'studio/backend/utils/hardware/**'
- - 'studio/backend/core/training/worker.py'
- - 'studio/backend/core/inference/mlx_inference.py'
- - 'tests/studio/test_hardware_dispatch_matrix.py'
- - 'tests/studio/test_is_mlx_dispatch_gate.py'
- - 'tests/studio/test_mlx_training_worker_behaviors.py'
- - 'tests/studio/run_real_mlx_smoke.py'
- - 'tests/conftest.py'
- - '.github/workflows/mlx-ci.yml'
- push:
- branches: [main, pip]
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- dispatch:
- name: dispatch
- runs-on: macos-14
- # 25 min: dispatch + spoofed matrix + 7-step real LoRA training is
- # under 2 min; GGUF export builds llama.cpp via cmake on Apple
- # Silicon (~5-7 min), so we budget headroom.
- timeout-minutes: 25
- steps:
- # harden-runner audit mode: macOS runners cannot use blocking mode
- # today (eBPF egress enforcement is Linux-only), but audit mode is
- # supported cross-platform and surfaces the egress destinations in
- # the runner log. This produces the data needed to graduate this
- # job to a block-mode allowlist once macOS support lands.
- - name: Harden runner (audit)
- uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
- with:
- egress-policy: audit
-
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- # macOS install ladder, validated locally against a Linux
- # mac-sim venv (platform spoofed + mlx_simulation shim + real
- # datasets/transformers/structlog).
- #
- # 1. studio/backend/requirements/studio.txt brings structlog,
- # fastapi, etc. The hardware probe imports structlog at
- # module top level.
- # 2. Same pytest / numpy / httpx stack the rest of the repo CI
- # uses.
- # 3. torch is explicitly installed: unsloth-zoo's pyproject
- # deliberately excludes torch on darwin+arm64 (mlx replaces
- # it for runtime use), but the dispatch tests spoof
- # torch.cuda / torch.xpu / torch.backends.mps via monkeypatch
- # and so the test process needs torch importable. We pull
- # from the PyTorch CPU index so Apple Silicon gets the
- # explicit cpu+MPS arm64 wheel rather than something the
- # default PyPI resolver might pick up. The CPU index hosts
- # macosx_*_arm64 wheels alongside the Linux x86_64 ones.
- # 4. unsloth-zoo from git main (NOT PyPI), WITH deps. PR-A's
- # MLX support landed after the most recent unsloth-zoo PyPI
- # release; the wheel still raises NotImplementedError on
- # Apple Silicon when device_type.get_device_type() runs
- # unguarded. Studio's own install.sh overlays unsloth-zoo
- # from git main for the same reason. Pulling deps lets pip
- # resolve the platform-conditional MLX-only wheels (mlx,
- # mlx-lm, mlx-vlm gated on darwin+arm64 in unsloth-zoo's
- # pyproject) AND the shared deps (datasets, transformers,
- # sentencepiece, ...) that unsloth's MLX branch loads via
- # dataprep/raw_text.py.
- # 5. unsloth -e . --no-deps so the editable install does not
- # fight the unsloth-zoo dep set.
- #
- # All explicit pip installs are version-pinned to a single
- # released version (the latest as of 2026-05-07 within each
- # project's existing constraint range). bump alongside the rest
- # of the security audit when a new release lands.
- - name: Install deps
- run: |
- python -m pip install --upgrade pip
- pip install -r studio/backend/requirements/studio.txt
- pip install \
- 'python-multipart==0.0.27' \
- 'aiofiles==25.1.0' \
- 'sqlalchemy==2.0.49' \
- 'cryptography==48.0.0' \
- 'pyyaml==6.0.3' \
- 'jinja2==3.1.6' \
- 'mammoth==1.12.0' \
- 'unpdf==1.0.0' \
- 'requests==2.33.1' \
- 'typer==0.25.1' \
- 'numpy==2.4.4' \
- 'pytest==9.0.3' \
- 'pytest-asyncio==1.3.0' \
- 'httpx==0.28.1'
- pip install --index-url https://download.pytorch.org/whl/cpu \
- 'torch==2.10.0'
- # github.com occasionally 500s on the git fetch; retry the
- # zoo install so a single upstream blip does not fail CI.
- for attempt in 1 2 3; do
- if pip install "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo"; then
- break
- fi
- if [ "$attempt" -eq 3 ]; then
- echo "::error::pip install unsloth_zoo failed after 3 attempts"
- exit 1
- fi
- delay=$((5 * attempt))
- echo "::warning::unsloth_zoo install failed (attempt $attempt/3), retrying in ${delay}s..."
- sleep "$delay"
- done
- pip install -e . --no-deps
-
- # Real Apple Silicon sanity: confirm _IS_MLX activates on real
- # hardware with no platform spoof.
- - name: Verify _IS_MLX flips True on real Apple Silicon
- run: |
- python -c "
- import platform
- assert platform.system() == 'Darwin', platform.system()
- assert platform.machine() == 'arm64', platform.machine()
- import unsloth
- assert unsloth._IS_MLX is True, f'expected _IS_MLX=True on real Apple Silicon, got {unsloth._IS_MLX}'
- print('OK: _IS_MLX activated on real Apple Silicon')
- "
-
- # Real Apple Silicon sanity: confirm every PR-A MLX-only module
- # loads against real mlx + mlx-lm + mlx-vlm wheels.
- - name: Smoke-import every MLX-only unsloth_zoo module
- run: |
- python -c "
- import importlib
- for name in [
- 'unsloth_zoo.mlx_loader',
- 'unsloth_zoo.mlx_trainer',
- 'unsloth_zoo.mlx_compile',
- 'unsloth_zoo.mlx_utils',
- 'unsloth_zoo.mlx_cce',
- 'unsloth_zoo.gated_delta_vjp',
- ]:
- importlib.import_module(name)
- print('OK:', name)
- from unsloth_zoo.mlx_loader import FastMLXModel
- from unsloth_zoo.mlx_trainer import MLXTrainer, MLXTrainingConfig
- assert hasattr(FastMLXModel, 'from_pretrained')
- print('OK: FastMLXModel + MLXTrainer surface present')
- "
-
- # Spoofed dispatch matrix. Runs on the real Mac too -- the
- # test fixture installs a MetaPathFinder that blocks
- # `import mlx.core` for "no-mlx" profiles, so the spoofs
- # faithfully simulate every supported hardware combo regardless
- # of whether mlx is installed for real.
- - name: MLX dispatch tests (3 files, 36 tests)
- env:
- PYTHONPATH: ${{ github.workspace }}/studio
- UNSLOTH_COMPILE_DISABLE: '1'
- run: |
- python -m pytest -v --tb=short \
- tests/studio/test_hardware_dispatch_matrix.py \
- tests/studio/test_is_mlx_dispatch_gate.py \
- tests/studio/test_mlx_training_worker_behaviors.py
-
- # Studio prebuilt llama.cpp install + GGUF inference. Drives the
- # exact path Studio's setup.sh takes on macOS: invokes
- # studio/install_llama_prebuilt.py with --published-repo
- # ggml-org/llama.cpp and --published-release-tag b9049 (the
- # latest llama.cpp release at the time this step was added; bump
- # via UNSLOTH_LLAMA_TAG / DEFAULT_LLAMA_TAG when refreshing).
- # The installer downloads llama-b9049-bin-macos-arm64.tar.gz,
- # which is the universal Apple Silicon (arm64) build -- the
- # same artifact works on M1/M2/M3/M4 because llama.cpp compiles
- # against the ARMv8.2 baseline.
- #
- # The b9049 release also publishes:
- # - llama-b9049-bin-macos-arm64-kleidiai.tar.gz
- # KleidiAI dispatches at runtime; on M1 it falls back where
- # ISA features (e.g. I8MM) are missing, so this asset also
- # runs on M1 -- Studio just doesn't choose it by default.
- # - llama-b9049-bin-macos-x64.tar.gz
- # Intel-only; would only run on M1 via Rosetta 2 emulation,
- # which we explicitly avoid.
- # - iOS XCFramework
- # iOS-app build artifact, unrelated to a macOS desktop CI.
- #
- # After install, downloads a small published GGUF
- # (unsloth/gemma-3-270m-it-GGUF, Q4_K_M) from HuggingFace and
- # runs the prebuilt llama-cli on it. Asserts the prompt echo
- # appears in stdout. If the install fails OR the binary exits
- # non-zero, that's an Unsloth/Studio bug.
- - name: Studio prebuilt llama.cpp install + GGUF inference (Mac M1)
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- # install_llama_prebuilt.py hits the GitHub releases API to
- # resolve the asset URL. Anonymous calls share the runner-IP
- # rate-limit bucket and 403 quickly -- pass the workflow's
- # automatic GITHUB_TOKEN to bump us to the 5000/hr authenticated
- # bucket.
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- set -euo pipefail
- INSTALL_DIR="$HOME/.unsloth-studio-prebuilt-test/llama.cpp"
- rm -rf "$INSTALL_DIR"
- # --simple-policy is required when --published-repo points
- # at upstream ggml-org/llama.cpp; that repo doesn't ship the
- # llama-prebuilt-manifest.json asset Studio's default policy
- # expects, so the simple platform-specific policy maps
- # Darwin+arm64 -> bin-macos-arm64 directly. studio/setup.sh
- # passes both --published-repo ggml-org/llama.cpp AND
- # --simple-policy automatically on macOS, so this CI step
- # exercises the same code path users hit when they run
- # `curl -fsSL https://unsloth.ai/install.sh | sh`.
- python studio/install_llama_prebuilt.py \
- --install-dir "$INSTALL_DIR" \
- --published-repo ggml-org/llama.cpp \
- --published-release-tag b9049 \
- --simple-policy
-
- # Studio bundles only llama-server + llama-quantize from the
- # prebuilt (not llama-cli) -- inference goes through
- # llama-server's HTTP /completion endpoint. Validate both:
- # llama-quantize --help proves the dynamic libs link, then
- # spin up llama-server and POST a /completion request on a
- # tiny published GGUF.
- LLAMA_SERVER="$INSTALL_DIR/build/bin/llama-server"
- LLAMA_QUANT="$INSTALL_DIR/build/bin/llama-quantize"
- [ -x "$LLAMA_SERVER" ] || { echo "::error::llama-server missing at $LLAMA_SERVER"; find "$INSTALL_DIR/build" -type f | head -40; exit 1; }
- [ -x "$LLAMA_QUANT" ] || { echo "::error::llama-quantize missing at $LLAMA_QUANT"; exit 1; }
- echo "llama-server : $LLAMA_SERVER"
- echo "llama-quantize: $LLAMA_QUANT"
- "$LLAMA_QUANT" --help >/dev/null && echo " llama-quantize loads OK"
-
- mkdir -p /tmp/ggufs
- bash .github/scripts/hf-download-with-retry.sh \
- 'unsloth/gemma-3-270m-it-GGUF' \
- 'gemma-3-270m-it-Q4_K_M.gguf' \
- /tmp/ggufs
-
- PORT=18080
- echo "=== starting llama-server on 127.0.0.1:$PORT ==="
- "$LLAMA_SERVER" \
- -m /tmp/ggufs/gemma-3-270m-it-Q4_K_M.gguf \
- --host 127.0.0.1 \
- --port "$PORT" \
- -c 256 \
- -n 16 \
- --no-warmup \
- > /tmp/llama-server.log 2>&1 &
- SERVER_PID=$!
- trap 'kill "$SERVER_PID" 2>/dev/null || true' EXIT
-
- # Wait for /health to come up
- for i in $(seq 1 30); do
- if curl -sf "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then
- echo " server up after ${i}s"
- break
- fi
- sleep 1
- done
- if ! curl -sf "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then
- echo "::error::llama-server never became healthy"
- tail -40 /tmp/llama-server.log
- exit 1
- fi
-
- PROMPT="Hello, my name is"
- echo "=== POST /completion ==="
- RESP=$(curl -sf -X POST "http://127.0.0.1:$PORT/completion" \
- -H 'Content-Type: application/json' \
- -d "{\"prompt\":\"$PROMPT\",\"n_predict\":16,\"temperature\":0,\"seed\":3407}")
- echo "raw response (head): $(echo "$RESP" | head -c 600)"
- CONTENT=$(echo "$RESP" | python -c "import json,sys; print(json.loads(sys.stdin.read()).get('content',''))")
- echo "completion content: $CONTENT"
-
- if [ -z "$CONTENT" ]; then
- echo "::error::llama-server /completion returned empty content"
- tail -40 /tmp/llama-server.log
- exit 1
- fi
- echo "OK: Studio prebuilt llama.cpp on Mac M1 + GGUF /completion works"
-
- # Real MLX training + inference smoke test. Trains
- # unsloth/gemma-3-270m-it for 7 deterministic LoRA steps
- # (batch_size=2, gradient_accumulation_steps=3) on a single
- # repeated row ("<> My name is Unsloth!"), then saves
- # the trained model in 3 export formats. The `train` subcommand
- # captures per-phase timing + peak GPU + peak RSS into
- # train_metrics.json so we can detect regressions across CI runs.
- - name: MLX export round-trip — TRAIN + SAVE 3 formats
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- UNSLOTH_COMPILE_DISABLE: '1'
- run: |
- mkdir -p mlx_workdir
- python tests/studio/run_real_mlx_smoke.py train \
- --workdir "$PWD/mlx_workdir"
-
- # Each reload step runs in a FRESH Python process to confirm
- # the cold-start path users would hit in production also works
- # (not just the in-memory continuation of a still-running
- # trainer). FastMLXModel.from_pretrained gets called from
- # scratch; mx.random is re-seeded; per-step timing + peak
- # memory are emitted to {format}_reload_metrics.json next to
- # the saved dir.
- - name: MLX export round-trip — RELOAD LoRA (fresh process)
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- UNSLOTH_COMPILE_DISABLE: '1'
- run: |
- python tests/studio/run_real_mlx_smoke.py reload \
- --format lora \
- --dir "$PWD/mlx_workdir/lora"
-
- - name: MLX export round-trip — RELOAD merged_16bit (fresh process)
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- UNSLOTH_COMPILE_DISABLE: '1'
- run: |
- python tests/studio/run_real_mlx_smoke.py reload \
- --format merged \
- --dir "$PWD/mlx_workdir/merged_16bit"
-
- # GGUF reload uses the llama-cli binary that save_pretrained_gguf
- # built. If save_pretrained_gguf was skipped during train (e.g.
- # llama.cpp's convert_hf_to_gguf asserts on the model's tokenizer
- # vocab -- a downstream llama.cpp limitation, not an unsloth_zoo
- # bug), this step emits a workflow warning and exits 0 so the
- # LoRA + merged_16bit assertions remain the gating signal.
- - name: MLX export round-trip — RELOAD GGUF via llama-cli (fresh process)
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- if python -c "import json,sys; m=json.load(open('mlx_workdir/train_metrics.json')); sys.exit(0 if m.get('gguf_supported') else 1)"; then
- python tests/studio/run_real_mlx_smoke.py reload \
- --format gguf \
- --dir "$PWD/mlx_workdir/gguf"
- else
- REASON=$(python -c "import json; m=json.load(open('mlx_workdir/train_metrics.json')); print(m.get('gguf_skip_reason') or 'unknown')")
- echo "::warning title=GGUF round-trip skipped::${REASON}"
- echo "GGUF export was skipped during the train phase. Reason:"
- echo " ${REASON}"
- echo "Continuing without failing the job; the LoRA + merged_16bit"
- echo "reload assertions are still gating this PR."
- fi
-
- # Print all metrics JSON files so regressions are visible in the
- # job log. always() so we get telemetry even if a reload step
- # asserted gibberish.
- - name: MLX export round-trip — aggregate metrics
- if: always()
- run: |
- for f in mlx_workdir/train_metrics.json \
- mlx_workdir/lora_reload_metrics.json \
- mlx_workdir/merged_reload_metrics.json \
- mlx_workdir/gguf_reload_metrics.json; do
- echo "=== $f ==="
- cat "$f" 2>/dev/null || echo "(missing)"
- echo
- done
diff --git a/.github/workflows/notebooks-ci.yml b/.github/workflows/notebooks-ci.yml
deleted file mode 100644
index 673b2f3cc5..0000000000
--- a/.github/workflows/notebooks-ci.yml
+++ /dev/null
@@ -1,440 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-#
-# Cross-repo notebook validator. Lives in unslothai/unsloth (this repo)
-# and inspects every notebook in unslothai/notebooks at HEAD (or the
-# ref dispatched in via repository_dispatch).
-#
-# Catches the bug classes that landed in:
-# - unslothai/notebooks#258 Colab torchao 0.10 vs peft 0.19 floor
-# - unslothai/notebooks#260 DONT_UPDATE_EXCEPTIONS coverage drift
-# - unslothai/notebooks#261 torch/torchcodec ABI; --no-deps tokenizers
-# - unslothai/notebooks#264 --no-deps transformers + Colab tokenizers drift
-# - unslothai/notebooks#221 git+ HEAD installs in install cells
-# - unslothai/notebooks commit 51b1462 template/notebook drift
-#
-# CPU-only by design. Layer 2 (api-introspect) reuses the existing
-# tests/_zoo_aggressive_cuda_spoof.py harness so `import unsloth`
-# succeeds on a GPU-less ubuntu-latest runner.
-
-name: Notebooks CI
-
-on:
- pull_request:
- paths:
- - 'unsloth/**'
- - 'scripts/notebook_validator.py'
- - 'scripts/notebook_to_python.py'
- - 'scripts/data/colab_pip_freeze.gpu.txt'
- - 'scripts/data/colab_to_cpu_pin.json'
- - 'tests/notebooks/**'
- - 'tests/_zoo_aggressive_cuda_spoof.py'
- - '.github/workflows/notebooks-ci.yml'
- schedule:
- # Daily 06:17 UTC. Catches Colab preinstall bumps (the upstream image
- # is rebuilt roughly weekly) without us waiting on a PR. Off the
- # :00/:30 fleet-collision spots.
- - cron: '17 6 * * *'
- workflow_dispatch:
- inputs:
- notebooks_ref:
- description: 'unslothai/notebooks ref to lint (branch / SHA / tag)'
- default: 'main'
- include_smoke:
- description: 'Also run the install-cell smoke matrix (longer)'
- type: boolean
- default: false
- repository_dispatch:
- # Fired by a tiny companion workflow on unslothai/notebooks.
- types: [notebooks_pr_opened, notebooks_main_pushed]
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-env:
- NOTEBOOKS_REF: >-
- ${{ github.event.inputs.notebooks_ref ||
- github.event.client_payload.ref ||
- 'main' }}
-
-jobs:
- static:
- name: static (drift + lint + exceptions)
- runs-on: ubuntu-latest
- timeout-minutes: 10
- steps:
- # Validate the dispatched ref before it reaches actions/checkout's `ref:`
- # input. Reading via env (NOT direct ${{ ... }} interpolation in the
- # regex test) closes the GitHub-Actions-injection class where a
- # client_payload.ref like `main"; rm -rf / #` would be embedded into the
- # shell command. NOTEBOOKS_REF defaults to 'main' on non-dispatch
- # events, but only repository_dispatch can supply attacker-controlled
- # values, so we gate this check on that event type.
- - name: Validate client_payload.ref shape
- if: github.event_name == 'repository_dispatch'
- env:
- NOTEBOOKS_REF: ${{ github.event.client_payload.ref }}
- run: |
- if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then
- echo "::error::client_payload.ref contains disallowed characters" >&2
- exit 1
- fi
-
- - name: Checkout unsloth (this PR)
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- path: unsloth
- persist-credentials: false
-
- - name: Checkout unslothai/notebooks @ ${{ env.NOTEBOOKS_REF }}
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- repository: unslothai/notebooks
- ref: ${{ env.NOTEBOOKS_REF }}
- path: notebooks
- fetch-depth: 0 # drift check needs git status / diff
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Install validator deps
- run: |
- python -m pip install --upgrade pip
- # nbformat + nbconvert come from the converter's requirements;
- # spellchecker + huggingface_hub are imported at module top of
- # update_all_notebooks.py.
- pip install \
- 'nbformat>=5.10' 'nbconvert>=7.16' 'pyspellchecker>=0.8' \
- 'huggingface_hub>=0.34' 'tqdm>=4.66'
-
- - name: Refresh Colab pip-freeze (best-effort; falls back to snapshot)
- run: |
- python unsloth/scripts/notebook_validator.py refresh-colab \
- --out unsloth/scripts/data/colab_pip_freeze.gpu.txt \
- || echo "::warning::refresh-colab failed; using committed snapshot"
-
- - name: Diff Colab oracle vs committed snapshots (advisory)
- # Pulls pip-freeze.gpu.txt + apt-list-gpu.txt + os-info-gpu.txt
- # from googlecolab/backend-info and prints NEW / REMOVED /
- # CHANGED entries against scripts/data/colab_*.txt. Non-blocking
- # on PRs; the daily cron job below runs the same step with
- # --strict so upstream rotations surface within ~24h.
- continue-on-error: true
- working-directory: ${{ github.workspace }}
- run: |
- python unsloth/scripts/notebook_validator.py colab-diff \
- --snapshot-dir unsloth/scripts/data
-
- - name: Drift check (re-run update_all_notebooks.py + git diff)
- working-directory: ${{ github.workspace }}
- # Reported as non-blocking until the upstream `unslothai/notebooks`
- # tree is regenerated. The first run on @main surfaces ~463 files
- # of drift (7359 / 9634 line delta), which is a real backlog the
- # notebooks-side maintainers need to clear in their own repo --
- # this PR's role is to surface the count, not auto-fix it.
- continue-on-error: true
- run: |
- python unsloth/scripts/notebook_validator.py drift \
- --notebooks-dir notebooks
-
- - name: Convert sanity (every nb / kaggle / original_template -> .py)
- # Same rationale as Drift: a handful of upstream notebooks fail
- # the converter (custom magics, malformed JSON, etc). Surface
- # the count without blocking; the team triages in unslothai/notebooks.
- continue-on-error: true
- run: |
- python unsloth/scripts/notebook_validator.py convert \
- --notebooks-dir notebooks \
- --out _converted
-
- - name: Lint (install cells + AST scan, env-scoped)
- # Reported as non-blocking (continue-on-error: true) until the
- # backlog of pre-existing findings on unslothai/notebooks@main is
- # cleared. Same pattern PR #5298 used for biome:check on the
- # frontend. As of this commit the live tree surfaces 27 errors +
- # 6 warnings, all real (peft/torchao floor missing in 6 nb/
- # notebooks, 14 git+ HEAD installs in hand-tuned exception
- # notebooks, 6 torch/torchcodec ABI mismatches, 1
- # transformers/tokenizers --no-deps drift). The count surfaces
- # in the PR check UI. Drop continue-on-error once it hits zero.
- continue-on-error: true
- run: |
- python unsloth/scripts/notebook_validator.py lint \
- --notebooks-dir notebooks \
- --colab-pin unsloth/scripts/data/colab_pip_freeze.gpu.txt \
- --no-pypi
- # --no-pypi skips R-INST-002 (transitive resolve via PyPI metadata).
- # Layer 1 keeps PR-time wall-clock predictable; the daily cron run
- # below drops --no-pypi and refreshes the cache.
-
- - name: DONT_UPDATE_EXCEPTIONS coverage
- run: |
- python unsloth/scripts/notebook_validator.py exceptions \
- --notebooks-dir notebooks
-
- static-with-pypi:
- name: static + transitive resolve (cron / dispatch only)
- if: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
- runs-on: ubuntu-latest
- timeout-minutes: 15
- steps:
- # See `static.Validate client_payload.ref shape` for rationale. This
- # job's `if:` excludes repository_dispatch today, so the validation
- # step is a defence-in-depth no-op until that gate ever relaxes.
- - name: Validate client_payload.ref shape
- if: github.event_name == 'repository_dispatch'
- env:
- NOTEBOOKS_REF: ${{ github.event.client_payload.ref }}
- run: |
- if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then
- echo "::error::client_payload.ref contains disallowed characters" >&2
- exit 1
- fi
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- path: unsloth
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- repository: unslothai/notebooks
- ref: ${{ env.NOTEBOOKS_REF }}
- path: notebooks
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with: { python-version: '3.12', cache: 'pip' }
- - name: Install
- run: pip install -U pip
- - name: Refresh Colab oracle
- run: |
- python unsloth/scripts/notebook_validator.py refresh-colab \
- --out unsloth/scripts/data/colab_pip_freeze.gpu.txt
- - name: Diff Colab oracle vs committed snapshots (--strict on cron)
- # Cron-only escalation of the advisory PR-time check. Fails if
- # any of pip-freeze.gpu.txt / apt-list-gpu.txt / os-info-gpu.txt
- # has drifted from scripts/data/colab_*.txt; refresh the
- # snapshots in this repo to acknowledge.
- run: |
- python unsloth/scripts/notebook_validator.py colab-diff \
- --snapshot-dir unsloth/scripts/data --strict
- - name: Lint with live PyPI metadata
- run: |
- python unsloth/scripts/notebook_validator.py lint \
- --notebooks-dir notebooks \
- --colab-pin unsloth/scripts/data/colab_pip_freeze.gpu.txt
-
- api-introspect:
- name: api surface (under CUDA spoof)
- runs-on: ubuntu-latest
- timeout-minutes: 12
- steps:
- - name: Validate client_payload.ref shape
- if: github.event_name == 'repository_dispatch'
- env:
- NOTEBOOKS_REF: ${{ github.event.client_payload.ref }}
- run: |
- if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then
- echo "::error::client_payload.ref contains disallowed characters" >&2
- exit 1
- fi
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- path: unsloth
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- repository: unslothai/notebooks
- ref: ${{ env.NOTEBOOKS_REF }}
- path: notebooks
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with: { python-version: '3.12', cache: 'pip' }
-
- - name: Install CPU torch + pinned unsloth + trl + converter deps
- run: |
- python -m pip install --upgrade pip
- # CPU torch + torchvision. torchvision is required because
- # unsloth_zoo.vision_utils imports PIL at module top, and the
- # easiest way to get a torch-compatible PIL on a CPU runner is
- # to let torchvision pull the right Pillow version.
- pip install --index-url https://download.pytorch.org/whl/cpu \
- 'torch>=2.8,<2.11' 'torchvision<0.26'
- # Pin to the same versions update_all_notebooks.py installs in
- # generated notebooks. Keep these in lockstep with PIN_TRL /
- # PIN_TRANSFORMERS in unslothai/notebooks/update_all_notebooks.py.
- # `triton` is added because unsloth/_gpu_init.py:232 does an
- # unconditional `import triton`; the PyPI wheel installs cleanly
- # on Linux x86_64 even without CUDA (same rationale as
- # consolidated-tests-ci.yml line 192-205).
- # Pillow is listed explicitly as a defensive belt-and-braces
- # next to torchvision (vision_utils crashes ModuleNotFoundError
- # if torchvision skipped its Pillow dep for any reason).
- pip install 'transformers>=4.56,<5.6' 'trl>=0.22,<0.26' 'accelerate>=1.0' \
- 'datasets>=3.4,<5' 'peft>=0.15,<0.20' \
- 'bitsandbytes>=0.43' 'sentencepiece' 'protobuf' triton \
- Pillow safetensors tqdm packaging psutil
- # Converter deps (nbformat for notebook_to_python.py).
- pip install 'nbformat>=5.10' 'nbconvert>=7.16'
- # Install unsloth from the LOCAL checkout (the PR head), not PyPI.
- # The PR-time CI must validate the code in this PR; PyPI unsloth
- # may lag the in-repo CPU-torch fallback in unsloth/kernels/utils.py
- # (lines 162-170) that handles missing torch._C._cuda_getCurrentRawStream.
- pip install --no-deps unsloth_zoo
- pip install --no-deps -e ./unsloth
-
- - name: Convert notebooks for AST scan
- # Same upstream-conversion-error tolerance as the static job.
- continue-on-error: true
- run: |
- python unsloth/scripts/notebook_validator.py convert \
- --notebooks-dir notebooks --out _converted
-
- - name: Dump unsloth + trl API surface (under CUDA spoof)
- run: |
- PYTHONPATH=unsloth/tests python -u - <<'PY'
- import sys, json, inspect
- import _zoo_aggressive_cuda_spoof as _spoof
- _spoof.apply()
- import unsloth
- import trl
- surface = {}
- for cls_name in ("FastLanguageModel", "FastVisionModel", "FastModel"):
- cls = getattr(unsloth, cls_name, None)
- if cls is None:
- continue
- surface[cls_name] = sorted(n for n in dir(cls) if not n.startswith("_"))
- surface["SFTConfig_kwargs"] = sorted(inspect.signature(trl.SFTConfig.__init__).parameters)
- json.dump(surface, open("_api_surface.json", "w"), indent=2)
- print("dumped surface for:", list(surface))
- PY
-
- - name: Run API rule against converted notebooks
- run: |
- python unsloth/scripts/notebook_validator.py api \
- --converted-dir _converted \
- --surface _api_surface.json
-
- smoke-install:
- name: smoke install (Colab-shaped venv, opt-in)
- if: ${{ github.event.inputs.include_smoke == 'true' || github.event_name == 'schedule' }}
- runs-on: ubuntu-latest
- timeout-minutes: 25
- strategy:
- fail-fast: false
- matrix:
- # One representative notebook per installation_*_content template.
- # Add rows when a new install template lands in update_all_notebooks.py.
- notebook:
- - 'nb/Llama3.1_(8B)-Alpaca.ipynb' # installation_content
- - 'nb/Gemma3_(4B)-Vision.ipynb' # installation_content + vision
- - 'nb/Llama3.1_(8B)-GRPO.ipynb' # installation_extra_grpo_content
- - 'nb/gpt-oss-(20B)-Fine-tuning.ipynb' # installation_gpt_oss_content
- - 'nb/Qwen3_5_(4B)_Vision.ipynb' # installation_qwen3_5_content
- - 'nb/Nemotron-3-Nano-30B-A3B_A100.ipynb' # installation_nemotron_nano_content
- - 'nb/Whisper.ipynb' # installation_whisper_content
- - 'nb/Synthetic_Data_Hackathon.ipynb' # installation_synthetic_data_content
- steps:
- - name: Validate client_payload.ref shape
- if: github.event_name == 'repository_dispatch'
- env:
- NOTEBOOKS_REF: ${{ github.event.client_payload.ref }}
- run: |
- if ! printf '%s' "$NOTEBOOKS_REF" | grep -Eq '^[A-Za-z0-9._/-]+$'; then
- echo "::error::client_payload.ref contains disallowed characters" >&2
- exit 1
- fi
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- path: unsloth
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- repository: unslothai/notebooks
- ref: ${{ env.NOTEBOOKS_REF }}
- path: notebooks
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with: { python-version: '3.12' }
-
- - name: Seed Colab-shaped venv from pip-freeze (CPU-mapped)
- run: |
- # Strip cu128 local versions, route torch/torchvision to the CPU
- # wheel index, drop CUDA-specific deps the runner can't use.
- python -u - <<'PY' > /tmp/seed_pins.txt
- import json, re
- mapping = json.load(open("unsloth/scripts/data/colab_to_cpu_pin.json"))
- rewrite = mapping["rewrite"]
- skip = set(mapping["skip"])
- spoof = set(mapping["module_spoof"])
- out = []
- for line in open("unsloth/scripts/data/colab_pip_freeze.gpu.txt"):
- line = line.strip()
- if not line or line.startswith("#"):
- continue
- m = re.match(r"^([A-Za-z0-9._-]+)\s*==\s*(.+)$", line)
- if not m:
- continue
- name, ver = m.group(1).lower(), m.group(2)
- if name in skip:
- continue
- if name in spoof:
- continue
- if name in rewrite:
- ver = re.sub(r"[+\-].+$", "", ver)
- out.append(f"{name}=={ver}")
- else:
- ver = re.sub(r"[+\-].+$", "", ver)
- out.append(f"{name}=={ver}")
- print("\n".join(out))
- PY
- head -5 /tmp/seed_pins.txt
- wc -l /tmp/seed_pins.txt
-
- - name: Install Colab-shaped venv
- run: |
- python -m pip install --upgrade pip
- # Best-effort: any single line that fails to resolve on CPU is
- # tolerated; the smoke contract is "the install cell + the unsloth
- # import works", not "the entire Colab venv reproduces."
- while IFS= read -r spec; do
- pip install "$spec" --index-url https://download.pytorch.org/whl/cpu \
- --extra-index-url https://pypi.org/simple || \
- echo "::warning::pin failed: $spec"
- done < /tmp/seed_pins.txt
-
- - name: Run install cell
- run: |
- python unsloth/scripts/notebook_validator.py convert \
- --notebooks-dir notebooks --out _converted
- # Take the converted .py and run the install cell only.
- BASE="$(basename '${{ matrix.notebook }}' .ipynb | tr -d '()' | tr -c '[:alnum:]_' _)"
- PY="_converted/${BASE}.py"
- [ -f "$PY" ] || { echo "::error::$PY not found"; ls _converted | head; exit 1; }
- # Truncate at the first `from unsloth import` so we run install +
- # core imports only.
- awk '/^from unsloth import/ { print "import sys; sys.exit(0)"; exit } { print }' "$PY" > _smoke.py
- PYTHONPATH=unsloth/tests python -u - <<'PY'
- import _zoo_aggressive_cuda_spoof as _s; _s.apply()
- # Stub torchcodec for cells that import it — no CPU wheel exists.
- import sys, types
- if "torchcodec" not in sys.modules:
- sys.modules["torchcodec"] = types.ModuleType("torchcodec")
- exec(open("_smoke.py").read(), {"__name__": "__main__"})
- PY
-
- - name: Verify imports under spoof
- run: |
- PYTHONPATH=unsloth/tests python -u - <<'PY'
- import sys, types
- if "torchcodec" not in sys.modules:
- sys.modules["torchcodec"] = types.ModuleType("torchcodec")
- import _zoo_aggressive_cuda_spoof as _s; _s.apply()
- import unsloth, peft, torch, torchao, transformers, tokenizers
- print("OK: imports pass under CUDA spoof")
- PY
diff --git a/.github/workflows/release-desktop.yml b/.github/workflows/release-desktop.yml
deleted file mode 100644
index 810bb644ba..0000000000
--- a/.github/workflows/release-desktop.yml
+++ /dev/null
@@ -1,902 +0,0 @@
-name: Release Desktop App
-
-on:
- workflow_dispatch:
- inputs:
- studio_version:
- description: 'Studio version tag to release (for example, v0.1.39-beta)'
- type: string
- required: true
- pypi_version:
- description: 'Exact PyPI unsloth version just published/stamped (for example, 2026.5.3); leave blank to use MIN_DESKTOP_BACKEND_VERSION'
- type: string
- required: false
- draft:
- description: 'Create as draft release; draft runs do not advance desktop-latest updater channel'
- type: boolean
- default: true
-
-permissions:
- contents: read
-
-concurrency:
- group: release-desktop-${{ github.repository }}
- cancel-in-progress: false
-
-jobs:
- prepare-version:
- name: Prepare release versions
- runs-on: ubuntu-latest
- outputs:
- studio_version: ${{ steps.prepare.outputs.studio_version }}
- app_version: ${{ steps.prepare.outputs.app_version }}
- desktop_release_tag: ${{ steps.prepare.outputs.desktop_release_tag }}
- prerelease: ${{ steps.prepare.outputs.prerelease }}
- pypi_version: ${{ steps.prepare.outputs.pypi_version }}
-
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- with:
- persist-credentials: false
-
- - name: Validate release versions
- id: prepare
- shell: bash
- env:
- INPUT_STUDIO_VERSION: ${{ inputs.studio_version }}
- INPUT_PYPI_VERSION: ${{ inputs.pypi_version }}
- run: |
- python3 <<'PY'
- import os
- import pathlib
- import re
- import sys
-
- studio_version = os.environ['INPUT_STUDIO_VERSION'].strip()
- if not studio_version:
- sys.exit('studio_version is required, for example v0.1.39-beta')
- if re.fullmatch(r'v?20\d{2}\.\d+\.\d+(?:[-+][0-9A-Za-z.-]+)?', studio_version):
- sys.exit(f'studio_version must be a Studio SemVer tag, not a date-style backend version: {studio_version}')
-
- semver_tag = re.compile(
- r'^v(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)'
- r'(?:-[0-9A-Za-z.][0-9A-Za-z.-]*)?$'
- )
- if not semver_tag.fullmatch(studio_version):
- sys.exit(f'studio_version must be a SemVer tag with leading v, for example v0.1.39-beta: {studio_version}')
-
- app_version = studio_version.removeprefix('v')
- desktop_release_tag = f'desktop-v{app_version}'
- prerelease = 'true' if '-' in app_version.split('+', 1)[0] else 'false'
-
- def parse_backend_version(version):
- match = re.fullmatch(
- r'(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)'
- r'(?:([a-zA-Z]|\.dev|dev|\.rc|rc|\.post|post)(\d*))?'
- r'(?:[-+]([0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?',
- version,
- )
- if not match:
- return None
- major, minor, patch, suffix_name, suffix_number, suffix_text = match.groups()
- if suffix_name:
- normalized = suffix_name.lower().lstrip('.')
- order = {'dev': 0, 'a': 1, 'b': 2, 'rc': 3, 'post': 5}.get(normalized)
- if order is None:
- return None
- number = int(suffix_number or '0')
- elif suffix_text:
- order = 3 if version[version.find(suffix_text) - 1] == '-' else 4
- number = 0
- else:
- order = 4
- number = 0
- return (int(major), int(minor), int(patch), order, number)
-
- preflight = pathlib.Path('studio/src-tauri/src/preflight/version.rs').read_text()
- match = re.search(r'MIN_DESKTOP_BACKEND_VERSION:\s*&str\s*=\s*"([^"]+)"', preflight)
- if not match:
- sys.exit('Could not read MIN_DESKTOP_BACKEND_VERSION')
- min_backend_version = match.group(1)
-
- input_pypi_version = os.environ.get('INPUT_PYPI_VERSION', '').strip()
- parsed_min_backend = parse_backend_version(min_backend_version)
- if parsed_min_backend is None:
- sys.exit(f'MIN_DESKTOP_BACKEND_VERSION is not a supported backend package version: {min_backend_version}')
-
- pypi_version = input_pypi_version or min_backend_version
- parsed_pypi = parse_backend_version(pypi_version)
- if parsed_pypi is None:
- sys.exit(f'pypi_version is not a supported backend package version: {pypi_version}')
- if parsed_pypi < parsed_min_backend:
- sys.exit(
- f'pypi_version {pypi_version} is lower than desktop minimum '
- f'MIN_DESKTOP_BACKEND_VERSION {min_backend_version}'
- )
-
- if input_pypi_version:
- print(
- 'Using exact PyPI unsloth version from pypi_version input: '
- f'{pypi_version} (desktop minimum: {min_backend_version})'
- )
- else:
- print(
- 'Using exact PyPI unsloth version from MIN_DESKTOP_BACKEND_VERSION: '
- f'{pypi_version}'
- )
-
- with open(os.environ['GITHUB_OUTPUT'], 'a', encoding='utf-8') as output:
- print(f'studio_version={studio_version}', file=output)
- print(f'app_version={app_version}', file=output)
- print(f'desktop_release_tag={desktop_release_tag}', file=output)
- print(f'prerelease={prerelease}', file=output)
- print(f'pypi_version={pypi_version}', file=output)
- PY
-
- - name: Verify PyPI package and Studio stamp
- shell: bash
- env:
- STUDIO_VERSION: ${{ steps.prepare.outputs.studio_version }}
- PYPI_VERSION: ${{ steps.prepare.outputs.pypi_version }}
- run: |
- set -euo pipefail
- python3 <<'PY'
- import json
- import os
- import pathlib
- import sys
- import time
- import urllib.error
- import urllib.request
-
- pypi_version = os.environ['PYPI_VERSION']
- dist_dir = pathlib.Path(os.environ['RUNNER_TEMP'], 'pypi-unsloth-dist')
- dist_dir.mkdir(parents=True, exist_ok=True)
- metadata_url = f'https://pypi.org/pypi/unsloth/{pypi_version}/json'
-
- last_error = None
- for attempt in range(1, 6):
- try:
- with urllib.request.urlopen(metadata_url, timeout=30) as response:
- metadata = json.load(response)
- break
- except Exception as exc:
- last_error = exc
- if attempt < 5:
- time.sleep(10 * attempt)
- else:
- sys.exit(f'Publish unsloth=={pypi_version} to PyPI before the desktop release ({last_error})')
-
- files = metadata.get('urls') or []
- if not files:
- sys.exit(f'PyPI returned no distribution files for unsloth=={pypi_version}')
-
- for file_info in files:
- filename = file_info.get('filename')
- url = file_info.get('url')
- if not filename or '/' in filename or not url:
- sys.exit(f'Unexpected PyPI file entry for unsloth=={pypi_version}: {file_info!r}')
- target = dist_dir / filename
- for attempt in range(1, 4):
- try:
- with urllib.request.urlopen(url, timeout=60) as response:
- target.write_bytes(response.read())
- break
- except Exception as exc:
- last_error = exc
- if attempt < 3:
- time.sleep(5 * attempt)
- else:
- sys.exit(f'Could not download {filename} from PyPI ({last_error})')
- PY
-
- if [ -f scripts/stamp_studio_release.py ]; then
- mapfile -t dists < <(find "$RUNNER_TEMP/pypi-unsloth-dist" -type f \( -name '*.whl' -o -name '*.tar.gz' \) | sort)
- if [ "${#dists[@]}" -eq 0 ]; then
- echo "No PyPI wheel/sdist artifacts downloaded for unsloth==$PYPI_VERSION" >&2
- exit 1
- fi
- python3 scripts/stamp_studio_release.py --verify-dist "$RUNNER_TEMP/pypi-unsloth-dist" --expected "$STUDIO_VERSION"
- else
- echo "scripts/stamp_studio_release.py not found; release-desktop requires #5308 to verify the PyPI Studio stamp." >&2
- exit 1
- fi
-
- - name: Guard public updater channel version
- if: ${{ !inputs.draft }}
- shell: bash
- env:
- GH_REPO: ${{ github.repository }}
- GH_TOKEN: ${{ github.token }}
- APP_VERSION: ${{ steps.prepare.outputs.app_version }}
- run: |
- set -euo pipefail
- mkdir -p "$RUNNER_TEMP/desktop-current"
- if ! gh release download desktop-latest --pattern latest.json --dir "$RUNNER_TEMP/desktop-current" --clobber 2>/dev/null; then
- echo "No existing desktop-latest latest.json found; allowing first channel publish."
- exit 0
- fi
- python3 <<'PY'
- import json
- import os
- import pathlib
- import re
- import sys
-
- def parse(value: str):
- value = value.removeprefix('v')
- match = re.fullmatch(
- r'(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)'
- r'(?:-([0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?'
- r'(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?',
- value,
- )
- if not match:
- sys.exit(f'desktop-latest latest.json has invalid version: {value}')
- major, minor, patch, prerelease = match.groups()
- return (int(major), int(minor), int(patch), prerelease)
-
- def numeric_tail(identifier: str) -> tuple[str, int] | None:
- match = re.fullmatch(r'([A-Za-z-]+)(\d+)', identifier)
- if not match:
- return None
- return (match.group(1).lower(), int(match.group(2)))
-
- def compare_identifier(left: str, right: str) -> int:
- left_num = left.isdigit()
- right_num = right.isdigit()
- if left_num and right_num:
- return (int(left) > int(right)) - (int(left) < int(right))
- if left_num:
- return -1
- if right_num:
- return 1
-
- left_tail = numeric_tail(left)
- right_tail = numeric_tail(right)
- if left_tail and right_tail and left_tail[0] == right_tail[0]:
- return (left_tail[1] > right_tail[1]) - (left_tail[1] < right_tail[1])
-
- return (left > right) - (left < right)
-
- def compare_prerelease(left: str | None, right: str | None) -> int:
- if left == right:
- return 0
- if left is None:
- return 1
- if right is None:
- return -1
- left_parts = left.split('.')
- right_parts = right.split('.')
- for left_part, right_part in zip(left_parts, right_parts):
- order = compare_identifier(left_part, right_part)
- if order:
- return order
- return (len(left_parts) > len(right_parts)) - (len(left_parts) < len(right_parts))
-
- def compare(left: str, right: str) -> int:
- left_major, left_minor, left_patch, left_pre = parse(left)
- right_major, right_minor, right_patch, right_pre = parse(right)
- left_core = (left_major, left_minor, left_patch)
- right_core = (right_major, right_minor, right_patch)
- if left_core != right_core:
- return (left_core > right_core) - (left_core < right_core)
- return compare_prerelease(left_pre, right_pre)
-
- current_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-current', 'latest.json')
- current = json.loads(current_path.read_text()).get('version')
- next_version = os.environ['APP_VERSION']
- if not isinstance(current, str):
- sys.exit('desktop-latest latest.json has missing version')
- if compare(next_version, current) < 0:
- sys.exit(
- f'Refusing to publish {next_version}; desktop-latest currently points at newer version {current}.'
- )
- PY
-
- build:
- # TODO: split into a "build (no secrets)" + "publish (secrets)" job pair
- # with actions/upload-artifact handoff so the matrix build cannot
- # publish a Release on its own. The current matrix runs across
- # Linux/macOS/Windows in a single job, so the split needs artefact
- # collection across the OS matrix and is out of scope for this
- # hardening pass.
- permissions:
- contents: write # tauri-apps/tauri-action creates / uploads a GitHub Release
- strategy:
- fail-fast: false
- max-parallel: 1
- matrix:
- include:
- - platform: macos-latest
- args: '--target aarch64-apple-darwin'
- label: macOS (Apple Silicon)
- # - platform: macos-latest
- # args: '--target x86_64-apple-darwin'
- # label: macOS (Intel)
- - platform: ubuntu-22.04
- args: ''
- label: Linux (x64)
- - platform: windows-latest
- args: ''
- label: Windows (x64)
-
- name: Build ${{ matrix.label }}
- needs: prepare-version
- runs-on: ${{ matrix.platform }}
-
- env:
- FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
- APP_VERSION: ${{ needs.prepare-version.outputs.app_version }}
- STUDIO_VERSION: ${{ needs.prepare-version.outputs.studio_version }}
- DESKTOP_RELEASE_TAG: ${{ needs.prepare-version.outputs.desktop_release_tag }}
- DESKTOP_PRERELEASE: ${{ needs.prepare-version.outputs.prerelease }}
-
- steps:
- # harden-runner in audit mode: surfaces every egress destination in
- # the runner log so the allowlist for a future `egress-policy: block`
- # promotion can be derived from observed traffic. Audit mode is
- # cross-platform (Linux / macOS / Windows runners); blocking mode is
- # currently Linux-only, so we deliberately stay in audit until the
- # macOS + Windows codesign paths have been observed.
- - name: Harden runner (audit)
- uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
- with:
- egress-policy: audit
-
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- with:
- persist-credentials: false
-
- # ── Linux dependencies ──
- - name: Install Linux dependencies
- if: matrix.platform == 'ubuntu-22.04'
- run: |
- sudo apt-get update
- sudo apt-get install -y libwebkit2gtk-4.1-dev libayatana-appindicator3-dev librsvg2-dev libxdo-dev libssl-dev patchelf
-
- # ── Node.js ──
- - name: Setup Node.js
- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e
- with:
- node-version: 24
-
- - name: Install pinned Tauri CLI
- # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
- # required for `vite build`. The pre-install lockfile structural
- # audit (lockfile_supply_chain_audit.py) is the practical defence
- # against the npm postinstall-dropper class -- it fires BEFORE any
- # tarball runs, on the injection pattern itself rather than an
- # advisory-DB lookup.
- run: npm install --save-dev --prefix studio @tauri-apps/cli@2.10.1 --no-fund --no-audit
-
- - name: Verify pinned Tauri CLI
- shell: bash
- run: |
- out="$(npx --prefix studio tauri --version)"
- echo "$out"
- if [ "$out" != "tauri-cli 2.10.1" ]; then
- echo "Expected tauri-cli 2.10.1, got $out" >&2
- exit 1
- fi
-
- - name: Verify desktop updater and Linux package config
- shell: bash
- run: |
- node <<'JS'
- const { readFileSync } = require('node:fs');
-
- const expected = 'https://github.com/unslothai/unsloth/releases/download/desktop-latest/latest.json';
- const config = JSON.parse(readFileSync('studio/src-tauri/tauri.conf.json', 'utf8'));
- const endpoints = config.plugins?.updater?.endpoints;
- if (!Array.isArray(endpoints) || endpoints.length !== 1) {
- throw new Error('Expected exactly one desktop updater endpoint');
- }
- if (endpoints[0] !== expected) {
- throw new Error('Desktop updater endpoint must be ' + expected + ', got ' + endpoints[0]);
- }
- if (endpoints.some((endpoint) => endpoint.includes('/releases/latest/'))) {
- throw new Error('Desktop updater endpoint must not use repo-wide /releases/latest/');
- }
-
- const targets = config.bundle?.targets;
- if (Array.isArray(targets) && targets.some((target) => String(target).toLowerCase() === 'rpm')) {
- throw new Error('Desktop release must not target RPM packages');
- }
- if (config.bundle?.linux?.rpm) {
- throw new Error('bundle.linux.rpm must not be configured');
- }
-
- const workflow = readFileSync('.github/workflows/release-desktop.yml', 'utf8');
- const lines = workflow.split(/\r?\n/);
- const releaseBodies = [];
- for (let i = 0; i < lines.length; i += 1) {
- const match = lines[i].match(/^(\s*)releaseBody:\s*\|\s*$/);
- if (!match) continue;
- const baseIndent = match[1].length;
- const bodyLines = [];
- i += 1;
- for (; i < lines.length; i += 1) {
- const line = lines[i];
- if (line.trim() === '') {
- bodyLines.push('');
- continue;
- }
- const indent = line.match(/^\s*/)[0].length;
- if (indent <= baseIndent) {
- i -= 1;
- break;
- }
- bodyLines.push(line.slice(baseIndent + 2));
- }
- releaseBodies.push(bodyLines.join('\n'));
- }
- if (releaseBodies.length === 0) {
- throw new Error('Expected at least one desktop release body');
- }
- for (const body of releaseBodies) {
- if (/\brpm\b|\.rpm/i.test(body)) {
- throw new Error('Desktop release body must not advertise RPM packages');
- }
- }
- JS
-
- - name: Install frontend dependencies
- working-directory: studio/frontend
- # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
- # required for `vite build`. The pre-install lockfile structural
- # audit (lockfile_supply_chain_audit.py) is the practical defence
- # against the npm postinstall-dropper class -- it fires BEFORE any
- # tarball runs, on the injection pattern itself rather than an
- # advisory-DB lookup.
- run: npm install --no-fund --no-audit
-
- # ── Rust ──
- - name: Install Rust stable
- uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable @ 2026-03-27
- with:
- targets: ${{ matrix.platform == 'macos-latest' && 'aarch64-apple-darwin,x86_64-apple-darwin' || '' }}
-
- - name: Patch desktop app version
- shell: bash
- working-directory: studio/src-tauri
- run: |
- set -euo pipefail
- if command -v python3 >/dev/null 2>&1; then
- PYTHON=python3
- else
- PYTHON=python
- fi
- "$PYTHON" <<'PY'
- import os
- import pathlib
- import re
- import sys
-
- app_version = os.environ['APP_VERSION']
- if not app_version:
- sys.exit('APP_VERSION is required')
-
- cargo_toml = pathlib.Path('Cargo.toml')
- lines = cargo_toml.read_text().splitlines(keepends=True)
- in_package = False
- patched = False
- for index, line in enumerate(lines):
- stripped = line.strip()
- if stripped == '[package]':
- in_package = True
- continue
- if stripped.startswith('[') and stripped.endswith(']'):
- in_package = False
- if in_package and re.fullmatch(r'version\s*=\s*"[^"]+"\s*', stripped):
- lines[index] = f'version = "{app_version}"\n'
- patched = True
- break
- if not patched:
- sys.exit('Could not patch [package] version in Cargo.toml')
- cargo_toml.write_text(''.join(lines))
-
- cargo_lock = pathlib.Path('Cargo.lock')
- lock_text = cargo_lock.read_text()
- lock_text, count = re.subn(
- r'(?m)(^\[\[package\]\]\nname = "unsloth-studio"\nversion = ")[^"]+(")',
- lambda match: f'{match.group(1)}{app_version}{match.group(2)}',
- lock_text,
- )
- if count != 1:
- sys.exit(f'Could not patch unsloth-studio version in Cargo.lock (matches={count})')
- cargo_lock.write_text(lock_text)
- PY
-
- cargo metadata --locked --no-deps --format-version 1 > "$RUNNER_TEMP/cargo-metadata.json"
- "$PYTHON" <<'PY'
- import json
- import os
- import pathlib
- import sys
-
- app_version = os.environ['APP_VERSION']
- metadata = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'cargo-metadata.json').read_text())
- versions = [package['version'] for package in metadata.get('packages', []) if package.get('name') == 'unsloth-studio']
- if versions != [app_version]:
- sys.exit(f'cargo metadata unsloth-studio version mismatch: expected {app_version}, got {versions}')
- PY
-
- git diff -- Cargo.toml Cargo.lock
-
- - name: Rust cache
- uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32
- with:
- workspaces: 'studio/src-tauri -> target'
-
- # ── macOS: import signing certificate ──
- - name: Import Apple certificate
- if: matrix.platform == 'macos-latest'
- env:
- APPLE_CERTIFICATE: ${{ secrets.APPLE_CERTIFICATE }}
- APPLE_CERTIFICATE_PASSWORD: ${{ secrets.APPLE_CERTIFICATE_PASSWORD }}
- KEYCHAIN_PASSWORD: ${{ secrets.KEYCHAIN_PASSWORD }}
- run: |
- echo $APPLE_CERTIFICATE | base64 --decode > certificate.p12
- security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
- security default-keychain -s build.keychain
- security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
- security set-keychain-settings -t 3600 -u build.keychain
- security import certificate.p12 -k build.keychain -P "$APPLE_CERTIFICATE_PASSWORD" -T /usr/bin/codesign
- security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k "$KEYCHAIN_PASSWORD" build.keychain
- security find-identity -v -p codesigning build.keychain
- rm -f certificate.p12
-
- # ── Windows: install Azure Trusted Signing CLI ──
- - name: Install trusted-signing-cli
- if: matrix.platform == 'windows-latest'
- run: |
- cargo install trusted-signing-cli --version 0.9.0 --locked
- echo "$env:USERPROFILE\.cargo\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-
- # ── Windows: verify signing CLI is accessible ──
- - name: Verify trusted-signing-cli
- if: matrix.platform == 'windows-latest'
- run: |
- Write-Output "PATH: $env:PATH"
- Get-Command trusted-signing-cli -ErrorAction SilentlyContinue || Write-Output "trusted-signing-cli NOT in PATH"
- trusted-signing-cli --version || Write-Output "trusted-signing-cli failed to run"
-
- # ── Linux: build + sign + upload ──
- - name: Build Linux app
- if: matrix.platform == 'ubuntu-22.04'
- uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }}
- TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }}
- with:
- projectPath: studio
- tauriScript: npx --prefix . tauri
- tagName: ${{ needs.prepare-version.outputs.desktop_release_tag }}
- releaseName: 'Unsloth Studio (Desktop) ${{ needs.prepare-version.outputs.studio_version }}'
- releaseBody: |
- Desktop app for Unsloth Studio.
-
- **macOS**: Download the Apple Silicon `.dmg`.
- **Windows**: Download the `-setup.exe` installer.
- **Linux**: Download `.deb` (Ubuntu/Debian) or `.AppImage` (universal).
-
- > Linux in-app updates are AppImage-oriented. Package installs should update by downloading a new package.
- > Linux AppImage on Ubuntu 24.04+ may require: `sudo apt install libfuse2t64`
- > First-run system dependency elevation is supported on Ubuntu/Debian. Other Linux distributions should install system packages manually.
- releaseDraft: ${{ inputs.draft }}
- prerelease: ${{ needs.prepare-version.outputs.prerelease }}
- args: -v ${{ matrix.args }}
-
- # ── macOS: build + sign + notarize + upload ──
- - name: Build macOS app
- if: matrix.platform == 'macos-latest'
- uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }}
- TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }}
- APPLE_SIGNING_IDENTITY: ${{ secrets.APPLE_SIGNING_IDENTITY }}
- APPLE_ID: ${{ secrets.APPLE_ID }}
- APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
- APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }}
- with:
- projectPath: studio
- tauriScript: npx --prefix . tauri
- tagName: ${{ needs.prepare-version.outputs.desktop_release_tag }}
- releaseName: 'Unsloth Studio (Desktop) ${{ needs.prepare-version.outputs.studio_version }}'
- releaseBody: |
- Desktop app for Unsloth Studio.
-
- **macOS**: Download the Apple Silicon `.dmg`.
- **Windows**: Download the `-setup.exe` installer.
- **Linux**: Download `.deb` (Ubuntu/Debian) or `.AppImage` (universal).
-
- > Linux in-app updates are AppImage-oriented. Package installs should update by downloading a new package.
- > Linux AppImage on Ubuntu 24.04+ may require: `sudo apt install libfuse2t64`
- > First-run system dependency elevation is supported on Ubuntu/Debian. Other Linux distributions should install system packages manually.
- releaseDraft: ${{ inputs.draft }}
- prerelease: ${{ needs.prepare-version.outputs.prerelease }}
- args: -v ${{ matrix.args }}
-
- # ── Windows: build + sign + upload ──
- - name: Build Windows app
- if: matrix.platform == 'windows-latest'
- uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }}
- TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }}
- AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
- AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
- AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
- AZURE_TRUSTED_SIGNING_ACCOUNT_NAME: ${{ secrets.AZURE_TRUSTED_SIGNING_ACCOUNT_NAME }}
- AZURE_CERTIFICATE_PROFILE_NAME: ${{ secrets.AZURE_CERTIFICATE_PROFILE_NAME }}
- with:
- projectPath: studio
- tauriScript: npx --prefix . tauri
- tagName: ${{ needs.prepare-version.outputs.desktop_release_tag }}
- releaseName: 'Unsloth Studio (Desktop) ${{ needs.prepare-version.outputs.studio_version }}'
- releaseBody: |
- Desktop app for Unsloth Studio.
-
- **macOS**: Download the Apple Silicon `.dmg`.
- **Windows**: Download the `-setup.exe` installer.
- **Linux**: Download `.deb` (Ubuntu/Debian) or `.AppImage` (universal).
-
- > Linux in-app updates are AppImage-oriented. Package installs should update by downloading a new package.
- > Linux AppImage on Ubuntu 24.04+ may require: `sudo apt install libfuse2t64`
- > First-run system dependency elevation is supported on Ubuntu/Debian. Other Linux distributions should install system packages manually.
- releaseDraft: ${{ inputs.draft }}
- prerelease: ${{ needs.prepare-version.outputs.prerelease }}
- args: -v ${{ matrix.args }}
-
- # Release process note: only non-draft workflow runs advance the public
- # desktop-latest updater channel. Draft builds are for private review; if a
- # draft is manually published later, this channel intentionally remains
- # unchanged until a narrow manual channel-publish flow is added or a public
- # desktop release is created by running this workflow with draft=false.
- publish-updater-channel:
- name: Publish desktop updater channel
- needs: [prepare-version, build]
- if: ${{ !inputs.draft }}
- runs-on: ubuntu-latest
- permissions:
- contents: write
- env:
- GH_REPO: ${{ github.repository }}
- APP_VERSION: ${{ needs.prepare-version.outputs.app_version }}
- STUDIO_VERSION: ${{ needs.prepare-version.outputs.studio_version }}
- DESKTOP_RELEASE_TAG: ${{ needs.prepare-version.outputs.desktop_release_tag }}
- DESKTOP_PRERELEASE: ${{ needs.prepare-version.outputs.prerelease }}
-
- steps:
- - name: Download versioned updater metadata
- shell: bash
- env:
- GH_TOKEN: ${{ github.token }}
- run: |
- set -euo pipefail
- mkdir -p "$RUNNER_TEMP/desktop-updater"
- gh api "repos/${GITHUB_REPOSITORY}/releases/tags/${DESKTOP_RELEASE_TAG}" > "$RUNNER_TEMP/source-release.json"
- python3 <<'PY'
- import json
- import os
- import pathlib
- import sys
-
- source = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'source-release.json').read_text())
- expected_tag = os.environ['DESKTOP_RELEASE_TAG']
- if source.get('tag_name') != expected_tag:
- sys.exit(f'Expected source release {expected_tag}, got {source.get("tag_name")}')
- if source.get('draft'):
- sys.exit(f'Source desktop release {expected_tag} is draft; refusing to publish public updater channel')
- PY
- gh release download "$DESKTOP_RELEASE_TAG" --pattern latest.json --dir "$RUNNER_TEMP/desktop-updater" --clobber
- test -s "$RUNNER_TEMP/desktop-updater/latest.json"
-
- - name: Validate versioned updater metadata
- shell: bash
- run: |
- python3 <<'PY'
- import json
- import os
- import pathlib
- import re
- import sys
-
- app_version = os.environ['APP_VERSION']
- release_tag = os.environ['DESKTOP_RELEASE_TAG']
- latest_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-updater', 'latest.json')
- data = json.loads(latest_path.read_text())
- if not isinstance(data, dict):
- sys.exit('latest.json must be a JSON object')
-
- version = data.get('version')
- if not isinstance(version, str) or not version:
- sys.exit('latest.json missing version')
- if not re.fullmatch(r'v?\d+\.\d+\.\d+(?:[-+][0-9A-Za-z.-]+)?', version):
- sys.exit(f'latest.json version is not SemVer-like: {version}')
- if version.removeprefix('v') != app_version:
- sys.exit(f'latest.json version {version} does not match desktop app version {app_version}')
-
- platforms = data.get('platforms')
- if not isinstance(platforms, dict) or not platforms:
- sys.exit('latest.json missing platforms')
-
- required_families = {
- 'darwin-aarch64': False,
- 'linux-x86_64': False,
- 'windows-x86_64': False,
- }
- expected_prefix = f'https://github.com/unslothai/unsloth/releases/download/{release_tag}/'
- forbidden_fragments = ('/releases/latest/', '/releases/download/desktop-latest/')
-
- for platform, entry in platforms.items():
- if not isinstance(entry, dict):
- sys.exit(f'Platform {platform} must be an object')
- url = entry.get('url')
- signature = entry.get('signature')
- if not isinstance(url, str) or not url.strip():
- sys.exit(f'Platform {platform} missing url')
- if not isinstance(signature, str) or not signature.strip():
- sys.exit(f'Platform {platform} missing signature')
- if any(fragment in url for fragment in forbidden_fragments):
- sys.exit(f'Platform {platform} points at a moving updater channel: {url}')
- if not url.startswith(expected_prefix):
- sys.exit(f'Platform {platform} URL must point at {release_tag}: {url}')
- for family in required_families:
- if platform == family or platform.startswith(family + '-'):
- required_families[family] = True
-
- missing = [family for family, found in required_families.items() if not found]
- if missing:
- sys.exit('latest.json missing required platform families: ' + ', '.join(missing))
- PY
-
- - name: Ensure desktop updater channel release
- shell: bash
- env:
- GH_TOKEN: ${{ github.token }}
- run: |
- set -euo pipefail
- channel_json="$RUNNER_TEMP/desktop-latest-release.json"
- if ! gh api "repos/${GITHUB_REPOSITORY}/releases/tags/desktop-latest" > "$channel_json" 2>/dev/null; then
- gh release create desktop-latest \
- --title "Unsloth Studio Desktop updater channel" \
- --notes "Machine-managed desktop updater channel; latest.json is replaced by release-desktop.yml." \
- --prerelease \
- --latest=false \
- --target "$GITHUB_SHA"
- gh api "repos/${GITHUB_REPOSITORY}/releases/tags/desktop-latest" > "$channel_json"
- fi
-
- python3 <<'PY'
- import json
- import os
- import pathlib
- import sys
-
- channel = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-latest-release.json').read_text())
- if channel.get('draft'):
- sys.exit('desktop-latest release is draft; refusing to publish updater channel')
- if channel.get('immutable'):
- sys.exit('desktop-latest release is immutable; cannot replace latest.json')
- if not channel.get('prerelease'):
- sys.exit('desktop-latest release must be a prerelease so it cannot compete with repo-wide latest')
- PY
-
- - name: Prevent updater channel downgrade
- shell: bash
- env:
- GH_TOKEN: ${{ github.token }}
- run: |
- set -euo pipefail
- mkdir -p "$RUNNER_TEMP/desktop-current"
- if ! gh release download desktop-latest --pattern latest.json --dir "$RUNNER_TEMP/desktop-current" --clobber 2>/dev/null; then
- echo "No existing desktop-latest latest.json found; allowing first channel publish."
- exit 0
- fi
- python3 <<'PY'
- import json
- import os
- import pathlib
- import re
- import sys
-
- def parse(value: str):
- value = value.removeprefix('v')
- match = re.fullmatch(
- r'(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)'
- r'(?:-([0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?'
- r'(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?',
- value,
- )
- if not match:
- sys.exit(f'desktop-latest latest.json has invalid version: {value}')
- major, minor, patch, prerelease = match.groups()
- return (int(major), int(minor), int(patch), prerelease)
-
- def numeric_tail(identifier: str) -> tuple[str, int] | None:
- match = re.fullmatch(r'([A-Za-z-]+)(\d+)', identifier)
- if not match:
- return None
- return (match.group(1).lower(), int(match.group(2)))
-
- def compare_identifier(left: str, right: str) -> int:
- left_num = left.isdigit()
- right_num = right.isdigit()
- if left_num and right_num:
- return (int(left) > int(right)) - (int(left) < int(right))
- if left_num:
- return -1
- if right_num:
- return 1
-
- left_tail = numeric_tail(left)
- right_tail = numeric_tail(right)
- if left_tail and right_tail and left_tail[0] == right_tail[0]:
- return (left_tail[1] > right_tail[1]) - (left_tail[1] < right_tail[1])
-
- return (left > right) - (left < right)
-
- def compare_prerelease(left: str | None, right: str | None) -> int:
- if left == right:
- return 0
- if left is None:
- return 1
- if right is None:
- return -1
- left_parts = left.split('.')
- right_parts = right.split('.')
- for left_part, right_part in zip(left_parts, right_parts):
- order = compare_identifier(left_part, right_part)
- if order:
- return order
- return (len(left_parts) > len(right_parts)) - (len(left_parts) < len(right_parts))
-
- def compare(left: str, right: str) -> int:
- left_major, left_minor, left_patch, left_pre = parse(left)
- right_major, right_minor, right_patch, right_pre = parse(right)
- left_core = (left_major, left_minor, left_patch)
- right_core = (right_major, right_minor, right_patch)
- if left_core != right_core:
- return (left_core > right_core) - (left_core < right_core)
- return compare_prerelease(left_pre, right_pre)
-
- current_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-current', 'latest.json')
- next_path = pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-updater', 'latest.json')
- current = json.loads(current_path.read_text()).get('version')
- next_version = json.loads(next_path.read_text()).get('version')
- if not isinstance(current, str) or not isinstance(next_version, str):
- sys.exit('Could not compare desktop-latest channel versions')
- if compare(next_version, current) < 0:
- sys.exit(
- f'Refusing to move desktop-latest from {current} to older version {next_version}.'
- )
- PY
-
- - name: Publish desktop updater channel metadata
- shell: bash
- env:
- GH_TOKEN: ${{ github.token }}
- run: |
- set -euo pipefail
- gh release upload desktop-latest "$RUNNER_TEMP/desktop-updater/latest.json" --clobber
- gh api "repos/${GITHUB_REPOSITORY}/releases/tags/desktop-latest" > "$RUNNER_TEMP/desktop-latest-release.json"
- python3 <<'PY'
- import json
- import os
- import pathlib
- import sys
-
- channel = json.loads(pathlib.Path(os.environ['RUNNER_TEMP'], 'desktop-latest-release.json').read_text())
- assets = [asset for asset in channel.get('assets', []) if asset.get('name') == 'latest.json']
- if len(assets) != 1:
- sys.exit(f'Expected exactly one desktop-latest latest.json asset, found {len(assets)}')
- expected_url = f'https://github.com/{os.environ["GITHUB_REPOSITORY"]}/releases/download/desktop-latest/latest.json'
- actual_url = assets[0].get('browser_download_url')
- if actual_url != expected_url:
- sys.exit(f'desktop-latest latest.json URL mismatch: expected {expected_url}, got {actual_url}')
- PY
diff --git a/.github/workflows/safetensors-tool-loop-ci.yml b/.github/workflows/safetensors-tool-loop-ci.yml
new file mode 100644
index 0000000000..17be0c8fcf
--- /dev/null
+++ b/.github/workflows/safetensors-tool-loop-ci.yml
@@ -0,0 +1,488 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
+
+# Cross-platform smoke for the Studio safetensors agentic tool loop.
+# Runs the new test_safetensors_tool_loop.py plus the directly-adjacent
+# tool / inference / anthropic suites against three runner OSes
+# (ubuntu-latest, macos-14, windows-latest) under the same CPU-only
+# torch + transformers shape Studio's main backend job uses.
+#
+# Why a dedicated workflow:
+# - The safetensors loop ships with 41 unit tests that exercise the
+# state machine, parser, allowlist, IPC kwarg forwarding, and
+# template fallback path. None of them call into torch's CUDA
+# subsystem or load a model, so a free CPU runner is enough.
+# - Path filter pins the workflow to files this PR actually changes
+# plus the workflow itself, so unrelated commits don't re-trigger.
+# - Concurrency.cancel-in-progress: each new push to the staging
+# branch supersedes the previous run -- iterating is cheap.
+#
+# Cost guard: one Python version, three OS matrix cells, ~3 min each.
+# No Windows-runner queue pressure beyond a single matrix entry.
+
+name: Safetensors tool loop CI
+
+on:
+ pull_request:
+ paths:
+ - 'studio/backend/core/inference/**'
+ - 'studio/backend/routes/inference.py'
+ - 'studio/backend/tests/test_safetensors_tool_loop.py'
+ - 'studio/backend/utils/datasets/**'
+ - '.github/workflows/safetensors-tool-loop-ci.yml'
+ push:
+ branches: [safetensors-tool-loop-staging]
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+
+jobs:
+ safetensors-loop:
+ name: ${{ matrix.os }} / py3.11
+ runs-on: ${{ matrix.os }}
+ timeout-minutes: 20
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest, macos-14, windows-latest]
+ steps:
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+
+ - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+ with:
+ python-version: '3.11'
+ cache: 'pip'
+
+ # The safetensors tool loop talks to no GPU and does not call
+ # model.generate; it parses cumulative text and fakes tool
+ # results. Install the CPU torch wheel only because Studio's
+ # `from utils.hardware import ...` chain imports torch at module
+ # scope. Same shape Studio's main backend job uses.
+ #
+ # `shell: bash` everywhere -- windows-latest ships Git Bash, and
+ # using one shell across the matrix keeps the `\` line
+ # continuations valid without forking the script per OS.
+ - name: Install CPU torch + transformers
+ shell: bash
+ run: |
+ python -m pip install --upgrade pip
+ pip install --index-url https://download.pytorch.org/whl/cpu \
+ 'torch>=2.4,<2.11'
+ pip install 'transformers>=4.51,<5.5'
+
+ - name: Install Studio backend dependencies (CPU only)
+ shell: bash
+ run: |
+ pip install \
+ pytest pytest-asyncio httpx \
+ fastapi 'pydantic>=2' pyjwt cryptography python-multipart \
+ structlog pyyaml jinja2 mammoth unpdf requests typer \
+ aiofiles sqlalchemy huggingface_hub matplotlib datasets \
+ 'numpy<3'
+
+ - name: Run safetensors tool-loop tests
+ shell: bash
+ working-directory: studio/backend
+ env:
+ # Windows: GITHUB_WORKSPACE is a Windows path with backslashes
+ # but bash + setup-python resolve mixed separators just fine
+ # for PYTHONPATH purposes.
+ PYTHONPATH: ${{ github.workspace }}/studio/backend
+ UNSLOTH_COMPILE_DISABLE: '1'
+ run: |
+ python -m pytest tests/test_safetensors_tool_loop.py -v --tb=short
+
+ - name: Run adjacent tool / inference suites (regression guard)
+ shell: bash
+ working-directory: studio/backend
+ env:
+ PYTHONPATH: ${{ github.workspace }}/studio/backend
+ UNSLOTH_COMPILE_DISABLE: '1'
+ run: |
+ python -m pytest \
+ tests/test_openai_tool_passthrough.py \
+ tests/test_responses_tool_passthrough.py \
+ tests/test_inference_model_validation.py \
+ tests/test_anthropic_thinking_translation.py \
+ tests/test_anthropic_code_execution.py \
+ tests/test_anthropic_messages.py \
+ -q --tb=short
+
+ # macos-14 is Apple Silicon (M-series), so we can exercise the MLX
+ # path that workspace_40 / GGUF runners cannot reach. Loads
+ # unsloth/Qwen3.5-0.8B for real via MLXInferenceBackend and asserts
+ # the chat_template_info forwarded over IPC produces
+ # supports_tools=True + supports_reasoning=True. This is the exact
+ # bug the user reported on Mac.
+ mlx-qwen35-probe:
+ name: macos-14 / MLX Qwen3.5-0.8B
+ runs-on: macos-14
+ timeout-minutes: 25
+ steps:
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+
+ - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+ with:
+ python-version: '3.11'
+ cache: 'pip'
+
+ # mlx + mlx-lm are macOS / Apple Silicon only.
+ - name: Install MLX + Studio backend deps
+ shell: bash
+ run: |
+ python -m pip install --upgrade pip
+ pip install 'mlx>=0.20' 'mlx-lm>=0.20' 'mlx-vlm>=0.1' || \
+ pip install mlx mlx-lm mlx-vlm
+ pip install 'transformers>=4.51,<5.5'
+ pip install \
+ pytest pytest-asyncio httpx \
+ fastapi 'pydantic>=2' pyjwt cryptography python-multipart \
+ structlog pyyaml jinja2 mammoth unpdf requests typer \
+ aiofiles sqlalchemy huggingface_hub matplotlib datasets \
+ 'numpy<3'
+ # unsloth-zoo carries unsloth_zoo.mlx.loader.FastMLXModel.
+ pip install --no-deps 'unsloth-zoo' || true
+
+ - name: Probe Qwen3.5-0.8B chat_template + capability flags via MLX
+ shell: bash
+ working-directory: studio/backend
+ env:
+ PYTHONPATH: ${{ github.workspace }}/studio/backend
+ UNSLOTH_COMPILE_DISABLE: '1'
+ HF_HUB_ENABLE_HF_TRANSFER: '0'
+ run: |
+ python - <<'PY'
+ import sys
+ from pathlib import Path
+ sys.path.insert(0, str(Path.cwd()))
+
+ # 1. Load model via MLXInferenceBackend (the real Mac path).
+ from utils.models.model_config import ModelConfig
+ from core.inference.mlx_inference import MLXInferenceBackend
+ from routes.inference import _detect_safetensors_features
+
+ repo = "unsloth/Qwen3.5-0.8B"
+ cfg = ModelConfig.from_identifier(model_id=repo)
+ assert cfg is not None, "ModelConfig.from_identifier returned None"
+
+ backend = MLXInferenceBackend()
+ ok = backend.load_model(config=cfg, max_seq_length=2048)
+ assert ok, "MLXInferenceBackend.load_model returned False"
+
+ # 2. chat_template_info must be populated on backend.models[name].
+ entry = backend.models[backend.active_model_name]
+ assert "chat_template_info" in entry, (
+ "chat_template_info missing from MLX backend.models -- "
+ "the Mac IPC fix did not land"
+ )
+ tpl = entry["chat_template_info"].get("template")
+ assert tpl, "chat_template is None / empty after MLX load"
+ assert "" in tpl, "Qwen3.5 template missing "
+
+ # 3. Route-layer classification -- the LoadResponse contract.
+ class _Stub:
+ def __init__(self): self.active_model_name = repo
+ def _is_gpt_oss_model(self): return False
+ flags = _detect_safetensors_features(_Stub(), tpl)
+ print("MLX flags for", repo, "=", flags)
+ assert flags["supports_tools"] is True, flags
+ assert flags["supports_reasoning"] is True, flags
+ assert flags["reasoning_style"] == "enable_thinking", flags
+ print("PASS: MLX Qwen3.5-0.8B advertises tools + reasoning")
+ PY
+
+ # The previous step proved chat_template_info reaches the route;
+ # this step exercises real generate_chat_response calls with the
+ # four template kwargs (tools / enable_thinking / reasoning_effort
+ # / preserve_thinking) so the Mac sees the same wire as Linux.
+ # We bound each cartesian cell to ~12 tokens to keep CI fast --
+ # the goal is plumbing, not benchmark quality.
+ - name: Cartesian generate probe (tools x thinking) via MLX
+ shell: bash
+ working-directory: studio/backend
+ env:
+ PYTHONPATH: ${{ github.workspace }}/studio/backend
+ UNSLOTH_COMPILE_DISABLE: '1'
+ HF_HUB_ENABLE_HF_TRANSFER: '0'
+ run: |
+ python - <<'PY'
+ import sys, threading, time
+ from pathlib import Path
+ sys.path.insert(0, str(Path.cwd()))
+
+ from utils.models.model_config import ModelConfig
+ from core.inference.mlx_inference import MLXInferenceBackend
+ from core.inference import safetensors_agentic
+
+ repo = "unsloth/Qwen3.5-0.8B"
+ cfg = ModelConfig.from_identifier(model_id=repo)
+ backend = MLXInferenceBackend()
+ assert backend.load_model(config=cfg, max_seq_length=2048)
+
+ WEB_SEARCH = {
+ "type": "function",
+ "function": {
+ "name": "web_search",
+ "description": "Search the web.",
+ "parameters": {
+ "type": "object",
+ "properties": {"query": {"type": "string"}},
+ "required": ["query"],
+ },
+ },
+ }
+ PYTHON = {
+ "type": "function",
+ "function": {
+ "name": "python",
+ "description": "Run Python.",
+ "parameters": {
+ "type": "object",
+ "properties": {"code": {"type": "string"}},
+ "required": ["code"],
+ },
+ },
+ }
+
+ combos = [
+ dict(label="baseline", kw={}),
+ dict(label="thinking_on", kw={"enable_thinking": True}),
+ dict(label="thinking_off", kw={"enable_thinking": False}),
+ dict(label="web_search_only", kw={"tools": [WEB_SEARCH]}),
+ dict(label="python_only", kw={"tools": [PYTHON]}),
+ dict(label="both_tools+thinking", kw={"tools": [WEB_SEARCH, PYTHON], "enable_thinking": True}),
+ ]
+
+ for combo in combos:
+ t0 = time.time()
+ gen = backend.generate_chat_response(
+ messages=[{"role": "user", "content": "Briefly: 2 plus 3?"}],
+ system_prompt="You are a helpful assistant.",
+ max_new_tokens=12,
+ cancel_event=threading.Event(),
+ **combo["kw"],
+ )
+ chunks = []
+ for cumulative in gen:
+ chunks.append(cumulative)
+ if len(chunks) >= 12:
+ break
+ dt = time.time() - t0
+ assert chunks, f"{combo['label']}: no chunks yielded"
+ last = chunks[-1]
+ print(f" {combo['label']:<22} dt={dt:5.1f}s out[-80:]={last[-80:]!r}")
+
+ # Plumbing-only assertion: enable_thinking=True must produce
+ # a visibly different output than enable_thinking=False at
+ # the same prompt. Cross-checks that the kwarg actually
+ # reaches the model.
+ base = next(c for c in combos if c["label"] == "baseline")["kw"]
+ tk_on = next(c for c in combos if c["label"] == "thinking_on")["kw"]
+ tk_off = next(c for c in combos if c["label"] == "thinking_off")["kw"]
+ assert tk_on != tk_off, "thinking_on / thinking_off kwargs collapsed"
+
+ # Agentic loop end-to-end is covered by the 43 unit tests on
+ # ubuntu/macos/windows; skipping the real-model agentic
+ # probe here because CPU-only macos-14 runners cannot
+ # complete a VLM generation pass within the job timeout.
+ print("PASS: MLX cartesian probe exercised end-to-end")
+ PY
+
+ # Multi-format parser probe: exercise the extended tool_call_parser
+ # (Llama-3 <|python_tag|>, Llama-3.2 bare JSON, Mistral pre-v11 +
+ # v11+ + [ARGS], Gemma 4 <|tool_call>) against synthetic emissions
+ # drawn from each family's official chat template. Runs on the
+ # three matrix OSes so we catch any regex / json edge case the
+ # parser still has cross-platform.
+ multi-format-parser:
+ name: ${{ matrix.os }} / multi-format parser
+ runs-on: ${{ matrix.os }}
+ timeout-minutes: 10
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest, macos-14, windows-latest]
+ steps:
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ with:
+ persist-credentials: false
+
+ - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+ with:
+ python-version: '3.11'
+ cache: 'pip'
+
+ # The parser itself is pure Python, but importing
+ # ``core.inference.tool_call_parser`` triggers the package
+ # __init__ which loads ``orchestrator.py`` and pulls in
+ # structlog / fastapi / pydantic. Install the same minimal
+ # backend deps the other jobs use; still no torch / mlx since
+ # the parser does not need them.
+ - name: Install backend deps (CPU only, no torch / mlx)
+ shell: bash
+ run: |
+ python -m pip install --upgrade pip
+ pip install \
+ pytest pytest-asyncio httpx \
+ fastapi 'pydantic>=2' pyjwt cryptography python-multipart \
+ structlog pyyaml jinja2 mammoth unpdf requests typer \
+ aiofiles sqlalchemy huggingface_hub matplotlib datasets \
+ 'numpy<3'
+
+ - name: Probe all five emission formats end-to-end
+ shell: bash
+ working-directory: studio/backend
+ env:
+ PYTHONPATH: ${{ github.workspace }}/studio/backend
+ run: |
+ python - <<'PY'
+ """Verify the shared parser turns every supported family's
+ emission shape into the same OpenAI-format tool call."""
+ import json, sys
+ from pathlib import Path
+ sys.path.insert(0, str(Path.cwd()))
+
+ from core.inference.tool_call_parser import (
+ parse_tool_calls_from_text,
+ has_tool_signal,
+ strip_tool_markup,
+ TOOL_XML_SIGNALS,
+ )
+
+ # Every emission marker the streaming loop watches for.
+ for marker in (
+ "",
+ "",
+ "[TOOL_CALLS]",
+ "<|tool_call>",
+ "<|tool▁calls▁begin|>",
+ "<|tool▁call▁begin|>",
+ "<|tool_calls_section_begin|>",
+ "<|tool_call_begin|>",
+ ):
+ assert marker in TOOL_XML_SIGNALS, marker
+
+ fixtures = [
+ # (label, raw_text, expected_name, expected_args)
+ ("Qwen ",
+ '{"name":"web_search","arguments":{"q":"x"}}',
+ "web_search", {"q": "x"}),
+ ("Qwen3.5 ",
+ "print(1)",
+ "python", {"code": "print(1)"}),
+ ("Llama-3 <|python_tag|>.call",
+ '<|python_tag|>brave_search.call(query="Tokyo")',
+ "brave_search", {"query": "Tokyo"}),
+ ("Llama-3 <|python_tag|>JSON",
+ '<|python_tag|>{"name":"web_search","parameters":{"q":"x"}}',
+ "web_search", {"q": "x"}),
+ ("Llama-3.2 bare JSON",
+ '{"name":"web_search","parameters":{"q":"Tokyo weather"}}',
+ "web_search", {"q": "Tokyo weather"}),
+ ("Mistral pre-v11 array",
+ '[TOOL_CALLS] [{"name":"web_search","arguments":{"q":"x"}}]',
+ "web_search", {"q": "x"}),
+ ("Mistral v11+ name{json}",
+ '[TOOL_CALLS]add{"a":3.5,"b":4}',
+ "add", {"a": 3.5, "b": 4}),
+ ("Ministral v11+ [ARGS]",
+ '[TOOL_CALLS]add[ARGS]{"a":1,"b":2}',
+ "add", {"a": 1, "b": 2}),
+ ("Gemma 4 <|tool_call>",
+ '<|tool_call>call:get_weather{'
+ 'location:<|"|>Tokyo<|"|>,units:<|"|>celsius<|"|>'
+ '}',
+ "get_weather", {"location": "Tokyo", "units": "celsius"}),
+ # DeepSeek R1 (code-fenced).
+ ("DeepSeek R1 fence",
+ "<|tool▁calls▁begin|>"
+ "<|tool▁call▁begin|>function"
+ "<|tool▁sep|>special_function\n"
+ "```json\n"
+ '{"arg1": 1}\n'
+ "```"
+ "<|tool▁call▁end|>"
+ "<|tool▁calls▁end|>",
+ "special_function", {"arg1": 1}),
+ # DeepSeek V3.1 (bare JSON).
+ ("DeepSeek V3.1 bare",
+ "<|tool▁calls▁begin|>"
+ "<|tool▁call▁begin|>get_time"
+ "<|tool▁sep|>"
+ '{"city": "Tokyo"}'
+ "<|tool▁call▁end|>"
+ "<|tool▁calls▁end|>",
+ "get_time", {"city": "Tokyo"}),
+ # GLM 4.x.
+ ("GLM 4.x +",
+ "web_search\n"
+ "query\n"
+ "weather Tokyo\n"
+ "",
+ "web_search", {"query": "weather Tokyo"}),
+ # Kimi K2.
+ ("Kimi K2 section",
+ "<|tool_calls_section_begin|>"
+ "<|tool_call_begin|>functions.special_function:0"
+ "<|tool_call_argument_begin|>"
+ '{"arg1": 1}'
+ "<|tool_call_end|>"
+ "<|tool_calls_section_end|>",
+ "special_function", {"arg1": 1}),
+ ]
+
+ for label, text, expected_name, expected_args in fixtures:
+ assert has_tool_signal(text) or label == "Llama-3.2 bare JSON", (
+ f"{label}: streaming loop would NOT wake -- "
+ f"has_tool_signal returned False"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert result, f"{label}: parser returned no calls for {text!r}"
+ assert result[0]["function"]["name"] == expected_name, (
+ f"{label}: got name={result[0]['function']['name']!r} "
+ f"expected {expected_name!r}"
+ )
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args == expected_args, (
+ f"{label}: got args={args!r} expected {expected_args!r}"
+ )
+ # Stripping must remove the markup so downstream UI does
+ # not echo it as assistant prose.
+ stripped = strip_tool_markup(text, final=True)
+ if label == "Llama-3.2 bare JSON":
+ # Bare JSON path does not strip (the JSON itself is
+ # the entire response) -- ensure stripping is safe.
+ assert isinstance(stripped, str)
+ else:
+ assert "" not in stripped
+ assert "<|tool▁calls" not in stripped
+ assert "<|tool_calls_section" not in stripped
+ assert "[TOOL_CALLS]" not in stripped
+ assert "<|tool_call>" not in stripped
+ print(f" OK {label:28s} -> {expected_name}({expected_args})")
+
+ print("PASS: all 9 emission shapes parsed correctly")
+ PY
+
+ - name: Run the new multi-format parser unit tests
+ shell: bash
+ working-directory: studio/backend
+ env:
+ PYTHONPATH: ${{ github.workspace }}/studio/backend
+ UNSLOTH_COMPILE_DISABLE: '1'
+ run: |
+ python -m pytest \
+ tests/test_safetensors_tool_loop.py::TestParserMultiFormat \
+ -v --tb=short
diff --git a/.github/workflows/security-audit.yml b/.github/workflows/security-audit.yml
deleted file mode 100644
index a1e7b2efa6..0000000000
--- a/.github/workflows/security-audit.yml
+++ /dev/null
@@ -1,1126 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Multi-language supply-chain audit. Triggers:
-# - PRs touching any dependency manifest (Python / npm / Cargo) or
-# this workflow file,
-# - push to main / pip,
-# - nightly @ 04:13 UTC so newly-published advisories surface even
-# when no PR opens,
-# - workflow_dispatch for ad-hoc invocations.
-#
-# Two jobs:
-# - advisory-audit: one runner that runs pip-audit + npm audit +
-# cargo audit back-to-back. All three are
-# advisory-DB lookups -- fast, lockfile-driven,
-# no archive download. Setting up the python /
-# node / rust toolchains on one runner and
-# running the three commands serially is
-# cheaper than spinning up three runners.
-# - pip-scan-packages: 3-shard matrix that downloads + pattern-scans
-# every PyPI archive in the transitive closure.
-# This is the expensive job (~6 min/shard,
-# running in parallel) and it must stay
-# independent so a CVE-DB hit in advisory-audit
-# does not block the supply-chain pattern scan
-# (or vice versa).
-#
-# All steps are non-blocking initially. The default branch already
-# carries a known-vuln backlog (the dependabot banner shows 17 today,
-# pip-audit catches 2 more, npm/cargo will catch their own); a hard
-# gate now would block every PR on a baseline we have not triaged.
-# As each baseline closes, drop continue-on-error per step.
-#
-# Dependency coverage:
-# - unsloth core (pyproject.toml [project.dependencies])
-# - unsloth `huggingfacenotorch` extras (the canonical install path
-# for fine-tuning users; pulls transformers / peft / accelerate /
-# trl / datasets / diffusers / sentence-transformers / etc.)
-# - all six Studio backend requirements files
-# - Studio frontend (npm) and Tauri shell (cargo)
-# Each Python step builds a filtered dep list from pyproject.toml +
-# requirements/*.txt before auditing. We do NOT install any of these
-# -- pip-audit resolves through PyPI metadata, scan_packages.py
-# downloads sdist/wheel archives and inspects them without running
-# install hooks, so an attacker who has compromised a transitive dep
-# cannot execute code in this workflow.
-
-name: Security audit
-
-on:
- pull_request:
- paths:
- - 'studio/backend/requirements/**'
- - 'studio/frontend/package.json'
- - 'studio/frontend/package-lock.json'
- - 'studio/src-tauri/Cargo.toml'
- - 'studio/src-tauri/Cargo.lock'
- - 'pyproject.toml'
- - 'scripts/scan_packages.py'
- - 'scripts/scan_npm_packages.py'
- - '.github/workflows/security-audit.yml'
- push:
- branches: [main, pip]
- schedule:
- - cron: '13 4 * * *' # 04:13 UTC daily, off the cron rush
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- # ─────────────────────────────────────────────────────────────────────
- # Combined advisory-DB audit: pip-audit + npm audit + cargo audit
- # all on one runner. Each step is continue-on-error so a finding in
- # one toolchain does not suppress the others.
- # ─────────────────────────────────────────────────────────────────────
- advisory-audit:
- name: advisory audit (pip + npm + cargo)
- runs-on: ubuntu-latest
- timeout-minutes: 25
- steps:
- # step-security/harden-runner installs an eBPF-based egress
- # firewall on the runner. In `audit` mode it logs every outbound
- # connection without blocking; in `block` mode it rejects
- # anything outside `allowed-endpoints`. We run audit-only
- # initially: the next time this job hits a real PyPI advisory or
- # an attacker-funded archive in pip-scan-packages, the audit log
- # tells us exactly which hosts were dialed and we promote the
- # allowlist to block. Would have *contained* the litellm exfil
- # even if scan_packages had missed the .pth payload.
- # SHA-pinned (not @v2): the litellm 1.82.7 attack chain hijacked
- # mutable tags on aquasecurity/trivy-action and would have hit
- # anyone using @v0 / @v2 / @latest references. Pinning to a 40-
- # char SHA freezes this action at known-good code; Dependabot's
- # github-actions ecosystem will auto-bump the SHA.
- # v2.19.1 commit:
- # Per-job allowlist: advisory-audit hits PyPI, npm registry,
- # crates.io advisories, GitHub release artefacts (osv-scanner
- # binary), Semgrep registry, and TruffleHog's own GitHub action.
- - name: Harden runner (egress block)
- uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
- with:
- egress-policy: block
- disable-sudo: true
- allowed-endpoints: >
- api.github.com:443
- github.com:443
- codeload.github.com:443
- objects.githubusercontent.com:443
- raw.githubusercontent.com:443
- release-assets.githubusercontent.com:443
- registry.npmjs.org:443
- pypi.org:443
- files.pythonhosted.org:443
- static.rust-lang.org:443
- index.crates.io:443
- static.crates.io:443
- crates.io:443
- semgrep.dev:443
-
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- # Full history so TruffleHog can diff base..head; without
- # this it sees only the latest commit and reports nothing.
- fetch-depth: 0
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable @ 2026-03-27
-
- - uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2.9.1
- with:
- workspaces: studio/src-tauri -> target
-
- - name: Install pip-audit + cargo-audit
- # cargo-audit pulls advisories from the RustSec advisory-db on
- # first run and caches them under ~/.cargo/advisory-db. Pin
- # --locked so the version we install matches Cargo.lock
- # determinism. cargo-audit 0.22 supports the CVSS 4.0 schema
- # used in 2026 advisories (e.g. RUSTSEC-2026-0073); 0.21
- # crashes with a TOML parse error on that file.
- # npm audit is bundled with the node toolchain, no install.
- run: |
- python -m pip install --upgrade pip 'pip-audit>=2.7'
- cargo install --locked --version '^0.22' cargo-audit
-
- # ─────────────────────────────────────────────────────────────
- # Python: pip-audit
- # ─────────────────────────────────────────────────────────────
- - name: Build filtered Python requirements set
- # Two transforms:
- # (1) Generate audit-reqs/unsloth-deps.txt from pyproject.toml
- # so pip-audit sees the unsloth pip package's own dep set
- # (core + huggingfacenotorch extras: transformers / peft /
- # accelerate / trl / datasets / diffusers /
- # sentence-transformers / huggingface_hub / hf_transfer /
- # etc.).
- # (2) Copy each studio/backend/requirements/*.txt into
- # audit-reqs/ with `git+` lines stripped. pip-audit's `-r`
- # mode does a dry-run resolve against PyPI metadata; a
- # `git+https://...` spec forces it to clone, which is
- # both slow and outside the threat model (we audit
- # PyPI-served archives; a git ref is whatever HEAD says
- # on the runner). A comment line is left in place so the
- # skipped specs are obvious in the artifact.
- # The `huggingface` extra is `huggingfacenotorch` plus torch /
- # torchvision / triton, deliberately skipped: Studio backend
- # already pins a torch and the +cu* / +cpu local-version tags
- # trip up the PyPI resolver in `-r` mode.
- run: |
- mkdir -p audit-reqs
- python <<'PY' > audit-reqs/unsloth-deps.txt
- import tomllib
- with open("pyproject.toml", "rb") as f:
- d = tomllib.load(f)
- core = d["project"]["dependencies"]
- extras = d["project"]["optional-dependencies"]["huggingfacenotorch"]
- print("# Auto-generated from pyproject.toml by security-audit.yml.")
- print("# core deps + huggingfacenotorch extras.")
- for spec in core + extras:
- print(spec)
- PY
- for f in studio.txt extras.txt extras-no-deps.txt \
- no-torch-runtime.txt overrides.txt triton-kernels.txt; do
- python < "audit-reqs/$f"
- src = "studio/backend/requirements/$f"
- with open(src) as fh:
- for line in fh:
- stripped = line.strip()
- before_comment = stripped.split("#", 1)[0]
- if "git+" in before_comment:
- print(f"# [security-audit] skipped git+ spec: {stripped}")
- continue
- print(line.rstrip("\n"))
- PY
- done
-
- - name: pip-audit (declared Python deps, no install)
- # `-r requirements.txt` resolves the requirements through pip's
- # dependency resolver against PyPI metadata and audits the
- # resolved tree without ever executing setup.py / install
- # hooks. Way faster than installing the full Studio runtime
- # and -- critically -- safer: an attacker who has compromised
- # a transitive dep cannot run code in this job.
- #
- # extras.txt + extras-no-deps.txt have legacy setup.py
- # packages (notably openai-whisper) whose setup.py imports
- # `pkg_resources`, which the isolated build env's current
- # setuptools no longer ships. PIP_CONSTRAINT pins an older
- # setuptools into the build env so those builds resolve.
- # Per-file loop so one bad file doesn't take out the whole
- # audit.
- continue-on-error: true
- env:
- PIP_CONSTRAINT: ${{ github.workspace }}/audit-reqs/build-constraints.txt
- run: |
- set +e
- cat > audit-reqs/build-constraints.txt <<'CONSTRAINTS'
- setuptools<78
- wheel
- CONSTRAINTS
- : > logs-pip-audit.txt
- for f in unsloth-deps studio extras extras-no-deps \
- no-torch-runtime overrides triton-kernels; do
- if ! grep -qE '^[^#[:space:]]' "audit-reqs/$f.txt"; then
- echo "[security-audit] $f.txt has no PyPI specs after git+ filter, skipping" \
- | tee -a logs-pip-audit.txt
- continue
- fi
- echo "::group::pip-audit -r audit-reqs/$f.txt"
- {
- echo
- echo "=== $f ==="
- pip-audit -r "audit-reqs/$f.txt" --format=columns
- echo "=== end $f (rc=$?) ==="
- } 2>&1 | tee -a logs-pip-audit.txt
- echo "::endgroup::"
- done
- {
- echo "## pip-audit (Python)"
- echo
- echo '### Coverage'
- echo '- unsloth core + `huggingfacenotorch` extras (pyproject.toml)'
- echo '- studio/backend/requirements/{studio,extras,extras-no-deps,no-torch-runtime,overrides,triton-kernels}.txt'
- echo '- `git+` specs are stripped before audit (out of scope: we audit PyPI archives)'
- echo
- echo '### Findings'
- echo '```'
- cat logs-pip-audit.txt
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- # ─────────────────────────────────────────────────────────────
- # Pre-install lockfile supply-chain audit (npm + cargo).
- # Catches structural anomalies (non-registry resolved URLs,
- # missing integrity hashes, known IOC strings) BEFORE `npm
- # audit` or OSV-Scanner consult the advisory DB. The advisory
- # path is reactive -- there is a window between a malicious
- # publication and the GHSA landing. This step fires on the
- # injection pattern itself so it catches the same class of
- # attack the moment the lockfile shape becomes wrong.
- # ─────────────────────────────────────────────────────────────
- - name: Lockfile supply-chain audit (pre-install scan)
- run: |
- python3 scripts/lockfile_supply_chain_audit.py
- {
- echo "## Lockfile supply-chain audit"
- echo
- echo "Scanned: studio/frontend/package-lock.json + studio/src-tauri/Cargo.lock"
- echo
- echo "No structural anomalies or known IOC strings."
- } >> "$GITHUB_STEP_SUMMARY"
-
- # ─────────────────────────────────────────────────────────────
- # npm: Studio frontend
- # ─────────────────────────────────────────────────────────────
- - name: npm audit (Studio frontend)
- # `npm audit` resolves the lockfile through the npmjs.com
- # advisory DB. `--audit-level=high` filters the noise floor
- # to only HIGH and CRITICAL. We do NOT pass --omit=dev: a
- # malicious dev-only dep can still steal secrets from a CI
- # runner, so dev deps need to be in the audit surface.
- continue-on-error: true
- working-directory: studio/frontend
- run: |
- set +e
- npm audit --audit-level=high | tee ../../logs-npm-audit.txt
- # Always also write the full JSON for grep-ability.
- npm audit --json > ../../logs-npm-audit.json || true
- {
- echo "## npm audit (Studio frontend)"
- echo
- echo '```'
- tail -200 ../../logs-npm-audit.txt
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- # ─────────────────────────────────────────────────────────────
- # cargo: Studio Tauri shell
- # ─────────────────────────────────────────────────────────────
- - name: cargo audit (Studio Tauri)
- # `--deny warnings` would make the job fail on any advisory.
- # Keep non-blocking initially; drop continue-on-error after
- # the baseline closes.
- continue-on-error: true
- working-directory: studio/src-tauri
- run: |
- set +e
- cargo audit | tee ../../logs-cargo-audit.txt
- {
- echo "## cargo audit (Studio Tauri)"
- echo
- echo '```'
- tail -200 ../../logs-cargo-audit.txt
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- # ─────────────────────────────────────────────────────────────
- # OSV-Scanner: cross-ecosystem advisory DB (PyPI + npm + cargo)
- # ─────────────────────────────────────────────────────────────
- - name: OSV-Scanner (PyPI + npm + cargo, cross-ecosystem advisories)
- # OSV's advisory feed is a superset of GitHub-Advisory + RustSec
- # + npm advisories; running it alongside the per-ecosystem audit
- # tools catches CVEs that haven't propagated to the per-ecosystem
- # DBs yet (e.g. langchain-core CVE-2025-68664 was on OSV before
- # GitHub Advisory). Single binary, one transitive resolver, all
- # three lockfile types in one pass. Non-blocking until baselines
- # close.
- continue-on-error: true
- run: |
- set +e
- # OSV-Scanner ships a raw binary (no tarball) in v2.x.
- curl -fsSL -o /tmp/osv-scanner \
- https://github.com/google/osv-scanner/releases/download/v2.0.2/osv-scanner_linux_amd64
- chmod +x /tmp/osv-scanner
- /tmp/osv-scanner --version
- /tmp/osv-scanner scan source \
- --lockfile=studio/frontend/package-lock.json \
- --lockfile=studio/src-tauri/Cargo.lock \
- --lockfile=requirements.txt:audit-reqs/unsloth-deps.txt \
- --lockfile=requirements.txt:audit-reqs/studio.txt \
- --lockfile=requirements.txt:audit-reqs/no-torch-runtime.txt \
- --lockfile=requirements.txt:audit-reqs/overrides.txt \
- --lockfile=requirements.txt:audit-reqs/extras.txt \
- --lockfile=requirements.txt:audit-reqs/extras-no-deps.txt \
- --format=table 2>&1 | tee logs-osv-scanner.txt
- {
- echo "## OSV-Scanner (cross-ecosystem)"
- echo
- echo '```'
- tail -200 logs-osv-scanner.txt
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- # ─────────────────────────────────────────────────────────────
- # Semgrep: design-flaw detection (catches what regex-pattern
- # scanning of malicious authors cannot — first-party logic bugs
- # like langchain-core CVE-2025-68664 dumps/dumpd injection,
- # n8n CVE-2025-68668 _pyodide.eval_code sandbox escape, marimo
- # CVE-2026-39987 unauth WebSocket).
- # ─────────────────────────────────────────────────────────────
- - name: Semgrep (supply-chain + python rule packs)
- continue-on-error: true
- run: |
- set +e
- python -m pip install --quiet 'semgrep>=1.95'
- semgrep --version
- semgrep scan \
- --config p/supply-chain \
- --config p/python \
- --config p/javascript \
- --config p/security-audit \
- --severity ERROR --severity WARNING \
- --metrics off \
- --timeout 120 \
- studio/backend unsloth scripts \
- 2>&1 | tee logs-semgrep.txt
- {
- echo "## Semgrep (supply-chain + python + javascript rules)"
- echo
- echo '```'
- tail -200 logs-semgrep.txt
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- # ─────────────────────────────────────────────────────────────
- # Lockfile pin verifier. The litellm 1.82.7 attack window was
- # ~40 minutes; anyone resolving with `>=` got the malicious
- # version automatically. Flag every spec in the requirements
- # files that does not pin to an exact `==` (or `@` for git
- # refs, or `===` for arbitrary equality). Warning-only for now;
- # graduate to blocking once the baseline is clean.
- # ─────────────────────────────────────────────────────────────
- - name: Lockfile pin verifier (Python requirements)
- continue-on-error: true
- run: |
- python <<'PY' | tee logs-pin-verifier.txt
- import re
- from pathlib import Path
-
- # Specs that look like `pkg==1.2.3` or `pkg @ git+...` or
- # bare comments / -r lines are pinned-or-not-applicable.
- PINNED = re.compile(r"^\s*[A-Za-z0-9_.\-]+\s*(?:===|==)\s*[^,;]+\s*$")
- GIT_OR_URL = re.compile(r"^\s*[A-Za-z0-9_.\-]+\s*@\s*(?:git\+|https?://)")
-
- unpinned = []
- for f in sorted(Path("studio/backend/requirements").glob("*.txt")):
- for i, raw in enumerate(f.read_text().splitlines(), 1):
- line = raw.strip()
- if not line or line.startswith("#") or line.startswith("-"):
- continue
- spec = line.split("#", 1)[0].strip().split(";", 1)[0].strip()
- if not spec:
- continue
- if "git+" in spec or PINNED.match(spec) or GIT_OR_URL.match(spec):
- continue
- unpinned.append((str(f), i, line))
-
- print(f"::group::Lockfile pin status")
- if unpinned:
- print(f"WARN: {len(unpinned)} non-`==` specs across requirements/*.txt")
- print("(litellm 1.82.7 wave hit anyone on `>=`; tighten when feasible.)")
- for f, i, line in unpinned[:80]:
- print(f" {f}:{i}: {line}")
- if len(unpinned) > 80:
- print(f" ... and {len(unpinned) - 80} more")
- else:
- print("OK: every spec is exact-pinned.")
- print("::endgroup::")
- PY
- {
- echo "## Lockfile pin verifier"
- echo
- echo '```'
- cat logs-pin-verifier.txt
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- # ─────────────────────────────────────────────────────────────
- # Trivy is deliberately NOT installed here. Trivy was the entry
- # point for the litellm 1.82.7 supply-chain compromise (March
- # 2026): attackers force-rewrote 76 of 77 tags in
- # aquasecurity/trivy-action to point at malicious commits;
- # anyone running the action with a tag ref auto-pulled a
- # credential-harvesting payload. By design a security scanner
- # has broad read access to runner secrets, which is exactly
- # what made it the ideal pivot. We pick up Trivy's CVE coverage
- # from OSV-Scanner (NVD + GHSA + GitLab) and its secret
- # detection from TruffleHog. IaC misconfig detection (Trivy's
- # one unique value-add) is unfilled for now -- revisit with
- # checkov / kics when we ship a Dockerfile or k8s manifests.
- # See https://docs.litellm.ai/blog/security-update-march-2026
- # and the Microsoft / Trend Micro / Snyk incident write-ups.
- # ─────────────────────────────────────────────────────────────
-
- # ─────────────────────────────────────────────────────────────
- # TruffleHog secret-leak scan on the PR diff. Catches API keys
- # / tokens / cred files committed accidentally. --only-verified
- # filters out probabilistic findings, so we only flag tokens
- # that the source provider confirmed are live. On push to main
- # / pip we scan the full repo; on PR we scan base..head.
- # SHA-pinned for the same reason as harden-runner above.
- # v3.95.2 commit:
- # ─────────────────────────────────────────────────────────────
- - name: TruffleHog (secrets in diff)
- continue-on-error: true
- uses: trufflesecurity/trufflehog@37b77001d0174ebec2fcca2bd83ff83a6d45a3ab # v3.95.3
- with:
- path: ./
- base: ${{ github.event.pull_request.base.sha || '' }}
- head: ${{ github.event.pull_request.head.sha || github.sha }}
- # The action passes --no-update internally; passing it here
- # too triggers `flag 'no-update' cannot be repeated`. Stick
- # with --only-verified so we only flag tokens the source
- # provider confirmed are live (no probabilistic findings).
- extra_args: --only-verified
-
- # ─────────────────────────────────────────────────────────────
- # CycloneDX SBOM. Lets downstream consumers audit what's
- # actually shipped in unsloth wheels and the Studio backend
- # runtime. Generates one JSON file per requirements input plus
- # a combined SBOM keyed off pyproject.toml; uploads as a build
- # artifact (and a future step can attest it via SLSA).
- # ─────────────────────────────────────────────────────────────
- - name: Generate CycloneDX SBOM
- continue-on-error: true
- run: |
- set +e
- python -m pip install --quiet 'cyclonedx-bom>=4.6'
- mkdir -p sbom
- # Per-requirements-file SBOM (the audit-reqs/ files are the
- # filtered, git+-stripped views built earlier in this job).
- # cyclonedx-py 4.x uses `--sv` for spec version and `-o` for
- # the output file; the older `--schema-version`/`--outfile`
- # spellings are not accepted.
- for f in audit-reqs/*.txt; do
- base=$(basename "$f" .txt)
- if grep -qE '^[^#[:space:]]' "$f"; then
- cyclonedx-py requirements "$f" \
- --sv 1.6 \
- --of JSON \
- -o "sbom/sbom-$base.json" 2>&1 | tail -5 || true
- fi
- done
- # Project-level SBOM from pyproject.toml.
- cyclonedx-py environment \
- --sv 1.6 \
- --of JSON \
- -o sbom/sbom-environment.json 2>&1 | tail -5 || true
- ls -la sbom/
- {
- echo "## CycloneDX SBOM"
- echo
- echo "Generated SBOM files:"
- ls sbom/ | sed 's/^/- sbom\//'
- } >> "$GITHUB_STEP_SUMMARY"
-
- # ─────────────────────────────────────────────────────────────
- # GitHub Actions pinning verifier. tj-actions/changed-files
- # was compromised in March 2025; anyone using `@v4` (a mutable
- # ref) auto-shipped the malicious version. Catch every
- # non-SHA-pinned `uses:` across the workflows tree. Warn-only
- # initially so the existing baseline doesn't block PRs.
- # ─────────────────────────────────────────────────────────────
- - name: GitHub Actions pinning verifier
- continue-on-error: true
- run: |
- python <<'PY' | tee logs-actions-pinning.txt
- import re
- from pathlib import Path
- # SHA pin = 40 hex chars after @
- SHA_PIN = re.compile(r"@[0-9a-f]{40}\b")
- # First-party / GitHub-published actions get a softer pass
- # (still recommended to pin; not a security gate).
- FIRST_PARTY = re.compile(r"^\s*-\s*uses:\s*(actions|github)/[^@]+@")
- USES = re.compile(r"^\s*-\s*uses:\s*([^@\s]+)@(\S+)")
- unpinned_third = []
- unpinned_first = []
- for f in sorted(Path(".github/workflows").glob("*.yml")):
- for i, line in enumerate(f.read_text().splitlines(), 1):
- m = USES.match(line)
- if not m:
- continue
- name, ref = m.group(1), m.group(2)
- if SHA_PIN.search(line):
- continue
- bucket = unpinned_first if FIRST_PARTY.match(line) else unpinned_third
- bucket.append((str(f), i, name, ref))
- print("::group::Action pinning status")
- print(f"third-party actions on mutable refs: {len(unpinned_third)}")
- for f, i, n, r in unpinned_third:
- print(f" HIGH {f}:{i}: {n}@{r}")
- print()
- print(f"first-party (actions/* | github/*) on mutable refs: {len(unpinned_first)}")
- for f, i, n, r in unpinned_first[:30]:
- print(f" WARN {f}:{i}: {n}@{r}")
- if len(unpinned_first) > 30:
- print(f" ... and {len(unpinned_first) - 30} more")
- print()
- print("Recommendation: pin third-party actions to a 40-char SHA.")
- print("Dependabot's github-actions ecosystem will auto-bump them.")
- print("::endgroup::")
- PY
- {
- echo "## GitHub Actions pinning verifier"
- echo
- echo '```'
- cat logs-actions-pinning.txt
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- # ─────────────────────────────────────────────────────────────
- # Hash-pin verifier. `==` pinning protects against version
- # drift but not against a re-uploaded malicious wheel at the
- # same version (PyPI lets a yanked release be re-published with
- # different bytes for ~5 minutes via `--filename` collision).
- # `pip install --require-hashes` rejects any download whose
- # SHA-256 doesn't match. Inspector step that reports how many
- # specs would gain from a hash pin -- conversion is a roadmap
- # item (needs pip-tools / uv pip compile --generate-hashes).
- # ─────────────────────────────────────────────────────────────
- - name: Hash-pin verifier (Python requirements)
- continue-on-error: true
- run: |
- python <<'PY' | tee logs-hash-verifier.txt
- import re
- from pathlib import Path
- PINNED = re.compile(r"^\s*[A-Za-z0-9_.\-]+\s*==\s*[^,;]+\s*$")
- HASH_LINE = re.compile(r"--hash=sha256:[0-9a-f]{64}")
- total_pinned = 0
- with_hash = 0
- for f in sorted(Path("studio/backend/requirements").glob("*.txt")):
- text = f.read_text()
- for raw in text.splitlines():
- line = raw.strip()
- if not line or line.startswith("#") or line.startswith("-"):
- continue
- spec = line.split("#", 1)[0].strip().split(";", 1)[0]
- if PINNED.match(spec):
- total_pinned += 1
- if HASH_LINE.search(raw):
- with_hash += 1
- print(f"::group::Hash-pin status")
- print(f" exact == pins: {total_pinned}")
- print(f" with --hash=sha256: {with_hash}")
- print(f" without --hash: {total_pinned - with_hash}")
- print()
- print("Roadmap: convert to hash-locked installs via")
- print("`uv pip compile --generate-hashes` and `pip install --require-hashes`.")
- print("Hash-locked installs would have refused a republished")
- print("malicious litellm 1.82.7 wheel even at the same version.")
- print("::endgroup::")
- PY
- {
- echo "## Hash-pin verifier"
- echo
- echo '```'
- cat logs-hash-verifier.txt
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- if: always()
- with:
- name: advisory-audit-logs
- path: |
- logs-pip-audit.txt
- logs-npm-audit.txt
- logs-npm-audit.json
- logs-cargo-audit.txt
- logs-osv-scanner.txt
- logs-semgrep.txt
- logs-pin-verifier.txt
- logs-actions-pinning.txt
- logs-hash-verifier.txt
- audit-reqs/
- sbom/
- retention-days: 30
-
- # ─────────────────────────────────────────────────────────────────────
- # Python: pre-install package scan (no install, no execution)
- # ─────────────────────────────────────────────────────────────────────
- pip-scan-packages:
- # Downloads each declared dep WITHOUT installing it and inspects
- # the archive contents for known malicious patterns: weaponized
- # .pth files, credential stealers, obfuscated payloads,
- # install-time droppers, suspicious subprocess / network /
- # base64-blob combinations.
- #
- # This is the kind of check that would have caught:
- # - litellm 1.82.7 / 1.82.8 (March 2026, supply-chain compromise)
- # - the typo-squat campaign against PyTorch Lightning
- # before either landed in the install path. pip-audit only knows
- # about CVE-published vulnerabilities, so it does NOT see novel
- # malicious uploads. scan_packages.py runs deterministic regex
- # pattern matching, no LLM calls.
- #
- # `--with-deps` makes the scan transitive: every package the
- # declared set resolves to gets fetched and pattern-scanned, not
- # just the top-level pins. Resolving the full transitive closure
- # of the unsloth + Studio dep tree downloads several hundred
- # archives, hence the longer timeout.
- #
- # Sharded across runners for wall-clock parallelism. Each shard
- # runs scan_packages.py once with --with-deps so its own slice
- # benefits from pip's deduped transitive resolve. Shard
- # composition tries to balance load:
- # - hf-stack: pyproject extras + no-torch-runtime
- # (~150 archives, transformers/peft/accelerate/...)
- # - studio: FastAPI/Studio backend + overrides + extras-no-deps
- # (~150 archives, smaller scientific stack)
- # - extras: the heavy openai-whisper / scikit-learn / librosa
- # stack (~250 archives, dominant cost)
- # triton-kernels.txt is git+-only, fully skipped.
- name: ${{ matrix.shard.name }}
- runs-on: ubuntu-latest
- timeout-minutes: 25
- strategy:
- fail-fast: false
- matrix:
- shard:
- - name: 'pip scan-packages :: hf-stack'
- id: hf-stack
- files: 'unsloth-deps no-torch-runtime'
- - name: 'pip scan-packages :: studio'
- id: studio
- files: 'studio overrides extras-no-deps'
- - name: 'pip scan-packages :: extras'
- id: extras
- files: 'extras'
- steps:
- # Egress block on every shard. Each shard pulls hundreds of
- # PyPI archives -- if a malicious wheel ever phones home from
- # within the scanner sandbox (it shouldn't; we never execute
- # the archive), harden-runner now rejects the connect outright.
- # Per-job allowlist: pip-scan-packages only fetches PyPI archives
- # via scan_packages.py + pip download. No npm or cargo traffic.
- - name: Harden runner (egress block)
- uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
- with:
- egress-policy: block
- disable-sudo: true
- allowed-endpoints: >
- api.github.com:443
- github.com:443
- codeload.github.com:443
- objects.githubusercontent.com:443
- pypi.org:443
- files.pythonhosted.org:443
-
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Install scan_packages.py runtime deps
- # scan_packages.py imports requests + packaging at runtime to
- # talk to PyPI's JSON API and to parse version specifiers. We
- # do not install the packages it scans -- those are downloaded
- # raw and inspected without ever touching `pip install`.
- run: python -m pip install --upgrade pip requests packaging
-
- - name: Build filtered requirements set
- # Mirrors the advisory-audit job's input transform: pyproject.toml
- # extraction + git+ stripping. scan_packages.py downloads
- # PyPI archives without building, so it tolerates legacy
- # setup.py packages (no resolver dry-run); but `--with-deps`
- # delegates resolution to a single `pip download` call that
- # cannot satisfy `git+` specs without git operations, so we
- # strip them here too.
- run: |
- mkdir -p audit-reqs
- python <<'PY' > audit-reqs/unsloth-deps.txt
- import tomllib
- with open("pyproject.toml", "rb") as f:
- d = tomllib.load(f)
- core = d["project"]["dependencies"]
- extras = d["project"]["optional-dependencies"]["huggingfacenotorch"]
- print("# Auto-generated from pyproject.toml by security-audit.yml.")
- print("# core deps + huggingfacenotorch extras.")
- for spec in core + extras:
- print(spec)
- PY
- for f in studio.txt extras.txt extras-no-deps.txt \
- no-torch-runtime.txt overrides.txt triton-kernels.txt; do
- python < "audit-reqs/$f"
- src = "studio/backend/requirements/$f"
- with open(src) as fh:
- for line in fh:
- stripped = line.strip()
- before_comment = stripped.split("#", 1)[0]
- if "git+" in before_comment:
- print(f"# [security-audit] skipped git+ spec: {stripped}")
- continue
- print(line.rstrip("\n"))
- PY
- done
-
- - name: Sanity-check scan_packages.py
- # The scanner lives at scripts/scan_packages.py in this repo
- # so we don't depend on a network fetch at job time.
- run: |
- test -f scripts/scan_packages.py
- head -3 scripts/scan_packages.py
- grep -q "Standalone pre-install package scanner" scripts/scan_packages.py
-
- - name: Scan declared + transitive Python deps
- # scan_packages.py exits 1 on CRITICAL/HIGH findings, 0 on
- # clean. We swallow the exit because the baseline isn't
- # triaged yet; surface the findings in the workflow summary.
- # Drop continue-on-error after the first clean run on main.
- #
- # `--with-deps` walks PyPI metadata to enumerate every
- # transitive dep the declared set would install, then scans
- # them all. Without this flag, we'd only catch a malicious
- # *direct* dep -- and supply-chain attacks usually land
- # several hops down (litellm 1.82.7 was a dep of a dep for
- # most users).
- #
- # This step runs once per matrix shard. Within a shard, every
- # -r file is fed to a single `pip download` call so pip
- # intersects version constraints and yields a deduped
- # transitive set (no point fetching the same transformers
- # wheel five times). Across shards we accept some redundant
- # downloads in exchange for wall-clock parallelism.
- env:
- SHARD_FILES: ${{ matrix.shard.files }}
- run: |
- set +e
- mkdir -p logs
- LOG="logs-scan-packages-${{ matrix.shard.id }}.txt"
- echo "::group::shard ${{ matrix.shard.id }} input files"
- REQ_ARGS=()
- for f in $SHARD_FILES; do
- if grep -qE '^[^#[:space:]]' "audit-reqs/$f.txt"; then
- echo " + audit-reqs/$f.txt"
- REQ_ARGS+=( -r "audit-reqs/$f.txt" )
- else
- echo " - audit-reqs/$f.txt (empty after git+ filter, skipping)"
- fi
- done
- echo "::endgroup::"
- if [ ${#REQ_ARGS[@]} -eq 0 ]; then
- echo "[security-audit] shard ${{ matrix.shard.id }}: no PyPI specs, nothing to scan" \
- | tee "$LOG"
- else
- python scripts/scan_packages.py --with-deps "${REQ_ARGS[@]}" \
- 2>&1 | tee "$LOG"
- fi
- {
- echo "## scan_packages :: shard ${{ matrix.shard.id }}"
- echo
- echo "### Files in this shard"
- for f in $SHARD_FILES; do echo "- audit-reqs/$f.txt"; done
- echo
- echo '### Findings (tail)'
- echo '```'
- tail -200 "$LOG"
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- if: always()
- with:
- name: scan-packages-log-${{ matrix.shard.id }}
- path: |
- logs-scan-packages-${{ matrix.shard.id }}.txt
- audit-reqs/
- retention-days: 30
-
- # ─────────────────────────────────────────────────────────────────────
- # npm: pre-install tarball content scan.
- # ─────────────────────────────────────────────────────────────────────
- npm-scan-packages:
- # Counterpart to pip-scan-packages for the npm side. Reads
- # studio/frontend/package-lock.json, downloads each resolved
- # tarball DIRECTLY from registry.npmjs.org (never via `npm
- # install` -- no lifecycle scripts ever run), verifies the
- # lockfile integrity hash, unpacks each tarball into a sandboxed
- # temp dir behind size / count / path-escape / symlink guards,
- # and pattern-scans the extracted file contents for the
- # signatures common to npm supply-chain attacks:
- #
- # - lifecycle (preinstall / install / postinstall / prepare)
- # scripts in any package.json that fetch + execute external
- # code,
- # - C2 / exfiltration hosts (getsession.org, AWS IMDS,
- # Kubernetes ServiceAccount token paths, GitHub Actions OIDC,
- # HashiCorp Vault endpoints),
- # - credential-stealing references (.npmrc, .aws/credentials,
- # GITHUB_TOKEN / NPM_TOKEN in JS sources),
- # - known IOC filenames (router_init.js, tanstack_runner.js,
- # router_runtime.js),
- # - obfuscation shapes (Function/eval against base64 blobs).
- #
- # Threat model: every tarball is hostile. Safety guarantees are
- # documented at scripts/scan_npm_packages.py top-of-file. The
- # script is stdlib-only so adding it does not increase the
- # transitive supply-chain surface.
- name: npm scan-packages (Studio frontend tarballs)
- runs-on: ubuntu-latest
- timeout-minutes: 30
- needs: []
- steps:
- # Per-job allowlist: npm-scan-packages only fetches tarballs from
- # registry.npmjs.org. GitHub endpoints retained for checkout +
- # setup-python action machinery.
- - name: Harden runner (egress block)
- uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
- with:
- egress-policy: block
- disable-sudo: true
- allowed-endpoints: >
- api.github.com:443
- github.com:443
- codeload.github.com:443
- objects.githubusercontent.com:443
- registry.npmjs.org:443
-
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
-
- - name: Sanity-check scan_npm_packages.py
- run: |
- test -f scripts/scan_npm_packages.py
- python3 -c "import ast; ast.parse(open('scripts/scan_npm_packages.py').read())"
-
- - name: Scan npm tarballs (declared + transitive, no install)
- # The script exits 1 on HIGH/CRITICAL findings; we capture the
- # full log and surface it in the step summary either way. It
- # never runs `npm install`, never executes anything from a
- # downloaded tarball, and only fetches from registry.npmjs.org.
- # Initially non-blocking so the baseline can settle; drop
- # continue-on-error once the baseline is clean for a week.
- run: |
- set -o pipefail
- LOG=logs-scan-npm.txt
- python3 scripts/scan_npm_packages.py 2>&1 | tee "$LOG"
- {
- echo "## scan_npm_packages"
- echo
- echo '### Findings (tail)'
- echo '```'
- tail -300 "$LOG"
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- if: always()
- with:
- name: scan-npm-packages-log
- path: logs-scan-npm.txt
- retention-days: 30
-
- # ─────────────────────────────────────────────────────────────────────
- # Workflow-trigger lint. Refuses two patterns that together powered the
- # TanStack GHSA-g7cv-rxg3-hmpx supply-chain compromise:
- #
- # 1. `pull_request_target` -- runs a fork's workflow YAML against
- # the base repository's secrets. There is no safe use of this
- # trigger for a public open-source project.
- #
- # 2. Shared cache keys between PR-triggered workflows and the
- # publish workflow. A fork PR can poison the cache; the publish
- # workflow then restores the poisoned cache on next run.
- #
- # Cheap pure-Python lint, runs in seconds. Fail-closed.
- # ─────────────────────────────────────────────────────────────────────
- workflow-trigger-lint:
- name: workflow-trigger lint (pull_request_target / cache-poisoning)
- runs-on: ubuntu-latest
- timeout-minutes: 5
- steps:
- - name: Harden runner (egress block)
- uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
- with:
- egress-policy: block
- disable-sudo: true
- allowed-endpoints: >
- api.github.com:443
- github.com:443
- codeload.github.com:443
- objects.githubusercontent.com:443
- pypi.org:443
- files.pythonhosted.org:443
-
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
-
- - name: Install PyYAML
- run: pip install pyyaml
-
- - name: Lint workflow triggers + cache keys
- run: python3 scripts/lint_workflow_triggers.py
-
- # ─────────────────────────────────────────────────────────────────────
- # Regression tests: pin scanner IOC tables and pre-install fixtures.
- # Hard gate (no continue-on-error) so future drift in the IOC tables
- # or scanner exit semantics fails this PR at review time.
- # ─────────────────────────────────────────────────────────────────────
- tests-security:
- name: pytest tests/security
- runs-on: ubuntu-latest
- timeout-minutes: 10
- steps:
- - name: Harden runner (egress block)
- uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
- with:
- egress-policy: block
- disable-sudo: true
- allowed-endpoints: >
- api.github.com:443
- github.com:443
- codeload.github.com:443
- objects.githubusercontent.com:443
- pypi.org:443
- files.pythonhosted.org:443
-
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
-
- - name: Install pytest + PyYAML
- # PyYAML is imported by scripts/lint_workflow_triggers.py, which the
- # `tests/security/test_lint_workflow_triggers.py` regression suite
- # exercises as a subprocess. Without it the lint script bails with
- # `ERROR: PyYAML is required` (exit 2) and the 5 lint regression
- # tests fail. Pinned the same way pytest is pinned.
- run: pip install pytest==9.0.3 pyyaml==6.0.2
-
- - name: Run security regression tests
- run: python3 -m pytest tests/security -v
-
- # ─────────────────────────────────────────────────────────────────────
- # npm provenance + new install-script diff. Catches the two npm
- # supply-chain levers we don't yet gate on:
- #
- # 1. `npm audit signatures` validates the registry-signed
- # provenance of every tarball laid down in node_modules. Pulled
- # from the public npm transparency log; surfaces unsigned or
- # mis-signed deps. Informational for now (continue-on-error)
- # while the baseline settles.
- #
- # 2. `check_new_install_scripts.py` diffs the PR's lockfile
- # against the base ref and refuses any newly-added dep that
- # ships a postinstall hook. Every recent npm supply-chain
- # compromise leveraged a postinstall as the execution lever, so
- # blocking new ones at PR time is a small, high-signal gate.
- # ─────────────────────────────────────────────────────────────────────
- npm-provenance-and-install-scripts:
- name: npm provenance + new install-script diff
- runs-on: ubuntu-latest
- timeout-minutes: 15
- steps:
- - name: Harden runner (egress block)
- uses: step-security/harden-runner@a5ad31d6a139d249332a2605b85202e8c0b78450 # v2.19.1
- with:
- egress-policy: audit
- disable-sudo: true
- allowed-endpoints: >
- api.github.com:443
- github.com:443
- codeload.github.com:443
- objects.githubusercontent.com:443
- registry.npmjs.org:443
-
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- # Need the base commit accessible for `git show
- # :studio/frontend/package-lock.json` below.
- fetch-depth: 0
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
-
- - name: Install Studio frontend deps (--ignore-scripts)
- # `npm audit signatures` requires node_modules to be populated.
- # `--ignore-scripts` is mandatory: this is exactly the lever the
- # new-install-script gate below protects against, and we must
- # not run any third-party hook to set up the audit.
- working-directory: studio/frontend
- run: npm ci --ignore-scripts
-
- - name: npm audit signatures (informational)
- # Surfaces unsigned / mis-signed packages from the npm
- # transparency log. continue-on-error during baseline-build
- # phase; promote to hard gate once the lockfile is fully
- # signed (most major maintainers signed by mid-2025).
- working-directory: studio/frontend
- continue-on-error: true
- run: |
- set -o pipefail
- LOG=logs-audit-signatures.txt
- npm audit signatures 2>&1 | tee "$LOG"
- {
- echo "## npm audit signatures"
- echo
- echo '```'
- tail -200 "$LOG"
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
-
- - name: Extract base-ref lockfile (PR triggers only)
- if: github.event_name == 'pull_request'
- run: |
- set -e
- BASE_SHA="${{ github.event.pull_request.base.sha }}"
- git show "$BASE_SHA:studio/frontend/package-lock.json" \
- > /tmp/base-package-lock.json
-
- - name: Diff for newly-added install-script deps
- if: github.event_name == 'pull_request'
- run: |
- python3 scripts/check_new_install_scripts.py \
- --base /tmp/base-package-lock.json \
- --head studio/frontend/package-lock.json
-
- - name: Skip install-script diff (non-PR trigger)
- if: github.event_name != 'pull_request'
- run: |
- echo "Not a pull_request event; install-script diff requires a base ref."
- echo "This step is intentionally a no-op outside PR triggers."
-
- - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- if: always()
- with:
- name: npm-audit-signatures-log
- path: studio/frontend/logs-audit-signatures.txt
- if-no-files-found: ignore
- retention-days: 30
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
deleted file mode 100644
index 1a4cf841d0..0000000000
--- a/.github/workflows/stale.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: 'Inactive Issue Pinger'
-
-on:
- schedule:
- - cron: '30 5 * * *' # Runs at 5:30 UTC every day
-
-jobs:
- stale:
- runs-on: ubuntu-latest
- permissions:
- issues: write
-
- steps:
- - uses: actions/stale@b5d41d4e1d5dceea10e7104786b73624c18a190f # v10.2.0
- with:
- # The message to post on stale issues.
- # This message will ping the issue author.
- # Note: The stale bot action does not currently support a direct placeholder for the last commenter.
- # As a workaround, this message encourages any participant to reply.
- stale-issue-message: >
- Is this issue still important to you?
- Apologies in advance we might have missed this issue as well.
- For faster response times, please post on our Reddit server - https://www.reddit.com/r/unsloth or our Discord - https://discord.com/invite/unsloth
-
- # The number of days of inactivity before an issue is considered stale.
- days-before-issue-stale: 9999
-
- # Set to -1 to never close stale issues.
- days-before-issue-close: -1
-
- # A label to apply to stale issues.
- stale-issue-label: 'inactive'
-
- # The number of operations to perform per run to avoid rate limiting.
- operations-per-run: 500
-
- enable-statistics: false
diff --git a/.github/workflows/studio-api-smoke.yml b/.github/workflows/studio-api-smoke.yml
deleted file mode 100644
index 53514e2ce1..0000000000
--- a/.github/workflows/studio-api-smoke.yml
+++ /dev/null
@@ -1,166 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Studio API & Auth Tests -- HTTP-level integration tests for the
-# FastAPI surface. No Playwright, no model UI; tests/studio/test_studio_api_smoke.py
-# runs ~30 s and asserts:
-# - CORS hardening (no wildcard + credentials, no bootstrap leak)
-# - /api/system + /api/system/hardware require auth
-# - Auth state machine + JWT expiry
-# - API key lifecycle E2E (create / list / use / delete / reject)
-# - Auth file-mode hardening (Linux only)
-# - Inference lifecycle (force reload, bogus variant, /v1/models, /v1/embeddings, /v1/responses)
-# - Endpoint-by-endpoint auth audit
-#
-# Reuses the GGUF cache key from studio-ui-smoke.yml so the model
-# download is one cache-hit on the second job.
-
-name: Studio API CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'install.sh'
- - 'pyproject.toml'
- - 'tests/studio/**'
- - '.github/workflows/studio-api-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- api-smoke:
- name: Studio API & Auth Tests
- runs-on: ubuntu-latest
- timeout-minutes: 12
- env:
- GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18893'
- HF_HOME: ${{ github.workspace }}/hf-cache
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - name: Linux deps
- run: |
- sudo apt-get update
- sudo apt-get install -y --no-install-recommends \
- libcurl4-openssl-dev libssl-dev jq
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- # Same key as studio-ui-smoke.yml so the two jobs share a
- # single GGUF download across CI.
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Prime HF_HOME with the GGUF
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
- - name: Save HF_HOME for ${{ env.GGUF_REPO }}
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Install pyjwt for the JWT-expiry forge test
- run: pip install 'pyjwt>=2.6'
-
- - name: Reset auth + boot Studio (API-only)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
-
- - name: Pass bootstrap password + rotated targets to the test
- # The test does its own bootstrap-login + rotation to exercise
- # the auth state machine; we just pre-mint two random rotated
- # passwords for it. Mask them so the log is clean.
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- NEW2="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- echo "::add-mask::$NEW2"
- echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV"
- echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV"
- echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
- - name: Run Studio API & Auth tests
- # The script is named WITHOUT a `test_` prefix so it isn't
- # auto-collected by pytest in Backend CI's `tests/` walk
- # (which doesn't set BASE_URL and would crash at import).
- env:
- BASE_URL: http://127.0.0.1:18893
- STUDIO_AUTH_DIR: /home/runner/.unsloth/studio/auth
- run: python tests/studio/studio_api_smoke.py
-
- - name: Stop Studio
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
-
- - name: Upload API smoke logs
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: studio-api-smoke-log
- path: |
- logs/install.log
- logs/studio.log
- retention-days: 7
diff --git a/.github/workflows/studio-backend-ci.yml b/.github/workflows/studio-backend-ci.yml
deleted file mode 100644
index 63eb70f7f1..0000000000
--- a/.github/workflows/studio-backend-ci.yml
+++ /dev/null
@@ -1,221 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Runs the existing studio/backend/tests/ suite (~860 tests, all CPU-friendly)
-# on every PR that touches the backend or unsloth library. Until this lands,
-# none of those tests run automatically. Verified locally on Python 3.13 with
-# the surgical exclusions below: 861 pass, 4 skipped.
-#
-# Exclusions:
-# - tests/test_studio_api.py: end-to-end against a live model + GGUF download,
-# too heavy for free runners. Run separately when GPU CI is available.
-# - -k 'not llama_cpp_load_progress_live': spawns a real llama.cpp process,
-# not appropriate for CPU-only runners.
-#
-# Two jobs:
-# - pytest matrix (3.10/3.11/3.12/3.13) over studio/backend/tests
-# - repo-cpu-tests: auto-discovered tests/ + state-isolated spoof files
-#
-# Whole-repo Python lint (syntax + ruff + debugger-leftover scan)
-# moved to the dedicated `Lint CI` workflow (.github/workflows/lint-ci.yml)
-# so it fires on every PR rather than only on studio/unsloth/tests
-# path changes.
-
-name: Backend CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'tests/**'
- - 'pyproject.toml'
- - '.github/workflows/studio-backend-ci.yml'
- push:
- branches: [main, pip]
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- pytest:
- name: (Python ${{ matrix.python }})
- runs-on: ubuntu-latest
- timeout-minutes: 15
- strategy:
- fail-fast: false
- matrix:
- python: ['3.10', '3.11', '3.12', '3.13']
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '${{ matrix.python }}'
- cache: 'pip'
-
- - name: Install backend test dependencies (CPU only)
- run: |
- python -m pip install --upgrade pip
- # Studio's declared backend deps:
- pip install -r studio/backend/requirements/studio.txt
- # Extras that studio.txt does not list but the import chain needs
- # (python-multipart for FastAPI form/file uploads, sqlalchemy/cryptography
- # for the auth DB, yaml/jinja2 for utils.models.model_config, etc.):
- pip install \
- python-multipart aiofiles sqlalchemy cryptography \
- pyyaml jinja2 mammoth unpdf requests \
- 'numpy<3' pytest pytest-asyncio httpx
- # Torch CPU + transformers are required by a chunk of the backend test
- # suite (gpu_selection, kv_cache_estimation, utils). CPU-only torch
- # keeps the install ~250 MB / ~1 min on a clean runner.
- pip install --index-url https://download.pytorch.org/whl/cpu 'torch>=2.4,<2.11'
- pip install 'transformers>=4.51,<5.5'
-
- - name: Backend tests
- working-directory: studio/backend
- # Locally validated against this dep set: 831 passed, 5 skipped, 35 deselected.
- # Deselections (all environment-specific, would never pass on a GPU-less
- # `ubuntu-latest` runner regardless of code correctness):
- # - llama_cpp_load_progress_live: spawns a real llama.cpp process
- # - TestGpuAutoSelection / TestPreSpawnGpuResolution / TestPerGpuFitGuardAllCounts:
- # require live transformers config introspection on real GPUs
- # - TestTransformersIntrospection: same
- # - test_returns_cuda_when_cuda_available / test_calls_cuda_cache_when_cuda:
- # assume CUDA-capable GPU
- run: |
- python -m pytest tests/ -q --tb=short \
- --ignore=tests/test_studio_api.py \
- -k 'not llama_cpp_load_progress_live and not TestGpuAutoSelection and not TestPreSpawnGpuResolution and not TestPerGpuFitGuardAllCounts and not TestTransformersIntrospection and not test_returns_cuda_when_cuda_available and not test_calls_cuda_cache_when_cuda'
-
- repo-cpu-tests:
- # Auto-discover everything under tests/ that is not GPU-bound by
- # design. New tests added in covered directories are picked up
- # without a workflow edit. Locally validated: 760 passed, 1 skipped,
- # 23 deselected. tests/conftest.py (mirroring unsloth-zoo PR #624)
- # pre-loads unsloth_zoo.device_type and unsloth.device_type under a
- # mocked torch.cuda.is_available so the unsloth import chain
- # succeeds on CPU.
- name: Repo tests (CPU)
- runs-on: ubuntu-latest
- timeout-minutes: 15
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- # node + uv unlock ~60 tests that previously skipped on CI:
- # - 9 tests in test_chat_preset_builtin_invariants.py need node to
- # compile a tiny TS harness against the frontend chat sources.
- # - tests/python/* spawn fresh `uv venv`s to verify the no-torch
- # install path; they self-skip when uv is missing.
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - name: Install uv (for tests/python/* sandboxed venvs)
- run: pip install uv
-
- - name: Install deps (shared shape with backend pytest job)
- run: |
- python -m pip install --upgrade pip
- pip install -r studio/backend/requirements/studio.txt
- pip install \
- python-multipart aiofiles sqlalchemy cryptography \
- pyyaml jinja2 mammoth unpdf requests typer \
- 'numpy<3' pytest pytest-asyncio httpx
- # torchvision: unsloth_zoo.vision_utils imports it at module scope.
- pip install --index-url https://download.pytorch.org/whl/cpu \
- 'torch>=2.4,<2.11' 'torchvision<0.26'
- pip install 'transformers>=4.51,<5.5'
- # bitsandbytes: hard import in unsloth/models/_utils.py. Recent
- # versions ship a CPU build that imports cleanly on Linux.
- pip install 'bitsandbytes>=0.45'
- # unsloth.device_type imports unsloth_zoo.utils.Version at module
- # scope, so the conftest preload needs unsloth_zoo even though
- # it is an optional dep of unsloth.
- pip install 'unsloth_zoo>=2026.5.1'
- pip install -e . --no-deps
-
- - name: Repo tests (CPU, auto-discovered)
- env:
- # tests/python/* import install_python_stack from studio/.
- PYTHONPATH: ${{ github.workspace }}/studio
- # Skip lazy compilation work the unsloth import chain wants to
- # do at import time on a real GPU.
- UNSLOTH_COMPILE_DISABLE: '1'
- # --ignore: GPU-bound directories (qlora/saving need real weights;
- # tests/sh is the shell suite the next step handles; tests/utils
- # is a helpers folder); tests/vllm_compat + tests/version_compat
- # are dedicated multi-version drift canaries with their own job
- # in version-compat-ci.yml that installs the heavier dep set
- # (torchcodec, full transformers/peft/bnb pins) those tests need.
- # State-sensitive hardware-spoofing files run in isolation in the
- # next step because they mutate hardware.py module globals.
- # -m: honour markers from tests/python/conftest.py (`server` =
- # needs studio venv, `e2e` = needs network).
- # --deselect:
- # - test_model_registration / test_all_model_registration:
- # hit huggingface_hub for live model existence checks.
- # - test_autoconfig_works_with_no_torch_runtime / test_autoconfig_succeeds:
- # fail because no-torch-runtime.txt does not pin tokenizers
- # and the latest tokenizers (0.23.1) is incompatible with the
- # transformers it resolves to. Tracked separately; this is a
- # real bug in the no-torch install path, not a CI issue.
- run: |
- python -m pytest tests/ -q --tb=short \
- --ignore=tests/qlora \
- --ignore=tests/saving \
- --ignore=tests/utils \
- --ignore=tests/sh \
- --ignore=tests/studio/test_hardware_dispatch_matrix.py \
- --ignore=tests/studio/test_is_mlx_dispatch_gate.py \
- --ignore=tests/vllm_compat \
- --ignore=tests/version_compat \
- -m 'not server and not e2e' \
- --deselect tests/test_model_registry.py::test_model_registration \
- --deselect tests/test_model_registry.py::test_all_model_registration \
- --deselect 'tests/python/test_tokenizers_and_torch_constraint.py::TestE2ETokenizersFix::test_autoconfig_works_with_no_torch_runtime' \
- --deselect 'tests/python/test_tokenizers_and_torch_constraint.py::TestE2EFullNoTorchSandbox::test_autoconfig_succeeds'
-
- - name: Hardware-spoof tests (state-sensitive, run in isolation)
- env:
- PYTHONPATH: ${{ github.workspace }}/studio
- UNSLOTH_COMPILE_DISABLE: '1'
- # These two files mutate hardware.py module globals at runtime
- # via the spoof fixtures, which leaks state into any other test
- # that imports hardware. Run them in their own pytest invocation
- # so the leak does not cross file boundaries.
- run: |
- python -m pytest -q --tb=short \
- tests/studio/test_hardware_dispatch_matrix.py \
- tests/studio/test_is_mlx_dispatch_gate.py
-
- - name: Shell installer tests
- # Subset that does not depend on a writable / pristine install.sh
- # tree; test_install_host_defaults.sh checks install.ps1 layout
- # which has drifted (separate followup).
- run: |
- set -e
- for s in \
- tests/sh/test_get_torch_index_url.sh \
- tests/sh/test_mac_intel_compat.sh \
- tests/sh/test_tauri_install_exit_order.sh \
- tests/sh/test_torch_constraint.sh; do
- echo "::group::$s"
- bash "$s"
- echo "::endgroup::"
- done
-
diff --git a/.github/workflows/studio-frontend-ci.yml b/.github/workflows/studio-frontend-ci.yml
deleted file mode 100644
index 1270a57ef6..0000000000
--- a/.github/workflows/studio-frontend-ci.yml
+++ /dev/null
@@ -1,151 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Frontend PR gate: lockfile freshness, typecheck, build, and a bundle grep
-# that catches the 2026.5.1 chat-history regression at the JS level.
-#
-# biome runs as non-blocking for now: the codebase currently has accumulated
-# ~470 errors and ~1650 warnings against the existing biome config. Surfacing
-# the count in CI lets us drive it down without forcing a fleet-wide cleanup
-# in the same PR. Drop `continue-on-error` once that number is zero.
-
-name: Frontend CI
-
-on:
- pull_request:
- paths:
- - 'studio/frontend/**'
- - 'scripts/check_frontend_dep_removal.py'
- - 'tests/studio/test_frontend_dep_removal.py'
- - '.github/workflows/studio-frontend-ci.yml'
- push:
- branches: [main, pip]
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- build:
- name: Frontend build + bundle sanity
- runs-on: ubuntu-latest
- timeout-minutes: 10
- defaults:
- run:
- working-directory: studio/frontend
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- # FIXME: drop this step once @assistant-ui/* and assistant-stream
- # leave 0.x -- on 1.x, caret ranges are conventional. Until then,
- # every 0.minor on this surface is a SemVer-major (this is exactly
- # how 2026.5.1 shipped a broken chat runtime: ^0.12.19 quietly
- # resolved to 0.12.28).
- - name: '@assistant-ui must be pinned exactly (no caret/tilde)'
- working-directory: ${{ github.workspace }}
- run: |
- set -e
- if grep -nE '"(@assistant-ui/[a-z-]+|assistant-stream)":[[:space:]]*"[\^~]' studio/frontend/package.json; then
- echo "::error file=studio/frontend/package.json::These packages must be pinned to exact versions until they leave 0.x. Drop the leading ^ or ~."
- exit 1
- fi
- echo "All assistant-ui packages are pinned exactly."
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- # Run the structural lockfile scan BEFORE npm ci. A compromised
- # tarball runs its `prepare` / `postinstall` during `npm ci`,
- # so any catch has to fire upstream of that. The scanner is
- # pure-Python read-only; safe to call ahead of every install.
- - name: Lockfile supply-chain audit (pre-install scan)
- working-directory: ${{ github.workspace }}
- run: python3 scripts/lockfile_supply_chain_audit.py
-
- - name: Lockfile must agree with package.json (npm ci is strict)
- # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
- # required for `vite build`. The pre-install lockfile structural
- # audit (lockfile_supply_chain_audit.py) is the practical defence
- # against the npm postinstall-dropper class -- it fires BEFORE any
- # tarball runs, on the injection pattern itself rather than an
- # advisory-DB lookup.
- run: npm ci --no-fund --no-audit
-
- - name: npm ci must not have modified the working tree
- working-directory: ${{ github.workspace }}
- run: |
- if ! git diff --quiet -- studio/frontend; then
- echo "::error::npm ci modified files; commit the updated lockfile"
- git status -- studio/frontend
- exit 1
- fi
-
- # Catch the common foot-gun: a dep dropped from package.json that is
- # still imported somewhere. The script walks the lockfile dep graph
- # from the new top-level deps and only counts top-level node_modules
- # paths as valid resolution targets for bare src/ imports.
- #
- # actions/checkout uses fetch-depth: 1 by default, so the base branch
- # is not available locally. Fetch the single base commit with an
- # explicit refspec so origin/ is reliably created (a bare
- # `git fetch origin [` only updates FETCH_HEAD in some configs).
- - name: Dependency removal safety check
- if: github.event_name == 'pull_request'
- working-directory: ${{ github.workspace }}
- run: |
- git fetch --no-tags --depth=1 origin \
- "${{ github.base_ref }}:refs/remotes/origin/${{ github.base_ref }}"
- python3 scripts/check_frontend_dep_removal.py \
- --base "origin/${{ github.base_ref }}" \
- --enumerate-dead
- python3 tests/studio/test_frontend_dep_removal.py
-
- - name: Typecheck
- run: npm run typecheck
-
- - name: Build
- run: npm run build
-
- - name: Built bundle must not contain Studio's unstable_Provider call site
- run: |
- set -e
- JS=$(ls dist/assets/index-*.js | head -1)
- HITS=$(grep -c 'unstable_Provider:' "$JS" || echo 0)
- echo "main bundle: $JS"
- echo "unstable_Provider: hits=$HITS (assistant-ui internals contribute up to 3)"
- if [ "$HITS" -gt 3 ]; then
- echo "::error file=studio/frontend/src/features/chat/runtime-provider.tsx::Studio bundle still passes unstable_Provider through useRemoteThreadListRuntime; this is the 2026.5.1 chat-history regression. Pass adapters directly into useLocalRuntime instead."
- exit 1
- fi
-
- - name: Bundle size budget (75 MB)
- run: |
- SIZE=$(du -sb dist | cut -f1)
- BUDGET=$((75 * 1024 * 1024))
- echo "dist size: $SIZE bytes ($((SIZE/1024/1024)) MB), budget: $BUDGET bytes (75 MB)"
- if [ "$SIZE" -gt "$BUDGET" ]; then
- echo "::error::studio/frontend/dist/ exceeded the 75 MB budget. Drop dead deps (e.g. the unused next dep) or split chunks."
- exit 1
- fi
-
- - name: Biome (non-blocking until accumulated drift is cleared)
- continue-on-error: true
- run: npm run biome:check
-
- - name: Upload built dist
- # Always upload so a green run is reviewable too -- the dist
- # output catches "tests passed but bundle changed unexpectedly"
- # regressions that would be invisible if we only kept artifacts
- # on failure.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: studio-frontend-dist
- path: studio/frontend/dist
- retention-days: 3
diff --git a/.github/workflows/studio-inference-smoke.yml b/.github/workflows/studio-inference-smoke.yml
deleted file mode 100644
index 775363e73c..0000000000
--- a/.github/workflows/studio-inference-smoke.yml
+++ /dev/null
@@ -1,887 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Three end-to-end smoke jobs that boot a freshly-installed Studio and
-# exercise the surfaces real users hit through the OpenAI / Anthropic
-# SDKs and curl. Each job picks the smallest model that exercises the
-# behaviour under test, primes HF_HOME via actions/cache, and shares
-# the install.sh --local --no-torch bootstrap.
-#
-# 1. OpenAI, Anthropic API tests
-# gemma-3-270m-it UD-Q4_K_XL (~254 MiB).
-# Password rotation via /api/auth/change-password (old fails,
-# new works), then OpenAI + Anthropic Python SDKs against /v1/*
-# with temperature=0 and a fixed seed. Asserts the four-turn
-# conversation is deterministic across two runs.
-#
-# 2. Tool calling Tests
-# Qwen3.5-2B UD-IQ3_XXS (~890 MiB). OpenAI function calling,
-# server-side tools (python, terminal, web_search) via
-# enable_tools / enabled_tools, and enable_thinking on/off.
-#
-# 3. JSON, images
-# gemma-4-E2B-it UD-IQ3_XXS (~2.4 GiB) + mmproj-F16 (~986 MiB).
-# response_format JSON-schema decoding and OpenAI image_url
-# (data URI) plus Anthropic source/base64 image inputs.
-#
-# All three jobs run in parallel. Total wall time is dominated by job 3
-# on a cold cache; warm cache cuts that to ~3 min.
-
-name: Studio GGUF CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'install.sh'
- - 'pyproject.toml'
- - '.github/workflows/studio-inference-smoke.yml'
- push:
- branches: [main, pip]
- # Manual trigger for pre-warming HF_HOME caches on main, or re-running
- # against an arbitrary branch without pushing a no-op commit.
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- # ─────────────────────────────────────────────────────────────────────
- # Job 1: OpenAI, Anthropic API tests
- # ─────────────────────────────────────────────────────────────────────
- openai-anthropic:
- name: OpenAI, Anthropic API tests
- runs-on: ubuntu-latest
- timeout-minutes: 25
- env:
- GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18888'
- HF_HOME: ${{ github.workspace }}/hf-cache
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - name: Linux deps for llama.cpp prebuilt
- run: |
- sudo apt-get update
- sudo apt-get install -y --no-install-recommends \
- libcurl4-openssl-dev libssl-dev jq
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Prime HF_HOME with the GGUF
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
- - name: Save HF_HOME for ${{ env.GGUF_REPO }}
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Install OpenAI + Anthropic Python SDKs
- run: pip install 'openai>=1.50' 'anthropic>=0.40'
-
- - name: Reset auth + boot Studio (API-only)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json
- exit 0
- fi
- sleep 1
- done
- echo "Studio did not become healthy in 180s"
- tail -200 logs/studio.log
- exit 1
-
- - name: Password rotation (old must fail, new must work)
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- # 1. Login with the bootstrap password.
- OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
- [ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] || { echo "bootstrap login failed"; exit 1; }
- # 2. Rotate to a fresh random password.
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
- -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
- -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
- # 3. Old password must now be rejected (HTTP 401).
- OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \
- -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}")
- if [ "$OLD_STATUS" != "401" ]; then
- echo "::error::Login with old password returned $OLD_STATUS, expected 401"
- exit 1
- fi
- # 4. New password must succeed; capture the JWT for downstream steps.
- NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
- [ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] || { echo "new login failed"; exit 1; }
- echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV"
- echo "password rotation OK (old=401, new=200)"
-
- - name: Load the GGUF (HF repo + variant, served from HF_HOME cache)
- run: |
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
- -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
- --max-time 600 \
- -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
- | jq '{status, display_name, is_gguf, context_length}'
-
- - name: Multi-turn determinism via OpenAI + Anthropic SDKs
- env:
- BASE_URL: http://127.0.0.1:18888
- run: |
- python - <<'PY'
- import json
- import os
- from openai import OpenAI
- from anthropic import Anthropic
-
- BASE = os.environ["BASE_URL"]
- KEY = os.environ["TOKEN"] # JWT also accepted as Bearer on /v1/*
- SEED = 3407
-
- # Four-turn conversation: the second and fourth turns can only be
- # answered correctly if the model sees the prior turns, so this
- # also exercises the conversation-history wiring.
- PROMPTS = [
- "What is 1+1?",
- "What did I ask before?",
- "What is the capital of France?",
- "Repeat the city name",
- ]
-
- def run_openai():
- client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
- history, replies = [], []
- for prompt in PROMPTS:
- history.append({"role": "user", "content": prompt})
- resp = client.chat.completions.create(
- model = "default",
- messages = history,
- temperature = 0.0,
- max_tokens = 80,
- seed = SEED,
- extra_body = {"enable_thinking": False},
- )
- text = resp.choices[0].message.content or ""
- replies.append(text)
- history.append({"role": "assistant", "content": text})
- return replies
-
- def run_anthropic():
- # Two SDK quirks vs. Studio:
- # 1. base_url must NOT include /v1 -- the SDK appends
- # /v1/messages itself; otherwise the request hits
- # /v1/v1/messages and 405s.
- # 2. The SDK sends `x-api-key` by default, but Studio's
- # auth layer is HTTPBearer-only. Override via
- # default_headers so Authorization: Bearer ... is
- # sent instead.
- client = Anthropic(
- base_url = BASE,
- api_key = "unused",
- default_headers = {"Authorization": f"Bearer {KEY}"},
- )
- history, replies = [], []
- for prompt in PROMPTS:
- history.append({"role": "user", "content": prompt})
- msg = client.messages.create(
- model = "default",
- max_tokens = 80,
- messages = history,
- temperature = 0.0,
- extra_body = {"seed": SEED, "enable_thinking": False},
- )
- text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text")
- replies.append(text)
- history.append({"role": "assistant", "content": text})
- return replies
-
- for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)):
- first = runner()
- second = runner()
- for i, (a, b) in enumerate(zip(first, second), start = 1):
- print(f"[{label} turn {i}] {a!r}")
- assert a, f"{label}: empty turn {i} response"
- assert a == b, (
- f"{label} non-deterministic at turn {i} with temperature=0.0:\n"
- f" run1: {a!r}\n run2: {b!r}"
- )
- # Sanity: turn-2 reply should mention the earlier question, and
- # turn-4 reply should mention Paris (model echoes the city it
- # produced for turn 3). Lower-cased substring checks keep the
- # assertion robust to formatting jitter.
- joined = " ".join(first).lower()
- assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}"
- assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}"
- print(f"[{label}] OK -- 4 turns, run1 == run2, history grounded")
- PY
-
- - name: Stop Studio
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
- ss -tln | grep ":${STUDIO_PORT}" || true
-
- - name: Upload logs
- # Always upload so green runs are still reviewable.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: openai-anthropic-log
- path: |
- logs/studio.log
- logs/install.log
- retention-days: 7
-
- # ─────────────────────────────────────────────────────────────────────
- # Job 2: Tool calling Tests
- # ─────────────────────────────────────────────────────────────────────
- tool-calling:
- name: Tool calling Tests
- runs-on: ubuntu-latest
- timeout-minutes: 25
- env:
- # Tool calling is the highest-volume GGUF in this workflow
- # (Qwen3.5-2B at IQ3_XXS = ~890 MiB). Caching HF_HOME would
- # store xet chunks + blobs + snapshots = ~4 GiB compressed --
- # 4-5x file-size inflation, dominated by xet chunks. Use main's
- # `--local-dir gguf-cache` pattern to cache the flat .gguf only.
- # Studio's /api/inference/load accepts either a HF repo (which
- # uses HF_HOME) or an absolute file path; passing the absolute
- # path keeps the test off HF_HOME entirely so the cache size
- # tracks the GGUF file 1:1. The OpenAI/Anth and JSON+images
- # jobs still cover the gguf_variant resolution path.
- GGUF_REPO: unsloth/Qwen3.5-2B-GGUF
- GGUF_FILE: Qwen3.5-2B-UD-IQ3_XXS.gguf
- STUDIO_PORT: '18889'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - name: Linux deps for llama.cpp prebuilt
- run: |
- sudo apt-get update
- sudo apt-get install -y --no-install-recommends \
- libcurl4-openssl-dev libssl-dev jq
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Restore GGUF model file
- id: cache-gguf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: gguf-cache
- key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
- - name: Download GGUF if cache miss
- id: download-gguf
- if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.cache-gguf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p gguf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache
-
- - name: Save GGUF model file
- if: always() && steps.download-gguf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: gguf-cache
- key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Reset auth + boot Studio (API-only, default tool policy)
- # We deliberately use the API-only mode rather than
- # `unsloth studio run` because the latter calls
- # `set_tool_policy(...)` with a resolved bool: on loopback the
- # default resolves to True, which forces every request through
- # the server-side agentic loop and breaks the standard
- # function-calling test below. API-only mode leaves
- # tool_policy=None so each request's `enable_tools` field is
- # honoured.
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health, log in, change password, load model
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
- -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
- -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
- TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
- echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
- GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}"
- ls -lh "$GGUF_PATH"
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
- -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
- --max-time 600 \
- -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
- | jq '{status, display_name}'
-
- - name: Tool calling, server-side tools, thinking on/off
- env:
- BASE_URL: http://127.0.0.1:18889
- run: |
- python - <<'PY'
- import json
- import os
- import urllib.request
-
- BASE = os.environ["BASE_URL"]
- KEY = os.environ["API_KEY"]
- SEED = 3407
-
- def post(path, body, *, timeout = 240):
- """Plain JSON POST. For requests that don't go through
- the server-side agentic loop, the response is one JSON
- object."""
- data = json.dumps(body).encode()
- req = urllib.request.Request(
- f"{BASE}{path}",
- data = data,
- method = "POST",
- headers = {
- "Authorization": f"Bearer {KEY}",
- "Content-Type": "application/json",
- },
- )
- with urllib.request.urlopen(req, timeout = timeout) as resp:
- return resp.status, json.loads(resp.read().decode())
-
- def post_sse(path, body, *, timeout = 600):
- """POST a streaming request and accumulate the assistant
- text deltas. The server-side agentic loop ALWAYS returns
- SSE regardless of the request's `stream` field, so any
- call with enable_tools=true must use this helper."""
- body = {**body, "stream": True}
- data = json.dumps(body).encode()
- req = urllib.request.Request(
- f"{BASE}{path}",
- data = data,
- method = "POST",
- headers = {
- "Authorization": f"Bearer {KEY}",
- "Content-Type": "application/json",
- },
- )
- parts = []
- with urllib.request.urlopen(req, timeout = timeout) as resp:
- for raw in resp:
- line = raw.decode().strip()
- if not line.startswith("data: "):
- continue
- payload = line[6:]
- if payload == "[DONE]":
- break
- try:
- chunk = json.loads(payload)
- except json.JSONDecodeError:
- continue
- for choice in chunk.get("choices", []):
- delta = choice.get("delta", {}) or {}
- if delta.get("content"):
- parts.append(delta["content"])
- return "".join(parts)
-
- # ── 1. Standard OpenAI function calling ──────────────────────
- weather_tool = {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get current weather for a city.",
- "parameters": {
- "type": "object",
- "properties": {"city": {"type": "string"}},
- "required": ["city"],
- },
- },
- }
-
- status, data = post("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "What is the weather in Paris?"}],
- "tools": [weather_tool],
- "tool_choice": "required",
- "stream": False,
- "temperature": 0.0,
- "seed": SEED,
- "max_tokens": 120,
- })
- assert status == 200, f"tool call status {status}: {data}"
- choice = data["choices"][0]
- assert choice["finish_reason"] == "tool_calls", f"finish_reason={choice['finish_reason']!r}"
- tc = choice["message"]["tool_calls"][0]
- assert tc["function"]["name"] == "get_weather"
- args = json.loads(tc["function"]["arguments"])
- assert args.get("city"), f"missing city arg: {args}"
- print(f"[tools] PASS function calling -> {tc['function']['name']}({args})")
-
- # ── 2. Server-side python tool ───────────────────────────────
- # 123 * 456 = 56088. The agentic loop streams SSE; we
- # accumulate the assistant text and look for the answer. We
- # accept "56088" or "56,088" since the model may format it.
- content = post_sse("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "What is 123 * 456? Use the python tool to compute it and tell me the number."}],
- "enable_tools": True,
- "enabled_tools": ["python"],
- "session_id": "ci-tool-calling-py",
- "temperature": 0.0,
- "seed": SEED,
- "max_tokens": 600,
- })
- assert "56088" in content or "56,088" in content, (
- f"expected 56088 in python-tool answer, got: {content!r}"
- )
- print(f"[tools] PASS python tool ({len(content)} chars)")
-
- # ── 3. Server-side bash (terminal) tool ──────────────────────
- content = post_sse("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "Use the terminal tool to run `echo hello-bash-tool` and tell me the exact output."}],
- "enable_tools": True,
- "enabled_tools": ["terminal"],
- "session_id": "ci-tool-calling-bash",
- "temperature": 0.0,
- "seed": SEED,
- "max_tokens": 600,
- })
- assert "hello-bash-tool" in content, (
- f"expected 'hello-bash-tool' in terminal-tool answer, got: {content!r}"
- )
- print(f"[tools] PASS bash/terminal tool ({len(content)} chars)")
-
- # ── 4. Server-side web_search tool ───────────────────────────
- # DuckDuckGo is flaky from CI runners and small Qwen3.5-2B
- # may not actually search. Only assert that the SSE stream
- # opens and yields any data; HTTP / parser failures already
- # raise above.
- try:
- content = post_sse("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}],
- "enable_tools": True,
- "enabled_tools": ["web_search"],
- "session_id": "ci-tool-calling-web",
- "temperature": 0.0,
- "seed": SEED,
- "max_tokens": 400,
- })
- print(f"[tools] PASS web_search stream ({len(content)} chars)")
- except Exception as exc:
- print(f"[tools] WARN web_search probe failed (non-blocking): {exc}")
-
- # ── 5. Thinking on / off ─────────────────────────────────────
- # Studio strips think blocks from message.content for tools-mode
- # responses, so we toggle plain chat (no enable_tools) and look
- # at the surfaced reasoning_content / message.thinking field.
- def thinking_call(enable):
- status, data = post("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "Briefly: is 17 prime?"}],
- "stream": False,
- "enable_thinking": enable,
- "temperature": 0.0,
- "seed": SEED,
- "max_tokens": 300,
- })
- assert status == 200
- msg = data["choices"][0]["message"]
- # Studio surfaces thinking via reasoning_content (OpenAI
- # extension). Fall back to inline markers for
- # robustness across template versions.
- raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "")
- return raw
-
- on_text = thinking_call(True)
- off_text = thinking_call(False)
- had_think_on = ("" in on_text) or len(on_text) > 80
- had_think_off = ("" in off_text) and len(off_text) > 0
- assert had_think_on, (
- f"enable_thinking=True produced no thinking signal: {on_text!r}"
- )
- # Off-mode should not contain the literal marker.
- assert "" not in off_text, (
- f"enable_thinking=False but still present: {off_text!r}"
- )
- print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)")
- PY
-
- - name: Stop Studio
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
- ss -tln | grep ":${STUDIO_PORT}" || true
-
- - name: Upload logs
- # Always upload so green runs are still reviewable.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: tool-calling-log
- path: |
- logs/studio.log
- logs/install.log
- retention-days: 7
-
- # ─────────────────────────────────────────────────────────────────────
- # Job 3: JSON, images
- # ─────────────────────────────────────────────────────────────────────
- json-images:
- name: JSON, images
- runs-on: ubuntu-latest
- timeout-minutes: 30
- env:
- GGUF_REPO: unsloth/gemma-4-E2B-it-GGUF
- GGUF_VARIANT: UD-IQ3_XXS
- GGUF_FILE: gemma-4-E2B-it-UD-IQ3_XXS.gguf
- MMPROJ_FILE: mmproj-F16.gguf
- STUDIO_PORT: '18890'
- HF_HOME: ${{ github.workspace }}/hf-cache
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - name: Linux deps for llama.cpp prebuilt
- run: |
- sudo apt-get update
- sudo apt-get install -y --no-install-recommends \
- libcurl4-openssl-dev libssl-dev jq
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Restore HF_HOME for ${{ env.GGUF_REPO }} (model + mmproj)
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1
-
- - name: Prime HF_HOME with the GGUF + mmproj
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE"
-
- - name: Save HF_HOME for ${{ env.GGUF_REPO }} (model + mmproj)
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Install OpenAI + Anthropic Python SDKs
- run: pip install 'openai>=1.50' 'anthropic>=0.40'
-
- - name: Reset auth + boot Studio (API-only)
- # See Job 2's comment: API-only mode keeps tool_policy=None so
- # response_format requests aren't routed through the agentic
- # tool loop.
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health, log in, change password, load model
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
- -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
- -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
- TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
- echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
- # Load the GGUF (mmproj is auto-detected via the HF repo
- # lookup, the cached file is pulled out of HF_HOME).
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
- -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
- --max-time 900 \
- -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
- | jq '{status, display_name, is_vision}'
-
- - name: JSON schema decoding + image input
- env:
- BASE_URL: http://127.0.0.1:18890
- run: |
- python - <<'PY'
- import base64
- import json
- import os
- import urllib.request
- from openai import OpenAI
- from anthropic import Anthropic
-
- BASE = os.environ["BASE_URL"]
- KEY = os.environ["API_KEY"]
- SEED = 3407
-
- def post(path, body, *, timeout = 240):
- req = urllib.request.Request(
- f"{BASE}{path}",
- data = json.dumps(body).encode(),
- method = "POST",
- headers = {
- "Authorization": f"Bearer {KEY}",
- "Content-Type": "application/json",
- },
- )
- with urllib.request.urlopen(req, timeout = timeout) as resp:
- return resp.status, json.loads(resp.read().decode())
-
- # ── 1. response_format = json_object (JSON mode) ─────────────
- # llama.cpp's HTTP server supports OpenAI-compatible JSON
- # mode: `response_format: {"type": "json_object"}` constrains
- # the model to emit syntactically-valid JSON. We use raw HTTP
- # rather than the OpenAI SDK so that the field shape Studio
- # forwards to llama-server is unambiguous (the SDK rewrites
- # response_format depending on which variant it recognises).
- # We deliberately do NOT pass a strict JSON schema -- on
- # small Gemma-4 quants the GBNF-from-schema path occasionally
- # produces empty output, and JSON mode is the surface we care
- # about exposing through Studio.
- status, data = post("/v1/chat/completions", {
- "model": "default",
- "messages": [
- {"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'},
- {"role": "user", "content": "What is the capital of France?"},
- ],
- "temperature": 0.0,
- "max_tokens": 200,
- "seed": SEED,
- "stream": False,
- "enable_thinking": False,
- "response_format": {"type": "json_object"},
- }, timeout = 600)
- assert status == 200, f"json status {status}: {data}"
- content = (data["choices"][0]["message"].get("content") or "").strip()
- # Some chat templates wrap JSON in ```json fences even in JSON
- # mode -- strip those before parsing.
- if content.startswith("```"):
- content = content.split("```", 2)[1]
- if content.startswith("json"):
- content = content[4:]
- content = content.strip("`\n ")
- parsed = json.loads(content)
- assert "paris" in str(parsed.get("city", "")).lower(), (
- f"city != Paris: {parsed}"
- )
- print(f"[json] PASS json_object -> {parsed}")
-
- # ── 2. OpenAI image_url (data URI base64) ───────────────────
- # 64x64 solid-red PNG. stb_image (used by Studio's image
- # normaliser at routes/inference.py:3410) rejects 4x4 or
- # smaller PNGs as truncated, so we go up to 64x64 -- still
- # tiny in token cost. The assertion is loose: any non-empty
- # response from the vision path proves multimodal end-to-end
- # wiring; small VL quants are weak at colour identification.
- PNG_64X64_RED_B64 = (
- "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k"
- "UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA"
- "1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII="
- )
- data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}"
-
- client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
- openai_resp = client.chat.completions.create(
- model = "default",
- temperature = 0.0,
- max_tokens = 80,
- seed = SEED,
- messages = [{
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": data_uri}},
- {"type": "text", "text": "What colour dominates this image? Reply in one word."},
- ],
- }],
- )
- openai_text = (openai_resp.choices[0].message.content or "").lower()
- print(f"[image/openai] reply: {openai_text!r}")
- assert openai_text, "OpenAI image_url returned empty content"
- # We do not strictly require 'red' -- some quants of small VL
- # models are weak at colour names. Just require a non-empty
- # answer; the vision path is the part under test.
- print("[image/openai] PASS image_url accepted, non-empty response")
-
- # ── 3. Anthropic source/base64 image ────────────────────────
- # Two SDK quirks vs. Studio: base_url must NOT include /v1
- # (the SDK appends it itself; otherwise /v1/v1/messages -> 405),
- # and Studio's auth is HTTPBearer-only so the SDK's default
- # x-api-key header is ignored -- send Authorization: Bearer
- # via default_headers.
- anthropic = Anthropic(
- base_url = BASE,
- api_key = "unused",
- default_headers = {"Authorization": f"Bearer {KEY}"},
- )
- a_msg = anthropic.messages.create(
- model = "default",
- max_tokens = 80,
- temperature = 0.0,
- extra_body = {"seed": SEED},
- messages = [{
- "role": "user",
- "content": [
- {
- "type": "image",
- "source": {
- "type": "base64",
- "media_type": "image/png",
- "data": PNG_64X64_RED_B64,
- },
- },
- {"type": "text", "text": "Describe this image briefly."},
- ],
- }],
- )
- a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text")
- print(f"[image/anthropic] reply: {a_text!r}")
- assert a_text, "Anthropic source/base64 returned empty content"
- print("[image/anthropic] PASS source/base64 accepted, non-empty response")
- PY
-
- - name: Stop Studio
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
- ss -tln | grep ":${STUDIO_PORT}" || true
-
- - name: Upload logs
- # Always upload so green runs are still reviewable.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: json-images-log
- path: |
- logs/studio.log
- logs/install.log
- retention-days: 7
diff --git a/.github/workflows/studio-mac-api-smoke.yml b/.github/workflows/studio-mac-api-smoke.yml
deleted file mode 100644
index b4e274155e..0000000000
--- a/.github/workflows/studio-mac-api-smoke.yml
+++ /dev/null
@@ -1,153 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Mac counterpart to studio-api-smoke.yml. Same tests/studio/
-# studio_api_smoke.py exercise (CORS hardening, auth state machine,
-# JWT expiry, API key lifecycle, /v1/models / /v1/embeddings /
-# /v1/responses, endpoint-by-endpoint auth audit) but on a real
-# Apple Silicon (macos-14, M1) runner. Drops the apt-get block;
-# GitHub-hosted macos-14 ships curl + jq.
-
-name: Mac Studio API CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'install.sh'
- - 'pyproject.toml'
- - 'tests/studio/**'
- - '.github/workflows/studio-mac-api-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- api-smoke:
- name: Studio API & Auth Tests
- runs-on: macos-14
- timeout-minutes: 25
- env:
- GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18895'
- HF_HOME: ${{ github.workspace }}/hf-cache
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Prime HF_HOME with the GGUF
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
- - name: Save HF_HOME for ${{ env.GGUF_REPO }}
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Assert install.sh used the Mac llama.cpp prebuilt
- run: |
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
-
- - name: Install pyjwt for the JWT-expiry forge test
- run: pip install 'pyjwt>=2.6'
-
- - name: Reset auth + boot Studio (API-only)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
-
- - name: Pass bootstrap password + rotated targets to the test
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- NEW2="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- echo "::add-mask::$NEW2"
- echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV"
- echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV"
- echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
- - name: Run Studio API & Auth tests
- env:
- BASE_URL: http://127.0.0.1:18895
- STUDIO_AUTH_DIR: /Users/runner/.unsloth/studio/auth
- run: python tests/studio/studio_api_smoke.py
-
- - name: Stop Studio
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
-
- - name: Upload API smoke logs
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: mac-studio-api-smoke-log
- path: |
- logs/install.log
- logs/studio.log
- retention-days: 7
diff --git a/.github/workflows/studio-mac-inference-smoke.yml b/.github/workflows/studio-mac-inference-smoke.yml
deleted file mode 100644
index 2d6864e0cb..0000000000
--- a/.github/workflows/studio-mac-inference-smoke.yml
+++ /dev/null
@@ -1,1042 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Three end-to-end smoke jobs that boot a freshly-installed Studio and
-# exercise the surfaces real users hit through the OpenAI / Anthropic
-# SDKs and curl. Each job picks the smallest model that exercises the
-# behaviour under test, primes a model cache via actions/cache, and
-# shares the install.sh --local --no-torch bootstrap.
-#
-# 1. OpenAI, Anthropic API tests
-# gemma-3-270m-it UD-Q4_K_XL (~254 MiB).
-# Password rotation via /api/auth/change-password (old fails,
-# new works), then OpenAI + Anthropic Python SDKs against /v1/*
-# with temperature=0 and a fixed seed. Asserts the four-turn
-# conversation is deterministic across two runs.
-#
-# 2. Tool calling Tests
-# Qwen3.5-2B UD-IQ3_XXS (~890 MiB). OpenAI function calling,
-# server-side tools (python, terminal, web_search) via
-# enable_tools / enabled_tools, and enable_thinking on/off.
-#
-# 3. JSON, images
-# gemma-4-E2B-it UD-IQ3_XXS (~2.4 GiB) + mmproj-F16 (~986 MiB).
-# response_format JSON-schema decoding and OpenAI image_url
-# (data URI) plus Anthropic source/base64 image inputs.
-#
-# All three jobs run in parallel. Total wall time is dominated by job 3
-# on a cold cache; warm cache cuts that to ~3 min.
-
-name: Mac Studio GGUF CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'install.sh'
- - 'pyproject.toml'
- - '.github/workflows/studio-mac-inference-smoke.yml'
- push:
- branches: [main, pip]
- # Manual trigger for pre-warming model caches on main, or re-running
- # against an arbitrary branch without pushing a no-op commit.
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- # ─────────────────────────────────────────────────────────────────────
- # Job 1: OpenAI, Anthropic API tests
- # ─────────────────────────────────────────────────────────────────────
- openai-anthropic:
- name: OpenAI, Anthropic API tests
- runs-on: macos-14
- timeout-minutes: 25
- env:
- GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18888'
- HF_HOME: ${{ github.workspace }}/hf-cache
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Prime HF_HOME with the GGUF
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
- # Save partial caches on cancel/timeout -- hf download resumes by
- # content hash. `outcome != skipped` keeps cache-hit a no-op.
- - name: Save HF_HOME for ${{ env.GGUF_REPO }}
- if: always() && steps.prime-hf.outcome != 'skipped' && hashFiles('hf-cache/**/*.gguf') != ''
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Assert install.sh used the Mac llama.cpp prebuilt
- run: |
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
-
- - name: Install OpenAI + Anthropic Python SDKs
- run: pip install 'openai>=1.50' 'anthropic>=0.40'
-
- - name: Reset auth + boot Studio (API-only)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json
- exit 0
- fi
- sleep 1
- done
- echo "Studio did not become healthy in 180s"
- tail -200 logs/studio.log
- exit 1
-
- - name: Password rotation (old must fail, new must work)
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- # 1. Login with the bootstrap password.
- OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
- [ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] || { echo "bootstrap login failed"; exit 1; }
- # 2. Rotate to a fresh random password.
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
- -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
- -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
- # 3. Old password must now be rejected (HTTP 401).
- OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \
- -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}")
- if [ "$OLD_STATUS" != "401" ]; then
- echo "::error::Login with old password returned $OLD_STATUS, expected 401"
- exit 1
- fi
- # 4. New password must succeed; capture the JWT for downstream steps.
- NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
- [ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] || { echo "new login failed"; exit 1; }
- echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV"
- echo "password rotation OK (old=401, new=200)"
-
- - name: Load the GGUF (HF repo + variant, served from HF_HOME cache)
- run: |
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
- -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
- --max-time 600 \
- -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
- | jq '{status, display_name, is_gguf, context_length}'
-
- - name: Multi-turn determinism via OpenAI + Anthropic SDKs
- env:
- BASE_URL: http://127.0.0.1:18888
- run: |
- python - <<'PY'
- import json
- import os
- from openai import OpenAI
- from anthropic import Anthropic
-
- BASE = os.environ["BASE_URL"]
- KEY = os.environ["TOKEN"] # JWT also accepted as Bearer on /v1/*
- SEED = 3407
-
- # Four-turn conversation: the second and fourth turns can only be
- # answered correctly if the model sees the prior turns, so this
- # also exercises the conversation-history wiring.
- PROMPTS = [
- "What is 1+1?",
- "What did I ask before?",
- "What is the capital of France?",
- "Repeat the city name",
- ]
-
- def run_openai():
- client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
- history, replies = [], []
- for prompt in PROMPTS:
- history.append({"role": "user", "content": prompt})
- resp = client.chat.completions.create(
- model = "default",
- messages = history,
- temperature = 0.0,
- max_tokens = 80,
- seed = SEED,
- extra_body = {"enable_thinking": False},
- )
- text = resp.choices[0].message.content or ""
- replies.append(text)
- history.append({"role": "assistant", "content": text})
- return replies
-
- def run_anthropic():
- # Two SDK quirks vs. Studio:
- # 1. base_url must NOT include /v1 -- the SDK appends
- # /v1/messages itself; otherwise the request hits
- # /v1/v1/messages and 405s.
- # 2. The SDK sends `x-api-key` by default, but Studio's
- # auth layer is HTTPBearer-only. Override via
- # default_headers so Authorization: Bearer ... is
- # sent instead.
- client = Anthropic(
- base_url = BASE,
- api_key = "unused",
- default_headers = {"Authorization": f"Bearer {KEY}"},
- )
- history, replies = [], []
- for prompt in PROMPTS:
- history.append({"role": "user", "content": prompt})
- msg = client.messages.create(
- model = "default",
- max_tokens = 80,
- messages = history,
- temperature = 0.0,
- extra_body = {"seed": SEED, "enable_thinking": False},
- )
- text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text")
- replies.append(text)
- history.append({"role": "assistant", "content": text})
- return replies
-
- for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)):
- first = runner()
- second = runner()
- for i, (a, b) in enumerate(zip(first, second), start = 1):
- print(f"[{label} turn {i}] {a!r}")
- assert a, f"{label}: empty turn {i} response"
- assert a == b, (
- f"{label} non-deterministic at turn {i} with temperature=0.0:\n"
- f" run1: {a!r}\n run2: {b!r}"
- )
- # Sanity: turn-2 reply should mention the earlier question, and
- # turn-4 reply should mention Paris (model echoes the city it
- # produced for turn 3). Lower-cased substring checks keep the
- # assertion robust to formatting jitter.
- joined = " ".join(first).lower()
- assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}"
- assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}"
- print(f"[{label}] OK -- 4 turns, run1 == run2, history grounded")
- PY
-
- - name: Stop Studio
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
- ss -tln | grep ":${STUDIO_PORT}" || true
-
- - name: Upload logs
- # Always upload so green runs are still reviewable.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: openai-anthropic-log
- path: |
- logs/studio.log
- logs/install.log
- retention-days: 7
-
- # ─────────────────────────────────────────────────────────────────────
- # Job 2: Tool calling Tests
- # ─────────────────────────────────────────────────────────────────────
- tool-calling:
- name: Tool calling Tests
- runs-on: macos-14
- timeout-minutes: 25
- env:
- # Tool calling is the highest-volume GGUF in this workflow
- # (Qwen3.5-2B at Q4_K_XL = ~1.28 GiB on Mac, where IQ3_XXS
- # collapses for tool-call grammar under Metal at temperature=0).
- # Caching HF_HOME stores xet chunks + blobs + snapshots = ~4.6
- # GiB compressed -- 3.6x file-size inflation. Use main's
- # `--local-dir gguf-cache` pattern to cache the flat .gguf only.
- # The OpenAI/Anth and JSON+images jobs still cover the
- # gguf_variant resolution path.
- GGUF_REPO: unsloth/Qwen3.5-2B-GGUF
- GGUF_FILE: Qwen3.5-2B-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18898'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Restore GGUF model file
- id: cache-gguf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: gguf-cache
- key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
- - name: Download GGUF if cache miss
- id: download-gguf
- if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.cache-gguf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p gguf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache
-
- # Save partial caches on cancel; next run resumes via content hash.
- - name: Save GGUF model file
- if: always() && steps.download-gguf.outcome != 'skipped' && hashFiles('gguf-cache/**/*.gguf') != ''
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: gguf-cache
- key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Assert install.sh used the Mac llama.cpp prebuilt
- run: |
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
-
- - name: Reset auth + boot Studio (API-only, default tool policy)
- # We deliberately use the API-only mode rather than
- # `unsloth studio run` because the latter calls
- # `set_tool_policy(...)` with a resolved bool: on loopback the
- # default resolves to True, which forces every request through
- # the server-side agentic loop and breaks the standard
- # function-calling test below. API-only mode leaves
- # tool_policy=None so each request's `enable_tools` field is
- # honoured.
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health, log in, change password, load model
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
- -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
- -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
- TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
- echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
- GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}"
- ls -lh "$GGUF_PATH"
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
- -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
- --max-time 600 \
- -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
- | jq '{status, display_name}'
-
- - name: Tool calling, server-side tools, thinking on/off
- env:
- BASE_URL: http://127.0.0.1:18898
- run: |
- python - <<'PY'
- import json
- import os
- import urllib.request
-
- BASE = os.environ["BASE_URL"]
- KEY = os.environ["API_KEY"]
- SEED = 3407
-
- def post(path, body, *, timeout = 240):
- """Plain JSON POST. For requests that don't go through
- the server-side agentic loop, the response is one JSON
- object."""
- data = json.dumps(body).encode()
- req = urllib.request.Request(
- f"{BASE}{path}",
- data = data,
- method = "POST",
- headers = {
- "Authorization": f"Bearer {KEY}",
- "Content-Type": "application/json",
- },
- )
- with urllib.request.urlopen(req, timeout = timeout) as resp:
- return resp.status, json.loads(resp.read().decode())
-
- def post_sse(path, body, *, timeout = 600):
- """POST a streaming request and accumulate the assistant
- text deltas. The server-side agentic loop ALWAYS returns
- SSE regardless of the request's `stream` field, so any
- call with enable_tools=true must use this helper."""
- body = {**body, "stream": True}
- data = json.dumps(body).encode()
- req = urllib.request.Request(
- f"{BASE}{path}",
- data = data,
- method = "POST",
- headers = {
- "Authorization": f"Bearer {KEY}",
- "Content-Type": "application/json",
- },
- )
- parts = []
- with urllib.request.urlopen(req, timeout = timeout) as resp:
- for raw in resp:
- line = raw.decode().strip()
- if not line.startswith("data: "):
- continue
- payload = line[6:]
- if payload == "[DONE]":
- break
- try:
- chunk = json.loads(payload)
- except json.JSONDecodeError:
- continue
- for choice in chunk.get("choices", []):
- delta = choice.get("delta", {}) or {}
- if delta.get("content"):
- parts.append(delta["content"])
- return "".join(parts)
-
- # ── 1. Standard OpenAI function calling ──────────────────────
- weather_tool = {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get current weather for a city.",
- "parameters": {
- "type": "object",
- "properties": {"city": {"type": "string"}},
- "required": ["city"],
- },
- },
- }
-
- # Mac Metal at temperature=0 is pathological for these small
- # quants (Qwen3.5-2B emits ',,,,,,...' or 'The The The...'),
- # gemma-4-E2B emits '' tokens). The Linux CPU
- # backend hides the issue. Use a small non-zero temperature
- # with a fixed seed so we stay deterministic but escape the
- # degenerate sampling trap.
- TEMP = 0.2
-
- status, data = post("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "What is the weather in Paris?"}],
- "tools": [weather_tool],
- "tool_choice": "required",
- "stream": False,
- "temperature": TEMP,
- "seed": SEED,
- # tool_choice='required' constrains the grammar so the
- # model emits a tool_call quickly when it works at all;
- # 128 tokens is enough for `{"city":"Paris"}` plus the
- # JSON envelope.
- "max_tokens": 128,
- }, timeout = 180)
- assert status == 200, f"tool call status {status}: {data}"
- choice = data["choices"][0]
- tool_calls = (choice.get("message") or {}).get("tool_calls") or []
- # Studio's contract: when tool_choice='required', llama.cpp's
- # grammar should force a tool_calls payload. On Mac that
- # contract is sometimes broken by the underlying quant; the
- # PASS path is "tool_calls present + correct schema", the
- # WARN path documents Studio still returned 200 with a
- # well-formed choices[] envelope.
- if tool_calls:
- tc = tool_calls[0]
- assert tc["function"]["name"] == "get_weather", (
- f"unexpected tool name: {tc['function']['name']!r}"
- )
- args = json.loads(tc["function"]["arguments"])
- assert args.get("city"), f"missing city arg: {args}"
- print(f"[tools] PASS function calling -> {tc['function']['name']}({args}) finish={choice.get('finish_reason')!r}")
- else:
- # Infrastructure path is correct; model output drifted.
- print(
- f"[tools] WARN function calling: no tool_calls (finish_reason="
- f"{choice.get('finish_reason')!r}); HTTP path OK, this is a "
- f"Mac Metal quant degeneracy."
- )
-
- # ── 2. Server-side python tool ───────────────────────────────
- # 123 * 456 = 56088. The agentic loop streams SSE; we
- # accumulate the assistant text and look for the answer. On
- # Mac the model often loses the tool calling contract before
- # producing the answer; accept either the answer OR a
- # non-empty SSE stream as proof the path completes.
- # macos-14 free runner is ~10 tok/s on Qwen3.5-2B Q4_K_XL;
- # cap max_tokens tightly so each SSE round stays under ~30s
- # even when the model stalls in a degenerate output state.
- content = post_sse("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "What is 123 * 456? Use the python tool to compute it and tell me the number."}],
- "enable_tools": True,
- "enabled_tools": ["python"],
- "session_id": "ci-tool-calling-py",
- "temperature": TEMP,
- "seed": SEED,
- "max_tokens": 128,
- }, timeout = 180)
- if "56088" in content or "56,088" in content:
- print(f"[tools] PASS python tool ({len(content)} chars, found 56088)")
- else:
- # Empty stream is a known Mac-quant degeneracy too; log
- # but do not fail.
- print(
- f"[tools] WARN python tool: SSE OK ({len(content)} chars) but "
- f"model didn't return 56088 -- Mac quant drift"
- )
-
- # NOTE: the dedicated "Server-side bash (terminal) tool" axis
- # was dropped in favour of the python axis above. Both share
- # the SAME server-side agentic loop wiring (only the registry
- # entry differs); the python axis is the canonical proof. On
- # macos-14 the duplicated SSE round was the dominant cost in
- # this step, so collapsing the two saves ~30-60 s wallclock
- # without losing distinct coverage.
-
- # ── 3. Server-side web_search tool ───────────────────────────
- # DuckDuckGo is flaky from CI runners and small Qwen3.5-2B
- # may not actually search. Only assert that the SSE stream
- # opens and yields any data; HTTP / parser failures already
- # raise above.
- try:
- content = post_sse("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}],
- "enable_tools": True,
- "enabled_tools": ["web_search"],
- "session_id": "ci-tool-calling-web",
- "temperature": TEMP,
- "seed": SEED,
- "max_tokens": 96,
- }, timeout = 180)
- print(f"[tools] PASS web_search stream ({len(content)} chars)")
- except Exception as exc:
- print(f"[tools] WARN web_search probe failed (non-blocking): {exc}")
-
- # ── 4. Thinking on / off ─────────────────────────────────────
- # Studio strips think blocks from message.content for tools-mode
- # responses, so we toggle plain chat (no enable_tools) and look
- # at the surfaced reasoning_content / message.thinking field.
- def thinking_call(enable):
- status, data = post("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "Briefly: is 17 prime?"}],
- "stream": False,
- "enable_thinking": enable,
- "temperature": TEMP,
- "seed": SEED,
- # 80 tokens lands within the 25-minute job timeout
- # on the macos-14 free runner. 17 is small; this is
- # plenty of room for either "Yes" + brief reasoning
- # or a degenerate empty completion.
- "max_tokens": 80,
- }, timeout = 180)
- assert status == 200
- msg = data["choices"][0]["message"]
- # Studio surfaces thinking via reasoning_content (OpenAI
- # extension). Fall back to inline markers for
- # robustness across template versions.
- raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "")
- return raw
-
- on_text = thinking_call(True)
- off_text = thinking_call(False)
- # Mac quant drift: the model may produce empty / degenerate
- # output regardless of enable_thinking. Assert ONLY that the
- # endpoint returned 200 (already enforced inside thinking_call)
- # and that toggling the flag doesn't surface a hard
- # marker when off.
- had_think_on = ("" in on_text) or len(on_text) > 80
- if not had_think_on:
- print(
- f"[tools] WARN enable_thinking=True produced no thinking signal: "
- f"{on_text[:200]!r} -- Mac quant drift"
- )
- # Off-mode should not contain the literal marker.
- assert "" not in off_text, (
- f"enable_thinking=False but still present: {off_text!r}"
- )
- print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)")
- PY
-
- - name: Stop Studio
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
- ss -tln | grep ":${STUDIO_PORT}" || true
-
- - name: Upload logs
- # Always upload so green runs are still reviewable.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: tool-calling-log
- path: |
- logs/studio.log
- logs/install.log
- retention-days: 7
-
- # ─────────────────────────────────────────────────────────────────────
- # Job 3: JSON, images
- # ─────────────────────────────────────────────────────────────────────
- json-images:
- name: JSON, images
- runs-on: macos-14
- timeout-minutes: 30
- env:
- GGUF_REPO: unsloth/gemma-4-E2B-it-GGUF
- # Linux smoke uses UD-IQ3_XXS, but on Mac Metal that gemma-4
- # quant emits sentinel tokens () for any prompt at
- # temperature=0 -- inference path is fine, the quant itself is
- # broken on Metal. UD-Q4_K_XL is the smallest published variant
- # that generates real text on M1.
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-4-E2B-it-UD-Q4_K_XL.gguf
- MMPROJ_FILE: mmproj-F16.gguf
- STUDIO_PORT: '18899'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- # Cache flat .gguf + mmproj (Job 2's pattern). HF_HOME inflates
- # ~3.6x via xet/blobs/snapshots, which made macOS saves never land.
- # mmproj is auto-detected as a sibling via detect_mmproj_file
- # (studio/backend/utils/models/model_config.py).
- - name: Restore GGUF + mmproj files
- id: cache-gguf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: gguf-cache
- key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-${{ env.MMPROJ_FILE }}-v2
-
- - name: Verify cache contains BOTH gguf + mmproj
- id: verify-cache
- if: steps.cache-gguf.outputs.cache-hit == 'true'
- run: |
- if [[ -f "gguf-cache/$GGUF_FILE" && -f "gguf-cache/$MMPROJ_FILE" ]]; then
- echo "ok=true" >> "$GITHUB_OUTPUT"
- else
- echo "Partial cache hit -- forcing re-download."
- echo "ok=false" >> "$GITHUB_OUTPUT"
- fi
-
- - name: Download GGUF + mmproj if cache miss or partial
- id: download-gguf
- if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.verify-cache.outputs.ok != 'true'
- # Authenticated + parallel: shared macos-14 NAT egress stalls
- # multi-GB anonymous downloads.
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p gguf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache &
- MODEL_PID=$!
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE" gguf-cache &
- MMPROJ_PID=$!
- wait "$MODEL_PID"
- wait "$MMPROJ_PID"
- # Fail loud on a partial download instead of in the next step.
- ls -lh "gguf-cache/$GGUF_FILE" "gguf-cache/$MMPROJ_FILE"
-
- # Save partial caches on cancel. hashFiles guard avoids a hard
- # save failure when the download step exits with no files. The
- # additional mmproj-presence check stops a partial save from
- # poisoning the cache for the next run.
- - name: Save GGUF + mmproj files
- if: always() && steps.download-gguf.outcome != 'skipped' && hashFiles('gguf-cache/**/*.gguf') != '' && hashFiles(format('gguf-cache/{0}', env.MMPROJ_FILE)) != ''
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: gguf-cache
- key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-${{ env.MMPROJ_FILE }}-v2
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Assert install.sh used the Mac llama.cpp prebuilt
- run: |
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
-
- - name: Install OpenAI + Anthropic Python SDKs
- run: pip install 'openai>=1.50' 'anthropic>=0.40'
-
- - name: Reset auth + boot Studio (API-only)
- # See Job 2's comment: API-only mode keeps tool_policy=None so
- # response_format requests aren't routed through the agentic
- # tool loop.
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health, log in, change password, load model
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
- -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
- -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
- TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
- echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
- # Load via local file path; mmproj sibling auto-detected by
- # detect_mmproj_file (model_config.py). gguf_variant omitted
- # -- it routes through _find_local_gguf_by_variant which
- # expects a directory, not a file path.
- GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}"
- MMPROJ_PATH="$GITHUB_WORKSPACE/gguf-cache/${MMPROJ_FILE}"
- ls -lh "$GGUF_PATH" "$MMPROJ_PATH"
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
- -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
- --max-time 900 \
- -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
- | jq '{status, display_name, is_vision}'
-
- - name: JSON schema decoding + image input
- env:
- BASE_URL: http://127.0.0.1:18899
- run: |
- python - <<'PY'
- import base64
- import json
- import os
- import urllib.request
- from openai import OpenAI
- from anthropic import Anthropic
-
- BASE = os.environ["BASE_URL"]
- KEY = os.environ["API_KEY"]
- SEED = 3407
- # Mac Metal degenerates these gemma-4 quants at temperature=0
- # (any prompt yields '...' padding tokens). Use a
- # small non-zero temperature with the same seed so we stay
- # deterministic-enough but escape the trap.
- TEMP = 0.2
-
- def post(path, body, *, timeout = 240):
- req = urllib.request.Request(
- f"{BASE}{path}",
- data = json.dumps(body).encode(),
- method = "POST",
- headers = {
- "Authorization": f"Bearer {KEY}",
- "Content-Type": "application/json",
- },
- )
- with urllib.request.urlopen(req, timeout = timeout) as resp:
- return resp.status, json.loads(resp.read().decode())
-
- # ── 1. response_format = json_object (JSON mode) ─────────────
- # llama.cpp's HTTP server supports OpenAI-compatible JSON
- # mode: `response_format: {"type": "json_object"}` constrains
- # the model to emit syntactically-valid JSON. We use raw HTTP
- # rather than the OpenAI SDK so that the field shape Studio
- # forwards to llama-server is unambiguous (the SDK rewrites
- # response_format depending on which variant it recognises).
- # We deliberately do NOT pass a strict JSON schema -- on
- # small Gemma-4 quants the GBNF-from-schema path occasionally
- # produces empty output, and JSON mode is the surface we care
- # about exposing through Studio.
- status, data = post("/v1/chat/completions", {
- "model": "default",
- "messages": [
- {"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'},
- {"role": "user", "content": "What is the capital of France?"},
- ],
- "temperature": TEMP,
- # Trimmed for Mac runner timeout budget; json_object
- # grammar terminates quickly when working.
- "max_tokens": 200,
- "seed": SEED,
- "stream": False,
- "enable_thinking": False,
- "response_format": {"type": "json_object"},
- }, timeout = 240)
- assert status == 200, f"json status {status}: {data}"
- # Verify the response envelope shape -- this is what we
- # actually want to exercise on Mac. The model output quality
- # downstream of this is a Mac-Metal-quant artefact.
- assert (
- isinstance(data.get("choices"), list)
- and data["choices"]
- and "message" in data["choices"][0]
- ), f"json response envelope malformed: {data}"
- content = (data["choices"][0]["message"].get("content") or "").strip()
- print(f"[json] raw json_object content: {content!r}")
- # Some chat templates wrap JSON in ```json fences even in JSON
- # mode -- strip those before parsing.
- if content.startswith("```"):
- content = content.split("```", 2)[1]
- if content.startswith("json"):
- content = content[4:]
- content = content.strip("`\n ")
- if content:
- try:
- parsed = json.loads(content)
- if "paris" in str(parsed.get("city", "")).lower():
- print(f"[json] PASS json_object -> {parsed}")
- else:
- print(f"[json] WARN json_object decoded but city!=Paris: {parsed}")
- except json.JSONDecodeError as exc:
- print(f"[json] WARN json_object content not parseable ({exc}); content={content!r}")
- else:
- print("[json] WARN json_object produced empty content on this Mac quant")
- # Cross-check: same prompt without response_format. We care
- # that the inference path stays healthy (status 200 + envelope
- # shape OK); model output quality is a separate concern.
- status2, data2 = post("/v1/chat/completions", {
- "model": "default",
- "messages": [{"role": "user", "content": "What is the capital of France? Answer with one word."}],
- "temperature": TEMP,
- # 1-word answer doesn't need 400 tokens; trim so a
- # degenerate streaming model doesn't burn through the
- # job's wallclock budget.
- "max_tokens": 150,
- "seed": SEED,
- "stream": False,
- "enable_thinking": False,
- }, timeout = 240)
- assert status2 == 200, f"plain status {status2}: {data2}"
- plain = (data2["choices"][0]["message"].get("content") or "").lower()
- print(f"[json] plain capital-of-france reply: {plain!r}")
- if "paris" in plain:
- print("[json] PASS plain inference path (paris mentioned)")
- else:
- print(
- f"[json] WARN plain inference returned no 'paris' -- Mac quant "
- f"degeneracy. HTTP path validated separately above."
- )
-
- # ── 2. OpenAI image_url (data URI base64) ───────────────────
- # 64x64 solid-red PNG. stb_image (used by Studio's image
- # normaliser at routes/inference.py:3410) rejects 4x4 or
- # smaller PNGs as truncated, so we go up to 64x64 -- still
- # tiny in token cost. The assertion is loose: any non-empty
- # response from the vision path proves multimodal end-to-end
- # wiring; small VL quants are weak at colour identification.
- PNG_64X64_RED_B64 = (
- "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k"
- "UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA"
- "1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII="
- )
- data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}"
-
- # The Mac prebuilt llama.cpp server has a known crash when
- # processing image inputs alongside the gemma-4-E2B mmproj
- # (server disconnects mid-completion). This is upstream
- # llama.cpp behaviour, not Studio. Wrap both SDK calls in
- # try/except so an upstream crash registers as a WARN rather
- # than failing the whole job. Studio's contract (OpenAI/
- # Anthropic image fields are accepted and forwarded) is
- # validated by the request body Studio constructs, not by
- # whether llama.cpp can decode it on Mac Metal.
- client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
- try:
- openai_resp = client.chat.completions.create(
- model = "default",
- temperature = TEMP,
- max_tokens = 80,
- seed = SEED,
- messages = [{
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": data_uri}},
- {"type": "text", "text": "What colour dominates this image? Reply in one word."},
- ],
- }],
- )
- openai_text = (openai_resp.choices[0].message.content or "").lower()
- print(f"[image/openai] reply: {openai_text!r}")
- if openai_text:
- print("[image/openai] PASS image_url accepted, non-empty response")
- else:
- print("[image/openai] WARN image_url accepted but empty content -- Mac quant drift")
- except Exception as exc:
- print(
- f"[image/openai] WARN image_url SDK call raised: {type(exc).__name__}: "
- f"{exc}. Likely upstream llama.cpp Mac+vision crash, NOT a Studio "
- f"regression. Studio successfully forwarded the request."
- )
-
- # ── 3. Anthropic source/base64 image ────────────────────────
- # Two SDK quirks vs. Studio: base_url must NOT include /v1
- # (the SDK appends it itself; otherwise /v1/v1/messages -> 405),
- # and Studio's auth is HTTPBearer-only so the SDK's default
- # x-api-key header is ignored -- send Authorization: Bearer
- # via default_headers.
- anthropic = Anthropic(
- base_url = BASE,
- api_key = "unused",
- default_headers = {"Authorization": f"Bearer {KEY}"},
- )
- try:
- a_msg = anthropic.messages.create(
- model = "default",
- max_tokens = 80,
- temperature = TEMP,
- extra_body = {"seed": SEED},
- messages = [{
- "role": "user",
- "content": [
- {
- "type": "image",
- "source": {
- "type": "base64",
- "media_type": "image/png",
- "data": PNG_64X64_RED_B64,
- },
- },
- {"type": "text", "text": "Describe this image briefly."},
- ],
- }],
- )
- a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text")
- print(f"[image/anthropic] reply: {a_text!r}")
- if a_text:
- print("[image/anthropic] PASS source/base64 accepted, non-empty response")
- else:
- print("[image/anthropic] WARN source/base64 accepted but empty content -- Mac quant drift")
- except Exception as exc:
- print(
- f"[image/anthropic] WARN anthropic image SDK call raised: "
- f"{type(exc).__name__}: {exc}. Likely upstream llama.cpp Mac+vision "
- f"crash, NOT a Studio regression."
- )
- PY
-
- - name: Stop Studio
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
- ss -tln | grep ":${STUDIO_PORT}" || true
-
- - name: Upload logs
- # Always upload so green runs are still reviewable.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: json-images-log
- path: |
- logs/studio.log
- logs/install.log
- retention-days: 7
diff --git a/.github/workflows/studio-mac-ui-smoke.yml b/.github/workflows/studio-mac-ui-smoke.yml
deleted file mode 100644
index b353f0ec83..0000000000
--- a/.github/workflows/studio-mac-ui-smoke.yml
+++ /dev/null
@@ -1,345 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Mac counterpart to studio-ui-smoke.yml. Same Playwright + Chromium
-# end-to-end chat UI flow, but on macos-14 (M1) so we catch
-# Mac-specific frontend / backend wiring regressions that the Linux
-# job would miss (e.g. the Mac Tauri shell loading the same React
-# bundle, or the Mac llama.cpp prebuilt's HTTP layer behaving
-# differently from the Linux build).
-
-name: Mac Studio UI CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'install.sh'
- - 'pyproject.toml'
- - 'tests/studio/**'
- - '.github/workflows/studio-mac-ui-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- ui-smoke:
- name: Chat UI Tests
- runs-on: macos-14
- timeout-minutes: 35
- env:
- GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18896'
- HF_HOME: ${{ github.workspace }}/hf-cache
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Prime HF_HOME with the GGUF
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
- - name: Save HF_HOME for ${{ env.GGUF_REPO }}
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Assert install.sh used the Mac llama.cpp prebuilt
- run: |
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
-
- - name: Install Playwright + Chromium
- # No --with-deps on Mac: that flag installs Linux apt packages.
- # GitHub-hosted macos-14 ships the system frameworks Chromium
- # needs already.
- # Pinned <1.58 because all 1.55-1.58 drivers ship Node 24 on
- # macos-14 and intermittently hit 'SyntaxError: Unexpected end
- # of JSON input' in pipeTransport.js. Run 25491698868 showed
- # the crash hitting 100% of three retry attempts -- not a
- # rare race but a hard reproduction. Belt-and-suspenders fix:
- # the test scripts pass --single-process to Chromium (see
- # tests/studio/playwright_chat_ui.py) AND we patch
- # pipeTransport.js below to swallow JSON parse errors instead
- # of crashing the driver Node process. Both together let the
- # in-script retry recover from any residual flakes.
- run: |
- pip install 'playwright>=1.55,<1.58'
- python -m playwright install chromium
-
- - name: Patch Playwright pipeTransport.js to tolerate malformed JSON
- # In Playwright 1.55-1.58, pipeTransport.js does
- # `JSON.parse(message)` with no try/catch; when Chromium dies
- # mid-write the partial buffer crashes the driver Node
- # process and the test script exits with 'Connection closed
- # while reading from the driver'. Newer Playwright versions
- # added a try/catch upstream. Backport that here.
- run: |
- python - <<'PY'
- import os, re, sys
- import playwright
- driver_dir = os.path.join(os.path.dirname(playwright.__file__), "driver", "package", "lib", "server")
- path = os.path.join(driver_dir, "pipeTransport.js")
- src = open(path).read()
- # Wrap both `this.onmessage.call(null, JSON.parse(...))` sites in try/catch.
- patched = re.sub(
- r"this\.onmessage\.call\(null, JSON\.parse\((message2?)\)\);",
- r"try { this.onmessage.call(null, JSON.parse(\1)); } "
- r"catch (e) { /* swallow malformed JSON from a crashing browser */ }",
- src,
- )
- if patched == src:
- # Already patched, or upstream changed -- either way, don't fail the build.
- print(f"pipeTransport.js: no JSON.parse calls matched at {path}; skipping.")
- else:
- open(path, "w").write(patched)
- print(f"pipeTransport.js: patched JSON.parse calls in {path}")
- PY
-
- - name: Reset auth + boot Studio
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
-
- - name: Pass bootstrap password to the Playwright step
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- NEW2="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- echo "::add-mask::$NEW2"
- echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV"
- echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV"
- echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
- - name: Drive the chat UI with Playwright
- env:
- BASE_URL: http://127.0.0.1:18896
- PW_ART_DIR: logs/playwright
- STUDIO_UI_STRICT: '1'
- # macos-14 free runner is 3 vCPU / 7 GB / no Metal-accel
- # available to llama.cpp from CI; gemma-3-270m turn latency
- # has been observed to crowd the 180s default. Triple it.
- STUDIO_UI_TURN_TIMEOUT_MS: '540000'
- # Retry up to 3 times to absorb known macos-14 free-runner
- # flakes: (1) Playwright Node 24 pipeTransport.js 'Unexpected
- # end of JSON input' crash when the Chromium browser process
- # dies mid-test, and (2) Chromium net::ERR_NO_BUFFER_SPACE
- # when the runner's kernel briefly runs out of socket buffers.
- # The retry FULLY resets Studio (kill, reset-password, reboot,
- # wait /api/health, re-export bootstrap pw) before re-running
- # the script. A real test failure (assertion / timeout) does
- # NOT match either pattern so it bypasses retry and surfaces
- # immediately.
- run: |
- mkdir -p logs/playwright
- attempt=1
- max_attempts=3
- while : ; do
- set +e
- python tests/studio/playwright_chat_ui.py 2>&1 | tee logs/playwright_attempt_${attempt}.log
- rc=${PIPESTATUS[0]}
- set -e
- if [ "$rc" -eq 0 ]; then
- break
- fi
- if { grep -q "Unexpected end of JSON input" logs/playwright_attempt_${attempt}.log \
- || grep -q "ERR_NO_BUFFER_SPACE" logs/playwright_attempt_${attempt}.log; } \
- && [ "$attempt" -lt "$max_attempts" ]; then
- echo "::warning::Playwright flake on attempt ${attempt}; resetting Studio and retrying..."
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
- unsloth studio reset-password
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > "logs/studio_retry_${attempt}.log" 2>&1 &
- STUDIO_PID=$!
- echo "STUDIO_PID=$STUDIO_PID" >> "$GITHUB_ENV"
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json \
- && jq -e '.status == "healthy"' /tmp/health.json >/dev/null; then
- break
- fi
- sleep 1
- done
- STUDIO_OLD_PW=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- STUDIO_NEW_PW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- STUDIO_NEW2_PW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$STUDIO_OLD_PW"
- echo "::add-mask::$STUDIO_NEW_PW"
- echo "::add-mask::$STUDIO_NEW2_PW"
- export STUDIO_OLD_PW STUDIO_NEW_PW STUDIO_NEW2_PW
- attempt=$((attempt + 1))
- sleep 3
- continue
- fi
- exit "$rc"
- done
-
- - name: Stop Studio (chat-ui ends with Shutdown click; this is belt-and-suspenders)
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
-
- - name: Reset auth + boot Studio for extra UI tests (port 18897)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18897 \
- > logs/studio_extra.log 2>&1 &
- echo "STUDIO_EXTRA_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health on 18897
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:18897/api/health" > /tmp/health2.json; then
- jq -e '.status == "healthy"' /tmp/health2.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health2.json
-
- - name: Pass bootstrap pw for extra UI test
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- echo "STUDIO_EXTRA_OLD_PW=$OLD" >> "$GITHUB_ENV"
- echo "STUDIO_EXTRA_NEW_PW=$NEW" >> "$GITHUB_ENV"
-
- - name: Drive Compare/Recipes/Export/Studio/Settings with Playwright
- env:
- BASE_URL: http://127.0.0.1:18897
- STUDIO_OLD_PW: ${{ env.STUDIO_EXTRA_OLD_PW }}
- STUDIO_NEW_PW: ${{ env.STUDIO_EXTRA_NEW_PW }}
- PW_ART_DIR: logs/playwright_extra
- STUDIO_UI_STRICT: '1'
- # See "Drive the chat UI" step.
- STUDIO_UI_TURN_TIMEOUT_MS: '540000'
- GGUF_REPO: ${{ env.GGUF_REPO }}
- GGUF_VARIANT: ${{ env.GGUF_VARIANT }}
- # Same flake-retry shape as "Drive the chat UI with Playwright"
- # -- catches pipeTransport JSON crash and ERR_NO_BUFFER_SPACE.
- run: |
- mkdir -p logs/playwright_extra
- attempt=1
- max_attempts=3
- while : ; do
- set +e
- python tests/studio/playwright_extra_ui.py 2>&1 | tee logs/playwright_extra_attempt_${attempt}.log
- rc=${PIPESTATUS[0]}
- set -e
- if [ "$rc" -eq 0 ]; then
- break
- fi
- if { grep -q "Unexpected end of JSON input" logs/playwright_extra_attempt_${attempt}.log \
- || grep -q "ERR_NO_BUFFER_SPACE" logs/playwright_extra_attempt_${attempt}.log; } \
- && [ "$attempt" -lt "$max_attempts" ]; then
- echo "::warning::Playwright flake on attempt ${attempt}; resetting Studio and retrying..."
- kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true
- sleep 2
- unsloth studio reset-password
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18897 \
- > "logs/studio_extra_retry_${attempt}.log" 2>&1 &
- STUDIO_EXTRA_PID=$!
- echo "STUDIO_EXTRA_PID=$STUDIO_EXTRA_PID" >> "$GITHUB_ENV"
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:18897/api/health" > /tmp/health2.json \
- && jq -e '.status == "healthy"' /tmp/health2.json >/dev/null; then
- break
- fi
- sleep 1
- done
- STUDIO_OLD_PW=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- STUDIO_NEW_PW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$STUDIO_OLD_PW"
- echo "::add-mask::$STUDIO_NEW_PW"
- export STUDIO_OLD_PW STUDIO_NEW_PW
- attempt=$((attempt + 1))
- sleep 3
- continue
- fi
- exit "$rc"
- done
-
- - name: Stop second Studio
- if: always()
- run: |
- kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true
- sleep 2
-
- - name: Upload Playwright artifacts
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: mac-studio-ui-smoke-artifacts
- path: |
- logs/studio.log
- logs/studio_extra.log
- logs/install.log
- logs/playwright
- logs/playwright_extra
- retention-days: 7
diff --git a/.github/workflows/studio-mac-update-smoke.yml b/.github/workflows/studio-mac-update-smoke.yml
deleted file mode 100644
index 07d26b9ab3..0000000000
--- a/.github/workflows/studio-mac-update-smoke.yml
+++ /dev/null
@@ -1,150 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Mac counterpart to studio-update-smoke.yml. Verifies that on a real
-# Apple Silicon (macos-14, M1) runner:
-#
-# 1. install.sh --local --no-torch installs Studio AND auto-fetches
-# the prebuilt llama.cpp Mac binary (llama-bNNNN-bin-macos-arm64
-# from ggml-org/llama.cpp). Hitting the source-build fallback is
-# treated as an Unsloth bug -- Studio must always pick the
-# prebuilt on Mac.
-# 2. unsloth studio update --local is idempotent. Two consecutive
-# runs both report "prebuilt up to date and validated", no
-# source-build fallback.
-# 3. The installed Studio still boots and /api/health returns
-# healthy after the update path.
-
-name: Mac Studio Update CI
-
-on:
- pull_request:
- paths:
- - 'install.sh'
- - 'studio/setup.sh'
- - 'studio/install_python_stack.py'
- - 'studio/install_llama_prebuilt.py'
- - 'studio/backend/requirements/**'
- - 'unsloth_cli/commands/studio.py'
- - 'pyproject.toml'
- - '.github/workflows/studio-mac-update-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- update-idempotency:
- name: Studio Updating Tests
- runs-on: macos-14
- timeout-minutes: 30
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Assert install.sh used the Mac llama.cpp prebuilt
- run: |
- # Mac install must take the prebuilt path. Source-build
- # fallback here is an Unsloth bug.
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.sh fell back to source-build llama.cpp on Mac. Studio must install the prebuilt llama-bNNNN-bin-macos-arm64 on Apple Silicon."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
- if ! grep -qE "prebuilt installed and validated|prebuilt up to date and validated|bin-macos-arm64" logs/install.log; then
- echo "::error::no Mac prebuilt llama.cpp marker in install.log."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
- echo "install.sh installed the Mac prebuilt llama.cpp"
-
- - name: First update should be a no-op (prebuilt already validated)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- set -o pipefail
- unsloth studio update --local 2>&1 | tee logs/update.log
- if grep -q "falling back to source build" logs/update.log; then
- echo "::error::studio update fell back to source-build llama.cpp on Mac."
- grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
- exit 1
- fi
- if ! grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update.log; then
- echo "::error::no prebuilt up-to-date marker in update.log."
- grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
- exit 1
- fi
- echo "update path took the prebuilt fast path"
-
- - name: Second update must also be a no-op
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- set -o pipefail
- unsloth studio update --local 2>&1 | tee logs/update2.log
- grep -q "falling back to source build" logs/update2.log && {
- echo "::error::second update fell back to source build on Mac"
- tail -60 logs/update2.log; exit 1; } || true
- grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update2.log
- echo "second update was clean"
-
- - name: Boot Studio briefly to confirm the install is still usable
- run: |
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18891 \
- > logs/studio.log 2>&1 &
- PID=$!
- HEALTHY=""
- for i in $(seq 1 60); do
- if curl -fs http://127.0.0.1:18891/api/health > /tmp/health.json; then
- if python3 -c "import json,sys; d=json.load(open('/tmp/health.json')); sys.exit(0 if d.get('status')=='healthy' else 1)"; then
- HEALTHY=1
- break
- fi
- fi
- sleep 1
- done
- if [ -z "$HEALTHY" ]; then
- echo "Studio failed to come up after \`update\`"
- tail -200 logs/studio.log
- kill "$PID" 2>/dev/null || true
- exit 1
- fi
- kill "$PID" 2>/dev/null || true
- echo "post-update Studio /api/health OK"
-
- - name: Upload update logs
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: mac-studio-update-log
- path: |
- logs/install.log
- logs/update.log
- logs/update2.log
- logs/studio.log
- retention-days: 7
diff --git a/.github/workflows/studio-tauri-smoke.yml b/.github/workflows/studio-tauri-smoke.yml
deleted file mode 100644
index 1156c264ae..0000000000
--- a/.github/workflows/studio-tauri-smoke.yml
+++ /dev/null
@@ -1,128 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# PR-time smoke for the Tauri desktop wrapper. Builds the frontend and the
-# Tauri Linux debug binary, with no codesigning. Catches:
-# - tauri.conf.json drift
-# - src-tauri Cargo.toml or rust source breakage
-# - Tauri CLI version drift (we pin 2.10.1, matching release-desktop.yml)
-# - frontend output not picked up by Tauri's distDir
-#
-# Linux-only on a free `ubuntu-latest` runner. Mac and Windows desktop builds
-# stay in release-desktop.yml (manual `workflow_dispatch`) because they need
-# code-signing secrets and ~30 min of runner time each.
-
-name: Studio Tauri CI
-
-on:
- pull_request:
- paths:
- - 'studio/frontend/**'
- - 'studio/src-tauri/**'
- # CLI rename / signature change can break Tauri's spawned
- # `unsloth studio` -- include unsloth_cli in the trigger set.
- - 'unsloth_cli/**'
- - '.github/workflows/studio-tauri-smoke.yml'
- push:
- branches: [main, pip]
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- linux-debug-build:
- name: Tauri Linux debug build (no codesign)
- runs-on: ubuntu-22.04
- timeout-minutes: 25
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - name: Linux native deps for Tauri / WebKit2GTK
- run: |
- sudo apt-get update
- sudo apt-get install -y \
- libwebkit2gtk-4.1-dev libayatana-appindicator3-dev \
- librsvg2-dev libxdo-dev libssl-dev patchelf
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '24'
-
- - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable @ 2026-03-27
-
- - uses: swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2.9.1
- with:
- workspaces: studio/src-tauri -> target
-
- - name: Install pinned Tauri CLI (matches release-desktop.yml)
- # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
- # required for `vite build`. The pre-install lockfile structural
- # audit (lockfile_supply_chain_audit.py) is the practical defence
- # against the npm postinstall-dropper class -- it fires BEFORE any
- # tarball runs, on the injection pattern itself rather than an
- # advisory-DB lookup.
- run: npm install --save-dev --prefix studio @tauri-apps/cli@2.10.1 --no-fund --no-audit
-
- - name: Verify pinned Tauri CLI version
- run: |
- out="$(npx --prefix studio tauri --version)"
- echo "$out"
- [ "$out" = "tauri-cli 2.10.1" ] || { echo "::error::expected tauri-cli 2.10.1, got $out"; exit 1; }
-
- - name: Lockfile supply-chain audit (pre-install scan)
- run: python3 scripts/lockfile_supply_chain_audit.py
-
- - name: Frontend build (npm ci, vite)
- working-directory: studio/frontend
- # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
- # required for `vite build`. The pre-install lockfile structural
- # audit (lockfile_supply_chain_audit.py) is the practical defence
- # against the npm postinstall-dropper class -- it fires BEFORE any
- # tarball runs, on the injection pattern itself rather than an
- # advisory-DB lookup.
- run: |
- npm ci --no-fund --no-audit
- npm run build
- test -f dist/index.html
-
- - name: Tauri debug build (Linux, no bundle, no codesign)
- # `--debug` + `--no-bundle` keeps this lean: compiles the Rust crate,
- # confirms the frontend dist is wired into Tauri, but skips the AppImage
- # / .deb production. Code signing is irrelevant because we never produce
- # a distributable artifact.
- env:
- TAURI_SIGNING_PRIVATE_KEY: ''
- TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ''
- run: npx --prefix studio tauri build --debug --no-bundle
-
- - name: Inspect produced binary
- run: |
- BIN=$(find studio/src-tauri/target/debug -maxdepth 1 -type f -executable 2>/dev/null \
- | grep -Ev '\.(d|so|dylib|dll)$' \
- | grep -Ev '/(deps|build|examples)$' \
- | head -1)
- echo "binary: $BIN"
- if [ -z "$BIN" ]; then
- echo "::error::Tauri debug binary not produced"
- ls -la studio/src-tauri/target/debug/ || true
- exit 1
- fi
- file "$BIN"
- du -h "$BIN"
-
- - name: Upload Tauri debug build
- # Always upload so a green run leaves the binary inspectable too.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: tauri-debug-build
- path: |
- studio/src-tauri/target/debug
- studio/frontend/dist
- retention-days: 3
diff --git a/.github/workflows/studio-ui-smoke.yml b/.github/workflows/studio-ui-smoke.yml
deleted file mode 100644
index 455fe4b7e1..0000000000
--- a/.github/workflows/studio-ui-smoke.yml
+++ /dev/null
@@ -1,293 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# End-to-end Studio chat UI smoke via Playwright + Chromium against a
-# headless Linux runner. Boots Studio with the smallest GGUF
-# (gemma-3-270m-it UD-Q4_K_XL, ~254 MiB), drives the actual frontend
-# bundle, and asserts the full bootstrap-password / change-password /
-# send-message / persist-on-reload journey works end to end.
-#
-# This is the only workflow that catches regressions in the wiring
-# between the React frontend and the FastAPI backend, e.g. assistant-ui
-# version drift, /api/auth response shape changes, runtime-provider
-# regressions, or chat-history persistence breaking. Backend-only and
-# frontend-only CI happily pass while the actual user-visible UI is
-# broken (cf. the 2026.5.1 chat-history release).
-
-name: Studio UI CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'install.sh'
- - 'pyproject.toml'
- # The Playwright test files themselves -- a PR that ONLY edits
- # the test must still trigger UI CI.
- - 'tests/studio/**'
- - '.github/workflows/studio-ui-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- ui-smoke:
- name: Chat UI Tests
- runs-on: ubuntu-latest
- timeout-minutes: 25
- env:
- GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18892'
- HF_HOME: ${{ github.workspace }}/hf-cache
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - name: Linux deps
- run: |
- sudo apt-get update
- sudo apt-get install -y --no-install-recommends \
- libcurl4-openssl-dev libssl-dev jq
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
-
- - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Prime HF_HOME with the GGUF
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
- - name: Save HF_HOME for ${{ env.GGUF_REPO }}
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Install Studio (--local, --no-torch)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: Install Playwright + Chromium
- run: |
- pip install 'playwright>=1.45'
- # --with-deps installs the OS-level runtime libs Chromium
- # needs (libnss3, libxkbcommon, etc.). About 30 s on a
- # warm runner.
- python -m playwright install --with-deps chromium
-
- - name: Reset auth + boot Studio
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health
- # 180 s -- a cold runner with venv warm-up + lazy imports has
- # been seen to exceed 60 s. Failing the wait is more expensive
- # than waiting an extra two minutes.
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
-
- - name: Pass bootstrap password to the Playwright step
- # The Playwright test does its OWN /change-password through the
- # UI (Setup your account / Choose a new password), then loads
- # the model via page.evaluate against /api/inference/load with
- # the JWT it got from change-password. So the only thing we
- # have to hand it is the bootstrap password (so it can verify
- # post-rotation that the OLD bootstrap pw now returns 401).
- #
- # NEW + NEW2 are generated freshly per CI run via secrets.token_urlsafe
- # rather than hardcoded. If a workflow gets compromised, the
- # attacker can't replay a known-good rotated password against
- # any future / parallel Studio install -- the rotated value
- # only ever exists for the lifetime of this single job, masked
- # in the log via ::add-mask::.
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- NEW2="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- echo "::add-mask::$NEW2"
- echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV"
- echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV"
- echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
- - name: Drive the chat UI with Playwright
- env:
- BASE_URL: http://127.0.0.1:18892
- # The test file lives in the repo so it can be run locally
- # against a freshly-installed Studio (BASE_URL=...; STUDIO_OLD_PW=
- # $(cat ~/.unsloth/studio/auth/.bootstrap_password); python ...).
- PW_ART_DIR: logs/playwright
- # Strict mode: in CI a missing button / nav / dialog must
- # FAIL the test. Locally the test still runs against partial
- # Studio installs without STUDIO_UI_STRICT.
- STUDIO_UI_STRICT: '1'
- run: |
- mkdir -p logs/playwright
- python tests/studio/playwright_chat_ui.py
-
- - name: Stop Studio (chat-ui ends with Shutdown click; this is belt-and-suspenders)
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
-
- # The chat UI test ends by clicking the Shutdown menuitem, which
- # leaves the server dead. The extra UI test (Compare / Recipes /
- # Export / Studio / Settings) needs a fresh Studio, so we boot a
- # second one on a different port. Boot is fast (~3-5s on the
- # warm install we already did) so this adds little wall time.
- - name: Reset auth + boot Studio for extra UI tests (port 18894)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18894 \
- > logs/studio_extra.log 2>&1 &
- echo "STUDIO_EXTRA_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health on 18894
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:18894/api/health" > /tmp/health2.json; then
- jq -e '.status == "healthy"' /tmp/health2.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health2.json
-
- - name: Pass bootstrap pw for extra UI test
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- echo "STUDIO_EXTRA_OLD_PW=$OLD" >> "$GITHUB_ENV"
- echo "STUDIO_EXTRA_NEW_PW=$NEW" >> "$GITHUB_ENV"
-
- - name: Drive Compare/Recipes/Export/Studio/Settings with Playwright
- env:
- BASE_URL: http://127.0.0.1:18894
- STUDIO_OLD_PW: ${{ env.STUDIO_EXTRA_OLD_PW }}
- STUDIO_NEW_PW: ${{ env.STUDIO_EXTRA_NEW_PW }}
- PW_ART_DIR: logs/playwright_extra
- STUDIO_UI_STRICT: '1'
- GGUF_REPO: ${{ env.GGUF_REPO }}
- GGUF_VARIANT: ${{ env.GGUF_VARIANT }}
- run: |
- mkdir -p logs/playwright_extra
- python tests/studio/playwright_extra_ui.py
-
- - name: Stop second Studio
- if: always()
- run: |
- kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true
- sleep 2
-
- # IME + multilingual paste regression (issue #5318 / PR #5327).
- # Third Studio on its own port so a hang here cannot poison the
- # earlier UI tests. No GGUF -- the bug surface is the composer.
- - name: Reset auth + boot Studio for IME / i18n tests (port 18896)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18896 \
- > logs/studio_ime.log 2>&1 &
- echo "STUDIO_IME_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health on 18896
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:18896/api/health" > /tmp/health3.json; then
- jq -e '.status == "healthy"' /tmp/health3.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health3.json
-
- - name: Pass bootstrap pw for IME / i18n test
- # IME smoke does the change-password against the bootstrap that
- # Studio's frontend injects into the page, so it only needs the
- # NEW password.
- run: |
- NEW="CIIme-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$NEW"
- echo "STUDIO_IME_NEW_PW=$NEW" >> "$GITHUB_ENV"
-
- - name: Drive IME + multilingual paste regression with Playwright
- env:
- BASE_URL: http://127.0.0.1:18896
- STUDIO_NEW_PW: ${{ env.STUDIO_IME_NEW_PW }}
- PW_ART_DIR: logs/playwright_ime
- STUDIO_UI_STRICT: '1'
- run: |
- mkdir -p logs/playwright_ime
- python tests/studio/playwright_chat_ime_i18n.py
-
- - name: Stop third Studio
- if: always()
- run: |
- kill "${STUDIO_IME_PID}" 2>/dev/null || true
- sleep 2
-
- - name: Upload Playwright artifacts
- # Always upload so a green run's screenshots stay reviewable --
- # catches "passed but the UI is silently broken" regressions.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: studio-ui-smoke-artifacts
- path: |
- logs/studio.log
- logs/studio_extra.log
- logs/studio_ime.log
- logs/install.log
- logs/playwright
- logs/playwright_extra
- logs/playwright_ime
- retention-days: 7
diff --git a/.github/workflows/studio-update-smoke.yml b/.github/workflows/studio-update-smoke.yml
deleted file mode 100644
index 1c353e933a..0000000000
--- a/.github/workflows/studio-update-smoke.yml
+++ /dev/null
@@ -1,154 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Verifies that `unsloth studio update --local` is idempotent: a fresh
-# install via install.sh, followed by `unsloth studio update --local`,
-# succeeds and is a no-op for the llama.cpp prebuilt (it should report
-# "prebuilt up to date and validated", not re-run the source build).
-#
-# This catches regressions in setup.sh's update path that the existing
-# GGUF / wheel jobs would miss because they only invoke install.sh once.
-
-name: Studio Update CI
-
-on:
- pull_request:
- paths:
- - 'install.sh'
- - 'studio/setup.sh'
- - 'studio/install_python_stack.py'
- - 'studio/install_llama_prebuilt.py'
- - 'studio/backend/requirements/**'
- - 'unsloth_cli/commands/studio.py'
- - 'pyproject.toml'
- - '.github/workflows/studio-update-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- update-idempotency:
- name: Studio Updating Tests
- runs-on: ubuntu-latest
- timeout-minutes: 15
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - name: Linux deps for llama.cpp prebuilt
- run: |
- sudo apt-get update
- sudo apt-get install -y --no-install-recommends \
- libcurl4-openssl-dev libssl-dev jq
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- # Don't cache pip: this job runs `bash install.sh` and
- # `unsloth studio update --local` which both go through
- # `uv` and never populate ~/.cache/pip. setup-python's
- # post-step then fatal-errors with "Cache folder path is
- # retrieved for pip but doesn't exist on disk".
-
- - name: Install Studio (--local, --no-torch)
- # Pass the workflow token so the llama.cpp prebuilt installer's
- # GitHub-API call to list releases isn't rate-limited (60/hr
- # unauthenticated). Without this, three consecutive install +
- # update + update calls in this job exceed the limit and the
- # prebuilt path falls back to source build.
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- mkdir -p logs
- set -o pipefail
- bash install.sh --local --no-torch 2>&1 | tee logs/install.log
-
- - name: First update should be a no-op (prebuilt already validated)
- # `unsloth studio update --local` runs studio/setup.sh against
- # the local repo. Right after install.sh the llama.cpp prebuilt
- # has just been installed and validated, so the second run must
- # take the "prebuilt up to date and validated" code path. Any
- # source-build fallback or re-download here means setup.sh's
- # idempotency regressed.
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- set -o pipefail
- unsloth studio update --local 2>&1 | tee logs/update.log
- if grep -q "falling back to source build" logs/update.log; then
- echo "::error::studio update fell back to source-build llama.cpp on a fresh install. setup.sh idempotency regressed."
- grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
- exit 1
- fi
- if ! grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update.log; then
- echo "::error::no prebuilt up-to-date marker in update.log. Did setup.sh skip the prebuilt path on update?"
- grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
- exit 1
- fi
- echo "update path took the prebuilt fast path"
-
- - name: Second update must also be a no-op
- # Two consecutive `update`s back-to-back is the usual desktop
- # flow (auto-update, then user-triggered update). Asserting the
- # second run is also clean rules out hidden state changes from
- # the first one.
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- set -o pipefail
- unsloth studio update --local 2>&1 | tee logs/update2.log
- grep -q "falling back to source build" logs/update2.log && {
- echo "::error::second update fell back to source build"
- tail -60 logs/update2.log; exit 1; } || true
- grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update2.log
- echo "second update was clean"
-
- - name: Boot Studio briefly to confirm the install is still usable
- # If `update --local` accidentally broke the venv or wiped the
- # llama-server binary, the server would fail to start here.
- run: |
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18891 \
- > logs/studio.log 2>&1 &
- PID=$!
- for i in $(seq 1 60); do
- if curl -fs http://127.0.0.1:18891/api/health > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json
- break
- fi
- sleep 1
- done
- if ! jq -e '.status == "healthy"' /tmp/health.json 2>/dev/null; then
- echo "Studio failed to come up after `update`"
- tail -200 logs/studio.log
- kill "$PID" 2>/dev/null || true
- exit 1
- fi
- kill "$PID" 2>/dev/null || true
- echo "post-update Studio /api/health OK"
-
- - name: Upload update logs
- # Always upload so a green run still leaves the install + two
- # update logs reviewable.
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: studio-update-log
- path: |
- logs/install.log
- logs/update.log
- logs/update2.log
- logs/studio.log
- retention-days: 7
diff --git a/.github/workflows/studio-windows-api-smoke.yml b/.github/workflows/studio-windows-api-smoke.yml
deleted file mode 100644
index 1d12ea6f90..0000000000
--- a/.github/workflows/studio-windows-api-smoke.yml
+++ /dev/null
@@ -1,246 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Windows counterpart to studio-api-smoke.yml / studio-mac-api-smoke.yml.
-# Same tests/studio/studio_api_smoke.py exercise (CORS hardening, auth
-# state machine, JWT expiry, API key lifecycle, /v1/models /
-# /v1/embeddings / /v1/responses, endpoint-by-endpoint auth audit) but
-# on the FREE windows-latest runner. The file-mode hardening section
-# (Section 6) is Linux-only and short-circuits on non-POSIX; the rest
-# is platform-portable.
-
-name: Windows Studio API CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'install.ps1'
- - 'pyproject.toml'
- - 'tests/studio/**'
- - '.github/workflows/studio-windows-api-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- api-smoke:
- name: Studio API & Auth Tests
- runs-on: windows-latest
- timeout-minutes: 30
- defaults:
- run:
- shell: bash
- env:
- GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18895'
- HF_HOME: ${{ github.workspace }}/hf-cache
- # Force UTF-8 for stdio (Windows defaults to cp1252; hf
- # download prints a "✓" checkmark and crashes otherwise).
- PYTHONIOENCODING: utf-8
- PYTHONUTF8: '1'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
-
- - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Prime HF_HOME with the GGUF
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
- - name: Save HF_HOME for ${{ env.GGUF_REPO }}
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
- shell: pwsh
- # See studio-windows-update-smoke.yml for the full rationale.
- # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
- # reinstall, and Defender's real-time scan dominates the
- # frontend / uv-pip-extract steps.
- run: |
- $ProgressPreference = 'SilentlyContinue'
- Write-Host "npm version before upgrade: $(npm -v)"
- npm install -g 'npm@^11' 2>&1 | Out-Host
- Write-Host "npm version after upgrade: $(npm -v)"
- # NOTE: do NOT pre-create these directories. See
- # studio-windows-update-smoke.yml for the full rationale --
- # creating an empty studio/frontend/dist trips setup.ps1's
- # mtime-based staleness check into "frontend up to date, skip
- # rebuild" and Studio boots with an empty dist directory.
- # Add-MpPreference accepts paths that do not yet exist.
- foreach ($p in @(
- "$env:USERPROFILE\.unsloth",
- "$env:USERPROFILE\AppData\Local\uv",
- "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
- "$env:GITHUB_WORKSPACE\studio\frontend\dist"
- )) {
- try {
- Add-MpPreference -ExclusionPath $p -ErrorAction Stop
- Write-Host "Defender exclusion added: $p"
- } catch {
- Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
- }
- }
-
- - name: Install Studio (--local, --no-torch)
- shell: pwsh
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- New-Item -ItemType Directory -Force -Path logs | Out-Null
- # *>&1 captures Write-Host (Information stream) output;
- # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
- # and validated" via Write-Host, and we grep for that.
- $ProgressPreference = 'SilentlyContinue'
- & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
- - name: Assert install.ps1 used the Windows llama.cpp prebuilt
- run: |
- # Filesystem-based check (setup.ps1's stream output isn't
- # captured back through this parent step's pipeline; see
- # studio-windows-ui-smoke.yml for full explanation).
- LLAMA_DIR=~/.unsloth/llama.cpp
- INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
- BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
- if [ ! -f "$INFO" ]; then
- echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
- ls -la "$LLAMA_DIR" || true
- exit 1
- fi
- if [ ! -f "$BIN" ]; then
- echo "::error::no llama-server.exe at $BIN."
- ls -la "$LLAMA_DIR/build/bin" || true
- exit 1
- fi
- echo "install.ps1 installed the Windows prebuilt llama.cpp:"
- cat "$INFO"
-
- - name: Add Studio shim to GITHUB_PATH
- # install.ps1's User-PATH update doesn't propagate to a
- # running Git Bash session; export the shim dir so the
- # next `unsloth ...` invocation finds it.
- run: |
- SHIM_DIR=~/.unsloth/studio/bin
- if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
- echo "::error::unsloth.exe shim not found at $SHIM_DIR"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
- - name: Patch Studio venv with full typer / pydantic dep trees
- # Belt-and-suspenders: install.ps1's --no-deps install of
- # no-torch-runtime.txt drops typer's and pydantic's runtime
- # deps unless explicitly pinned. Re-install the ones whose
- # deps don't pull torch.
- run: |
- STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
- if [ ! -f "$STUDIO_PY" ]; then
- echo "::error::Studio venv python not at $STUDIO_PY"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
- - name: Install pyjwt for the JWT-expiry forge test
- run: python -m pip install 'pyjwt>=2.6'
-
- - name: Reset auth + boot Studio (API-only)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
-
- - name: Pass bootstrap password + rotated targets to the test
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- NEW2="ApiSmoke-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- echo "::add-mask::$NEW2"
- echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV"
- echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV"
- echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
- - name: Run Studio API & Auth tests
- # Do NOT pin STUDIO_AUTH_DIR here. The Mac/Linux mirrors
- # hardcode runner-specific paths (/Users/runner/...,
- # /home/runner/...), but on Windows the path is
- # C:\Users\runneradmin\.unsloth\studio\auth and varies by
- # runner image. studio_api_smoke.py defaults to
- # Path.home()/".unsloth"/"studio"/"auth" when the env is
- # unset, which is correct on every OS.
- env:
- BASE_URL: http://127.0.0.1:18895
- run: python tests/studio/studio_api_smoke.py
-
- - name: Stop Studio
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
-
- - name: Upload API smoke logs
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: windows-studio-api-smoke-log
- path: |
- logs/install.log
- logs/studio.log
- retention-days: 7
diff --git a/.github/workflows/studio-windows-inference-smoke.yml b/.github/workflows/studio-windows-inference-smoke.yml
deleted file mode 100644
index 01bf4127a7..0000000000
--- a/.github/workflows/studio-windows-inference-smoke.yml
+++ /dev/null
@@ -1,1167 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Three end-to-end smoke jobs that boot a freshly-installed Studio and
-# exercise the surfaces real users hit through the OpenAI / Anthropic
-# SDKs and curl, on the FREE windows-latest runner. Each job picks the
-# smallest model that exercises the behaviour under test, primes
-# HF_HOME via actions/cache, and shares the install.ps1 --local
-# --no-torch bootstrap.
-#
-# 1. OpenAI, Anthropic API tests
-# gemma-3-270m-it UD-Q4_K_XL (~254 MiB).
-# 2. Tool calling Tests
-# Qwen3.5-2B UD-Q4_K_XL (~890 MiB).
-# 3. JSON, images
-# gemma-4-E2B-it UD-Q4_K_XL + mmproj-F16 (~3.4 GiB total).
-# Within the 14 GB windows-latest SSD budget.
-
-name: Windows Studio GGUF CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'install.ps1'
- - 'pyproject.toml'
- - '.github/workflows/studio-windows-inference-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- # ─────────────────────────────────────────────────────────────────────
- # Job 1: OpenAI, Anthropic API tests
- # ─────────────────────────────────────────────────────────────────────
- openai-anthropic:
- name: OpenAI, Anthropic API tests
- runs-on: windows-latest
- timeout-minutes: 30
- defaults:
- run:
- shell: bash
- env:
- GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18888'
- HF_HOME: ${{ github.workspace }}/hf-cache
- # Force UTF-8 for stdio (Windows defaults to cp1252; hf
- # download / Studio CLI print "✓" checkmarks and crash
- # otherwise).
- PYTHONIOENCODING: utf-8
- PYTHONUTF8: '1'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
-
- # Split restore + save (rather than the one-step actions/cache) so a
- # transient restore-side failure does not kill the whole job. v5 has a
- # known flake where it logs "Cache hit for: " and then exits
- # non-zero without actually extracting the archive (see
- # actions/cache#1621 and github community discussion #163260).
- # continue-on-error on restore masks that failure so the Prime step
- # below can re-download from HF and the job keeps running. Save then
- # populates the cache key on a real miss only; cache keys are
- # immutable, so a corrupted cached entry persists until the -v1
- # suffix below is bumped.
- - name: Restore HF_HOME cache for ${{ env.GGUF_REPO }}
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Prime HF_HOME with the GGUF
- id: prime-hf
- # Run on a real cache miss AND on the silent-restore-failure mode
- # described above (outcome != success).
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
- - name: Save HF_HOME cache for ${{ env.GGUF_REPO }}
- # Only write a fresh cache entry when we actually rebuilt the
- # directory (Prime ran and succeeded). Skipping when Prime is
- # skipped avoids "already exists" save warnings on the happy path.
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
- shell: pwsh
- # See studio-windows-update-smoke.yml for the full rationale.
- # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
- # reinstall, and Defender's real-time scan dominates the
- # frontend / uv-pip-extract steps.
- run: |
- $ProgressPreference = 'SilentlyContinue'
- Write-Host "npm version before upgrade: $(npm -v)"
- npm install -g 'npm@^11' 2>&1 | Out-Host
- Write-Host "npm version after upgrade: $(npm -v)"
- # NOTE: do NOT pre-create these directories. See
- # studio-windows-update-smoke.yml for the full rationale --
- # creating an empty studio/frontend/dist trips setup.ps1's
- # mtime-based staleness check into "frontend up to date, skip
- # rebuild" and Studio boots with an empty dist directory.
- # Add-MpPreference accepts paths that do not yet exist.
- foreach ($p in @(
- "$env:USERPROFILE\.unsloth",
- "$env:USERPROFILE\AppData\Local\uv",
- "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
- "$env:GITHUB_WORKSPACE\studio\frontend\dist"
- )) {
- try {
- Add-MpPreference -ExclusionPath $p -ErrorAction Stop
- Write-Host "Defender exclusion added: $p"
- } catch {
- Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
- }
- }
-
- - name: Install Studio (--local, --no-torch)
- shell: pwsh
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- New-Item -ItemType Directory -Force -Path logs | Out-Null
- # *>&1 captures Write-Host (Information stream) output;
- # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
- # and validated" via Write-Host, and we grep for that.
- $ProgressPreference = 'SilentlyContinue'
- & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
- - name: Assert install.ps1 used the Windows llama.cpp prebuilt
- run: |
- # Filesystem check; setup.ps1's stream output isn't captured.
- LLAMA_DIR=~/.unsloth/llama.cpp
- INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
- BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
- if [ ! -f "$INFO" ]; then
- echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
- ls -la "$LLAMA_DIR" || true
- exit 1
- fi
- if [ ! -f "$BIN" ]; then
- echo "::error::no llama-server.exe at $BIN."
- ls -la "$LLAMA_DIR/build/bin" || true
- exit 1
- fi
- echo "install.ps1 installed the Windows prebuilt llama.cpp:"
- cat "$INFO"
-
- - name: Add Studio shim to GITHUB_PATH
- run: |
- SHIM_DIR=~/.unsloth/studio/bin
- if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
- echo "::error::unsloth.exe shim not found at $SHIM_DIR"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
- - name: Patch Studio venv with full typer / pydantic dep trees
- # Belt-and-suspenders: install.ps1's --no-deps install of
- # no-torch-runtime.txt drops typer's and pydantic's runtime
- # deps unless explicitly pinned. Re-install the ones whose
- # deps don't pull torch.
- run: |
- STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
- if [ ! -f "$STUDIO_PY" ]; then
- echo "::error::Studio venv python not at $STUDIO_PY"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
- - name: Install OpenAI + Anthropic Python SDKs
- run: python -m pip install 'openai>=1.50' 'anthropic>=0.40'
-
- - name: Reset auth + boot Studio (API-only)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json
- exit 0
- fi
- sleep 1
- done
- echo "Studio did not become healthy in 180s"
- tail -200 logs/studio.log
- exit 1
-
- - name: Password rotation (old must fail, new must work)
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
- [ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] || { echo "bootstrap login failed"; exit 1; }
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
- -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
- -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
- OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \
- -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}")
- if [ "$OLD_STATUS" != "401" ]; then
- echo "::error::Login with old password returned $OLD_STATUS, expected 401"
- exit 1
- fi
- NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
- [ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] || { echo "new login failed"; exit 1; }
- echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV"
- echo "password rotation OK (old=401, new=200)"
-
- - name: Load the GGUF (HF repo + variant, served from HF_HOME cache)
- run: |
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
- -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
- --max-time 600 \
- -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
- | jq '{status, display_name, is_gguf, context_length}'
-
- - name: Multi-turn determinism via OpenAI + Anthropic SDKs
- env:
- BASE_URL: http://127.0.0.1:18888
- run: |
- python - <<'PY'
- import json
- import os
- from openai import OpenAI
- from anthropic import Anthropic
-
- BASE = os.environ["BASE_URL"]
- KEY = os.environ["TOKEN"]
- SEED = 3407
-
- PROMPTS = [
- "What is 1+1?",
- "What did I ask before?",
- "What is the capital of France?",
- "Repeat the city name",
- ]
-
- def run_openai():
- client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
- history, replies = [], []
- for prompt in PROMPTS:
- history.append({"role": "user", "content": prompt})
- resp = client.chat.completions.create(
- model = "default",
- messages = history,
- temperature = 0.0,
- max_tokens = 80,
- seed = SEED,
- extra_body = {"enable_thinking": False},
- )
- text = resp.choices[0].message.content or ""
- replies.append(text)
- history.append({"role": "assistant", "content": text})
- return replies
-
- def run_anthropic():
- client = Anthropic(
- base_url = BASE,
- api_key = "unused",
- default_headers = {"Authorization": f"Bearer {KEY}"},
- )
- history, replies = [], []
- for prompt in PROMPTS:
- history.append({"role": "user", "content": prompt})
- msg = client.messages.create(
- model = "default",
- max_tokens = 80,
- messages = history,
- temperature = 0.0,
- extra_body = {"seed": SEED, "enable_thinking": False},
- )
- text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text")
- replies.append(text)
- history.append({"role": "assistant", "content": text})
- return replies
-
- for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)):
- first = runner()
- second = runner()
- for i, (a, b) in enumerate(zip(first, second), start = 1):
- print(f"[{label} turn {i}] {a!r}")
- assert a, f"{label}: empty turn {i} response"
- assert a == b, (
- f"{label} non-deterministic at turn {i} with temperature=0.0:\n"
- f" run1: {a!r}\n run2: {b!r}"
- )
- joined = " ".join(first).lower()
- assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}"
- assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}"
- print(f"[{label}] OK -- 4 turns, run1 == run2, history grounded")
- PY
-
- - name: Stop Studio
- if: always()
- # Run as cmd so we are not running through the Git Bash shell;
- # Git Bash on windows-latest has been observed to exit 143
- # (SIGTERM) from any inline kill/sleep block, masking a green
- # test run. The runner reclaims the Studio child process at
- # job end either way, so just emit a marker and exit 0.
- shell: cmd
- run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end)
-
- - name: Upload logs
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: windows-openai-anthropic-log
- path: |
- logs/studio.log
- logs/install.log
- retention-days: 7
-
- # ─────────────────────────────────────────────────────────────────────
- # Job 2: Tool calling Tests
- # ─────────────────────────────────────────────────────────────────────
- tool-calling:
- name: Tool calling Tests
- runs-on: windows-latest
- timeout-minutes: 30
- defaults:
- run:
- shell: bash
- env:
- # Tool calling is the highest-volume GGUF in this workflow
- # (Qwen3.5-2B at Q4_K_XL = ~1.28 GiB). The previous HF_HOME
- # cache stored xet chunks + blobs + snapshots = ~4.7 GiB --
- # 3.7x file-size inflation, dominating the post-step upload
- # (211 s on first run; subsequent runs hit the cache, but the
- # one-time cost recurs every time the cache key bumps). Use
- # main's `--local-dir gguf-cache` pattern: cache the flat .gguf
- # only, pass an absolute path to Studio's /api/inference/load.
- # The OpenAI/Anth and JSON+images jobs still cover the
- # gguf_variant resolution path.
- GGUF_REPO: unsloth/Qwen3.5-2B-GGUF
- GGUF_FILE: Qwen3.5-2B-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18898'
- # Force UTF-8 for stdio (Windows defaults to cp1252; hf
- # download / Studio CLI print "✓" checkmarks and crash
- # otherwise).
- PYTHONIOENCODING: utf-8
- PYTHONUTF8: '1'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
-
- # Split restore + save so a transient restore-side failure does not
- # kill the whole job. See the matching block in the tool-calling job
- # above for the full rationale (actions/cache#1621).
- - name: Restore GGUF model cache
- id: cache-gguf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: gguf-cache
- key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
- - name: Download GGUF if cache miss
- id: download-gguf
- if: steps.cache-gguf.outputs.cache-hit != 'true' || steps.cache-gguf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p gguf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache
-
- - name: Save GGUF model cache
- if: always() && steps.download-gguf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: gguf-cache
- key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1
-
- - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
- shell: pwsh
- # See studio-windows-update-smoke.yml for the full rationale.
- # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
- # reinstall, and Defender's real-time scan dominates the
- # frontend / uv-pip-extract steps.
- run: |
- $ProgressPreference = 'SilentlyContinue'
- Write-Host "npm version before upgrade: $(npm -v)"
- npm install -g 'npm@^11' 2>&1 | Out-Host
- Write-Host "npm version after upgrade: $(npm -v)"
- # NOTE: do NOT pre-create these directories. See
- # studio-windows-update-smoke.yml for the full rationale --
- # creating an empty studio/frontend/dist trips setup.ps1's
- # mtime-based staleness check into "frontend up to date, skip
- # rebuild" and Studio boots with an empty dist directory.
- # Add-MpPreference accepts paths that do not yet exist.
- foreach ($p in @(
- "$env:USERPROFILE\.unsloth",
- "$env:USERPROFILE\AppData\Local\uv",
- "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
- "$env:GITHUB_WORKSPACE\studio\frontend\dist"
- )) {
- try {
- Add-MpPreference -ExclusionPath $p -ErrorAction Stop
- Write-Host "Defender exclusion added: $p"
- } catch {
- Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
- }
- }
-
- - name: Install Studio (--local, --no-torch)
- shell: pwsh
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- New-Item -ItemType Directory -Force -Path logs | Out-Null
- # *>&1 captures Write-Host (Information stream) output;
- # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
- # and validated" via Write-Host, and we grep for that.
- $ProgressPreference = 'SilentlyContinue'
- & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
- - name: Assert install.ps1 used the Windows llama.cpp prebuilt
- run: |
- # Filesystem check; setup.ps1's stream output isn't captured.
- LLAMA_DIR=~/.unsloth/llama.cpp
- INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
- BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
- if [ ! -f "$INFO" ]; then
- echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
- ls -la "$LLAMA_DIR" || true
- exit 1
- fi
- if [ ! -f "$BIN" ]; then
- echo "::error::no llama-server.exe at $BIN."
- ls -la "$LLAMA_DIR/build/bin" || true
- exit 1
- fi
- echo "install.ps1 installed the Windows prebuilt llama.cpp:"
- cat "$INFO"
-
- - name: Add Studio shim to GITHUB_PATH
- run: |
- SHIM_DIR=~/.unsloth/studio/bin
- if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
- echo "::error::unsloth.exe shim not found at $SHIM_DIR"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
- - name: Patch Studio venv with full typer / pydantic dep trees
- # Belt-and-suspenders: install.ps1's --no-deps install of
- # no-torch-runtime.txt drops typer's and pydantic's runtime
- # deps unless explicitly pinned. Re-install the ones whose
- # deps don't pull torch.
- run: |
- STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
- if [ ! -f "$STUDIO_PY" ]; then
- echo "::error::Studio venv python not at $STUDIO_PY"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
- - name: Reset auth + boot Studio (API-only, default tool policy)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health, log in, change password, load model
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
- -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
- -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
- TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
- echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
- # GITHUB_WORKSPACE on windows-latest is a Windows path with
- # backslashes ("D:\a\unsloth\unsloth"). Bash handles it as a
- # raw string, but we cannot embed `\a` etc. in JSON without
- # JSON-string-escaping every backslash. Replace `\` with `/`
- # via bash parameter expansion -- pathlib.Path on Windows
- # accepts forward slashes natively, so Studio's loader sees
- # a normal path.
- GGUF_PATH="${GITHUB_WORKSPACE//\\//}/gguf-cache/${GGUF_FILE}"
- ls -lh "$GGUF_PATH"
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
- -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
- --max-time 600 \
- -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
- | jq '{status, display_name}'
-
- - name: Tool calling, server-side tools, thinking on/off
- env:
- BASE_URL: http://127.0.0.1:18898
- run: |
- python - <<'PY'
- import json
- import os
- import urllib.request
-
- BASE = os.environ["BASE_URL"]
- KEY = os.environ["API_KEY"]
- SEED = 3407
- # Same temperature shim as the Mac job. Small Qwen3.5-2B
- # quants can degenerate at temperature=0; a small non-zero
- # temperature with a fixed seed keeps the test deterministic
- # while escaping the trap.
- TEMP = 0.2
-
- def post(path, body, *, timeout = 240):
- data = json.dumps(body).encode()
- req = urllib.request.Request(
- f"{BASE}{path}",
- data = data,
- method = "POST",
- headers = {
- "Authorization": f"Bearer {KEY}",
- "Content-Type": "application/json",
- },
- )
- with urllib.request.urlopen(req, timeout = timeout) as resp:
- return resp.status, json.loads(resp.read().decode())
-
- def post_sse(path, body, *, timeout = 600):
- body = {**body, "stream": True}
- data = json.dumps(body).encode()
- req = urllib.request.Request(
- f"{BASE}{path}",
- data = data,
- method = "POST",
- headers = {
- "Authorization": f"Bearer {KEY}",
- "Content-Type": "application/json",
- },
- )
- parts = []
- with urllib.request.urlopen(req, timeout = timeout) as resp:
- for raw in resp:
- line = raw.decode().strip()
- if not line.startswith("data: "):
- continue
- payload = line[6:]
- if payload == "[DONE]":
- break
- try:
- chunk = json.loads(payload)
- except json.JSONDecodeError:
- continue
- for choice in chunk.get("choices", []):
- delta = choice.get("delta", {}) or {}
- if delta.get("content"):
- parts.append(delta["content"])
- return "".join(parts)
-
- # ── 1. Standard OpenAI function calling ──────────────────────
- weather_tool = {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get current weather for a city.",
- "parameters": {
- "type": "object",
- "properties": {"city": {"type": "string"}},
- "required": ["city"],
- },
- },
- }
-
- status, data = post("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "What is the weather in Paris?"}],
- "tools": [weather_tool],
- "tool_choice": "required",
- "stream": False,
- "temperature": TEMP,
- "seed": SEED,
- "max_tokens": 600,
- })
- assert status == 200, f"tool call status {status}: {data}"
- choice = data["choices"][0]
- tool_calls = (choice.get("message") or {}).get("tool_calls") or []
- if tool_calls:
- tc = tool_calls[0]
- assert tc["function"]["name"] == "get_weather", (
- f"unexpected tool name: {tc['function']['name']!r}"
- )
- args = json.loads(tc["function"]["arguments"])
- assert args.get("city"), f"missing city arg: {args}"
- print(f"[tools] PASS function calling -> {tc['function']['name']}({args}) finish={choice.get('finish_reason')!r}")
- else:
- print(
- f"[tools] WARN function calling: no tool_calls (finish_reason="
- f"{choice.get('finish_reason')!r}); HTTP path OK, model output drift."
- )
-
- # ── 2. Server-side python tool ───────────────────────────────
- content = post_sse("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "What is 123 * 456? Use the python tool to compute it and tell me the number."}],
- "enable_tools": True,
- "enabled_tools": ["python"],
- "session_id": "ci-tool-calling-py",
- "temperature": TEMP,
- "seed": SEED,
- "max_tokens": 600,
- })
- if "56088" in content or "56,088" in content:
- print(f"[tools] PASS python tool ({len(content)} chars, found 56088)")
- else:
- assert content, "python tool: SSE stream empty"
- print(
- f"[tools] WARN python tool: SSE OK ({len(content)} chars) but "
- f"model didn't return 56088 -- model output drift"
- )
-
- # ── 3. Server-side bash (terminal) tool ──────────────────────
- # On Windows the terminal tool resolves to the system shell
- # (cmd.exe wrapper) and `echo hello-bash-tool` works the same
- # way it does on POSIX. The model still has to choose to
- # invoke the tool; assert non-empty SSE if it doesn't.
- content = post_sse("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "Use the terminal tool to run `echo hello-bash-tool` and tell me the exact output."}],
- "enable_tools": True,
- "enabled_tools": ["terminal"],
- "session_id": "ci-tool-calling-bash",
- "temperature": TEMP,
- "seed": SEED,
- "max_tokens": 600,
- })
- if "hello-bash-tool" in content:
- print(f"[tools] PASS terminal tool ({len(content)} chars)")
- else:
- assert content, "terminal tool: SSE stream empty"
- print(
- f"[tools] WARN terminal tool: SSE OK ({len(content)} chars) but "
- f"model didn't echo 'hello-bash-tool' -- model output drift"
- )
-
- # ── 4. Server-side web_search tool ───────────────────────────
- # DuckDuckGo can be flaky from CI runners; only assert that
- # the SSE stream opens and yields any data.
- try:
- content = post_sse("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}],
- "enable_tools": True,
- "enabled_tools": ["web_search"],
- "session_id": "ci-tool-calling-web",
- "temperature": TEMP,
- "seed": SEED,
- "max_tokens": 400,
- })
- print(f"[tools] PASS web_search stream ({len(content)} chars)")
- except Exception as exc:
- print(f"[tools] WARN web_search probe failed (non-blocking): {exc}")
-
- # ── 5. Thinking on / off ─────────────────────────────────────
- def thinking_call(enable):
- status, data = post("/v1/chat/completions", {
- "messages": [{"role": "user", "content": "Briefly: is 17 prime?"}],
- "stream": False,
- "enable_thinking": enable,
- "temperature": TEMP,
- "seed": SEED,
- "max_tokens": 300,
- })
- assert status == 200
- msg = data["choices"][0]["message"]
- raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "")
- return raw
-
- on_text = thinking_call(True)
- off_text = thinking_call(False)
- had_think_on = ("" in on_text) or len(on_text) > 80
- if not had_think_on:
- print(
- f"[tools] WARN enable_thinking=True produced no thinking signal: "
- f"{on_text[:200]!r}"
- )
- assert "" not in off_text, (
- f"enable_thinking=False but still present: {off_text!r}"
- )
- print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)")
- PY
-
- - name: Stop Studio
- if: always()
- # Run as cmd so we are not running through the Git Bash shell;
- # Git Bash on windows-latest has been observed to exit 143
- # (SIGTERM) from any inline kill/sleep block, masking a green
- # test run. The runner reclaims the Studio child process at
- # job end either way, so just emit a marker and exit 0.
- shell: cmd
- run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end)
-
- - name: Upload logs
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: windows-tool-calling-log
- path: |
- logs/studio.log
- logs/install.log
- retention-days: 7
-
- # ─────────────────────────────────────────────────────────────────────
- # Job 3: JSON, images
- # ─────────────────────────────────────────────────────────────────────
- json-images:
- name: JSON, images
- runs-on: windows-latest
- timeout-minutes: 35
- defaults:
- run:
- shell: bash
- env:
- GGUF_REPO: unsloth/gemma-4-E2B-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-4-E2B-it-UD-Q4_K_XL.gguf
- MMPROJ_FILE: mmproj-F16.gguf
- STUDIO_PORT: '18899'
- HF_HOME: ${{ github.workspace }}/hf-cache
- # Force UTF-8 for stdio (Windows defaults to cp1252; hf
- # download / Studio CLI print "✓" checkmarks and crash
- # otherwise).
- PYTHONIOENCODING: utf-8
- PYTHONUTF8: '1'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
-
- # Split restore + save so a transient restore-side failure does not
- # kill the whole job. See the matching block in the tool-calling job
- # for the full rationale (actions/cache#1621). This is the block that
- # actually broke in run 25713577488: "Cache hit for: " was
- # logged, the step exited non-zero in ~0.3 s without extracting the
- # 3.4 GiB archive, and steps 6-15 were skipped.
- - name: Restore HF_HOME cache for ${{ env.GGUF_REPO }} (model + mmproj)
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1
-
- - name: Prime HF_HOME with the GGUF + mmproj
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE"
-
- - name: Save HF_HOME cache for ${{ env.GGUF_REPO }} (model + mmproj)
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v1
-
- - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
- shell: pwsh
- # See studio-windows-update-smoke.yml for the full rationale.
- # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
- # reinstall, and Defender's real-time scan dominates the
- # frontend / uv-pip-extract steps.
- run: |
- $ProgressPreference = 'SilentlyContinue'
- Write-Host "npm version before upgrade: $(npm -v)"
- npm install -g 'npm@^11' 2>&1 | Out-Host
- Write-Host "npm version after upgrade: $(npm -v)"
- # NOTE: do NOT pre-create these directories. See
- # studio-windows-update-smoke.yml for the full rationale --
- # creating an empty studio/frontend/dist trips setup.ps1's
- # mtime-based staleness check into "frontend up to date, skip
- # rebuild" and Studio boots with an empty dist directory.
- # Add-MpPreference accepts paths that do not yet exist.
- foreach ($p in @(
- "$env:USERPROFILE\.unsloth",
- "$env:USERPROFILE\AppData\Local\uv",
- "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
- "$env:GITHUB_WORKSPACE\studio\frontend\dist"
- )) {
- try {
- Add-MpPreference -ExclusionPath $p -ErrorAction Stop
- Write-Host "Defender exclusion added: $p"
- } catch {
- Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
- }
- }
-
- - name: Install Studio (--local, --no-torch)
- shell: pwsh
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- New-Item -ItemType Directory -Force -Path logs | Out-Null
- # *>&1 captures Write-Host (Information stream) output;
- # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
- # and validated" via Write-Host, and we grep for that.
- $ProgressPreference = 'SilentlyContinue'
- & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
- - name: Assert install.ps1 used the Windows llama.cpp prebuilt
- run: |
- # Filesystem check; setup.ps1's stream output isn't captured.
- LLAMA_DIR=~/.unsloth/llama.cpp
- INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
- BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
- if [ ! -f "$INFO" ]; then
- echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
- ls -la "$LLAMA_DIR" || true
- exit 1
- fi
- if [ ! -f "$BIN" ]; then
- echo "::error::no llama-server.exe at $BIN."
- ls -la "$LLAMA_DIR/build/bin" || true
- exit 1
- fi
- echo "install.ps1 installed the Windows prebuilt llama.cpp:"
- cat "$INFO"
-
- - name: Add Studio shim to GITHUB_PATH
- run: |
- SHIM_DIR=~/.unsloth/studio/bin
- if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
- echo "::error::unsloth.exe shim not found at $SHIM_DIR"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
- - name: Patch Studio venv with full typer / pydantic dep trees
- # Belt-and-suspenders: install.ps1's --no-deps install of
- # no-torch-runtime.txt drops typer's and pydantic's runtime
- # deps unless explicitly pinned. Re-install the ones whose
- # deps don't pull torch.
- run: |
- STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
- if [ ! -f "$STUDIO_PY" ]; then
- echo "::error::Studio venv python not at $STUDIO_PY"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
- - name: Install OpenAI + Anthropic Python SDKs
- run: python -m pip install 'openai>=1.50' 'anthropic>=0.40'
-
- - name: Reset auth + boot Studio (API-only)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health, log in, change password, load model
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" | jq -r .access_token)
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
- -H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
- -d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
- TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
- -H 'content-type: application/json' \
- -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
- echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
- curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
- -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
- --max-time 900 \
- -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
- | jq '{status, display_name, is_vision}'
-
- - name: JSON schema decoding + image input
- env:
- BASE_URL: http://127.0.0.1:18899
- run: |
- python - <<'PY'
- import base64
- import json
- import os
- import urllib.request
- from openai import OpenAI
- from anthropic import Anthropic
-
- BASE = os.environ["BASE_URL"]
- KEY = os.environ["API_KEY"]
- SEED = 3407
- TEMP = 0.2
-
- def post(path, body, *, timeout = 240):
- req = urllib.request.Request(
- f"{BASE}{path}",
- data = json.dumps(body).encode(),
- method = "POST",
- headers = {
- "Authorization": f"Bearer {KEY}",
- "Content-Type": "application/json",
- },
- )
- with urllib.request.urlopen(req, timeout = timeout) as resp:
- return resp.status, json.loads(resp.read().decode())
-
- # ── 1. response_format = json_object (JSON mode) ─────────────
- status, data = post("/v1/chat/completions", {
- "model": "default",
- "messages": [
- {"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'},
- {"role": "user", "content": "What is the capital of France?"},
- ],
- "temperature": TEMP,
- "max_tokens": 600,
- "seed": SEED,
- "stream": False,
- "enable_thinking": False,
- "response_format": {"type": "json_object"},
- }, timeout = 600)
- assert status == 200, f"json status {status}: {data}"
- assert (
- isinstance(data.get("choices"), list)
- and data["choices"]
- and "message" in data["choices"][0]
- ), f"json response envelope malformed: {data}"
- content = (data["choices"][0]["message"].get("content") or "").strip()
- print(f"[json] raw json_object content: {content!r}")
- if content.startswith("```"):
- content = content.split("```", 2)[1]
- if content.startswith("json"):
- content = content[4:]
- content = content.strip("`\n ")
- if content:
- try:
- parsed = json.loads(content)
- if "paris" in str(parsed.get("city", "")).lower():
- print(f"[json] PASS json_object -> {parsed}")
- else:
- print(f"[json] WARN json_object decoded but city!=Paris: {parsed}")
- except json.JSONDecodeError as exc:
- print(f"[json] WARN json_object content not parseable ({exc}); content={content!r}")
- else:
- print("[json] WARN json_object produced empty content")
-
- status2, data2 = post("/v1/chat/completions", {
- "model": "default",
- "messages": [{"role": "user", "content": "What is the capital of France? Answer with one word."}],
- "temperature": TEMP,
- "max_tokens": 400,
- "seed": SEED,
- "stream": False,
- "enable_thinking": False,
- }, timeout = 600)
- assert status2 == 200, f"plain status {status2}: {data2}"
- plain = (data2["choices"][0]["message"].get("content") or "").lower()
- print(f"[json] plain capital-of-france reply: {plain!r}")
- if "paris" in plain:
- print("[json] PASS plain inference path (paris mentioned)")
- else:
- print(
- f"[json] WARN plain inference returned no 'paris' -- "
- f"model output drift. HTTP path validated separately above."
- )
-
- # ── 2. OpenAI image_url (data URI base64) ───────────────────
- PNG_64X64_RED_B64 = (
- "iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k"
- "UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA"
- "1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII="
- )
- data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}"
-
- # On Windows + the gemma-4-E2B mmproj, llama.cpp's vision
- # path runs on CPU (no Metal involvement). The wrapper is
- # kept for resilience but the vision path is expected to
- # work on Windows; an exception here is a real regression.
- client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
- try:
- openai_resp = client.chat.completions.create(
- model = "default",
- temperature = TEMP,
- max_tokens = 80,
- seed = SEED,
- messages = [{
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": data_uri}},
- {"type": "text", "text": "What colour dominates this image? Reply in one word."},
- ],
- }],
- )
- openai_text = (openai_resp.choices[0].message.content or "").lower()
- print(f"[image/openai] reply: {openai_text!r}")
- if openai_text:
- print("[image/openai] PASS image_url accepted, non-empty response")
- else:
- print("[image/openai] WARN image_url accepted but empty content")
- except Exception as exc:
- print(
- f"[image/openai] WARN image_url SDK call raised: {type(exc).__name__}: "
- f"{exc}. Studio successfully forwarded the request; failure here is "
- f"upstream llama.cpp vision behaviour."
- )
-
- # ── 3. Anthropic source/base64 image ────────────────────────
- anthropic = Anthropic(
- base_url = BASE,
- api_key = "unused",
- default_headers = {"Authorization": f"Bearer {KEY}"},
- )
- try:
- a_msg = anthropic.messages.create(
- model = "default",
- max_tokens = 80,
- temperature = TEMP,
- extra_body = {"seed": SEED},
- messages = [{
- "role": "user",
- "content": [
- {
- "type": "image",
- "source": {
- "type": "base64",
- "media_type": "image/png",
- "data": PNG_64X64_RED_B64,
- },
- },
- {"type": "text", "text": "Describe this image briefly."},
- ],
- }],
- )
- a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text")
- print(f"[image/anthropic] reply: {a_text!r}")
- if a_text:
- print("[image/anthropic] PASS source/base64 accepted, non-empty response")
- else:
- print("[image/anthropic] WARN source/base64 accepted but empty content")
- except Exception as exc:
- print(
- f"[image/anthropic] WARN anthropic image SDK call raised: "
- f"{type(exc).__name__}: {exc}. Likely upstream llama.cpp vision "
- f"behaviour, NOT a Studio regression."
- )
- PY
-
- - name: Stop Studio
- if: always()
- # Run as cmd so we are not running through the Git Bash shell;
- # Git Bash on windows-latest has been observed to exit 143
- # (SIGTERM) from any inline kill/sleep block, masking a green
- # test run. The runner reclaims the Studio child process at
- # job end either way, so just emit a marker and exit 0.
- shell: cmd
- run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end)
-
- - name: Upload logs
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: windows-json-images-log
- path: |
- logs/studio.log
- logs/install.log
- retention-days: 7
diff --git a/.github/workflows/studio-windows-ui-smoke.yml b/.github/workflows/studio-windows-ui-smoke.yml
deleted file mode 100644
index e5ab9f8ab7..0000000000
--- a/.github/workflows/studio-windows-ui-smoke.yml
+++ /dev/null
@@ -1,342 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Windows counterpart to studio-ui-smoke.yml / studio-mac-ui-smoke.yml.
-# Same Playwright + Chromium end-to-end chat UI flow + extra UI flow,
-# but on the FREE windows-latest runner so we catch Windows-specific
-# regressions in the install path (install.ps1), the Studio CLI's
-# Windows process-management branches, and the llama.cpp prebuilt's
-# Windows HTTP layer.
-
-name: Windows Studio UI CI
-
-on:
- pull_request:
- paths:
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - 'install.ps1'
- - 'pyproject.toml'
- - 'tests/studio/**'
- - '.github/workflows/studio-windows-ui-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- ui-smoke:
- name: Chat UI Tests
- runs-on: windows-latest
- timeout-minutes: 45
- # Default every step's shell to Git Bash. windows-latest's default
- # shell is pwsh; without this each curl / heredoc / `kill $PID`
- # step would need its own `shell: bash`. Steps that genuinely
- # need PowerShell (install.ps1 invocation) override per-step.
- defaults:
- run:
- shell: bash
- env:
- GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
- GGUF_VARIANT: UD-Q4_K_XL
- GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
- STUDIO_PORT: '18896'
- HF_HOME: ${{ github.workspace }}/hf-cache
- # Force UTF-8 for stdio so Python tools (hf download, Studio
- # CLI, etc.) can print Unicode characters like the success
- # checkmark "✓". Windows defaults to cp1252 / charmap and
- # any tool that prints "OK ✓" hits a UnicodeEncodeError.
- PYTHONIOENCODING: utf-8
- PYTHONUTF8: '1'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
- # No `cache: 'npm'`. setup-node's npm cache restore silently
- # aborts the entire job on Windows runners when the npm cache
- # path (`C:\npm\cache` per `npm config get cache`) doesn't yet
- # exist on a fresh runner -- the step exits without an error
- # message and every following step gets skipped. See
- # npm/cli#7308. The frontend `npm ci` is fast enough without
- # the cache that the reliability gain is worth the ~30s.
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- # No `cache: 'pip'`. install.ps1 / setup.ps1 use uv and
- # never populate ~/.cache/pip; setup-python's post-step
- # then fatal-errors with "Cache folder path is retrieved
- # for pip but doesn't exist on disk".
-
- - name: Restore HF_HOME for ${{ env.GGUF_REPO }}
- id: cache-hf
- uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- continue-on-error: true
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Prime HF_HOME with the GGUF
- id: prime-hf
- if: steps.cache-hf.outputs.cache-hit != 'true' || steps.cache-hf.outcome != 'success'
- env:
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
- run: |
- python -m pip install --upgrade huggingface_hub
- mkdir -p hf-cache
- bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
-
- - name: Save HF_HOME for ${{ env.GGUF_REPO }}
- if: always() && steps.prime-hf.outcome == 'success'
- uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
- with:
- path: hf-cache
- key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v1
-
- - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
- shell: pwsh
- # See studio-windows-update-smoke.yml for the full rationale.
- # tl;dr: setup.ps1 needs npm >=11 to skip a 35 s winget Node
- # reinstall, and Defender's real-time scan dominates the
- # frontend / uv-pip-extract steps.
- run: |
- $ProgressPreference = 'SilentlyContinue'
- Write-Host "npm version before upgrade: $(npm -v)"
- npm install -g 'npm@^11' 2>&1 | Out-Host
- Write-Host "npm version after upgrade: $(npm -v)"
- # NOTE: do NOT pre-create these directories. See
- # studio-windows-update-smoke.yml for the full rationale --
- # creating an empty studio/frontend/dist trips setup.ps1's
- # mtime-based staleness check into "frontend up to date, skip
- # rebuild" and Studio boots with an empty dist directory.
- # Add-MpPreference accepts paths that do not yet exist.
- foreach ($p in @(
- "$env:USERPROFILE\.unsloth",
- "$env:USERPROFILE\AppData\Local\uv",
- "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
- "$env:GITHUB_WORKSPACE\studio\frontend\dist"
- )) {
- try {
- Add-MpPreference -ExclusionPath $p -ErrorAction Stop
- Write-Host "Defender exclusion added: $p"
- } catch {
- Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
- }
- }
-
- - name: Install Studio (--local, --no-torch)
- # install.ps1 is the supported Windows installer. install.sh
- # has no Windows branch (apt-get / brew calls). The PS1
- # script's `Install-UnslothStudio @args` line at the bottom
- # forwards `--local --no-torch` correctly.
- shell: pwsh
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- New-Item -ItemType Directory -Force -Path logs | Out-Null
- # *>&1 redirects ALL PowerShell streams (stdout, stderr,
- # warning, verbose, debug, information) into the success
- # stream so Tee-Object captures everything. install.ps1
- # and setup.ps1 emit step/substep markers via Write-Host
- # which lands on the Information stream (PS 5+); without
- # the wildcard redirect, those markers (including
- # "prebuilt installed and validated") never reach
- # logs/install.log and the post-step grep asserter fails.
- $ProgressPreference = 'SilentlyContinue'
- & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
- - name: Assert install.ps1 used the Windows llama.cpp prebuilt
- run: |
- # install.ps1's setup.ps1 child writes "prebuilt installed
- # and validated" to its own console host -- that output
- # does NOT come back through this parent step's stdout
- # pipeline (no matter how aggressively we redirect: *>&1,
- # tee, etc.). Verify the install via the filesystem
- # instead. setup.ps1 writes UNSLOTH_PREBUILT_INFO.json
- # next to the install dir on success, and lays the
- # binaries under build/bin/Release/ on Windows.
- STUDIO_HOME=~/.unsloth/studio
- LLAMA_DIR=~/.unsloth/llama.cpp
- INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
- BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
- # Source-build fallback grep stays as a fast bail-out.
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
- if [ ! -f "$INFO" ]; then
- echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO; setup.ps1 didn't install the prebuilt."
- ls -la "$LLAMA_DIR" || true
- exit 1
- fi
- if [ ! -f "$BIN" ]; then
- echo "::error::no llama-server.exe at $BIN; prebuilt extraction incomplete."
- ls -la "$LLAMA_DIR/build/bin" || true
- ls -la "$LLAMA_DIR/build/bin/Release" || true
- exit 1
- fi
- echo "install.ps1 installed the Windows prebuilt llama.cpp:"
- cat "$INFO"
-
- - name: Add Studio shim to GITHUB_PATH
- # install.ps1 puts unsloth.exe at $StudioHome\bin\unsloth.exe
- # and adds that dir to the User PATH via the Windows registry.
- # Registry-level PATH updates don't propagate to a running
- # Git Bash session, so the next step's `unsloth ...` invocation
- # would hit "command not found". Re-export the shim dir to
- # GITHUB_PATH so every subsequent step in this job sees it.
- run: |
- SHIM_DIR=~/.unsloth/studio/bin
- if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
- echo "::error::unsloth.exe shim not found at $SHIM_DIR"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- # GITHUB_PATH wants Windows-style paths; convert via cygpath.
- cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
- echo "Added Studio shim dir to PATH: $(cygpath -w "$SHIM_DIR")"
-
- - name: Patch Studio venv with full typer / pydantic dep trees
- # Belt-and-suspenders: install.ps1's --no-deps install of
- # no-torch-runtime.txt drops typer's and pydantic's runtime
- # deps unless explicitly pinned. Re-install the ones whose
- # deps don't pull torch.
- run: |
- STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
- if [ ! -f "$STUDIO_PY" ]; then
- echo "::error::Studio venv python not at $STUDIO_PY"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
- - name: Install Playwright + Chromium
- # No --with-deps on Windows: that flag installs Linux apt
- # packages. windows-latest ships the system frameworks
- # Chromium needs (Edge / WebView2) already.
- run: |
- python -m pip install 'playwright>=1.45'
- python -m playwright install chromium
-
- - name: Reset auth + boot Studio
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
- > logs/studio.log 2>&1 &
- echo "STUDIO_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
- jq -e '.status == "healthy"' /tmp/health.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health.json
-
- - name: Pass bootstrap password to the Playwright step
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- NEW2="CIUi-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- echo "::add-mask::$NEW2"
- echo "STUDIO_OLD_PW=$OLD" >> "$GITHUB_ENV"
- echo "STUDIO_NEW_PW=$NEW" >> "$GITHUB_ENV"
- echo "STUDIO_NEW2_PW=$NEW2" >> "$GITHUB_ENV"
-
- - name: Drive the chat UI with Playwright
- env:
- BASE_URL: http://127.0.0.1:18896
- PW_ART_DIR: logs/playwright
- STUDIO_UI_STRICT: '1'
- # windows-latest free runner is 4 vCPU / 16 GB; gemma-3-
- # 270m turn latency under llama-server's CPU backend can
- # crowd the 180s default (slower than ubuntu-latest on
- # the same model). Keep the same generous budget the Mac
- # job uses.
- STUDIO_UI_TURN_TIMEOUT_MS: '540000'
- run: |
- mkdir -p logs/playwright
- python tests/studio/playwright_chat_ui.py
-
- - name: Stop Studio (chat-ui ends with Shutdown click; this is belt-and-suspenders)
- if: always()
- run: |
- kill "${STUDIO_PID}" 2>/dev/null || true
- sleep 2
-
- - name: Reset auth + boot Studio for extra UI tests (port 18897)
- run: |
- unsloth studio reset-password
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18897 \
- > logs/studio_extra.log 2>&1 &
- echo "STUDIO_EXTRA_PID=$!" >> "$GITHUB_ENV"
-
- - name: Wait for /api/health on 18897
- run: |
- for i in $(seq 1 180); do
- if curl -fs "http://127.0.0.1:18897/api/health" > /tmp/health2.json; then
- jq -e '.status == "healthy"' /tmp/health2.json && break
- fi
- sleep 1
- done
- jq -e '.status == "healthy"' /tmp/health2.json
-
- - name: Pass bootstrap pw for extra UI test
- run: |
- OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
- NEW="CIUiExtra-$(python -c 'import secrets; print(secrets.token_urlsafe(16))')"
- echo "::add-mask::$OLD"
- echo "::add-mask::$NEW"
- echo "STUDIO_EXTRA_OLD_PW=$OLD" >> "$GITHUB_ENV"
- echo "STUDIO_EXTRA_NEW_PW=$NEW" >> "$GITHUB_ENV"
-
- - name: Drive Compare/Recipes/Export/Studio/Settings with Playwright
- env:
- BASE_URL: http://127.0.0.1:18897
- STUDIO_OLD_PW: ${{ env.STUDIO_EXTRA_OLD_PW }}
- STUDIO_NEW_PW: ${{ env.STUDIO_EXTRA_NEW_PW }}
- PW_ART_DIR: logs/playwright_extra
- STUDIO_UI_STRICT: '1'
- STUDIO_UI_TURN_TIMEOUT_MS: '540000'
- GGUF_REPO: ${{ env.GGUF_REPO }}
- GGUF_VARIANT: ${{ env.GGUF_VARIANT }}
- run: |
- mkdir -p logs/playwright_extra
- python tests/studio/playwright_extra_ui.py
-
- - name: Stop second Studio
- if: always()
- run: |
- kill "${STUDIO_EXTRA_PID}" 2>/dev/null || true
- sleep 2
-
- - name: Upload Playwright artifacts
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: windows-studio-ui-smoke-artifacts
- path: |
- logs/studio.log
- logs/studio_extra.log
- logs/install.log
- logs/playwright
- logs/playwright_extra
- retention-days: 7
diff --git a/.github/workflows/studio-windows-update-smoke.yml b/.github/workflows/studio-windows-update-smoke.yml
deleted file mode 100644
index 157874d404..0000000000
--- a/.github/workflows/studio-windows-update-smoke.yml
+++ /dev/null
@@ -1,279 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Windows counterpart to studio-update-smoke.yml /
-# studio-mac-update-smoke.yml. Verifies that on the FREE
-# windows-latest runner:
-#
-# 1. install.ps1 --local --no-torch installs Studio AND auto-fetches
-# the prebuilt llama.cpp Windows binary (llama-bNNNN-bin-win-cpu-
-# x64 from ggml-org/llama.cpp). Hitting the source-build fallback
-# is treated as an Unsloth bug -- Studio must always pick the
-# prebuilt on Windows.
-# 2. unsloth studio update --local is idempotent. Two consecutive
-# runs both report "prebuilt up to date and validated", no
-# source-build fallback. The CLI's _find_setup_script picks
-# setup.ps1 on Windows automatically.
-# 3. The installed Studio still boots and /api/health returns
-# healthy after the update path.
-
-name: Windows Studio Update CI
-
-on:
- pull_request:
- paths:
- - 'install.ps1'
- - 'studio/setup.ps1'
- - 'studio/setup.bat'
- - 'studio/install_python_stack.py'
- - 'studio/install_llama_prebuilt.py'
- - 'studio/backend/requirements/**'
- - 'unsloth_cli/commands/studio.py'
- - 'pyproject.toml'
- - '.github/workflows/studio-windows-update-smoke.yml'
- push:
- branches: [main, pip]
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- update-idempotency:
- name: Studio Updating Tests
- runs-on: windows-latest
- timeout-minutes: 30
- defaults:
- run:
- shell: bash
- env:
- # Force UTF-8 for stdio (Windows defaults to cp1252; hf
- # download / Studio CLI print "✓" checkmarks and crash
- # otherwise).
- PYTHONIOENCODING: utf-8
- PYTHONUTF8: '1'
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- # Don't cache pip: install.ps1 + setup.ps1 go through uv
- # and never populate ~/.cache/pip; setup-python's post-step
- # then fatal-errors with "Cache folder path is retrieved
- # for pip but doesn't exist on disk".
-
- - name: Pre-install Windows tweaks (npm 11 + Defender exclusions)
- shell: pwsh
- # Two surgical fixes against measured Windows-only install
- # waste (vs Mac/Linux on the same SHA):
- #
- # (1) npm. setup.ps1 line 1109-1145 requires Node 22.12+ (or
- # 20.19+ / 23+) AND npm >=11 because Vite 8 needs both.
- # actions/setup-node@v4 with `node-version: '22'` lands
- # Node 22.22.2 + the npm 10.9.7 it bundles, so the npm
- # check fails and setup.ps1 falls through to the
- # "winget install Node.js LTS" branch -- a ~35 s reinstall
- # of Node we don't need. `npm install -g npm@^11` updates
- # the bundled npm in-place in ~5 s, which makes setup.ps1
- # short-circuit on the existing Node.
- #
- # (2) Defender. windows-latest's real-time scan opens / hashes
- # every file Studio writes during install (Vite output =
- # thousands of small chunks, uv pip = wheel-extraction =
- # thousands of small files). The latency dominates the
- # 200 s frontend build and the 90 s deps install. Adding
- # ExclusionPath entries for the directories the install
- # writes to drops per-file open latency from ~ms to ~us.
- # Add-MpPreference needs admin; the runneradmin user has
- # it, but wrap in try/catch so a permission flake leaves
- # the install otherwise unaffected.
- run: |
- $ProgressPreference = 'SilentlyContinue'
- Write-Host "npm version before upgrade: $(npm -v)"
- npm install -g 'npm@^11' 2>&1 | Out-Host
- Write-Host "npm version after upgrade: $(npm -v)"
- # NOTE: do NOT pre-create these directories before adding the
- # exclusion -- creating an empty studio/frontend/dist trips
- # setup.ps1 line 1281-1296's mtime-based "is the frontend
- # stale?" check into "up to date, skip rebuild", because the
- # newly-created dist's mtime is younger than every source
- # file. Studio then boots with an empty dist and 500s on
- # GET / with FileNotFoundError: dist\index.html. See run
- # 25546676715 / job 74984469728.
- # Add-MpPreference accepts paths that do not yet exist; the
- # exclusion is registered and applies when the path
- # materialises.
- foreach ($p in @(
- "$env:USERPROFILE\.unsloth",
- "$env:USERPROFILE\AppData\Local\uv",
- "$env:GITHUB_WORKSPACE\studio\frontend\node_modules",
- "$env:GITHUB_WORKSPACE\studio\frontend\dist"
- )) {
- try {
- Add-MpPreference -ExclusionPath $p -ErrorAction Stop
- Write-Host "Defender exclusion added: $p"
- } catch {
- Write-Host "Defender exclusion skipped ($($_.Exception.Message)): $p"
- }
- }
-
- - name: Install Studio (--local, --no-torch)
- shell: pwsh
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- New-Item -ItemType Directory -Force -Path logs | Out-Null
- # *>&1 captures Write-Host (Information stream) output;
- # plain 2>&1 does not. setup.ps1 emits "prebuilt installed
- # and validated" via Write-Host, and we grep for that.
- $ProgressPreference = 'SilentlyContinue'
- & ./install.ps1 --local --no-torch *>&1 | Tee-Object -FilePath logs/install.log
-
- - name: Assert install.ps1 used the Windows llama.cpp prebuilt
- run: |
- # Filesystem-based check (setup.ps1's stream output isn't
- # captured back through the parent pipeline).
- LLAMA_DIR=~/.unsloth/llama.cpp
- INFO="$LLAMA_DIR/UNSLOTH_PREBUILT_INFO.json"
- BIN="$LLAMA_DIR/build/bin/Release/llama-server.exe"
- if grep -q "falling back to source build" logs/install.log; then
- echo "::error::install.ps1 fell back to source-build llama.cpp on Windows."
- grep -E "llama-prebuilt|llama.cpp" logs/install.log | tail -60
- exit 1
- fi
- if [ ! -f "$INFO" ]; then
- echo "::error::no UNSLOTH_PREBUILT_INFO.json at $INFO."
- ls -la "$LLAMA_DIR" || true
- exit 1
- fi
- if [ ! -f "$BIN" ]; then
- echo "::error::no llama-server.exe at $BIN."
- ls -la "$LLAMA_DIR/build/bin" || true
- exit 1
- fi
- echo "install.ps1 installed the Windows prebuilt llama.cpp:"
- cat "$INFO"
-
- - name: Add Studio shim to GITHUB_PATH
- run: |
- SHIM_DIR=~/.unsloth/studio/bin
- if [ ! -f "$SHIM_DIR/unsloth.exe" ]; then
- echo "::error::unsloth.exe shim not found at $SHIM_DIR"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- cygpath -w "$SHIM_DIR" >> "$GITHUB_PATH"
-
- - name: Patch Studio venv with full typer / pydantic dep trees
- # install.ps1 runs `uv pip install --no-deps -r
- # no-torch-runtime.txt` to keep torch out of transitive
- # resolution from accelerate/peft/trl. That also drops
- # typer's and pydantic's runtime deps unless they're
- # explicitly pinned in no-torch-runtime.txt. We pin the
- # known ones (click, shellingham, annotated-doc, rich,
- # pydantic-core, annotated-types, typing-inspection, ...)
- # but typer / pydantic minor versions can introduce new
- # transitive deps that are NOT in our pin list.
- #
- # Belt-and-suspenders: re-install typer + pydantic +
- # huggingface_hub WITH their deps into the Studio venv.
- # `pip install --upgrade` only adds missing packages; it
- # never down-shifts an installed version. Cannot pull
- # torch (none of typer / pydantic / huggingface_hub depend
- # on it).
- run: |
- STUDIO_PY=~/.unsloth/studio/unsloth_studio/Scripts/python.exe
- if [ ! -f "$STUDIO_PY" ]; then
- echo "::error::Studio venv python not at $STUDIO_PY"
- ls -la ~/.unsloth/studio/ || true
- exit 1
- fi
- "$STUDIO_PY" -m pip install --upgrade typer pydantic huggingface_hub
-
- - name: First update should be a no-op (prebuilt already validated)
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- set -o pipefail
- unsloth studio update --local 2>&1 | tee logs/update.log
- if grep -q "falling back to source build" logs/update.log; then
- echo "::error::studio update fell back to source-build llama.cpp on Windows."
- grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
- exit 1
- fi
- if ! grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update.log; then
- echo "::error::no prebuilt up-to-date marker in update.log."
- grep -E "llama-prebuilt|llama.cpp" logs/update.log | tail -60
- exit 1
- fi
- echo "update path took the prebuilt fast path"
-
- - name: Second update must also be a no-op
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- set -o pipefail
- unsloth studio update --local 2>&1 | tee logs/update2.log
- grep -q "falling back to source build" logs/update2.log && {
- echo "::error::second update fell back to source build on Windows"
- tail -60 logs/update2.log; exit 1; } || true
- grep -qE "prebuilt up to date and validated|prebuilt installed and validated" logs/update2.log
- echo "second update was clean"
-
- - name: Boot Studio briefly to confirm the install is still usable
- run: |
- mkdir -p logs
- UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p 18891 \
- > logs/studio.log 2>&1 &
- PID=$!
- HEALTHY=""
- # Use jq (a Git Bash builtin) instead of `python -c
- # open('/tmp/health.json')` to read the saved health
- # response. Bash on windows-latest is MSYS Git Bash, which
- # resolves `/tmp/...` against the MSYS root, while the
- # python interpreter is Windows-native and resolves it
- # against the current drive's root. The two paths don't
- # agree, so python never finds the file curl just wrote.
- # jq reads through MSYS, so the path matches. Mirrors what
- # studio-windows-api-smoke.yml and the other Windows smoke
- # workflows already do.
- for i in $(seq 1 60); do
- if curl -fs http://127.0.0.1:18891/api/health > /tmp/health.json; then
- if jq -e '.status == "healthy"' /tmp/health.json >/dev/null; then
- HEALTHY=1
- break
- fi
- fi
- sleep 1
- done
- if [ -z "$HEALTHY" ]; then
- echo "Studio failed to come up after \`update\`"
- tail -200 logs/studio.log
- kill "$PID" 2>/dev/null || true
- exit 1
- fi
- kill "$PID" 2>/dev/null || true
- echo "post-update Studio /api/health OK"
-
- - name: Upload update logs
- if: always()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: windows-studio-update-log
- path: |
- logs/install.log
- logs/update.log
- logs/update2.log
- logs/studio.log
- retention-days: 7
diff --git a/.github/workflows/version-compat-ci.yml b/.github/workflows/version-compat-ci.yml
deleted file mode 100644
index 599b53df1d..0000000000
--- a/.github/workflows/version-compat-ci.yml
+++ /dev/null
@@ -1,312 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-#
-# Cross-version compat canary for the four upstream packages whose
-# release cadence regularly breaks unsloth + unsloth-zoo:
-#
-# 1. vLLM (LoRA worker manager, BnB loader, cumem allocator)
-# 2. TRL / GRPO (trainer source rewriters in unsloth.models.rl*)
-# 3. PEFT (LoraConfig, get_peft_model, LoraLayer, bnb integration)
-# 4. sentence-transformers (Transformer/Pooling/Normalize, Trainer)
-# 5. bitsandbytes (Linear4bit, dequantize_4bit)
-#
-# Strategy: GitHub raw-fetch + symbol grep against every tracked
-# version (no pip install, CPU-only). When upstream renames a symbol
-# we depend on, the matching test fails BEFORE a user hits it. The
-# `main` branch entries give us a few-day lead on PyPI releases.
-#
-# Cross-references:
-# tests/vllm_compat/test_vllm_pinned_symbols.py (vLLM symbols)
-# tests/version_compat/test_trl_grpo_pinned_symbols.py
-# tests/version_compat/test_peft_pinned_symbols.py
-# tests/version_compat/test_sentence_transformers_pinned_symbols.py
-# tests/version_compat/test_bitsandbytes_pinned_symbols.py
-
-name: Version Compat CI
-
-on:
- pull_request:
- # Trigger on any unsloth source change, not just the three previously
- # named files. The symbol-existence tests verify that EVERY pinned
- # upstream reference in unsloth still resolves; a new
- # `from peft.foo import Bar` added in unsloth/kernels/whatever.py
- # is just as much a compat regression risk as one added in
- # unsloth/models/rl.py.
- paths:
- - 'unsloth/**'
- - 'tests/vllm_compat/**'
- - 'tests/version_compat/**'
- - 'pyproject.toml'
- - '.github/workflows/version-compat-ci.yml'
- schedule:
- # Daily 06:43 UTC. Catches upstream PyPI releases roughly within
- # 24 h. Off the :00 / :30 fleet-collision spots.
- - cron: '43 6 * * *'
- workflow_dispatch:
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- vllm-pinned-symbols:
- name: vLLM pinned-symbol matrix (≥ 0.9.0 + main)
- runs-on: ubuntu-latest
- timeout-minutes: 12
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
- - name: Install pytest only
- # The test fetches from raw.githubusercontent.com and greps
- # source. No pip install of vllm / torch / transformers is
- # needed — that's the whole point of this canary.
- run: |
- python -m pip install --upgrade pip
- pip install 'pytest>=8'
- - name: Run vllm-compat suite
- env:
- # Authenticated requests get a 5000-req/h quota on raw
- # fetches; unauthenticated is 60/h and trips on the matrix.
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- python -m pytest tests/vllm_compat/test_vllm_pinned_symbols.py -v --tb=short
-
- trl-grpo-pinned-symbols:
- name: TRL / GRPO pinned-symbol matrix
- runs-on: ubuntu-latest
- timeout-minutes: 10
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
- - name: Install pytest only
- run: |
- python -m pip install --upgrade pip
- pip install 'pytest>=8'
- - name: Run trl-compat suite
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- # PYTHONPATH=. so `from tests.version_compat._fetch import …`
- # works without an editable install of unsloth itself.
- PYTHONPATH=. python -m pytest \
- tests/version_compat/test_trl_grpo_pinned_symbols.py \
- -v --tb=short
-
- peft-pinned-symbols:
- name: PEFT pinned-symbol matrix (pyproject window + main)
- runs-on: ubuntu-latest
- timeout-minutes: 8
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
- - name: Install pytest only
- run: |
- python -m pip install --upgrade pip
- pip install 'pytest>=8'
- - name: Run peft-compat suite
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- PYTHONPATH=. python -m pytest \
- tests/version_compat/test_peft_pinned_symbols.py \
- tests/version_compat/test_unsloth_zoo_save_merged_pinned_symbols.py \
- -v --tb=short
-
- st-pinned-symbols:
- name: sentence-transformers pinned-symbol matrix
- runs-on: ubuntu-latest
- timeout-minutes: 8
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
- - name: Install pytest only
- run: |
- python -m pip install --upgrade pip
- pip install 'pytest>=8'
- - name: Run sentence-transformers compat suite
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- PYTHONPATH=. python -m pytest \
- tests/version_compat/test_sentence_transformers_pinned_symbols.py \
- -v --tb=short
-
- bitsandbytes-pinned-symbols:
- name: bitsandbytes pinned-symbol matrix
- runs-on: ubuntu-latest
- timeout-minutes: 8
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
- - name: Install pytest only
- run: |
- python -m pip install --upgrade pip
- pip install 'pytest>=8'
- - name: Run bitsandbytes compat suite
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- PYTHONPATH=. python -m pytest \
- tests/version_compat/test_bitsandbytes_pinned_symbols.py \
- -v --tb=short
-
- transformers-pinned-symbols:
- name: transformers pinned-symbol matrix (4.57.6 + 5.x + main)
- runs-on: ubuntu-latest
- timeout-minutes: 12
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
- - name: Install pytest only
- run: |
- python -m pip install --upgrade pip
- pip install 'pytest>=8'
- - name: Run transformers compat suite
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- PYTHONPATH=. python -m pytest \
- tests/version_compat/test_transformers_pinned_symbols.py \
- -v --tb=short
-
- # Optional second layer: actually `pip install` ONE representative
- # version of each package and verify unsloth + unsloth-zoo modules
- # import on it under the existing CUDA spoof. CPU-only, runs on
- # ubuntu-latest. Catches the small set of breakages that the static
- # symbol check misses (e.g. import-time side effects).
- zoo-imports-under-spoof:
- name: unsloth_zoo vllm/grpo/peft/st modules import under CUDA spoof
- runs-on: ubuntu-latest
- timeout-minutes: 15
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- path: unsloth
- - name: Clone unsloth-zoo @ main
- run: |
- # github.com occasionally 500s on the git fetch; retry so a
- # single upstream blip does not fail CI.
- for attempt in 1 2 3; do
- rm -rf "$RUNNER_TEMP/unsloth-zoo"
- if git clone --depth=1 https://github.com/unslothai/unsloth-zoo \
- "$RUNNER_TEMP/unsloth-zoo"; then
- break
- fi
- if [ "$attempt" -eq 3 ]; then
- echo "::error::git clone unsloth-zoo failed after 3 attempts"
- exit 1
- fi
- delay=$((5 * attempt))
- echo "::warning::clone failed (attempt $attempt/3), retrying in ${delay}s..."
- sleep "$delay"
- done
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
- - name: Install CPU torch + supported pkg pins
- run: |
- python -m pip install --upgrade pip
- # CPU torch (vllm/peft/st all depend on it).
- pip install --index-url https://download.pytorch.org/whl/cpu \
- 'torch>=2.4,<2.11' 'torchvision<0.26' 'torchcodec<0.10'
- # torchcodec is a hard requirement on transformers 5.x:
- # transformers/audio_utils.py:55 does
- # `importlib.metadata.version("torchcodec")` UNCONDITIONALLY,
- # which raises PackageNotFoundError on a CPU runner that
- # otherwise has no audio path -- and that error trickles up
- # through every `import unsloth_zoo.` because
- # unsloth-zoo's vision_utils transitively pulls
- # transformers.processing_utils (-> audio_utils). The 0.10
- # cap mirrors the torch 2.10 / torchvision 0.26 ABI window
- # we already pin above.
- # Ladder of supported floor versions per pyproject.toml.
- pip install \
- 'transformers>=4.56,<5.6' 'trl>=0.22,<0.26' \
- 'peft>=0.18.0' 'sentence-transformers>=5.0' \
- 'accelerate>=1.0' 'datasets>=3.4,<5' \
- 'bitsandbytes>=0.45.5' \
- sentencepiece protobuf safetensors numpy 'pytest>=8' \
- 'huggingface_hub>=0.34' tqdm packaging psutil triton Pillow
- # Editable-install both repos so the test imports the
- # checkouts (not whatever stale PyPI version pip resolved).
- pip install --no-deps -e "$RUNNER_TEMP/unsloth-zoo"
- pip install --no-deps -e ./unsloth
- - name: Run vllm_compat zoo-imports tests under spoof
- env:
- UNSLOTH_IS_PRESENT: '1'
- UNSLOTH_COMPILE_DISABLE: '1'
- PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python
- run: |
- cd unsloth
- # tests/vllm_compat/test_unsloth_zoo_imports.py: narrow vllm/grpo
- # import gates (5 tests).
- # tests/vllm_compat/test_extended_module_imports.py: full sweep
- # of unsloth_zoo + unsloth.models.* modules + RL dispatch
- # table population + FastModel API surface under spoof
- # (~30 tests). Catches transformers / peft / bnb symbol pin
- # drift at module-top BEFORE any runtime call.
- PYTHONPATH=. python -m pytest \
- tests/vllm_compat/test_unsloth_zoo_imports.py \
- tests/vllm_compat/test_extended_module_imports.py \
- -v --tb=short
-
- # Daily-only: same suites but with --strict on importable upstream
- # tags. Schedule-only so PR jobs stay fast; cron tolerates a flake.
- daily-fresh-fetch:
- name: daily fresh-fetch sweep (cron only)
- if: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
- runs-on: ubuntu-latest
- timeout-minutes: 20
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
- cache: 'pip'
- - name: Install pytest
- run: pip install 'pytest>=8'
- - name: Run all version-compat suites in one process (no cache)
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- PYTHONPATH=. python -m pytest \
- tests/vllm_compat/test_vllm_pinned_symbols.py \
- tests/version_compat/ \
- -v --tb=short
diff --git a/.github/workflows/wheel-smoke.yml b/.github/workflows/wheel-smoke.yml
deleted file mode 100644
index 3de3c33ca2..0000000000
--- a/.github/workflows/wheel-smoke.yml
+++ /dev/null
@@ -1,136 +0,0 @@
-# SPDX-License-Identifier: AGPL-3.0-only
-# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
-
-# Builds the PyPI wheel from the PR branch, then verifies the built wheel
-# actually contains what we expect to ship and does NOT contain the broken
-# Studio bundle that 2026.5.1 published. This is the single workflow that
-# would have blocked the 2026.5.1 release before twine upload.
-#
-# Verified locally end-to-end against this branch:
-# - python -m build produces unsloth--py3-none-any.whl in 13s
-# - wheel content sanity passes:
-# lockfile shipped, frontend dist shipped,
-# no node_modules in wheel, no bun.lock in wheel,
-# main bundle has unstable_Provider hits=1 (assistant-ui internals only).
-# - Studio backend imports cleanly from the installed wheel with the
-# lightweight dep set below.
-
-name: Wheel CI
-
-on:
- pull_request:
- paths:
- - 'pyproject.toml'
- - 'studio/**'
- - 'unsloth/**'
- - 'unsloth_cli/**'
- - '.github/workflows/wheel-smoke.yml'
- push:
- branches: [main, pip]
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- wheel:
- name: Wheel build + content sanity + import smoke
- runs-on: ubuntu-latest
- timeout-minutes: 15
- steps:
- - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- with:
- persist-credentials: false
-
- - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
- with:
- node-version: '22'
-
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
- with:
- python-version: '3.12'
-
- - name: Lockfile supply-chain audit (pre-install scan)
- run: python3 scripts/lockfile_supply_chain_audit.py
-
- - name: Build frontend
- # Lifecycle scripts (esbuild native-binary postinstall, etc.) are
- # required for `vite build`. The pre-install lockfile structural
- # audit (lockfile_supply_chain_audit.py) is the practical defence
- # against the npm postinstall-dropper class -- it fires BEFORE any
- # tarball runs, on the injection pattern itself rather than an
- # advisory-DB lookup.
- run: |
- cd studio/frontend
- npm ci --no-fund --no-audit
- npm run build
-
- - name: Build wheel + sdist
- run: |
- python -m pip install --upgrade pip build
- rm -rf dist build ./*.egg-info
- python -m build
-
- - name: Wheel content sanity
- run: |
- python - <<'PY'
- import zipfile, glob, sys
- w = glob.glob("dist/unsloth-*.whl")
- if not w:
- print("FAIL: no wheel produced"); sys.exit(2)
- w = w[0]
- print(f"wheel: {w}")
- with zipfile.ZipFile(w) as z:
- n = z.namelist()
- checks = {
- "lockfile shipped": any(s.endswith("studio/frontend/package-lock.json") for s in n),
- "frontend dist shipped": any(s.endswith("studio/frontend/dist/index.html") for s in n),
- "no node_modules": not any("studio/frontend/node_modules/" in s for s in n),
- "no bun.lock": not any(s.endswith("studio/frontend/bun.lock") for s in n),
- }
- js = [s for s in n
- if "studio/frontend/dist/assets/" in s
- and s.endswith(".js")
- and "/index-" in s]
- if not js:
- print("FAIL: no main bundle index-*.js in wheel"); sys.exit(2)
- data = z.read(js[0]).decode("utf-8", "replace")
- hits = data.count("unstable_Provider:")
- print(f"main bundle: {js[0]}")
- print(f"unstable_Provider hits: {hits} (>=4 indicates 2026.5.1 regression)")
- checks["bundle has no Studio unstable_Provider call site"] = (hits < 4)
-
- print()
- for k, v in checks.items():
- print(f" [{'PASS' if v else 'FAIL'}] {k}")
- sys.exit(0 if all(checks.values()) else 1)
- PY
-
- - name: Studio backend import smoke
- # Imports `studio.backend.main:app` from the freshly-installed wheel in
- # a clean venv. This catches the class of bug that 2026.5.1 shipped with:
- # frontend dist missing, package-lock.json missing, or the wheel's Python
- # source tree broken in a way that surfaces only at app construction time.
- run: |
- python -m venv /tmp/v
- /tmp/v/bin/pip install --upgrade pip
- /tmp/v/bin/pip install -r studio/backend/requirements/studio.txt
- /tmp/v/bin/pip install \
- python-multipart aiofiles sqlalchemy cryptography \
- pyyaml jinja2 mammoth unpdf requests \
- 'numpy<3'
- /tmp/v/bin/pip install --no-deps dist/unsloth-*.whl
- # Run from /tmp so Python imports the installed package, not the source tree.
- cd /tmp
- /tmp/v/bin/python -c "from studio.backend.main import app; print('Studio backend OK:', app.title)"
-
- - name: Upload wheel on failure
- if: failure()
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: unsloth-wheel
- path: dist/
- retention-days: 7
diff --git a/studio/backend/core/inference/chat_template_helpers.py b/studio/backend/core/inference/chat_template_helpers.py
new file mode 100644
index 0000000000..833a714ee4
--- /dev/null
+++ b/studio/backend/core/inference/chat_template_helpers.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""
+Dependency-light wrapper around tokenizer.apply_chat_template with a
+kwarg fallback for templates that reject reasoning/tools args.
+"""
+
+from typing import Optional
+
+
+def apply_chat_template_for_generation(
+ tokenizer,
+ messages: list,
+ *,
+ tools: Optional[list] = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
+) -> str:
+ """Render the chat prompt. Try richest kwargs first; drop one
+ group at a time on TypeError. Jinja / missing-variable errors
+ propagate."""
+ reasoning_kwargs: dict = {}
+ if enable_thinking is not None:
+ reasoning_kwargs["enable_thinking"] = enable_thinking
+ if reasoning_effort is not None:
+ reasoning_kwargs["reasoning_effort"] = reasoning_effort
+ if preserve_thinking is not None:
+ reasoning_kwargs["preserve_thinking"] = preserve_thinking
+
+ attempts: list[dict] = []
+ if tools and reasoning_kwargs:
+ attempts.append({"tools": tools, **reasoning_kwargs})
+ if tools:
+ attempts.append({"tools": tools})
+ if reasoning_kwargs:
+ attempts.append(dict(reasoning_kwargs))
+ attempts.append({})
+
+ last_exc: Optional[Exception] = None
+ for kwargs in attempts:
+ try:
+ return tokenizer.apply_chat_template(
+ messages,
+ tokenize = False,
+ add_generation_prompt = True,
+ **kwargs,
+ )
+ except TypeError as e:
+ last_exc = e
+ continue
+ except Exception as e:
+ last_exc = e
+ break
+ if last_exc is not None:
+ raise last_exc
+ raise RuntimeError(
+ "apply_chat_template_for_generation: no attempt produced a result"
+ )
diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
index 4c140013a0..e1620f5ca3 100644
--- a/studio/backend/core/inference/inference.py
+++ b/studio/backend/core/inference/inference.py
@@ -839,6 +839,74 @@ def generate_with_adapter_control(
cancel_event = cancel_event, _adapter_state = use_adapter, **gen_kwargs
)
+ def generate_chat_completion_with_tools(
+ self,
+ messages: list,
+ tools: list,
+ system_prompt: str = "",
+ temperature: float = 0.7,
+ top_p: float = 0.9,
+ top_k: int = 40,
+ min_p: float = 0.0,
+ max_new_tokens: int = 2048,
+ repetition_penalty: float = 1.0,
+ cancel_event = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
+ max_tool_iterations: int = 25,
+ auto_heal_tool_calls: bool = True,
+ tool_call_timeout: int = 300,
+ session_id: Optional[str] = None,
+ ):
+ """Run an agentic tool loop on top of ``generate_chat_response``.
+
+ Yields the same event-dict protocol used by the GGUF path so
+ the route layer can stream both backends through one helper.
+ Each event is one of:
+
+ * ``{"type": "status", "text": ...}``
+ * ``{"type": "content", "text": cumulative_text}``
+ * ``{"type": "tool_start", "tool_name", "tool_call_id", "arguments"}``
+ * ``{"type": "tool_end", "tool_name", "tool_call_id", "result"}``
+ """
+ from core.inference.safetensors_agentic import run_safetensors_tool_loop
+ from core.inference.tools import execute_tool
+
+ def _single_turn(conv: list):
+ # conv already has the system message -- avoid double-prepend.
+ yield from self._generate_chat_response_inner(
+ messages = conv,
+ system_prompt = "",
+ temperature = temperature,
+ top_p = top_p,
+ top_k = top_k,
+ min_p = min_p,
+ max_new_tokens = max_new_tokens,
+ repetition_penalty = repetition_penalty,
+ cancel_event = cancel_event,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
+ )
+
+ initial = list(messages)
+ if system_prompt:
+ initial = [{"role": "system", "content": system_prompt}] + initial
+
+ yield from run_safetensors_tool_loop(
+ single_turn = _single_turn,
+ messages = initial,
+ tools = tools,
+ execute_tool = execute_tool,
+ cancel_event = cancel_event,
+ auto_heal_tool_calls = auto_heal_tool_calls,
+ max_tool_iterations = max_tool_iterations,
+ tool_call_timeout = tool_call_timeout,
+ session_id = session_id,
+ )
+
def generate_chat_response(
self,
messages: list,
@@ -851,10 +919,20 @@ def generate_chat_response(
max_new_tokens: int = 256,
repetition_penalty: float = 1.0,
cancel_event = None,
+ tools: Optional[list] = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
) -> Generator[str, None, None]:
"""
Generate response for text or vision models.
The generation lock is acquired by the background generation thread.
+
+ ``tools`` / ``enable_thinking`` / ``reasoning_effort`` /
+ ``preserve_thinking`` are forwarded into
+ ``tokenizer.apply_chat_template`` so templates that understand
+ these kwargs (Qwen3, Llama 3.1+, gpt-oss harmony, ...) advertise
+ the tool schemas and reasoning controls to the model.
"""
yield from self._generate_chat_response_inner(
messages = messages,
@@ -867,6 +945,10 @@ def generate_chat_response(
max_new_tokens = max_new_tokens,
repetition_penalty = repetition_penalty,
cancel_event = cancel_event,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
)
def _generate_chat_response_inner(
@@ -882,6 +964,10 @@ def _generate_chat_response_inner(
repetition_penalty: float = 1.0,
cancel_event = None,
_adapter_state = None,
+ tools: Optional[list] = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
) -> Generator[str, None, None]:
"""
Inner generation logic. Called by both generate_chat_response
@@ -981,8 +1067,13 @@ def _generate_chat_response_inner(
f"Please use a model that includes a chat template, or manually set "
f"one via tokenizer.chat_template before inference."
)
- formatted_prompt = tokenizer.apply_chat_template(
- template_messages, tokenize = False, add_generation_prompt = True
+ formatted_prompt = self._apply_chat_template_for_generation(
+ tokenizer,
+ template_messages,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
)
logger.debug(f"Formatted prompt: {formatted_prompt[:200]}...")
except Exception as e:
@@ -1319,20 +1410,9 @@ def generate_whisper_response(
def _is_gpt_oss_model(self, model_name: str = None) -> bool:
"""Check if the given (or active) model uses the gpt-oss harmony protocol."""
- name = (model_name or self.active_model_name or "").lower()
- try:
- from utils.datasets import MODEL_TO_TEMPLATE_MAPPER
+ from utils.datasets import is_gpt_oss_model_name
- # Exact match
- if MODEL_TO_TEMPLATE_MAPPER.get(name) == "gpt-oss":
- return True
- # Partial match (e.g. name-bnb-4bit variants)
- for key, tmpl in MODEL_TO_TEMPLATE_MAPPER.items():
- if tmpl == "gpt-oss" and (key in name or name in key):
- return True
- except Exception:
- pass
- return "gpt-oss" in name
+ return is_gpt_oss_model_name(model_name or self.active_model_name or "")
def generate_stream(
self,
@@ -1715,6 +1795,34 @@ def __call__(
"Patched RepetitionPenaltyLogitsProcessor with 64-token window for OuteTTS"
)
+ def _apply_chat_template_for_generation(
+ self,
+ tokenizer,
+ messages: list,
+ *,
+ tools: Optional[list] = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
+ ) -> str:
+ """Render the chat prompt, peeling kwargs the template does not
+ understand. Delegates to the dependency-light helper module so
+ the fallback chain can be unit-tested without pulling unsloth /
+ torch into the test sandbox.
+ """
+ from core.inference.chat_template_helpers import (
+ apply_chat_template_for_generation,
+ )
+
+ return apply_chat_template_for_generation(
+ tokenizer,
+ messages,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
+ )
+
def format_chat_prompt(self, messages: list, system_prompt: str = None) -> str:
if not self.active_model_name or self.active_model_name not in self.models:
logger.error("No active model available")
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 3682f1dbbb..0bf1b5afef 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -23,15 +23,39 @@
import threading
import time
from pathlib import Path
-from typing import Generator, List, Optional
+from typing import Generator, Iterable, List, Optional
from urllib.parse import urlparse
import httpx
+from core.tool_healing import (
+ _TC_END_TAG_RE,
+ _TC_FUNC_CLOSE_RE,
+ _TC_FUNC_START_RE,
+ _TC_JSON_START_RE,
+ _TC_PARAM_CLOSE_RE,
+ _TC_PARAM_START_RE,
+ _TOOL_ALL_PATS,
+ _TOOL_CLOSED_PATS,
+ parse_tool_calls_from_text,
+)
+
+# Stripping and signal-marker constants come from the multi-format
+# parser so Llama-3 / Mistral / Gemma 4 emissions are also detected
+# in the BUFFERING state machine and stripped from the assistant
+# stream. Pre-PR-5615 we used the legacy two-format helper which
+# only knew / bool:
+ """Quick DNS check. Runs on a daemon thread so concurrent sockets
+ in the same process are not affected by socket.setdefaulttimeout."""
+ result: list[Optional[bool]] = [None]
+
+ def _probe() -> None:
+ try:
+ socket.gethostbyname(host)
+ result[0] = False
+ except Exception:
+ result[0] = True
+
+ t = threading.Thread(target = _probe, daemon = True)
+ t.start()
+ t.join(timeout)
+ # Thread still running -> resolver wedged -> treat as dead.
+ return True if result[0] is None else result[0]
+
+
+@contextlib.contextmanager
+def _hf_offline_if_dns_dead():
+ """Set HF_HUB_OFFLINE for the body of this block only when DNS to
+ huggingface.co fails. Restores the env on exit so a transient
+ resolver hiccup at the start of one load can't quarantine the whole
+ process. Respects an explicit user setting (no-op if already set)."""
+ if "HF_HUB_OFFLINE" in os.environ:
+ yield False
+ return
+ if not _probe_dns_dead():
+ yield False
+ return
+
+ transformers_was_set = "TRANSFORMERS_OFFLINE" in os.environ
+ os.environ["HF_HUB_OFFLINE"] = "1"
+ if not transformers_was_set:
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
+ logger.warning("huggingface.co unreachable; using local HF cache for this load.")
+ try:
+ yield True
+ finally:
+ os.environ.pop("HF_HUB_OFFLINE", None)
+ if not transformers_was_set:
+ os.environ.pop("TRANSFORMERS_OFFLINE", None)
+
+
def _swa_cache_path() -> Path:
home = os.environ.get("UNSLOTH_STUDIO_HOME") or os.environ.get("STUDIO_HOME")
base = Path(home) if home else Path.home() / ".unsloth" / "studio"
@@ -317,25 +386,6 @@ def _extract_model_size_b(model_id: str):
return extract_model_size_b(model_id)
-# ── Pre-compiled patterns for tool XML stripping ─────────────
-_TOOL_CLOSED_PATS = [
- re.compile(r".*?", re.DOTALL),
- re.compile(r".*?", re.DOTALL),
-]
-_TOOL_ALL_PATS = _TOOL_CLOSED_PATS + [
- re.compile(r".*$", re.DOTALL),
- re.compile(r".*$", re.DOTALL),
-]
-
-# ── Pre-compiled patterns for tool-call XML parsing ──────────
-_TC_JSON_START_RE = re.compile(r"\s*\{")
-_TC_FUNC_START_RE = re.compile(r"\s*")
-_TC_END_TAG_RE = re.compile(r"")
-_TC_FUNC_CLOSE_RE = re.compile(r"\s*\s*$")
-_TC_PARAM_START_RE = re.compile(r"\s*")
-_TC_PARAM_CLOSE_RE = re.compile(r"\s*\s*$")
-
-
_TOOL_TEMPLATE_MARKERS = (
"{%- if tools %}",
"{%- if tools -%}",
@@ -345,6 +395,15 @@ def _extract_model_size_b(model_id: str):
"'role' == 'tool'",
'message.role == "tool"',
"message.role == 'tool'",
+ # DeepSeek-style: subscripted access + tool_calls field checks.
+ # DeepSeek's chat template has no top-level ``{% if tools %}`` block
+ # and uses ``message['role'] == 'tool'`` plus ``message['tool_calls']
+ # is defined`` to gate the emission.
+ "message['role'] == 'tool'",
+ 'message["role"] == "tool"',
+ "message['tool_calls']",
+ 'message["tool_calls"]',
+ "tool_calls is defined",
)
@@ -414,6 +473,149 @@ def detect_reasoning_flags(
return flags
+def _is_mtp_model_name(
+ model_identifier: Optional[str],
+ gguf_path: Optional[str] = None,
+) -> bool:
+ """Name-based MTP detector. Fallback for the metadata signal."""
+ for cand in (model_identifier, Path(gguf_path).name if gguf_path else None):
+ if cand and "-mtp" in cand.lower():
+ return True
+ return False
+
+
+def _extra_args_set_spec_type(extra_args: Optional[Iterable[str]]) -> bool:
+ """User passed --spec-type / --spec-default? llama-server takes a
+ single --spec-type (comma-separated to chain), so suppress
+ auto-emit when this is true."""
+ if not extra_args:
+ return False
+ for raw in extra_args:
+ tok = str(raw)
+ if not tok.startswith("--"):
+ continue
+ flag = tok.split("=", 1)[0]
+ if flag in ("--spec-type", "--spec-default"):
+ return True
+ return False
+
+
+def _build_ngram_mod_flags(
+ caps: Optional[dict],
+ n_match: int = 24,
+ n_min: int = 48,
+ n_max: int = 64,
+) -> list[str]:
+ """Emit the right ngram-mod knob flags for the running llama-server.
+
+ Post-rename builds expose ``--spec-ngram-mod-n-{match,min,max}``;
+ pre-rename builds expose the legacy ``--spec-ngram-size-n`` /
+ ``--draft-min`` / ``--draft-max``. ``caps`` comes from
+ ``probe_server_capabilities``; ``ngram_mod_flavor`` tells us which
+ set is real (vs a removal-stub entry). Returns ``[]`` when neither
+ set is available so the caller can drop ngram-mod entirely.
+ """
+ flavor = caps.get("ngram_mod_flavor") if caps else None
+ if flavor == "new":
+ return [
+ "--spec-ngram-mod-n-match",
+ str(n_match),
+ "--spec-ngram-mod-n-min",
+ str(n_min),
+ "--spec-ngram-mod-n-max",
+ str(n_max),
+ ]
+ if flavor == "legacy":
+ # Legacy llama.cpp before the spec arg rename: same knobs lived
+ # under --spec-ngram-size-n (lookup length) and the generic
+ # --draft-min / --draft-max (ngram size N range).
+ return [
+ "--spec-ngram-size-n",
+ str(n_match),
+ "--draft-min",
+ str(n_min),
+ "--draft-max",
+ str(n_max),
+ ]
+ return []
+
+
+# Canonical Speculative Decoding modes exposed by the Studio chat UI.
+# The dropdown renders five options (auto, mtp, ngram, mtp+ngram, off);
+# the load API also accepts legacy values that the original Switch and
+# external callers emit (default, draft-mtp, ngram-mod, ngram-simple).
+_CANONICAL_SPEC_MODES = {"auto", "mtp", "ngram", "mtp+ngram", "off", "ngram-simple"}
+_LEGACY_SPEC_MODE_MAP = {
+ "default": "auto",
+ "draft-mtp": "mtp",
+ "ngram-mod": "ngram",
+}
+
+
+def _canonicalize_spec_mode(value):
+ """Map any accepted ``speculative_type`` input onto a canonical mode.
+
+ Returns one of ``auto``, ``mtp``, ``ngram``, ``mtp+ngram``, ``off``,
+ ``ngram-simple``, or ``None`` (callers treat ``None`` as ``auto``).
+ Unknown strings collapse to ``auto`` so a stale UI value or typo
+ falls back to the safe platform-aware path.
+ """
+ if value is None:
+ return None
+ if not isinstance(value, str):
+ return None
+ stripped = value.strip().lower()
+ if not stripped:
+ return None
+ if stripped in _CANONICAL_SPEC_MODES:
+ return stripped
+ if stripped in _LEGACY_SPEC_MODE_MAP:
+ return _LEGACY_SPEC_MODE_MAP[stripped]
+ # llama.cpp comma-chains are emitted by old persisted state e.g.
+ # "ngram-mod,draft-mtp"; collapse the most common one explicitly.
+ pieces = [p.strip() for p in stripped.split(",") if p.strip()]
+ has_mtp = any(p in ("mtp", "draft-mtp") for p in pieces)
+ has_ngram = any(p in ("ngram", "ngram-mod") for p in pieces)
+ if has_mtp and has_ngram:
+ return "mtp+ngram"
+ if has_mtp:
+ return "mtp"
+ if has_ngram:
+ return "ngram"
+ return "auto"
+
+
+def _backfill_usage_from_timings(usage, timings):
+ """Synthesize ``usage`` from llama-server's ``timings`` when the
+ OpenAI-style usage block is missing or reports zero tokens.
+
+ The Studio chat UI computes generation t/s from
+ ``meta.usage.completion_tokens / totalStreamTime``. llama-server
+ always populates ``timings.predicted_n`` (true decoded count) and
+ ``timings.prompt_n``, but the ``usage`` field on the final SSE chunk
+ can be absent or zero on some server builds / streaming
+ configurations, which makes the UI fall back to wall-clock t/s and
+ dilute speculative-decoding speedups.
+ """
+ if not timings:
+ return usage
+ if usage and usage.get("completion_tokens"):
+ return usage
+ predicted_n = timings.get("predicted_n")
+ prompt_n = timings.get("prompt_n")
+ if predicted_n is None and prompt_n is None:
+ return usage
+ out = dict(usage or {})
+ if not out.get("completion_tokens") and predicted_n is not None:
+ out["completion_tokens"] = predicted_n
+ if not out.get("prompt_tokens") and prompt_n is not None:
+ out["prompt_tokens"] = prompt_n
+ out["total_tokens"] = int(out.get("prompt_tokens") or 0) + int(
+ out.get("completion_tokens") or 0
+ )
+ return out
+
+
class LlamaCppBackend:
"""
Manages a llama-server subprocess for GGUF model inference.
@@ -448,6 +650,15 @@ def __init__(self):
self._cache_type_kv: Optional[str] = None
self._reasoning_default: bool = True
self._speculative_type: Optional[str] = None
+ # Canonical UI-facing mode the user requested: one of
+ # ``auto``/``mtp``/``ngram``/``mtp+ngram``/``off``/``ngram-simple``.
+ # Round-tripped through the status API so the dropdown reflects
+ # the picked mode rather than the resolved internal flag set
+ # (auto on a 27B MTP GGUF resolves to draft-mtp but the dropdown
+ # should still read "Auto").
+ self._requested_spec_mode: Optional[str] = None
+ # User-supplied --spec-draft-n-max override (None = platform default).
+ self._spec_draft_n_max: Optional[int] = None
# KV-cache estimation fields (populated by _read_gguf_metadata)
self._n_layers: Optional[int] = None
self._n_kv_heads: Optional[int] = None
@@ -469,6 +680,8 @@ def __init__(self):
# Last N layers reuse KV from earlier layers and don't allocate
# their own cache (Gemma 3n / Gemma 4: .attention.shared_kv_layers).
self._shared_kv_layers: Optional[int] = None
+ # MTP head count (llama.cpp #22673); >0 enables --spec-type draft-mtp.
+ self._nextn_predict_layers: Optional[int] = None
self._lock = threading.Lock()
# Wraps load_model() end-to-end so concurrent loads serialise
# and never coexist as two llama-server processes (#5401).
@@ -483,6 +696,9 @@ def __init__(self):
self._requested_n_ctx: int = 0
self._stdout_lines: list[str] = []
self._stdout_thread: Optional[threading.Thread] = None
+ # llama-server tee log (see _drain_stdout / _kill_process).
+ self._llama_log_fh = None
+ self._llama_log_path: Optional[Path] = None
self._cancel_event = threading.Event()
self._api_key: Optional[str] = None
@@ -723,6 +939,17 @@ def cache_type_kv(self) -> Optional[str]:
def speculative_type(self) -> Optional[str]:
return self._speculative_type
+ @property
+ def requested_spec_mode(self) -> Optional[str]:
+ """Canonical UI-facing mode the user requested (see field doc)."""
+ return self._requested_spec_mode
+
+ @property
+ def spec_draft_n_max(self) -> Optional[int]:
+ """User --spec-draft-n-max override active on the load, or None
+ when the platform default (6 GPU / 3 CPU) is in effect."""
+ return self._spec_draft_n_max
+
# ── Binary discovery ──────────────────────────────────────────
@staticmethod
@@ -839,6 +1066,156 @@ def _find_llama_server_binary() -> Optional[str]:
return None
+ # ── llama-server capability probe ─────────────────────────────
+
+ # Cached on (path, mtime); `unsloth studio update` bumps mtime.
+ _capability_cache: dict[tuple[str, int], dict[str, object]] = {}
+
+ @classmethod
+ def probe_server_capabilities(
+ cls, binary: Optional[str] = None
+ ) -> dict[str, object]:
+ """Parse `llama-server --help` for feature flags. Returns
+ {found, mtp_token, supports_mtp, ngram_mod_flavor,
+ supports_ngram_mod, spec_draft_n_max_flag}.
+
+ ``ngram_mod_flavor`` is ``"new"`` when the binary exposes the
+ post-rename ``--spec-ngram-mod-n-match / -n-min / -n-max`` as
+ real args, ``"legacy"`` when only the pre-rename
+ ``--spec-ngram-size-n / --draft-min / --draft-max`` are real
+ (the rename ships with stub removal entries for the legacy
+ names; we tell stubs apart by the "argument has been removed"
+ description), or ``None`` if neither set is usable.
+
+ ``spec_draft_n_max_flag`` is the actual flag name the binary
+ accepts: ``--spec-draft-n-max`` on post-rename builds, or
+ ``--draft-max`` on legacy. ``None`` means n_max cannot be set.
+ """
+ bin_path = binary or cls._find_llama_server_binary()
+ if not bin_path or not Path(bin_path).is_file():
+ return {
+ "found": False,
+ "mtp_token": None,
+ "supports_mtp": False,
+ "ngram_mod_flavor": None,
+ "supports_ngram_mod": False,
+ "spec_draft_n_max_flag": None,
+ }
+ try:
+ mtime = int(Path(bin_path).stat().st_mtime)
+ except OSError:
+ mtime = 0
+ cache_key = (bin_path, mtime)
+ cached = cls._capability_cache.get(cache_key)
+ if cached is not None:
+ return cached
+
+ mtp_token: Optional[str] = None
+ ngram_mod_flavor: Optional[str] = None
+ spec_draft_n_max_flag: Optional[str] = None
+ try:
+ result = subprocess.run(
+ [bin_path, "--help"],
+ capture_output = True,
+ text = True,
+ timeout = 10,
+ check = False,
+ )
+ help_text = (result.stdout or "") + "\n" + (result.stderr or "")
+ # Split into per-flag blocks: each --flag line plus its
+ # indented continuation lines, so the "argument has been
+ # removed" description sits with its flag.
+ blocks: dict[str, str] = {}
+ current_flags: list[str] = []
+ current_desc: list[str] = []
+ for line in help_text.splitlines():
+ stripped = line.strip()
+ if stripped.startswith("-") and not line.startswith(" "):
+ # New flag line; flush previous.
+ if current_flags:
+ desc = " ".join(current_desc)
+ for f in current_flags:
+ blocks[f] = desc
+ current_flags = []
+ current_desc = [stripped]
+ # Extract long-form flag tokens from the DECLARATION
+ # prefix only (comma-separated aliases). Stop at the
+ # first token that isn't itself a flag, so flag
+ # references inside descriptions are ignored.
+ for tok in re.split(r"[,\s]+", stripped):
+ if tok.startswith("--") and re.match(
+ r"--[A-Za-z][A-Za-z0-9_-]*$", tok
+ ):
+ current_flags.append(tok)
+ elif tok.startswith("-") and len(tok) > 1:
+ # short alias like -fa; keep scanning aliases.
+ continue
+ else:
+ # First non-flag token marks end of decl.
+ break
+ else:
+ current_desc.append(stripped)
+ if current_flags:
+ desc = " ".join(current_desc)
+ for f in current_flags:
+ blocks[f] = desc
+
+ def _is_real(flag: str) -> bool:
+ """True if the flag exists AND is not a removal stub."""
+ desc = blocks.get(flag)
+ if desc is None:
+ return False
+ return "argument has been removed" not in desc
+
+ # MTP token detection from --spec-type line.
+ spec_line = ""
+ for line in help_text.splitlines():
+ if "--spec-type" in line:
+ spec_line = line
+ break
+ # PR #22673 used draft-mtp; later renamed to mtp.
+ if "draft-mtp" in spec_line:
+ mtp_token = "draft-mtp"
+ elif re.search(r"[|,\[]mtp[|,\]]", spec_line):
+ mtp_token = "mtp"
+
+ # ngram-mod flag flavor. Post-rename builds advertise both
+ # the new args (real) and the legacy ones (stubs); pre-rename
+ # builds only have the legacy ones as real.
+ new_ngram_real = (
+ _is_real("--spec-ngram-mod-n-match")
+ and _is_real("--spec-ngram-mod-n-min")
+ and _is_real("--spec-ngram-mod-n-max")
+ )
+ legacy_ngram_real = (
+ _is_real("--spec-ngram-size-n")
+ and _is_real("--draft-max")
+ and _is_real("--draft-min")
+ )
+ if new_ngram_real:
+ ngram_mod_flavor = "new"
+ elif legacy_ngram_real:
+ ngram_mod_flavor = "legacy"
+
+ # n_max flag: prefer post-rename, fall back to legacy.
+ if _is_real("--spec-draft-n-max"):
+ spec_draft_n_max_flag = "--spec-draft-n-max"
+ elif _is_real("--draft-max"):
+ spec_draft_n_max_flag = "--draft-max"
+ except (OSError, subprocess.SubprocessError) as exc:
+ logger.debug(f"llama-server --help probe failed: {exc}")
+
+ info = {
+ "found": True,
+ "mtp_token": mtp_token,
+ "supports_mtp": mtp_token is not None,
+ "ngram_mod_flavor": ngram_mod_flavor,
+ "supports_ngram_mod": ngram_mod_flavor is not None,
+ "spec_draft_n_max_flag": spec_draft_n_max_flag,
+ }
+ cls._capability_cache[cache_key] = info
+ return info
+
# ── GPU allocation ────────────────────────────────────────────
@staticmethod
@@ -1055,6 +1432,26 @@ def _add(path: Path) -> None:
_add(site_packages / "torch" / "lib")
return out
+ @staticmethod
+ def _build_windows_path_dirs(
+ binary_dir: str, prefix: str, cuda_path: str
+ ) -> list[str]:
+ """Ordered PATH entries the win32 branch of start_llama_server
+ prepends so llama-server.exe resolves cudart / cublas DLLs:
+ binary_dir, pip nvidia wheels, CUDA_PATH/bin, CUDA_PATH/bin/x64.
+ Extracted so test_windows_gpu_detection_mock asserts against
+ production logic, not a hand-copy. #5106."""
+ path_dirs = [binary_dir]
+ path_dirs.extend(LlamaCppBackend._windows_pip_nvidia_dll_dirs(prefix))
+ if cuda_path:
+ cuda_bin = os.path.join(cuda_path, "bin")
+ if os.path.isdir(cuda_bin):
+ path_dirs.append(cuda_bin)
+ cuda_bin_x64 = os.path.join(cuda_path, "bin", "x64")
+ if os.path.isdir(cuda_bin_x64):
+ path_dirs.append(cuda_bin_x64)
+ return path_dirs
+
@staticmethod
def _select_gpus(
model_size_bytes: int,
@@ -1319,6 +1716,7 @@ def _fit_context_to_vram(
kv_unified: bool = True,
ctx_checkpoints: int = 0,
kv_on_gpu: bool = True,
+ mtp_engaged: bool = False,
) -> int:
"""Return the largest context length that fits in GPU VRAM.
@@ -1332,6 +1730,12 @@ def _fit_context_to_vram(
the KV cache lives in CPU RAM and doesn't compete with weights
for VRAM; the requested context is honored verbatim. The other
keyword args mirror ``_estimate_kv_cache_bytes``.
+
+ ``mtp_engaged`` reserves extra VRAM for the MTP draft model's
+ KV cache + compute graph buffers. llama.cpp's MTP path keeps a
+ secondary cache sized off the target's KV; on tight VRAM tiers
+ (e.g. 32 GB) auto-fit at native context would otherwise spill
+ and force llama-server into a slower partial-offload path.
"""
if not self._can_estimate_kv():
logger.debug(
@@ -1352,7 +1756,9 @@ def _fit_context_to_vram(
ctx_checkpoints = ctx_checkpoints,
)
- budget_bytes = available_mib * 1024 * 1024 * 0.90
+ # MTP needs a tighter budget; drop from 0.90 to 0.85.
+ budget_frac = 0.85 if mtp_engaged else 0.90
+ budget_bytes = available_mib * 1024 * 1024 * budget_frac
model_footprint = model_size_bytes
# Check if requested context already fits
@@ -1462,6 +1868,11 @@ def _drain_stdout(self):
This prevents a pipe-buffer deadlock on Windows where the default
pipe buffer is only ~4 KB. Without draining, llama-server blocks
on writes and never becomes healthy.
+
+ Each line is also teed to ``self._llama_log_fh`` when set so a
+ post-mortem (especially in CI) has the full subprocess output
+ even if the crash predates the drain-thread join in
+ ``_wait_for_health``.
"""
try:
for line in self._process.stdout:
@@ -1469,6 +1880,14 @@ def _drain_stdout(self):
if line:
self._stdout_lines.append(line)
logger.debug(f"[llama-server] {line}")
+ fh = getattr(self, "_llama_log_fh", None)
+ if fh is not None:
+ try:
+ fh.write(line + "\n")
+ fh.flush()
+ except (ValueError, OSError):
+ # Log file closed under us; tee silently.
+ pass
except (ValueError, OSError):
# Pipe closed — process is terminating
pass
@@ -1557,6 +1976,7 @@ def _read_gguf_metadata(self, gguf_path: str) -> None:
self._ssm_inner_size = None
self._ssm_state_size = None
self._shared_kv_layers = None
+ self._nextn_predict_layers = None
try:
WANTED = {
@@ -1639,6 +2059,7 @@ def _read_gguf_metadata(self, gguf_path: str) -> None:
f"{arch}.attention.shared_kv_layers": "shared_kv_layers",
f"{arch}.ssm.inner_size": "ssm_inner_size",
f"{arch}.ssm.state_size": "ssm_state_size",
+ f"{arch}.nextn_predict_layers": "nextn_predict_layers",
}
elif key == "tokenizer.chat_template":
self._chat_template = val_s
@@ -1804,6 +2225,55 @@ def _download_gguf(
except Exception as e:
logger.warning(f"Could not list repo files: {e}")
+ # Offline: resolve variant -> filename from the local HF cache.
+ # The heuristic below assumes filenames echo the repo name,
+ # which breaks for e.g. Qwen3.6-27B-MTP-GGUF (no "MTP" in file).
+ # Match against the rel path (not just basename) so subdir
+ # layouts like ``BF16/foo.gguf`` are findable.
+ if not gguf_filename:
+ try:
+ from utils.models.model_config import _iter_hf_cache_snapshots
+
+ boundary = re.compile(
+ r"(? %s from local HF cache",
+ hf_variant,
+ gguf_filename,
+ )
+ break
+ except Exception as e:
+ logger.debug(f"Offline cache lookup for variant failed: {e}")
+
if not gguf_filename:
repo_name = hf_repo.split("/")[-1].replace("-GGUF", "")
gguf_filename = f"{repo_name}-{hf_variant}.gguf"
@@ -1811,8 +2281,6 @@ def _download_gguf(
# Check disk space and fall back to a smaller variant if needed
all_gguf_files = [gguf_filename] + gguf_extra_shards
try:
- import os
-
from huggingface_hub import get_paths_info, try_to_load_from_cache
path_infos = list(get_paths_info(hf_repo, all_gguf_files, token = hf_token))
@@ -1946,24 +2414,50 @@ def _download_mmproj(
Prefers mmproj-F16.gguf, falls back to any mmproj*.gguf file.
Returns the local path, or None if no mmproj file exists.
"""
- try:
- from huggingface_hub import hf_hub_download, list_repo_files
- files = list_repo_files(hf_repo, token = hf_token)
+ def _pick_mmproj(candidates: list[str]) -> Optional[str]:
mmproj_files = sorted(
- f for f in files if f.endswith(".gguf") and "mmproj" in f.lower()
+ f
+ for f in candidates
+ if f.lower().endswith(".gguf") and "mmproj" in Path(f).name.lower()
)
if not mmproj_files:
return None
-
- # Prefer F16 variant
- target = None
for f in mmproj_files:
if f.lower().endswith("-f16.gguf"):
- target = f
- break
- if target is None:
- target = mmproj_files[0]
+ return f
+ return mmproj_files[0]
+
+ target: Optional[str] = None
+ try:
+ from huggingface_hub import list_repo_files
+
+ target = _pick_mmproj(list_repo_files(hf_repo, token = hf_token))
+ except Exception as e:
+ logger.debug(f"Could not list repo files for mmproj: {e}")
+
+ # Offline: resolve mmproj from the local HF cache snapshot, same
+ # shape as _download_gguf's offline fallback above.
+ if target is None:
+ try:
+ from utils.models.model_config import _iter_hf_cache_snapshots
+
+ for snap in _iter_hf_cache_snapshots(hf_repo):
+ rel_files = [
+ p.relative_to(snap).as_posix() for p in snap.rglob("*.gguf")
+ ]
+ target = _pick_mmproj(rel_files)
+ if target is not None:
+ logger.info("Resolved mmproj %s from local HF cache", target)
+ break
+ except Exception as e:
+ logger.debug(f"Offline cache lookup for mmproj failed: {e}")
+
+ if target is None:
+ return None
+
+ try:
+ from huggingface_hub import hf_hub_download
logger.info(f"Downloading mmproj: {hf_repo}/{target}")
local_path = hf_hub_download(
@@ -1976,6 +2470,35 @@ def _download_mmproj(
logger.warning(f"Could not download mmproj: {e}")
return None
+ def _resolve_launch_mmproj_path(
+ self,
+ *,
+ model_path: str,
+ mmproj_path: Optional[str],
+ ) -> Optional[str]:
+ """Return mmproj_path iff it exists on disk AND matches the model family.
+
+ Returns None if mmproj_path is None, missing on disk, or family-mismatched.
+ """
+ if not mmproj_path:
+ return None
+
+ mmproj = Path(mmproj_path)
+ if not mmproj.is_file():
+ logger.warning(f"mmproj file not found: {mmproj_path}")
+ return None
+
+ from utils.models.model_config import mmproj_matches_model_family
+
+ if not mmproj_matches_model_family(model_path, str(mmproj)):
+ logger.warning(
+ f"mmproj does not match model family: model={Path(model_path).name} "
+ f"mmproj={mmproj.name}"
+ )
+ return None
+
+ return str(mmproj)
+
# ── Lifecycle ─────────────────────────────────────────────────
def load_model(
@@ -1996,6 +2519,7 @@ def load_model(
chat_template_override: Optional[str] = None,
cache_type_kv: Optional[str] = None,
speculative_type: Optional[str] = None,
+ spec_draft_n_max: Optional[int] = None,
n_threads: Optional[int] = None,
n_gpu_layers: Optional[int] = None, # Accepted for caller compat, unused
n_parallel: int = 1,
@@ -2027,6 +2551,7 @@ def load_model(
n_ctx = n_ctx,
cache_type_kv = cache_type_kv,
speculative_type = speculative_type,
+ spec_draft_n_max = spec_draft_n_max,
chat_template_override = chat_template_override,
extra_args = extra_args,
is_vision = is_vision,
@@ -2052,18 +2577,22 @@ def load_model(
)
# ── Phase 2: download (NO lock held, so cancel can proceed) ──
+ # Scope HF_HUB_OFFLINE to the download block only when DNS is
+ # dead; cleanup runs even on exception so a transient hiccup
+ # at the start of one load cannot quarantine future loads.
if hf_repo:
- model_path = self._download_gguf(
- hf_repo = hf_repo,
- hf_variant = hf_variant,
- hf_token = hf_token,
- )
- # Auto-download mmproj for vision models
- if is_vision and not mmproj_path:
- mmproj_path = self._download_mmproj(
+ with _hf_offline_if_dns_dead():
+ model_path = self._download_gguf(
hf_repo = hf_repo,
+ hf_variant = hf_variant,
hf_token = hf_token,
)
+ # Auto-download mmproj for vision models
+ if is_vision and not mmproj_path:
+ mmproj_path = self._download_mmproj(
+ hf_repo = hf_repo,
+ hf_token = hf_token,
+ )
elif gguf_path:
if not Path(gguf_path).is_file():
raise FileNotFoundError(f"GGUF file not found: {gguf_path}")
@@ -2115,6 +2644,35 @@ def load_model(
# GPU/VRAM-fit logic below may shrink this if hardware is limited.
max_available_ctx = self._context_length or effective_ctx
+ # Will MTP engage on this load? If so, the auto-fit
+ # budget needs to reserve extra VRAM for the draft
+ # model's KV cache + compute graph. Mirrors the
+ # canonical-mode resolver in _build_speculative_flags:
+ # forced mtp / mtp+ngram always engage; auto only
+ # engages on an MTP GGUF >= 3B (sub-3B auto falls
+ # back to ngram-mod which doesn't need headroom);
+ # ngram / ngram-simple / off never engage MTP.
+ _mtp_canonical = _canonicalize_spec_mode(speculative_type)
+ _mtp_effective = _mtp_canonical or "auto"
+ _mtp_size_for_fit = _extract_model_size_b(model_identifier)
+ _mtp_sub_3b_for_fit = (
+ _mtp_size_for_fit is not None and _mtp_size_for_fit < 3.0
+ )
+ _mtp_will_engage = bool(
+ not _extra_args_set_spec_type(extra_args)
+ and (
+ _mtp_effective in ("mtp", "mtp+ngram")
+ or (
+ _mtp_effective == "auto"
+ and (
+ bool(self._nextn_predict_layers)
+ or _is_mtp_model_name(model_identifier, model_path)
+ )
+ and not _mtp_sub_3b_for_fit
+ )
+ )
+ )
+
# Auto-cap context to fit in GPU VRAM and select GPUs.
#
# Two policies depending on whether the user set n_ctx:
@@ -2150,6 +2708,7 @@ def load_model(
model_size,
cache_type_kv,
n_parallel = n_parallel,
+ mtp_engaged = _mtp_will_engage,
)
kv = self._estimate_kv_cache_bytes(
capped, cache_type_kv, n_parallel = n_parallel
@@ -2201,6 +2760,7 @@ def load_model(
model_size,
cache_type_kv,
n_parallel = n_parallel,
+ mtp_engaged = _mtp_will_engage,
)
kv = self._estimate_kv_cache_bytes(
capped, cache_type_kv, n_parallel = n_parallel
@@ -2275,6 +2835,20 @@ def load_model(
gpu_indices, use_fit = None, True
effective_ctx = n_ctx # fall back to original
+ launch_mmproj_path = self._resolve_launch_mmproj_path(
+ model_path = model_path,
+ mmproj_path = mmproj_path,
+ )
+ # Need both a resolved mmproj AND the config vision flag; a stray
+ # mmproj passing the family-name heuristic must not flip a non-VLM
+ # GGUF into vision mode.
+ effective_is_vision = bool(launch_mmproj_path) and bool(is_vision)
+ if is_vision and not effective_is_vision:
+ logger.warning(
+ "Vision-capable GGUF loaded without a usable mmproj; "
+ "image input will be disabled for this session"
+ )
+
cmd = [
binary,
"-m",
@@ -2345,43 +2919,27 @@ def load_model(
# Qwen3-235B offloaded | 12 t/s | 21 t/s | 1.8x
# gpt-oss-120b repeat (92% accept)| 181 t/s | 814 t/s | 4.5x
#
- # Params from llama.cpp docs (docs/speculative.md):
- # --spec-ngram-size-n 24 (small n not recommended)
- # --draft-min 48 --draft-max 64 (MoEs need long drafts;
- # dense models can reduce these)
+ # Params from llama.cpp server README:
+ # --spec-ngram-mod-n-match 24 (lookup length)
+ # --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64
+ # (MoEs need long drafts; dense models can reduce these)
# ref: https://github.com/ggml-org/llama.cpp/blob/master/docs/speculative.md
# ref: https://github.com/ggml-org/llama.cpp/pull/19164
# ref: https://github.com/ggml-org/llama.cpp/pull/18471
- # ``"default"`` -> let llama-server pick a sensible spec
- # config via ``--spec-default``. Explicit type names are
- # passed through with the manual draft tuning we've shipped
- # historically so power users keep their overrides.
- _valid_spec_types = {"ngram-simple", "ngram-mod"}
- normalized_spec = (
- speculative_type.lower().strip() if speculative_type else None
+ # draft-mtp: MTP heads on Unsloth's *-MTP GGUFs
+ # (llama.cpp #22673). Auto-enabled via nextn_predict_layers,
+ # fallback to -MTP in name. GPU: MTP-only. CPU/Mac: chain
+ # with ngram-mod. See unsloth.ai/docs/models/qwen3.6#mtp-guide.
+ spec_flags = self._build_speculative_flags(
+ speculative_type = speculative_type,
+ spec_draft_n_max = spec_draft_n_max,
+ extra_args = extra_args,
+ model_identifier = model_identifier,
+ model_path = model_path,
+ gpus = bool(gpus),
+ binary = binary,
)
- if normalized_spec and normalized_spec != "off" and not is_vision:
- if normalized_spec == "default":
- cmd.append("--spec-default")
- self._speculative_type = "default"
- elif normalized_spec in _valid_spec_types:
- cmd.extend(["--spec-type", normalized_spec])
- if normalized_spec == "ngram-mod":
- cmd.extend(
- [
- "--spec-ngram-size-n",
- "24",
- "--draft-min",
- "48",
- "--draft-max",
- "64",
- ]
- )
- self._speculative_type = normalized_spec
- else:
- self._speculative_type = None
- else:
- self._speculative_type = None
+ cmd.extend(spec_flags)
# Apply custom chat template override if provided
self._chat_template_override = chat_template_override
@@ -2435,24 +2993,9 @@ def load_model(
)
logger.info(f"Reasoning model: {reasoning_kw} by default")
- if mmproj_path:
- if not Path(mmproj_path).is_file():
- logger.warning(f"mmproj file not found: {mmproj_path}")
- else:
- # #5347 guard for paths that bypass detect_mmproj_file.
- from utils.models.model_config import (
- mmproj_matches_model_family,
- )
-
- if not mmproj_matches_model_family(model_path, mmproj_path):
- logger.warning(
- f"Skipping mmproj with mismatched family: "
- f"model={Path(model_path).name}, "
- f"mmproj={Path(mmproj_path).name}"
- )
- else:
- cmd.extend(["--mmproj", mmproj_path])
- logger.info(f"Using mmproj for vision: {mmproj_path}")
+ if launch_mmproj_path and effective_is_vision:
+ cmd.extend(["--mmproj", launch_mmproj_path])
+ logger.info(f"Using mmproj for vision: {launch_mmproj_path}")
# Option C: add --api-key for direct client access when enabled
import os as _os
@@ -2493,23 +3036,12 @@ def load_model(
binary_dir = str(Path(binary).parent)
if sys.platform == "win32":
- # CUDA DLLs (cudart64_X.dll, cublas64_X.dll, etc.) must
- # be on PATH. Order: binary_dir, torch's pip-installed
- # nvidia wheels, then a system CUDA toolkit. Pip wheels
- # are the canonical source per Studio's install design
- # (mirrors the Linux LD_LIBRARY_PATH block below) and
- # CUDA_PATH covers users with a system toolkit. #5106.
- path_dirs = [binary_dir]
- path_dirs.extend(self._windows_pip_nvidia_dll_dirs(sys.prefix))
- cuda_path = os.environ.get("CUDA_PATH", "")
- if cuda_path:
- cuda_bin = os.path.join(cuda_path, "bin")
- if os.path.isdir(cuda_bin):
- path_dirs.append(cuda_bin)
- # Some CUDA installs put DLLs in bin\x64
- cuda_bin_x64 = os.path.join(cuda_path, "bin", "x64")
- if os.path.isdir(cuda_bin_x64):
- path_dirs.append(cuda_bin_x64)
+ # See _build_windows_path_dirs for ordering. #5106.
+ path_dirs = self._build_windows_path_dirs(
+ binary_dir,
+ sys.prefix,
+ os.environ.get("CUDA_PATH", ""),
+ )
existing_path = env.get("PATH", "")
env["PATH"] = ";".join(path_dirs) + ";" + existing_path
else:
@@ -2603,6 +3135,30 @@ def load_model(
self._kill_process()
self._stdout_lines = []
+ # Tee llama-server output to a dedicated log file so a
+ # post-mortem in CI (or after a remote-debug session)
+ # has the full subprocess trail even when the parent
+ # only stored the last 50 lines. Path lives under the
+ # studio home so it ships in the same place all other
+ # Studio logs live.
+ self._llama_log_fh = None
+ try:
+ log_dir = _swa_cache_path().parent / "logs" / "llama-server"
+ log_dir.mkdir(parents = True, exist_ok = True)
+ self._llama_log_path = (
+ log_dir / f"llama-{int(time.time())}-port-{self._port}.log"
+ )
+ self._llama_log_fh = open(
+ self._llama_log_path,
+ "w",
+ encoding = "utf-8",
+ buffering = 1,
+ )
+ logger.info(f"llama-server stdout/stderr -> {self._llama_log_path}")
+ except OSError as e:
+ # Best-effort; never block the load on logging.
+ logger.debug(f"Could not open llama-server log file: {e}")
+ self._llama_log_path = None
self._process = subprocess.Popen(
cmd,
stdout = subprocess.PIPE,
@@ -2637,7 +3193,7 @@ def load_model(
self._hf_variant = None
else:
self._hf_variant = None
- self._is_vision = is_vision
+ self._is_vision = effective_is_vision
self._model_identifier = model_identifier
# Store the effective (possibly capped) context separately.
@@ -2715,6 +3271,220 @@ def load_model(
)
return True
+ def _build_speculative_flags(
+ self,
+ *,
+ speculative_type: Optional[str],
+ spec_draft_n_max: Optional[int],
+ extra_args: Optional[List[str]],
+ model_identifier: str,
+ model_path: Optional[str],
+ gpus: bool,
+ binary: Optional[str],
+ ) -> List[str]:
+ """Return the llama-server flag list for the requested spec mode.
+
+ Side effects: sets ``self._speculative_type`` (resolved internal
+ emit), ``self._requested_spec_mode`` (canonical UI mode for the
+ status round-trip), and ``self._spec_draft_n_max`` (user override
+ only; None when the platform default applies).
+
+ Speculative decoding (n-gram self-speculation, zero VRAM cost):
+ ngram-mod uses a ~16 MB shared hash pool, constant memory /
+ complexity, variable draft lengths. Helps most when the model
+ repeats existing text (code refactor, summarisation, reasoning).
+ For general chat with low repetition, overhead is ~5 ms.
+
+ Benchmarks from upstream llama.cpp speculative-decoding PRs:
+ Scenario | Without | With | Speedup
+ gpt-oss-120b code refactor | 181 t/s | 446 t/s | 2.5x
+ Qwen3-235B offloaded | 12 t/s | 21 t/s | 1.8x
+ gpt-oss-120b repeat (92% accept)| 181 t/s | 814 t/s | 4.5x
+
+ Sub-3B dense MTP regresses vs spec-off because the draft head's
+ per-token cost exceeds the acceptance savings at this scale.
+ Q4_K_XL clean bench (each prompt once after an unrelated warmup)
+ on B200 + x86 CPU:
+ 0.8B GPU: draft-mtp n=2 = 0.58x vs OFF; ngram-only = 1.10x
+ 2B GPU: draft-mtp n=2 = 0.82x vs OFF; OFF or ngram = 1.00x
+ 0.8B CPU: chained n=2 = 0.86x vs OFF; ngram-only = 1.19x
+ 2B CPU: chained n=2 = 0.83x vs OFF; ngram-only = 1.01x
+ 4B+ GPU/CPU: spec on is a net win (1.08x-1.46x).
+ Auto falls back to ngram-mod (zero-VRAM, near-zero idle cost on
+ diverse content); forced MTP variants engage anyway and just log
+ a warning per the user's choice.
+ """
+ flags: List[str] = []
+ # Reset; emit branches re-set on the resolved emission.
+ self._spec_draft_n_max = None
+ self._speculative_type = None
+
+ # Canonical UI-facing requested mode: auto / mtp / ngram /
+ # mtp+ngram / off / ngram-simple. Legacy values are mapped via
+ # _canonicalize_spec_mode (default->auto, draft-mtp->mtp,
+ # ngram-mod->ngram, "ngram-mod,draft-mtp"->mtp+ngram).
+ canonical_mode = _canonicalize_spec_mode(speculative_type)
+ is_mtp_model = bool(self._nextn_predict_layers) or (
+ _is_mtp_model_name(model_identifier, model_path)
+ )
+ user_owns_spec_type = _extra_args_set_spec_type(extra_args)
+ _mtp_size_b = _extract_model_size_b(model_identifier)
+ _mtp_too_small = _mtp_size_b is not None and _mtp_size_b < 3.0
+
+ if user_owns_spec_type:
+ # User --spec-type in extra_args wins outright; suppress
+ # auto-emit so we don't emit a duplicate / conflicting
+ # spec block. Record requested mode as None.
+ self._requested_spec_mode = None
+ return flags
+
+ effective_mode = canonical_mode or "auto"
+ self._requested_spec_mode = effective_mode
+
+ def _resolved_draft_n_max() -> int:
+ # User override wins; else platform default (the B200 / x86
+ # clean-sweep sweet spot from PR #5582 is n=2 GPU, n=3 CPU;
+ # raising past 3 starts to regress on essay-style
+ # low-acceptance prompts).
+ if spec_draft_n_max is not None:
+ n = int(spec_draft_n_max)
+ self._spec_draft_n_max = n
+ return n
+ return 2 if gpus else 3
+
+ def _emit_mtp(*, chain_ngram: bool) -> bool:
+ """Append --spec-type mtp[/draft-mtp][,ngram-mod] + n-max."""
+ caps = self.probe_server_capabilities(binary)
+ mtp_token = caps.get("mtp_token") if caps else None
+ if not mtp_token:
+ logger.warning(
+ "Requested MTP speculative decoding but "
+ "llama-server lacks --spec-type mtp/draft-mtp; "
+ "run `unsloth studio update`. Loading without "
+ "speculative decoding."
+ )
+ return False
+ draft_n_max = _resolved_draft_n_max()
+ n_max_flag = caps.get("spec_draft_n_max_flag") or "--spec-draft-n-max"
+ if chain_ngram:
+ ngram_knobs = _build_ngram_mod_flags(caps)
+ if ngram_knobs:
+ spec_value = f"ngram-mod,{mtp_token}"
+ else:
+ logger.warning(
+ "llama-server lacks ngram-mod tuning "
+ "flags; loading MTP only (no ngram chain)"
+ )
+ spec_value = mtp_token
+ flags.extend(
+ [
+ "--spec-type",
+ spec_value,
+ n_max_flag,
+ str(draft_n_max),
+ ]
+ )
+ flags.extend(ngram_knobs)
+ else:
+ flags.extend(
+ [
+ "--spec-type",
+ mtp_token,
+ n_max_flag,
+ str(draft_n_max),
+ ]
+ )
+ self._speculative_type = "draft-mtp"
+ chain_label = "chained ngram-mod" if chain_ngram else "MTP-only"
+ logger.info(f"Spec decoding: {mtp_token} ({chain_label})")
+ return True
+
+ def _emit_ngram_mod() -> bool:
+ """Append --spec-type ngram-mod + flag-set knobs."""
+ ngram_caps = self.probe_server_capabilities(binary)
+ ngram_knobs = _build_ngram_mod_flags(ngram_caps)
+ flags.extend(["--spec-type", "ngram-mod"])
+ if not ngram_knobs:
+ logger.warning(
+ "llama-server lacks ngram-mod tuning "
+ "flags; loading without --spec-ngram-mod-* knobs"
+ )
+ flags.extend(ngram_knobs)
+ self._speculative_type = "ngram-mod"
+ logger.info("Spec decoding: ngram-mod")
+ return True
+
+ if effective_mode == "off":
+ return flags # nothing to emit
+ if effective_mode == "ngram-simple":
+ flags.extend(["--spec-type", "ngram-simple"])
+ self._speculative_type = "ngram-simple"
+ return flags
+ if effective_mode == "ngram":
+ _emit_ngram_mod()
+ return flags
+ if effective_mode == "mtp":
+ if _mtp_too_small:
+ logger.warning(
+ f"Forcing MTP on a {_mtp_size_b:.1f}B model; "
+ "the bench shows draft-mtp regresses below 3B. "
+ "Engaging anyway (user override)."
+ )
+ elif not is_mtp_model:
+ logger.warning(
+ "Forcing MTP on a non-MTP GGUF; llama-server may "
+ "fall back to spec-off if no nextn head is present. "
+ "Engaging anyway (user override)."
+ )
+ _emit_mtp(chain_ngram = False)
+ return flags
+ if effective_mode == "mtp+ngram":
+ if _mtp_too_small:
+ logger.warning(
+ f"Forcing MTP+Ngram on a {_mtp_size_b:.1f}B model; "
+ "the bench shows the chain regresses below 3B. "
+ "Engaging anyway (user override)."
+ )
+ elif not is_mtp_model:
+ logger.warning(
+ "Forcing MTP+Ngram on a non-MTP GGUF; llama-server "
+ "may fall back to ngram-only if no nextn head is "
+ "present. Engaging anyway (user override)."
+ )
+ _emit_mtp(chain_ngram = True)
+ return flags
+
+ # effective_mode == "auto": today's promotion path. llama.cpp
+ # #22673: MTP is compatible with mmproj, so there's no vision gate.
+ if is_mtp_model and not _mtp_too_small:
+ # GPU: MTP-only. CPU/Mac: chain ngram-mod + MTP.
+ _emit_mtp(chain_ngram = not gpus)
+ elif is_mtp_model and _mtp_too_small:
+ # Sub-3B fallback: drop the MTP draft head, keep ngram-mod
+ # when the binary supports it.
+ _small_caps = self.probe_server_capabilities(binary)
+ if _small_caps.get("supports_ngram_mod"):
+ logger.info(
+ f"MTP GGUF detected but model size {_mtp_size_b:.1f}B "
+ "is below the 3B speedup threshold; using ngram-mod "
+ "only (zero-VRAM, no draft head). Override via "
+ "--spec-type or the Studio Speculative Decoding "
+ "dropdown."
+ )
+ _emit_ngram_mod()
+ else:
+ logger.info(
+ f"MTP GGUF detected but model size {_mtp_size_b:.1f}B "
+ "is below the 3B speedup threshold and the bundled "
+ "llama-server does not advertise ngram-mod; "
+ "auto-disabling speculative decoding."
+ )
+ else:
+ # Non-MTP model: let llama-server choose its default strategy.
+ flags.append("--spec-default")
+ self._speculative_type = "default"
+ return flags
+
def _already_in_target_state(
self,
*,
@@ -2727,6 +3497,7 @@ def _already_in_target_state(
extra_args: Optional[List[str]],
is_vision: bool,
gguf_path: Optional[str] = None,
+ spec_draft_n_max: Optional[int] = None,
) -> bool:
"""True iff the live server already satisfies these load kwargs.
@@ -2763,16 +3534,28 @@ def _norm(value):
if _norm(self._cache_type_kv) != _norm(cache_type_kv):
return False
- # Vision GGUFs silently drop speculative decoding in
- # load_model (the spec gate is "not is_vision"); treat the
- # request's value as "off" so a vision load with
- # speculative_type="default" still matches.
- if self._is_vision or is_vision:
- req_spec = "off"
+ # Compare on the canonical UI-facing mode the user requested.
+ # When extra_args carries --spec-type, the route-layer code paths
+ # bypass the dropdown anyway and the backend stores
+ # _requested_spec_mode = None; the request mirrors that by
+ # canonicalising to None.
+ if _extra_args_set_spec_type(extra_args):
+ req_mode = None
else:
- req_spec = _norm(speculative_type) or "off"
- backend_spec = _norm(self._speculative_type) or "off"
- if req_spec != backend_spec:
+ req_mode = _canonicalize_spec_mode(speculative_type) or "auto"
+ backend_mode = self._requested_spec_mode
+ if req_mode != backend_mode:
+ return False
+
+ # spec_draft_n_max only matters when an MTP variant is actually
+ # engaged. Compare on the resolved spec rather than the requested
+ # mode so an Auto request that auto-promoted to draft-mtp under
+ # the hood still bounces a reload when the user changes n_max.
+ if (
+ self._speculative_type == "draft-mtp"
+ and spec_draft_n_max is not None
+ and int(spec_draft_n_max) != (self._spec_draft_n_max or 0)
+ ):
return False
if (self._chat_template_override or None) != (chat_template_override or None):
@@ -2841,6 +3624,8 @@ def unload_model(self) -> bool:
self._supports_tools = False
self._cache_type_kv = None
self._speculative_type = None
+ self._requested_spec_mode = None
+ self._spec_draft_n_max = None
self._n_layers = None
self._n_kv_heads = None
self._n_kv_heads_by_layer = None
@@ -2858,6 +3643,7 @@ def unload_model(self) -> bool:
self._ssm_inner_size = None
self._ssm_state_size = None
self._shared_kv_layers = None
+ self._nextn_predict_layers = None
# Clean up temp chat template file
if hasattr(self, "_chat_template_file") and self._chat_template_file:
try:
@@ -2899,6 +3685,13 @@ def _kill_process(self):
if self._stdout_thread is not None:
self._stdout_thread.join(timeout = 2)
self._stdout_thread = None
+ fh = getattr(self, "_llama_log_fh", None)
+ if fh is not None:
+ try:
+ fh.close()
+ except Exception:
+ pass
+ self._llama_log_fh = None
@staticmethod
def _kill_orphaned_servers():
@@ -3110,7 +3903,17 @@ def _wait_for_health(self, timeout: float = 120.0, interval: float = 0.5) -> boo
resp = httpx.get(url, timeout = 2.0)
if resp.status_code == 200:
return True
- except (httpx.ConnectError, httpx.TimeoutException):
+ except (
+ httpx.ConnectError,
+ httpx.TimeoutException,
+ # ReadError covers TCP RST mid-read while llama-server is
+ # still binding the port (Windows: WinError 10054). The
+ # crash-detection branch above catches a real exit; this
+ # one keeps a transient socket close from masking it.
+ httpx.ReadError,
+ httpx.RemoteProtocolError,
+ httpx.WriteError,
+ ):
pass
time.sleep(interval)
@@ -3122,128 +3925,9 @@ def _wait_for_health(self, timeout: float = 120.0, interval: float = 0.5) -> boo
@staticmethod
def _parse_tool_calls_from_text(content: str) -> list[dict]:
- """
- Parse tool calls from XML markup in content text.
-
- Handles formats like:
- {"name":"web_search","arguments":{"query":"..."}}
- ...
- Closing tags (, , ) are all optional
- since models frequently omit them.
- """
- tool_calls = []
-
- # Pattern 1: JSON inside tags.
- # Use balanced-brace extraction that skips braces inside JSON strings.
- for m in _TC_JSON_START_RE.finditer(content):
- brace_start = m.end() - 1 # position of the opening {
- depth, i = 0, brace_start
- in_string = False
- while i < len(content):
- ch = content[i]
- if in_string:
- if ch == "\\" and i + 1 < len(content):
- i += 2 # skip escaped character
- continue
- if ch == '"':
- in_string = False
- elif ch == '"':
- in_string = True
- elif ch == "{":
- depth += 1
- elif ch == "}":
- depth -= 1
- if depth == 0:
- break
- i += 1
- if depth == 0:
- json_str = content[brace_start : i + 1]
- try:
- obj = json.loads(json_str)
- tc = {
- "id": f"call_{len(tool_calls)}",
- "type": "function",
- "function": {
- "name": obj.get("name", ""),
- "arguments": obj.get("arguments", {}),
- },
- }
- if isinstance(tc["function"]["arguments"], dict):
- tc["function"]["arguments"] = json.dumps(
- tc["function"]["arguments"]
- )
- tool_calls.append(tc)
- except (json.JSONDecodeError, ValueError):
- pass
-
- # Pattern 2: XML-style value
- # All closing tags optional -- models frequently omit ,
- # ], and/or .
- if not tool_calls:
- # Step 1: Find all positions and extract their bodies.
- # Body boundary: use only or next as a boundary because
- # code parameter values can contain that literal string.
- # After extracting, we trim a trailing if present.
- func_starts = list(_TC_FUNC_START_RE.finditer(content))
- for idx, fm in enumerate(func_starts):
- func_name = fm.group(1)
- body_start = fm.end()
- # Hard boundaries: next
- next_func = (
- func_starts[idx + 1].start()
- if idx + 1 < len(func_starts)
- else len(content)
- )
- end_tag = _TC_END_TAG_RE.search(content[body_start:])
- if end_tag:
- body_end = body_start + end_tag.start()
- else:
- body_end = len(content)
- body_end = min(body_end, next_func)
- body = content[body_start:body_end]
- # Trim trailing if present (it's the real closing tag)
- body = _TC_FUNC_CLOSE_RE.sub("", body)
-
- # Step 2: Extract parameters from body.
- # For single-parameter functions (the common case: code, command,
- # query), use body end as the only boundary to avoid false matches
- # on inside code strings.
- arguments = {}
- param_starts = list(_TC_PARAM_START_RE.finditer(body))
- if len(param_starts) == 1:
- # Single parameter: value is everything from after the tag
- # to end of body, trimming any trailing .
- pm = param_starts[0]
- val = body[pm.end() :]
- val = _TC_PARAM_CLOSE_RE.sub("", val)
- arguments[pm.group(1)] = val.strip()
- else:
- for pidx, pm in enumerate(param_starts):
- param_name = pm.group(1)
- val_start = pm.end()
- # Value ends at next if present
- val = _TC_PARAM_CLOSE_RE.sub("", val)
- arguments[param_name] = val.strip()
-
- tc = {
- "id": f"call_{len(tool_calls)}",
- "type": "function",
- "function": {
- "name": func_name,
- "arguments": json.dumps(arguments),
- },
- }
- tool_calls.append(tc)
-
- return tool_calls
+ """Thin wrapper around the shared parser in tool_call_parser
+ so safetensors and llama_cpp pick up the same fixes."""
+ return _shared_parse_tool_calls_from_text(content)
@staticmethod
def _build_openai_messages(
@@ -3564,6 +4248,9 @@ def generate_chat_completion(
if _stream_done:
break # exit outer for
if _metadata_usage or _metadata_timings:
+ _metadata_usage = _backfill_usage_from_timings(
+ _metadata_usage, _metadata_timings
+ )
yield {
"type": "metadata",
"usage": _metadata_usage,
@@ -3622,17 +4309,14 @@ def generate_chat_completion_with_tools(
def _strip_tool_markup(text: str, *, final: bool = False) -> str:
if not auto_heal_tool_calls:
return text
- patterns = _TOOL_ALL_PATS if final else _TOOL_CLOSED_PATS
- for pat in patterns:
- text = pat.sub("", text)
- return text.strip() if final else text
-
- # XML prefixes that signal a tool call in content.
- # Empty when auto_heal is disabled so the buffer never
- # speculatively holds content for XML detection.
- _TOOL_XML_SIGNALS = (
- ("", ", Qwen3.5 , Mistral [TOOL_CALLS], Gemma 4 <|tool_call>.
+ _TOOL_XML_SIGNALS = _SHARED_TOOL_XML_SIGNALS if auto_heal_tool_calls else ()
_MAX_BUFFER_CHARS = 32
# ── Duplicate tool-call detection ────────────────────────
@@ -4016,7 +4700,10 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
}
)
# Accumulate tokens and timing from this iteration
- _fu_r = _iter_usage or {}
+ _fu_r = (
+ _backfill_usage_from_timings(_iter_usage, _iter_timings)
+ or {}
+ )
_accumulated_completion_tokens += _fu_r.get(
"completion_tokens", 0
)
@@ -4028,7 +4715,10 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
# Content was already streamed. Yield metadata.
yield {"type": "status", "text": ""}
- _fu = _iter_usage or {}
+ _fu = (
+ _backfill_usage_from_timings(_iter_usage, _iter_timings)
+ or {}
+ )
_fc = _fu.get("completion_tokens", 0)
_fp = _fu.get("prompt_tokens", 0)
_tc = _fc + _accumulated_completion_tokens
@@ -4118,7 +4808,10 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
)
if content_accum:
yield {"type": "content", "text": content_accum}
- _fu = _iter_usage or {}
+ _fu = (
+ _backfill_usage_from_timings(_iter_usage, _iter_timings)
+ or {}
+ )
_fc = _fu.get("completion_tokens", 0)
_fp = _fu.get("prompt_tokens", 0)
_tc = _fc + _accumulated_completion_tokens
@@ -4152,9 +4845,9 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
return
# ── Execute tool calls ──
- _accumulated_completion_tokens += (_iter_usage or {}).get(
- "completion_tokens", 0
- )
+ _accumulated_completion_tokens += (
+ _backfill_usage_from_timings(_iter_usage, _iter_timings) or {}
+ ).get("completion_tokens", 0)
_it = _iter_timings or {}
_accumulated_predicted_ms += _it.get("predicted_ms", 0)
_accumulated_predicted_n += _it.get("predicted_n", 0)
@@ -4174,7 +4867,17 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
arguments = json.loads(raw_args)
except (json.JSONDecodeError, ValueError):
if auto_heal_tool_calls:
- arguments = {"query": raw_args}
+ # Per-tool canonical heal key so a bare
+ # string emission still runs the right
+ # tool: ``code`` for python, ``command``
+ # for terminal, ``query`` for everything
+ # else (e.g. web_search). Mirrors
+ # safetensors_agentic._CANONICAL_HEAL_ARG.
+ _heal_key = {
+ "python": "code",
+ "terminal": "command",
+ }.get(tool_name, "query")
+ arguments = {_heal_key: raw_args}
else:
arguments = {"raw": raw_args}
else:
diff --git a/studio/backend/core/inference/mlx_inference.py b/studio/backend/core/inference/mlx_inference.py
index e7bce2d33e..716e4c27a2 100644
--- a/studio/backend/core/inference/mlx_inference.py
+++ b/studio/backend/core/inference/mlx_inference.py
@@ -157,10 +157,59 @@ def load_model(
"audio_type": None,
"has_audio_input": False,
}
+ # Capture chat_template_info so the worker IPC reply can ship
+ # it back to the parent and the route layer classifies
+ # capabilities the same way as the transformers / GGUF paths.
+ self._populate_chat_template_info(model_name)
logger.info("Model %s loaded successfully", model_name)
return True
+ def _populate_chat_template_info(self, model_name: str) -> None:
+ """Mirror InferenceBackend._load_chat_template_info for MLX.
+
+ Stores ``chat_template_info`` on ``self.models[model_name]``
+ with the resolved ``tokenizer.chat_template`` so
+ ``_detect_safetensors_features`` (route layer) sees the same
+ template the model actually uses."""
+ entry = self.models.get(model_name)
+ if not entry:
+ return
+ tok = entry.get("tokenizer")
+ if tok is None:
+ proc = entry.get("processor")
+ tok = getattr(proc, "tokenizer", None) if proc else None
+ info = {
+ "has_template": False,
+ "template": None,
+ "format_type": "generic",
+ "special_tokens": {},
+ "template_name": None,
+ }
+ try:
+ tpl = getattr(tok, "chat_template", None)
+ if tpl:
+ info["has_template"] = True
+ info["template"] = tpl
+ lower = tpl.lower()
+ if "start_header_id" in lower and "end_header_id" in lower:
+ info["format_type"] = "llama3"
+ elif "[inst]" in lower and "[/inst]" in lower:
+ info["format_type"] = "mistral"
+ elif "<|im_start|>" in lower and "<|im_end|>" in lower:
+ info["format_type"] = "chatml"
+ else:
+ info["format_type"] = "custom"
+ special = {}
+ for attr in ("bos_token", "eos_token", "pad_token"):
+ val = getattr(tok, attr, None)
+ if val:
+ special[attr] = val
+ info["special_tokens"] = special
+ except Exception as exc:
+ logger.warning("MLX chat_template_info capture failed: %s", exc)
+ entry["chat_template_info"] = info
+
def unload_model(self, model_name: str) -> bool:
import mlx.core as mx
import gc
@@ -197,6 +246,14 @@ def generate_chat_response(
max_new_tokens = 256,
repetition_penalty = 1.0,
cancel_event = None,
+ # Reasoning / tool kwargs forwarded by the route + worker -- the
+ # MLX path renders the template via apply_chat_template_for_
+ # generation so these are honoured the same way as the
+ # transformers path.
+ tools = None,
+ enable_thinking = None,
+ reasoning_effort = None,
+ preserve_thinking = None,
) -> Generator[str, None, None]:
if self._model is None:
raise RuntimeError("No model loaded")
@@ -239,6 +296,10 @@ def generate_chat_response(
max_new_tokens,
repetition_penalty,
cancel_event,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
)
else:
yield from self._generate_text(
@@ -250,6 +311,10 @@ def generate_chat_response(
max_new_tokens,
repetition_penalty,
cancel_event,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
)
def _generate_text(
@@ -262,14 +327,26 @@ def _generate_text(
max_new_tokens,
repetition_penalty,
cancel_event,
+ *,
+ tools = None,
+ enable_thinking = None,
+ reasoning_effort = None,
+ preserve_thinking = None,
):
from mlx_lm import stream_generate
from mlx_lm.sample_utils import make_sampler, make_logits_processors
- prompt = self._tokenizer.apply_chat_template(
+ from core.inference.chat_template_helpers import (
+ apply_chat_template_for_generation,
+ )
+
+ prompt = apply_chat_template_for_generation(
+ self._tokenizer,
messages,
- tokenize = False,
- add_generation_prompt = True,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
)
if prompt is None:
raise RuntimeError(
@@ -343,20 +420,38 @@ def _generate_vlm(
max_new_tokens,
repetition_penalty,
cancel_event,
+ *,
+ tools = None,
+ enable_thinking = None,
+ reasoning_effort = None,
+ preserve_thinking = None,
):
from mlx_vlm import stream_generate as vlm_stream
- # Apply chat template
- chat_fn = getattr(self._processor, "apply_chat_template", None)
+ from core.inference.chat_template_helpers import (
+ apply_chat_template_for_generation,
+ )
+
+ # Pick the chat-template-aware caller: processors that expose
+ # their own apply_chat_template + chat_template attr (e.g.
+ # Qwen2.5-VL) use it directly; otherwise fall back to the
+ # nested tokenizer.
+ chat_target = self._processor
if (
- chat_fn is None
+ getattr(self._processor, "apply_chat_template", None) is None
or not hasattr(self._processor, "chat_template")
or self._processor.chat_template is None
):
- tok = getattr(self._processor, "tokenizer", self._processor)
- chat_fn = tok.apply_chat_template
+ chat_target = getattr(self._processor, "tokenizer", self._processor)
- prompt = chat_fn(messages, tokenize = False, add_generation_prompt = True)
+ prompt = apply_chat_template_for_generation(
+ chat_target,
+ messages,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
+ )
# For VLM: always use mlx_vlm's stream_generate which handles
# pixel_values properly (passes None for text-only, image for VLM)
diff --git a/studio/backend/core/inference/orchestrator.py b/studio/backend/core/inference/orchestrator.py
index 5562820f49..7e7d7026f6 100644
--- a/studio/backend/core/inference/orchestrator.py
+++ b/studio/backend/core/inference/orchestrator.py
@@ -449,6 +449,10 @@ def _generate_dispatched(
repetition_penalty: float = 1.0,
cancel_event = None,
use_adapter = None,
+ tools: Optional[list] = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
) -> Generator[str, None, None]:
"""Dispatched generation — sends command without holding _gen_lock.
@@ -494,6 +498,14 @@ def _generate_dispatched(
if use_adapter is not None:
cmd["use_adapter"] = use_adapter
+ if tools is not None:
+ cmd["tools"] = tools
+ if enable_thinking is not None:
+ cmd["enable_thinking"] = enable_thinking
+ if reasoning_effort is not None:
+ cmd["reasoning_effort"] = reasoning_effort
+ if preserve_thinking is not None:
+ cmd["preserve_thinking"] = preserve_thinking
# Create mailbox BEFORE sending command
mailbox: queue.Queue = queue.Queue()
@@ -695,6 +707,13 @@ def load_model(
"audio_type": model_info.get("audio_type"),
"has_audio_input": model_info.get("has_audio_input", False),
}
+ # Mirror chat_template_info so routes can classify
+ # capabilities without re-entering the subprocess.
+ _tpl_info = model_info.get("chat_template_info")
+ if isinstance(_tpl_info, dict):
+ self.models[self.active_model_name]["chat_template_info"] = (
+ _tpl_info
+ )
self.loading_models.discard(model_name)
logger.info(
"Model '%s' loaded successfully in subprocess", model_name
@@ -770,8 +789,18 @@ def generate_chat_response(
max_new_tokens: int = 256,
repetition_penalty: float = 1.0,
cancel_event = None,
+ tools: Optional[list] = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
) -> Generator[str, None, None]:
- """Generate response, streaming tokens from subprocess."""
+ """Generate response, streaming tokens from subprocess.
+
+ Optional ``tools`` / ``enable_thinking`` / ``reasoning_effort`` /
+ ``preserve_thinking`` kwargs are forwarded into the worker so
+ ``tokenizer.apply_chat_template`` can render tool schemas and
+ reasoning controls when the template understands them.
+ """
yield from self._generate_inner(
messages = messages,
system_prompt = system_prompt,
@@ -784,6 +813,88 @@ def generate_chat_response(
repetition_penalty = repetition_penalty,
cancel_event = cancel_event,
use_adapter = None,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
+ )
+
+ def generate_chat_completion_with_tools(
+ self,
+ messages: list,
+ tools: list,
+ system_prompt: str = "",
+ temperature: float = 0.7,
+ top_p: float = 0.9,
+ top_k: int = 40,
+ min_p: float = 0.0,
+ max_tokens: Optional[int] = None,
+ repetition_penalty: float = 1.0,
+ cancel_event = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
+ max_tool_iterations: int = 25,
+ auto_heal_tool_calls: bool = True,
+ tool_call_timeout: int = 300,
+ session_id: Optional[str] = None,
+ use_adapter: Optional[Union[bool, str]] = None,
+ **_unused,
+ ):
+ """Run the safetensors agentic tool loop in this (parent)
+ process, calling the worker for each generation turn.
+
+ Yields the same event dicts as the GGUF tool loop so the route
+ layer can stream both backends through one helper. See
+ ``safetensors_agentic.run_safetensors_tool_loop`` for the
+ event protocol.
+ """
+ from core.inference.safetensors_agentic import run_safetensors_tool_loop
+ from core.inference.tools import execute_tool
+
+ max_new_tokens = max_tokens if max_tokens and max_tokens > 0 else 2048
+
+ def _single_turn(conv: list):
+ # ``conv`` already carries any system message because the
+ # loop appends to a list seeded with system+user above.
+ common_kwargs = dict(
+ messages = conv,
+ system_prompt = "",
+ image = None,
+ temperature = temperature,
+ top_p = top_p,
+ top_k = top_k,
+ min_p = min_p,
+ max_new_tokens = max_new_tokens,
+ repetition_penalty = repetition_penalty,
+ cancel_event = cancel_event,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
+ )
+ if use_adapter is not None:
+ yield from self.generate_with_adapter_control(
+ use_adapter = use_adapter,
+ **common_kwargs,
+ )
+ else:
+ yield from self.generate_chat_response(**common_kwargs)
+
+ initial = list(messages)
+ if system_prompt:
+ initial = [{"role": "system", "content": system_prompt}] + initial
+
+ yield from run_safetensors_tool_loop(
+ single_turn = _single_turn,
+ messages = initial,
+ tools = tools,
+ execute_tool = execute_tool,
+ cancel_event = cancel_event,
+ auto_heal_tool_calls = auto_heal_tool_calls,
+ max_tool_iterations = max_tool_iterations,
+ tool_call_timeout = tool_call_timeout,
+ session_id = session_id,
)
def generate_with_adapter_control(
@@ -817,6 +928,10 @@ def _generate_inner(
repetition_penalty: float = 1.0,
cancel_event = None,
use_adapter = None,
+ tools: Optional[list] = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
) -> Generator[str, None, None]:
"""Inner generation logic — sends command to subprocess, yields tokens.
@@ -853,6 +968,10 @@ def _generate_inner(
repetition_penalty = repetition_penalty,
cancel_event = cancel_event,
use_adapter = use_adapter,
+ tools = tools,
+ enable_thinking = enable_thinking,
+ reasoning_effort = reasoning_effort,
+ preserve_thinking = preserve_thinking,
)
def _generate_locked(
@@ -868,6 +987,10 @@ def _generate_locked(
repetition_penalty: float = 1.0,
cancel_event = None,
use_adapter = None,
+ tools: Optional[list] = None,
+ enable_thinking: Optional[bool] = None,
+ reasoning_effort: Optional[str] = None,
+ preserve_thinking: Optional[bool] = None,
) -> Generator[str, None, None]:
"""Actual generation logic — must be called under _gen_lock."""
request_id = str(uuid.uuid4())
@@ -893,6 +1016,16 @@ def _generate_locked(
if use_adapter is not None:
cmd["use_adapter"] = use_adapter
+ # Only forward template kwargs the caller actually set so older
+ # workers that ignore unknown keys still work.
+ if tools is not None:
+ cmd["tools"] = tools
+ if enable_thinking is not None:
+ cmd["enable_thinking"] = enable_thinking
+ if reasoning_effort is not None:
+ cmd["reasoning_effort"] = reasoning_effort
+ if preserve_thinking is not None:
+ cmd["preserve_thinking"] = preserve_thinking
try:
self._send_cmd(cmd)
@@ -1200,6 +1333,13 @@ def check_vision_model_compatibility(self) -> bool:
return self.models[self.active_model_name].get("is_vision", False)
return False
+ def _is_gpt_oss_model(self, model_name: str = None) -> bool:
+ """Parent-side gpt-oss detection so the safetensors route can run
+ the same guard without an IPC round-trip to the subprocess."""
+ from utils.datasets import is_gpt_oss_model_name
+
+ return is_gpt_oss_model_name(model_name or self.active_model_name or "")
+
# ========== GLOBAL INSTANCE ==========
_inference_backend = None
diff --git a/studio/backend/core/inference/safetensors_agentic.py b/studio/backend/core/inference/safetensors_agentic.py
new file mode 100644
index 0000000000..f70421b584
--- /dev/null
+++ b/studio/backend/core/inference/safetensors_agentic.py
@@ -0,0 +1,460 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""
+Safetensors/transformers agentic tool loop.
+
+Wraps a single-turn cumulative-text generator (the existing
+``InferenceOrchestrator.generate_chat_response`` pipeline that streams
+from a worker subprocess) with the tool-calling, thinking-block,
+status, and metadata event protocol used by the GGUF path. Keeps the
+front-end SSE shape identical across backends so the chat UI does not
+care which engine actually ran the model.
+
+The GGUF path lives in ``llama_cpp.py`` and talks to llama-server's
+structured ``delta.tool_calls`` directly. Native transformers has no
+such structured channel, so this loop parses tool calls from the
+cumulative text and dispatches them via ``core.inference.tools``.
+"""
+
+import json
+import re
+import threading
+from typing import Callable, Generator, Optional
+from urllib.parse import urlparse
+
+from loggers import get_logger
+
+from core.inference.tool_call_parser import (
+ BUDGET_EXHAUSTED_NUDGE,
+ DUPLICATE_CALL_NUDGE,
+ TOOL_ERROR_NUDGE,
+ TOOL_ERROR_PREFIXES,
+ TOOL_XML_SIGNALS,
+ has_tool_signal,
+ parse_tool_calls_from_text,
+ strip_tool_markup,
+)
+
+
+logger = get_logger(__name__)
+
+
+# Buffer cap while waiting to disambiguate a possible tool-call prefix.
+_MAX_BUFFER_CHARS = 32
+
+# Forward-looking intent signals that indicate the model is describing
+# what it *will* do rather than giving a final answer. Mirrors the GGUF
+# path so safetensors / MLX nudge the model to act when it stalls on
+# planning instead of calling a tool. Excludes "I can", "I should",
+# "I want to", "let's" which appear in direct answers / explanations.
+_INTENT_SIGNAL = re.compile(
+ r"(?i)("
+ # Direct intent: "I'll", "I will", "Let me", "I am going to".
+ r"\b(i['’](ll|m going to|m gonna)|i am (going to|gonna)|i will|i shall|let me|allow me)\b"
+ r"|"
+ # Step / plan framing: "First", "Step 1:", "Here's my plan".
+ r"\b(?:first\b|step \d+:?|here['’]?s (?:my |the |a )?(?:plan|approach))"
+ r"|"
+ # "Now I" / "Next I" patterns.
+ r"\b(?:now i|next i)\b"
+ r")"
+)
+_MAX_REPROMPTS = 3
+_REPROMPT_MAX_CHARS = 2000
+_REPROMPT_INSTRUCTION = (
+ "STOP. Do NOT write code or explain. You MUST call a tool NOW. "
+ "Call web_search or python immediately."
+)
+
+
+def _status_for_tool(tool_name: str, arguments: dict) -> str:
+ """Return a human-readable status line matching the GGUF path."""
+ if tool_name == "web_search":
+ url = (arguments.get("url") or "").strip()
+ if url:
+ parsed = urlparse(url)
+ if parsed.scheme in ("http", "https") and parsed.hostname:
+ host = parsed.hostname
+ if host.startswith("www."):
+ host = host[4:]
+ return f"Reading: {host}"
+ return "Reading page..."
+ query = arguments.get("query", "")
+ return f"Searching: {query}"
+ if tool_name == "python":
+ preview = (arguments.get("code") or "").strip().split("\n")[0][:60]
+ return f"Running Python: {preview}" if preview else "Running Python..."
+ if tool_name == "terminal":
+ preview = (arguments.get("command") or "")[:60]
+ return f"Running: {preview}" if preview else "Running command..."
+ return f"Calling: {tool_name}"
+
+
+_CANONICAL_HEAL_ARG = {"python": "code", "terminal": "command"}
+
+
+def _coerce_arguments(raw_args, *, heal: bool, tool_name: str = "") -> dict:
+ """Normalise tool ``arguments`` to a dict.
+
+ Some templates emit a JSON string, others a bare query string. With
+ ``heal=True`` we accept a bare string as ``{: ...}``
+ so a Hermes-style call without proper JSON still runs the tool. The
+ canonical key is picked per tool: ``code`` for python, ``command``
+ for terminal, ``query`` for everything else (e.g. web_search).
+ """
+ if isinstance(raw_args, dict):
+ return raw_args
+ if isinstance(raw_args, str):
+ try:
+ parsed = json.loads(raw_args)
+ if isinstance(parsed, dict):
+ return parsed
+ except (json.JSONDecodeError, ValueError):
+ pass
+ if heal:
+ key = _CANONICAL_HEAL_ARG.get(tool_name, "query")
+ return {key: raw_args}
+ return {"raw": raw_args}
+ return {}
+
+
+def run_safetensors_tool_loop(
+ *,
+ single_turn: Callable[[list], Generator[str, None, None]],
+ messages: list[dict],
+ tools: list[dict],
+ execute_tool: Callable[..., str],
+ cancel_event: Optional[threading.Event] = None,
+ auto_heal_tool_calls: bool = True,
+ max_tool_iterations: int = 25,
+ tool_call_timeout: int = 300,
+ session_id: Optional[str] = None,
+) -> Generator[dict, None, None]:
+ """Drive an agentic tool loop on top of a cumulative-text generator.
+
+ ``single_turn(messages)`` must yield cumulative assistant text
+ (each yield is a snapshot including all previously emitted tokens).
+ The loop:
+
+ * Buffers the leading characters of every turn so it can decide
+ whether the model is about to emit a tool call. Plain content
+ starts streaming as soon as the buffer rules it out.
+ * On detecting ```` or `` 0 else 0
+ for iteration in range(max_tool_iterations + _extra_iters + 1):
+ if cancel_event is not None and cancel_event.is_set():
+ return
+
+ detect_state = _state_buffering
+ content_buffer = ""
+ content_accum = ""
+ cumulative_display = ""
+ last_emitted = ""
+
+ gen = single_turn(conversation)
+ prev_cumulative = ""
+
+ for cumulative in gen:
+ if cancel_event is not None and cancel_event.is_set():
+ return
+
+ if not isinstance(cumulative, str):
+ continue # defensive: pipeline only yields strings
+
+ delta = cumulative[len(prev_cumulative) :]
+ prev_cumulative = cumulative
+ if not delta:
+ continue
+ content_accum += delta
+
+ if detect_state == _state_draining:
+ continue
+
+ if detect_state == _state_streaming:
+ candidate = cumulative_display + delta
+ signal_pos = -1
+ for sig in TOOL_XML_SIGNALS:
+ p = candidate.find(sig)
+ if p >= 0 and (signal_pos < 0 or p < signal_pos):
+ signal_pos = p
+ if signal_pos >= 0:
+ before_tool = candidate[:signal_pos]
+ cleaned_before = strip_tool_markup(before_tool)
+ if len(cleaned_before) > len(last_emitted):
+ last_emitted = cleaned_before
+ yield {"type": "content", "text": cleaned_before}
+ cumulative_display = candidate
+ detect_state = _state_draining
+ continue
+ cumulative_display = candidate
+ cleaned = strip_tool_markup(cumulative_display)
+ if len(cleaned) > len(last_emitted):
+ last_emitted = cleaned
+ yield {"type": "content", "text": cleaned}
+ continue
+
+ # BUFFERING: hold until we know it is not a tool call.
+ content_buffer += delta
+ stripped = content_buffer.lstrip()
+ if not stripped:
+ continue
+
+ is_match = False
+ is_prefix = False
+ for sig in TOOL_XML_SIGNALS:
+ if stripped.startswith(sig):
+ is_match = True
+ break
+ if sig.startswith(stripped):
+ is_prefix = True
+ break
+
+ if is_match:
+ detect_state = _state_draining
+ elif is_prefix and len(stripped) < _MAX_BUFFER_CHARS:
+ continue
+ else:
+ detect_state = _state_streaming
+ cumulative_display += content_buffer
+ cleaned = strip_tool_markup(cumulative_display)
+ if len(cleaned) > len(last_emitted):
+ last_emitted = cleaned
+ yield {"type": "content", "text": cleaned}
+
+ # Stream finished -- resolve what we collected.
+ if cancel_event is not None and cancel_event.is_set():
+ return
+
+ if detect_state == _state_buffering:
+ # Buffer never resolved -- tool XML or plain content.
+ stripped = content_buffer.lstrip()
+ if stripped and has_tool_signal(stripped):
+ detect_state = _state_draining
+ else:
+ # Emit the buffered content, then fall through to the
+ # STREAMING block so the intent re-prompt + safety-net
+ # parser still get a chance. Without this, a short
+ # intent emission like "Let me search." that never
+ # exits BUFFERING would silently terminate the loop.
+ if content_buffer:
+ cumulative_display += content_buffer
+ cleaned = strip_tool_markup(cumulative_display, final = True)
+ if len(cleaned) > len(last_emitted):
+ last_emitted = cleaned
+ yield {"type": "content", "text": cleaned}
+ detect_state = _state_streaming
+
+ if detect_state == _state_streaming:
+ # No tool detected mid-stream -- check for late tool XML.
+ safety_tc = None
+ if has_tool_signal(content_accum):
+ safety_tc = parse_tool_calls_from_text(
+ content_accum,
+ id_offset = next_call_id,
+ )
+ if not safety_tc:
+ # Re-prompt on plan-without-action: if the model
+ # described what it intends to do but did not call a
+ # tool, nudge it to act. Mirrors the GGUF path. Only
+ # fires on responses that signal intent / planning --
+ # direct answers like "4" or "Hello!" don't trigger.
+ _stripped = content_accum.strip()
+ if (
+ tools
+ and reprompt_count < _MAX_REPROMPTS
+ and 0 < len(_stripped) < _REPROMPT_MAX_CHARS
+ and _INTENT_SIGNAL.search(_stripped)
+ and not final_attempt_done
+ ):
+ reprompt_count += 1
+ logger.info(
+ "Safetensors re-prompt %d/%d: model planned without "
+ "calling tools (%d chars)",
+ reprompt_count,
+ _MAX_REPROMPTS,
+ len(_stripped),
+ )
+ conversation.append(
+ {"role": "assistant", "content": _stripped}
+ )
+ conversation.append(
+ {"role": "user", "content": _REPROMPT_INSTRUCTION}
+ )
+ yield {"type": "status", "text": ""}
+ continue
+
+ # Final answer: streaming already emitted content.
+ # Skip a final=True re-strip so literal ""
+ # in prose survives when no real tool call parsed.
+ yield {"type": "status", "text": ""}
+ return
+ tool_calls = safety_tc
+ content_text = strip_tool_markup(content_accum, final = True)
+ logger.info(
+ "Safetensors safety net: parsed %d tool call(s) from streamed content",
+ len(tool_calls),
+ )
+ else:
+ # DRAINING: parse tool calls out of full content.
+ tool_calls = parse_tool_calls_from_text(
+ content_accum,
+ id_offset = next_call_id,
+ )
+ if not tool_calls and auto_heal_tool_calls:
+ # Parser found nothing -- surface raw content so any
+ # literal "" prose is preserved.
+ if content_accum:
+ yield {"type": "content", "text": content_accum}
+ yield {"type": "status", "text": ""}
+ return
+ content_text = strip_tool_markup(content_accum, final = True)
+
+ if final_attempt_done:
+ # Final-answer turn re-called a tool -- stop the loop.
+ if content_text:
+ yield {"type": "content", "text": content_text}
+ yield {"type": "status", "text": ""}
+ return
+
+ assistant_msg: dict = {"role": "assistant", "content": content_text}
+ if tool_calls:
+ assistant_msg["tool_calls"] = tool_calls
+ next_call_id += len(tool_calls)
+ conversation.append(assistant_msg)
+
+ for tc in tool_calls or []:
+ func = tc.get("function", {}) or {}
+ tool_name = func.get("name", "") or ""
+ arguments = _coerce_arguments(
+ func.get("arguments", {}),
+ heal = auto_heal_tool_calls,
+ tool_name = tool_name,
+ )
+
+ yield {"type": "status", "text": _status_for_tool(tool_name, arguments)}
+ yield {
+ "type": "tool_start",
+ "tool_name": tool_name,
+ "tool_call_id": tc.get("id", ""),
+ "arguments": arguments,
+ }
+
+ tc_key = tool_name + str(arguments)
+ if allowed_tool_names and tool_name not in allowed_tool_names:
+ result = (
+ f"Error: tool '{tool_name}' is not enabled for this "
+ "request. Use one of the enabled tools or provide a "
+ "final answer."
+ )
+ else:
+ already_ran_ok = any(
+ k == tc_key and not err for k, err in tool_call_history
+ )
+ if already_ran_ok:
+ result = DUPLICATE_CALL_NUDGE
+ else:
+ eff_timeout = (
+ None if tool_call_timeout >= 9999 else tool_call_timeout
+ )
+ try:
+ result = execute_tool(
+ tool_name,
+ arguments,
+ cancel_event = cancel_event,
+ timeout = eff_timeout,
+ session_id = session_id,
+ )
+ except Exception as exc:
+ logger.exception("Tool %s raised: %s", tool_name, exc)
+ result = f"Error: tool raised an exception: {exc}"
+
+ yield {
+ "type": "tool_end",
+ "tool_name": tool_name,
+ "tool_call_id": tc.get("id", ""),
+ "result": result,
+ }
+
+ is_error = isinstance(result, str) and result.lstrip().startswith(
+ TOOL_ERROR_PREFIXES
+ )
+ tool_call_history.append((tc_key, is_error))
+
+ # Strip frontend image sentinel from the model's view.
+ # Cut at the first occurrence so leading and consecutive
+ # sentinels are both removed.
+ result_for_model = result
+ if isinstance(result_for_model, str) and "__IMAGES__:" in result_for_model:
+ result_for_model = result_for_model.split("__IMAGES__:", 1)[0].rstrip()
+ if is_error:
+ result_for_model = result_for_model + TOOL_ERROR_NUDGE
+
+ tool_msg: dict = {
+ "role": "tool",
+ "name": tool_name,
+ "content": result_for_model,
+ }
+ tool_call_id = tc.get("id")
+ if tool_call_id:
+ tool_msg["tool_call_id"] = tool_call_id
+ conversation.append(tool_msg)
+
+ # Clear the status badge before the next turn.
+ yield {"type": "status", "text": ""}
+
+ # Budget tracked against the caller-requested cap, ignoring
+ # the re-prompt slots so a stalling model still gets a final
+ # answer attempt. Tool-call iterations executed = iteration -
+ # reprompt_count.
+ _tool_iters_done = iteration + 1 - reprompt_count
+ if _tool_iters_done >= max_tool_iterations and not final_attempt_done:
+ # Budget exhausted; nudge a final plain answer.
+ final_attempt_done = True
+ conversation.append(
+ {
+ "role": "user",
+ "content": BUDGET_EXHAUSTED_NUDGE,
+ }
+ )
+
+ yield {"type": "status", "text": ""}
diff --git a/studio/backend/core/inference/tool_call_parser.py b/studio/backend/core/inference/tool_call_parser.py
new file mode 100644
index 0000000000..f505bc8ad6
--- /dev/null
+++ b/studio/backend/core/inference/tool_call_parser.py
@@ -0,0 +1,1127 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""
+Backend-neutral tool-call parser shared by GGUF, safetensors, and MLX.
+
+Covers the emission formats so the safetensors + MLX agentic loop sees
+the same call shape llama-server normalises for GGUF:
+
+ - ``{json}`` (Qwen / Hermes)
+ - ``v`` (Qwen3.5 xml)
+ - ``<|python_tag|>NAME.call(k="v", ...)`` (Llama-3 built-in tools)
+ - ``<|python_tag|>{"name":..., "parameters":...}`` (Llama-3 custom)
+ - ``{"name":..., "parameters":...}`` (Llama-3.2 bare JSON)
+ - ``[TOOL_CALLS] [{...}, ...]`` (Mistral v0.3 / Nemo / Small)
+ - ``[TOOL_CALLS]name{json}`` (Mistral v11+ / Magistral)
+ - ``[TOOL_CALLS]name[ARGS]{json}`` (Ministral / Mistral Large 3)
+ - ``<|tool_call>call:NAME{k:<|"|>v<|"|>}`` (Gemma 4)
+ - ``<|tool▁calls▁begin|>...function<|tool▁sep|>NAME\\n``\\`\\`\\`json\\n{...}\\n\\`\\`\\`...`` (DeepSeek R1)
+ - ``<|tool▁calls▁begin|>...<|tool▁call▁begin|>NAME<|tool▁sep|>{json}<|tool▁call▁end|>...`` (DeepSeek V3 / V3.1)
+ - ``NAME\\nk\\nv...`` (GLM 4.5 / 4.6 / 4.7)
+ - ``<|tool_calls_section_begin|>...<|tool_call_begin|>functions.NAME:IDX<|tool_call_argument_begin|>{json}<|tool_call_end|>...`` (Kimi K2)
+
+Closing tags / brackets are tolerated when missing because models
+frequently truncate them mid-stream.
+"""
+
+import json
+import re
+from typing import Any
+
+
+# ── Streaming-buffer signal markers ─────────────────────────────────
+
+
+# Prefixes the safetensors / MLX streaming buffer watches for to gate
+# in-progress text. When ANY of these appear in the cumulative text,
+# the state machine switches from STREAMING to DRAINING so we don't
+# leak partial markup to the user before we can parse it.
+TOOL_XML_SIGNALS = (
+ "",
+ "",
+ "[TOOL_CALLS]",
+ "<|tool_call>",
+ # DeepSeek R1 / V3 / V3.1 (full-width pipes + lower-one-eighth-block).
+ "<|tool▁calls▁begin|>",
+ "<|tool▁call▁begin|>",
+ # Alternative DeepSeek openers llama.cpp also recognises -- some
+ # checkpoints emit ASCII underscores, others a short form.
+ "<|tool_calls_begin|>",
+ "<|tool▁calls|>",
+ # Kimi K2 / Moonshot section start.
+ "<|tool_calls_section_begin|>",
+ "<|tool_call_begin|>",
+)
+
+
+# ── Strip patterns for ``strip_tool_markup`` ────────────────────────
+
+
+# _TOOL_CLOSED_PATS: closed pairs only (used during streaming so
+# in-progress XML stays buffered). _TOOL_ALL_PATS: also matches trailing
+# unclosed runs so truncated tails don't leak markup at end-of-turn.
+_TOOL_CLOSED_PATS = [
+ re.compile(r".*?", re.DOTALL),
+ re.compile(r".*?", re.DOTALL),
+ re.compile(r"<\|tool_call>.*?", re.DOTALL),
+ re.compile(r"\[TOOL_CALLS\]\s*\[.*?\](?:\s*)?", re.DOTALL),
+ # Mistral v11+ ``[TOOL_CALLS]name{json}`` (may chain), close at ``}``.
+ re.compile(r"\[TOOL_CALLS\]\s*[\w\.\-]+\s*(?:\[ARGS\])?\s*\{.*?\}", re.DOTALL),
+ # DeepSeek R1 / V3 / V3.1: full envelope ``<|tool▁calls▁begin|>...<|tool▁calls▁end|>``.
+ re.compile(r"<|tool[▁_]calls[▁_]begin|>.*?<|tool▁calls▁end|>", re.DOTALL),
+ # Kimi K2: ``<|tool_calls_section_begin|>...<|tool_calls_section_end|>``.
+ re.compile(r"<\|tool_calls_section_begin\|>.*?<\|tool_calls_section_end\|>", re.DOTALL),
+]
+_TOOL_ALL_PATS = _TOOL_CLOSED_PATS + [
+ re.compile(r".*$", re.DOTALL),
+ re.compile(r".*$", re.DOTALL),
+ re.compile(r"<\|tool_call>.*$", re.DOTALL),
+ re.compile(r"\[TOOL_CALLS\].*$", re.DOTALL),
+ re.compile(r"<\|python_tag\|>.*$", re.DOTALL),
+ # DeepSeek envelopes truncated mid-stream.
+ re.compile(r"<|tool[▁_]calls[▁_]begin|>.*$", re.DOTALL),
+ re.compile(r"<|tool▁call▁begin|>.*$", re.DOTALL),
+ # Kimi K2 envelope truncated.
+ re.compile(r"<\|tool_calls_section_begin\|>.*$", re.DOTALL),
+ re.compile(r"<\|tool_call_begin\|>.*$", re.DOTALL),
+]
+
+
+# ── Nudges + error-result prefixes ──────────────────────────────────
+
+
+TOOL_ERROR_PREFIXES = (
+ "Error",
+ "Search failed",
+ "Execution error",
+ "Blocked:",
+ "Exit code",
+ "Failed to fetch",
+ "Failed to resolve",
+ "No query provided",
+)
+
+DUPLICATE_CALL_NUDGE = (
+ "You already made this exact call. Do not repeat the same tool "
+ "call. Try a different approach: fetch a URL from previous "
+ "results, use Python to process data you already have, or "
+ "provide your final answer now."
+)
+
+TOOL_ERROR_NUDGE = (
+ "\n\nThe tool call encountered an issue. Please try a different "
+ "approach or rephrase your request."
+)
+
+BUDGET_EXHAUSTED_NUDGE = (
+ "You have used all available tool calls. Based on everything you "
+ "have found so far, provide your final answer now. Do not call "
+ "any more tools."
+)
+
+
+# ── Format-specific regexes ─────────────────────────────────────────
+
+
+# Qwen / Hermes {json}
+_TC_JSON_START_RE = re.compile(r"\s*\{")
+# Qwen3.5 / Hermes XML form v
+_TC_FUNC_START_RE = re.compile(r"\s*")
+_TC_END_TAG_RE = re.compile(r"")
+_TC_FUNC_CLOSE_RE = re.compile(r"\s*\s*$")
+_TC_PARAM_START_RE = re.compile(r"\s*")
+_TC_PARAM_CLOSE_RE = re.compile(r"\s*\s*$")
+
+# Llama-3 <|python_tag|>NAME.call(...)
+_LLAMA3_PYTHON_TAG = "<|python_tag|>"
+_LLAMA3_PY_CALL_RE = re.compile(
+ r"<\|python_tag\|>\s*([\w\.\-]+)\s*\.\s*call\s*\(",
+)
+_LLAMA3_KV_RE = re.compile(
+ r"""(\w+)\s*=\s*(?:"((?:\\.|[^"\\])*)"|(-?\d+(?:\.\d+)?)|(true|false|null))""",
+ re.VERBOSE,
+)
+
+# Mistral [TOOL_CALLS] trigger. v11+ chains multiple triggers, each
+# followed by a bare name then either ``{json}`` (Magistral) or
+# ``[ARGS]{json}`` (Ministral / Mistral Large 3).
+_MISTRAL_TRIGGER = "[TOOL_CALLS]"
+_MISTRAL_ARGS_MARKER = "[ARGS]"
+_MISTRAL_V11_NAME_RE = re.compile(r"\s*([\w\.\-]+)\s*")
+
+# DeepSeek R1 / V3 / V3.1 markers (full-width pipe U+FF5C, lower-
+# one-eighth-block U+2581). llama.cpp accepts five variants of the
+# outer block-open; we mirror its tolerance.
+_DEEPSEEK_BEGIN_RE = re.compile(
+ r"<|(?:tool▁calls▁begin|tool_calls_begin|tool calls begin|tool\\_calls\\_begin|tool▁calls)|>"
+)
+_DEEPSEEK_END = "<|tool▁calls▁end|>"
+_DEEPSEEK_CALL_BEGIN = "<|tool▁call▁begin|>"
+_DEEPSEEK_SEP = "<|tool▁sep|>"
+_DEEPSEEK_CALL_END = "<|tool▁call▁end|>"
+# R1 specifically wraps the args in a Markdown ```json ... ``` fence and
+# prefixes the call with the literal ``function`` token; V3 / V3.1 do
+# not. Detect R1 by the presence of ``function<|tool▁sep|>`` followed
+# by ``\n```json``.
+_DEEPSEEK_R1_FUNC_RE = re.compile(
+ r"(?:" + re.escape(_DEEPSEEK_CALL_BEGIN) + r")?function"
+ + re.escape(_DEEPSEEK_SEP) + r"([^\n]+)\n```json\n",
+)
+_DEEPSEEK_R1_CLOSE_RE = re.compile(
+ r"```[\s\r\n]*" + re.escape(_DEEPSEEK_CALL_END)
+)
+_DEEPSEEK_V3_FUNC_RE = re.compile(
+ r"(?:" + re.escape(_DEEPSEEK_CALL_BEGIN) + r")?([^\n<]+?)"
+ + re.escape(_DEEPSEEK_SEP),
+)
+
+# GLM 4.5 / 4.6 / 4.7 markers. Body is ``NAME\nK\n
+# V...`` per chat_template.jinja; strings are
+# raw, non-strings are JSON-encoded.
+_GLM_TC_OPEN_RE = re.compile(r"\s*([^\n<{][^\n<]*)\n")
+_GLM_TC_CLOSE = ""
+_GLM_ARG_PAIR_RE = re.compile(
+ r"(.*?)\s*(.*?)",
+ re.DOTALL,
+)
+
+# Kimi K2 / Moonshot markers (ASCII pipes, NOT full-width). The name
+# arrives as ``functions.NAME:IDX`` between ``<|tool_call_begin|>`` and
+# ``<|tool_call_argument_begin|>``. Strip the prefix and suffix to
+# recover the bare name.
+_KIMI_SECTION_BEGIN = "<|tool_calls_section_begin|>"
+_KIMI_SECTION_END = "<|tool_calls_section_end|>"
+_KIMI_CALL_BEGIN = "<|tool_call_begin|>"
+_KIMI_ARG_BEGIN = "<|tool_call_argument_begin|>"
+_KIMI_CALL_END = "<|tool_call_end|>"
+_KIMI_ID_RE = re.compile(r"^(?:functions\.)?([\w\.\-]+)(?::(\d+))?$")
+
+# Gemma 4 <|tool_call>call:NAME{...}. ``<|"|>`` wraps strings.
+_GEMMA_TC_RE = re.compile(r"<\|tool_call>\s*call\s*:\s*([\w\.\-]+)\s*\{")
+_GEMMA_STR_BEGIN = '<|"|>'
+_GEMMA_STR_END = '<|"|>'
+_GEMMA_TC_END = ""
+
+
+# ── Public API ──────────────────────────────────────────────────────
+
+
+def strip_tool_markup(text: str, *, final: bool = False) -> str:
+ """Strip tool-call markup from streamed text.
+
+ ``final=False`` only removes closed pairs so in-progress markup
+ stays buffered. ``final=True`` also removes trailing unclosed runs
+ and trims the result.
+ """
+ pats = _TOOL_ALL_PATS if final else _TOOL_CLOSED_PATS
+ for pat in pats:
+ text = pat.sub("", text)
+ return text.strip() if final else text
+
+
+def has_tool_signal(text: str) -> bool:
+ """True if ``text`` contains any known tool-call signal."""
+ return any(s in text for s in TOOL_XML_SIGNALS)
+
+
+def parse_tool_calls_from_text(content: str, *, id_offset: int = 0) -> list[dict]:
+ """Parse OpenAI-format ``tool_calls`` from model text.
+
+ Returns ``[{"id", "type", "function": {"name", "arguments"}}]``
+ where ``arguments`` is always a JSON string. Tries each known
+ emission format in turn; returns as soon as one yields calls so
+ we never double-count.
+ """
+ # DeepSeek R1 / V3 / V3.1 ``<|tool▁calls▁begin|>...<|tool▁calls▁end|>``.
+ # Run early -- the full-width markers cannot collide with any
+ # other family, and R1's code-fence body would fail Qwen's
+ # JSON-start regex anyway.
+ calls = _parse_deepseek_tool_calls(content, id_offset = id_offset)
+ if calls:
+ return calls
+
+ # Kimi K2 ``<|tool_calls_section_begin|>...<|tool_calls_section_end|>``.
+ # Markers cannot collide with any other family.
+ calls = _parse_kimi_tool_calls(content, id_offset = id_offset)
+ if calls:
+ return calls
+
+ # Qwen / Hermes {json}
+ calls = _parse_tool_call_json(content, id_offset = id_offset)
+ if calls:
+ return calls
+
+ # GLM 4.5 / 4.6 / 4.7 ``NAME\nK
+ # V...``. Marker collides with
+ # Qwen's ````, but Qwen requires ``\s*{`` after the tag
+ # while GLM emits a bare name then ``\n``, so Qwen returns no calls
+ # before we get here. Running GLM AFTER Qwen also keeps Qwen
+ # behaviour unchanged on real Qwen emissions.
+ calls = _parse_glm_tool_calls(content, id_offset = id_offset)
+ if calls:
+ return calls
+
+ # Qwen3.5 / Hermes v
+ calls = _parse_function_xml(content, id_offset = id_offset)
+ if calls:
+ return calls
+
+ # Llama-3 <|python_tag|>...
+ calls = _parse_llama3_python_tag(content, id_offset = id_offset)
+ if calls:
+ return calls
+
+ # Mistral [TOOL_CALLS]...
+ calls = _parse_mistral_tool_calls(content, id_offset = id_offset)
+ if calls:
+ return calls
+
+ # Gemma 4 <|tool_call>...
+ calls = _parse_gemma_tool_calls(content, id_offset = id_offset)
+ if calls:
+ return calls
+
+ # Llama-3.2 bare JSON ``{"name":..., "parameters":...}`` (no tag).
+ # Strict: only fires when stripped content STARTS with ``{`` and
+ # parses as ``{name: str, parameters|arguments: dict}``. Keeps
+ # plain assistant prose unaffected.
+ return _parse_llama3_bare_json(content, id_offset = id_offset)
+
+
+# ── Per-format parsers ──────────────────────────────────────────────
+
+
+def _parse_tool_call_json(content: str, *, id_offset: int) -> list[dict]:
+ out: list[dict] = []
+ for m in _TC_JSON_START_RE.finditer(content):
+ brace_start = m.end() - 1
+ end = _balanced_brace_end(content, brace_start)
+ if end is None:
+ continue
+ try:
+ obj = json.loads(content[brace_start : end + 1])
+ except (json.JSONDecodeError, ValueError):
+ continue
+ name = obj.get("name", "")
+ args = obj.get("arguments", {})
+ if isinstance(args, dict):
+ args_str = json.dumps(args)
+ elif isinstance(args, str):
+ args_str = args
+ else:
+ args_str = json.dumps({"value": args})
+ if not name:
+ continue
+ out.append(
+ {
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {"name": name, "arguments": args_str},
+ }
+ )
+ return out
+
+
+def _parse_function_xml(content: str, *, id_offset: int) -> list[dict]:
+ out: list[dict] = []
+ func_starts = list(_TC_FUNC_START_RE.finditer(content))
+ for idx, fm in enumerate(func_starts):
+ func_name = fm.group(1)
+ body_start = fm.end()
+ next_func = (
+ func_starts[idx + 1].start() if idx + 1 < len(func_starts) else len(content)
+ )
+ end_tag = _TC_END_TAG_RE.search(content[body_start:])
+ if end_tag:
+ body_end = body_start + end_tag.start()
+ else:
+ body_end = len(content)
+ body_end = min(body_end, next_func)
+ body = _TC_FUNC_CLOSE_RE.sub("", content[body_start:body_end])
+
+ args: dict = {}
+ param_starts = list(_TC_PARAM_START_RE.finditer(body))
+ if len(param_starts) == 1:
+ pm = param_starts[0]
+ val = _TC_PARAM_CLOSE_RE.sub("", body[pm.end() :])
+ args[pm.group(1)] = val.strip()
+ else:
+ for pidx, pm in enumerate(param_starts):
+ val_start = pm.end()
+ next_param = (
+ param_starts[pidx + 1].start()
+ if pidx + 1 < len(param_starts)
+ else len(body)
+ )
+ val = _TC_PARAM_CLOSE_RE.sub("", body[val_start:next_param])
+ args[pm.group(1)] = val.strip()
+
+ out.append(
+ {
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {"name": func_name, "arguments": json.dumps(args)},
+ }
+ )
+ return out
+
+
+def _parse_llama3_python_tag(content: str, *, id_offset: int) -> list[dict]:
+ """Llama-3 emission shapes:
+ <|python_tag|>NAME.call(arg="v", ...) (built-in tools)
+ <|python_tag|>{"name":"NAME", "parameters":{...}} (custom tools)
+ <|python_tag|>{"name":...}; {"name":...} (multi-call, ``; `` sep)
+ Accepts both ``parameters`` and ``arguments`` keys per Llama 3.1/3.2.
+ """
+ out: list[dict] = []
+ if _LLAMA3_PYTHON_TAG not in content:
+ return out
+
+ # 1. NAME.call(...) built-in form.
+ for m in _LLAMA3_PY_CALL_RE.finditer(content):
+ name = m.group(1)
+ i = m.end()
+ depth = 1
+ in_string = False
+ esc = False
+ while i < len(content) and depth > 0:
+ ch = content[i]
+ if in_string:
+ if esc:
+ esc = False
+ elif ch == "\\":
+ esc = True
+ elif ch == '"':
+ in_string = False
+ else:
+ if ch == '"':
+ in_string = True
+ elif ch == "(":
+ depth += 1
+ elif ch == ")":
+ depth -= 1
+ if depth == 0:
+ break
+ i += 1
+ body = content[m.end() : i]
+ args: dict[str, Any] = {}
+ for kv in _LLAMA3_KV_RE.finditer(body):
+ k = kv.group(1)
+ if kv.group(2) is not None:
+ try:
+ args[k] = bytes(kv.group(2), "utf-8").decode("unicode_escape")
+ except (UnicodeDecodeError, ValueError):
+ args[k] = kv.group(2)
+ elif kv.group(3) is not None:
+ v = kv.group(3)
+ args[k] = float(v) if "." in v else int(v)
+ elif kv.group(4) is not None:
+ args[k] = {"true": True, "false": False, "null": None}[kv.group(4)]
+ out.append(
+ {
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {"name": name, "arguments": json.dumps(args)},
+ }
+ )
+
+ # 2. <|python_tag|>{"name":..., "parameters":...} JSON form. Use a
+ # streaming JSON decoder (raw_decode) so we can peel multiple
+ # objects out of the same emission (separated by ``; `` per
+ # Llama 3 template).
+ if not out:
+ decoder = json.JSONDecoder()
+ idx = content.find(_LLAMA3_PYTHON_TAG)
+ while idx >= 0:
+ search_from = idx + len(_LLAMA3_PYTHON_TAG)
+ # Scan all `{` from this trigger; raw_decode jumps the
+ # cursor past each parsed object, but if a `{` falls
+ # inside an already-decoded object we skip it.
+ cursor = search_from
+ while cursor < len(content):
+ brace = content.find("{", cursor)
+ if brace < 0:
+ break
+ # Stop if we've hit the next <|python_tag|>.
+ next_tag = content.find(_LLAMA3_PYTHON_TAG, search_from, brace)
+ if next_tag >= 0:
+ break
+ try:
+ obj, end_offset = decoder.raw_decode(content[brace:])
+ except (json.JSONDecodeError, ValueError):
+ cursor = brace + 1
+ continue
+ if not isinstance(obj, dict):
+ cursor = brace + end_offset
+ continue
+ name = obj.get("name") or obj.get("function") or ""
+ args = (
+ obj.get("parameters")
+ if "parameters" in obj
+ else obj.get("arguments", {})
+ )
+ if isinstance(args, dict):
+ args_str = json.dumps(args)
+ elif isinstance(args, str):
+ args_str = args
+ else:
+ args_str = json.dumps({"value": args})
+ if name:
+ out.append(
+ {
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {"name": name, "arguments": args_str},
+ }
+ )
+ cursor = brace + end_offset
+ idx = content.find(_LLAMA3_PYTHON_TAG, cursor)
+ return out
+
+
+def _parse_llama3_bare_json(content: str, *, id_offset: int) -> list[dict]:
+ """Llama-3.2 ``custom_tools`` shape -- bare JSON ``{"name":...,
+ "parameters":{...}}`` emitted directly, no ``<|python_tag|>``.
+
+ Strict to avoid firing on tool-message echoes:
+
+ * Content must start with ``{`` once whitespace and any leading
+ ``<|begin_of_text|>`` / ``<|eot_id|>`` etc. sentinels are stripped.
+ * Object must have ``name`` (non-empty str) plus a dict in
+ ``parameters`` or ``arguments``.
+ * Loops via ``raw_decode`` to peel multiple ``;``-separated calls.
+ """
+ out: list[dict] = []
+ stripped = content.lstrip()
+ # Strip leading Llama-3 sentinel tokens that sometimes precede the
+ # JSON (``<|eot_id|>`` from the prior turn, ``<|start_header_id|>``).
+ for sentinel in (
+ "<|begin_of_text|>",
+ "<|eot_id|>",
+ "<|start_header_id|>",
+ "<|end_header_id|>",
+ "<|eom_id|>",
+ ):
+ stripped = stripped.lstrip()
+ if stripped.startswith(sentinel):
+ stripped = stripped[len(sentinel) :]
+ stripped = stripped.lstrip()
+ if not stripped.startswith("{"):
+ return out
+
+ decoder = json.JSONDecoder()
+ cursor = 0
+ n = len(stripped)
+ while cursor < n:
+ # Skip whitespace and Llama 3 inter-call separator ``;``.
+ while cursor < n and stripped[cursor] in " \t\n\r;":
+ cursor += 1
+ if cursor >= n or stripped[cursor] != "{":
+ break
+ try:
+ obj, end_offset = decoder.raw_decode(stripped[cursor:])
+ except (json.JSONDecodeError, ValueError):
+ break
+ if not isinstance(obj, dict):
+ break
+ name = obj.get("name") or obj.get("function") or ""
+ if not isinstance(name, str) or not name:
+ break
+ if "parameters" in obj:
+ args = obj.get("parameters")
+ elif "arguments" in obj:
+ args = obj.get("arguments")
+ else:
+ break
+ if isinstance(args, dict):
+ args_str = json.dumps(args)
+ elif isinstance(args, str):
+ args_str = args
+ else:
+ break
+ out.append(
+ {
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {"name": name, "arguments": args_str},
+ }
+ )
+ cursor += end_offset
+ return out
+
+
+def _parse_mistral_tool_calls(content: str, *, id_offset: int) -> list[dict]:
+ """Mistral emissions covered:
+ Pre-v11 array: ``[TOOL_CALLS] [{"name":..., "arguments":...}, ...]``
+ Pre-v11 single: ``[TOOL_CALLS]{"name":..., "arguments":...}``
+ v11+ single: ``[TOOL_CALLS]name{json_args}``
+ v11+ parallel: ``[TOOL_CALLS]a{...}[TOOL_CALLS]b{...}``
+ v11+ w/ [ARGS]: ``[TOOL_CALLS]name[ARGS]{json_args}`` (Ministral / Large 3)
+ """
+ out: list[dict] = []
+ idx = content.find(_MISTRAL_TRIGGER)
+ if idx < 0:
+ return out
+
+ # Decide whether the FIRST occurrence is array / single-object
+ # (pre-v11) or v11+ bare-name. Skip whitespace, peek at next char.
+ j = idx + len(_MISTRAL_TRIGGER)
+ k = j
+ while k < len(content) and content[k] in " \t\n\r":
+ k += 1
+ if k >= len(content):
+ return out
+
+ if content[k] == "[":
+ return _parse_mistral_array(content, k, id_offset)
+
+ if content[k] == "{":
+ # Could be pre-v11 single object ``{"name": ...}`` or a JSON
+ # blob immediately following the trigger (rare). Try parsing
+ # as an object that exposes ``name``; if not, fall through to
+ # v11+ handling so we don't drop emission silently.
+ end = _balanced_brace_end(content, k)
+ if end is not None:
+ try:
+ obj = json.loads(content[k : end + 1])
+ if isinstance(obj, dict) and obj.get("name"):
+ _consume_mistral_call(content[k : end + 1], out, id_offset)
+ return out
+ except (json.JSONDecodeError, ValueError):
+ pass
+
+ # v11+ path: walk every ``[TOOL_CALLS]`` and parse ``name{json}``
+ # or ``name[ARGS]{json}`` after each trigger.
+ pos = idx
+ while pos >= 0:
+ cur = pos + len(_MISTRAL_TRIGGER)
+ nm = _MISTRAL_V11_NAME_RE.match(content, cur)
+ if not nm:
+ pos = content.find(_MISTRAL_TRIGGER, cur)
+ continue
+ name = nm.group(1)
+ after_name = nm.end()
+ # Optional ``[ARGS]`` marker.
+ if content.startswith(_MISTRAL_ARGS_MARKER, after_name):
+ after_name += len(_MISTRAL_ARGS_MARKER)
+ while after_name < len(content) and content[after_name] in " \t\n\r":
+ after_name += 1
+ if after_name >= len(content) or content[after_name] != "{":
+ pos = content.find(_MISTRAL_TRIGGER, cur)
+ continue
+ end = _balanced_brace_end(content, after_name)
+ if end is None:
+ break
+ try:
+ args = json.loads(content[after_name : end + 1])
+ except (json.JSONDecodeError, ValueError):
+ pos = content.find(_MISTRAL_TRIGGER, end + 1)
+ continue
+ if not isinstance(args, dict):
+ pos = content.find(_MISTRAL_TRIGGER, end + 1)
+ continue
+ out.append(
+ {
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {
+ "name": name,
+ "arguments": json.dumps(args),
+ },
+ }
+ )
+ pos = content.find(_MISTRAL_TRIGGER, end + 1)
+ return out
+
+
+def _parse_mistral_array(content: str, start: int, id_offset: int) -> list[dict]:
+ """Parse pre-v11 ``[TOOL_CALLS] [{...}, ...]`` JSON array form."""
+ out: list[dict] = []
+ j = start
+ depth = 0
+ in_string = False
+ esc = False
+ while j < len(content):
+ ch = content[j]
+ if in_string:
+ if esc:
+ esc = False
+ elif ch == "\\":
+ esc = True
+ elif ch == '"':
+ in_string = False
+ else:
+ if ch == '"':
+ in_string = True
+ elif ch == "[":
+ depth += 1
+ elif ch == "]":
+ depth -= 1
+ if depth == 0:
+ break
+ j += 1
+ body = content[start : j + 1] if depth == 0 else content[start:]
+
+ try:
+ arr = json.loads(body)
+ if isinstance(arr, list):
+ for obj in arr:
+ if isinstance(obj, dict):
+ _consume_mistral_call(json.dumps(obj), out, id_offset)
+ return out
+ except (json.JSONDecodeError, ValueError):
+ pass
+
+ # Healing path: walk objects manually for unclosed array.
+ for m in re.finditer(r"\{", body):
+ end = _balanced_brace_end(body, m.start())
+ if end is None:
+ continue
+ _consume_mistral_call(body[m.start() : end + 1], out, id_offset)
+ return out
+
+
+def _consume_mistral_call(obj_text: str, out: list[dict], id_offset: int) -> None:
+ try:
+ obj = json.loads(obj_text)
+ except (json.JSONDecodeError, ValueError):
+ return
+ if not isinstance(obj, dict):
+ return
+ name = obj.get("name") or ""
+ args = obj.get("arguments") or {}
+ if isinstance(args, dict):
+ args_str = json.dumps(args)
+ elif isinstance(args, str):
+ args_str = args
+ else:
+ args_str = json.dumps({"value": args})
+ if name:
+ out.append(
+ {
+ "id": obj.get("id") or f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {"name": name, "arguments": args_str},
+ }
+ )
+
+
+def _parse_gemma_tool_calls(content: str, *, id_offset: int) -> list[dict]:
+ """Gemma 4: <|tool_call>call:NAME{k:<|"|>v<|"|>, ...}."""
+ out: list[dict] = []
+ for m in _GEMMA_TC_RE.finditer(content):
+ name = m.group(1)
+ body_start = m.end() - 1
+ end_marker = content.find(_GEMMA_TC_END, body_start)
+ scan_end = end_marker if end_marker >= 0 else len(content)
+ end = _gemma_balanced_brace_end(content, body_start, scan_end)
+ if end is None:
+ continue
+ body = content[body_start + 1 : end]
+ try:
+ args = _gemma_parse_mapping_body(body)
+ except Exception:
+ args = {}
+ out.append(
+ {
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {"name": name, "arguments": json.dumps(args)},
+ }
+ )
+ return out
+
+
+# ── Brace-balancing helpers ─────────────────────────────────────────
+
+
+def _balanced_brace_end(text: str, brace_pos: int) -> int | None:
+ """Index of `}` matching `{` at ``brace_pos`` -- ignores `{` `}`
+ inside JSON strings. Returns None if unmatched."""
+ if brace_pos >= len(text) or text[brace_pos] != "{":
+ return None
+ depth = 0
+ in_string = False
+ esc = False
+ i = brace_pos
+ while i < len(text):
+ ch = text[i]
+ if in_string:
+ if esc:
+ esc = False
+ elif ch == "\\":
+ esc = True
+ elif ch == '"':
+ in_string = False
+ else:
+ if ch == '"':
+ in_string = True
+ elif ch == "{":
+ depth += 1
+ elif ch == "}":
+ depth -= 1
+ if depth == 0:
+ return i
+ i += 1
+ return None
+
+
+def _gemma_balanced_brace_end(text: str, brace_pos: int, hard_stop: int) -> int | None:
+ """Same as ``_balanced_brace_end`` but respects Gemma ``<|"|>``
+ string runs and matches `{`/`[` symmetrically."""
+ if brace_pos >= len(text) or text[brace_pos] != "{":
+ return None
+ depth = 0
+ i = brace_pos
+ while i < hard_stop:
+ if text.startswith(_GEMMA_STR_BEGIN, i):
+ close = text.find(_GEMMA_STR_END, i + len(_GEMMA_STR_BEGIN))
+ if close < 0:
+ return None
+ i = close + len(_GEMMA_STR_END)
+ continue
+ ch = text[i]
+ if ch == "{" or ch == "[":
+ depth += 1
+ elif ch == "}" or ch == "]":
+ depth -= 1
+ if depth == 0:
+ return i
+ i += 1
+ return None
+
+
+def _gemma_parse_value(text: str, i: int):
+ """Parse one Gemma argument value starting at ``i``. Returns
+ ``(value, next_index)``."""
+ if text.startswith(_GEMMA_STR_BEGIN, i):
+ close = text.find(_GEMMA_STR_END, i + len(_GEMMA_STR_BEGIN))
+ if close < 0:
+ return text[i + len(_GEMMA_STR_BEGIN) :], len(text)
+ return text[i + len(_GEMMA_STR_BEGIN) : close], close + len(_GEMMA_STR_END)
+ if text[i] == "{":
+ end = _gemma_balanced_brace_end(text, i, len(text))
+ if end is None:
+ return {}, len(text)
+ return _gemma_parse_mapping_body(text[i + 1 : end]), end + 1
+ if text[i] == "[":
+ j, depth = i, 0
+ while j < len(text):
+ if text.startswith(_GEMMA_STR_BEGIN, j):
+ k = text.find(_GEMMA_STR_END, j + len(_GEMMA_STR_BEGIN))
+ if k < 0:
+ j = len(text)
+ break
+ j = k + len(_GEMMA_STR_END)
+ continue
+ ch = text[j]
+ if ch == "[":
+ depth += 1
+ elif ch == "]":
+ depth -= 1
+ if depth == 0:
+ break
+ j += 1
+ body = text[i + 1 : j]
+ items: list[Any] = []
+ k = 0
+ while k < len(body):
+ if body[k] in " \t\n\r,":
+ k += 1
+ continue
+ v, k = _gemma_parse_value(body, k)
+ items.append(v)
+ return items, j + 1
+ # Primitive: number, true/false/null, or bare identifier (rare).
+ end = i
+ while (
+ end < len(text)
+ and text[end] not in ",}]"
+ and not text.startswith(_GEMMA_STR_BEGIN, end)
+ ):
+ end += 1
+ raw = text[i:end].strip()
+ if raw == "true":
+ return True, end
+ if raw == "false":
+ return False, end
+ if raw == "null":
+ return None, end
+ try:
+ return int(raw), end
+ except ValueError:
+ pass
+ try:
+ return float(raw), end
+ except ValueError:
+ pass
+ return raw, end
+
+
+def _gemma_parse_mapping_body(body: str) -> dict[str, Any]:
+ """Parse content between `{` and `}` for a Gemma argument mapping."""
+ out: dict[str, Any] = {}
+ i = 0
+ n = len(body)
+ while i < n:
+ while i < n and body[i] in " \t\n\r,":
+ i += 1
+ if i >= n:
+ break
+ if body.startswith(_GEMMA_STR_BEGIN, i):
+ close = body.find(_GEMMA_STR_END, i + len(_GEMMA_STR_BEGIN))
+ if close < 0:
+ break
+ key = body[i + len(_GEMMA_STR_BEGIN) : close]
+ i = close + len(_GEMMA_STR_END)
+ else:
+ kstart = i
+ while i < n and body[i] != ":":
+ i += 1
+ key = body[kstart:i].strip()
+ while i < n and body[i] in " \t\n\r":
+ i += 1
+ if i < n and body[i] == ":":
+ i += 1
+ while i < n and body[i] in " \t\n\r":
+ i += 1
+ if i >= n:
+ out[key] = None
+ break
+ v, i = _gemma_parse_value(body, i)
+ out[key] = v
+ return out
+
+
+# ── DeepSeek R1 / V3 / V3.1 ─────────────────────────────────────────
+
+
+def _parse_deepseek_tool_calls(content: str, *, id_offset: int) -> list[dict]:
+ """DeepSeek emissions:
+ R1: ``<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\\n``\\`\\`\\`json\\n{...}\\n\\`\\`\\`<|tool▁call▁end|>...``
+ V3.x: ``<|tool▁calls▁begin|><|tool▁call▁begin|>NAME<|tool▁sep|>{json}<|tool▁call▁end|>...``
+
+ Mirrors llama.cpp's common_chat_parse_deepseek_r1 / _v3_1 in
+ chat-parser.cpp lines 801-879 and vLLM's deepseekv3 /
+ deepseekv31 tool_parsers. Tolerates four outer-marker variants
+ that real checkpoints emit.
+ """
+ out: list[dict] = []
+ begin = _DEEPSEEK_BEGIN_RE.search(content)
+ if not begin:
+ return out
+ scan_start = begin.end()
+ end_pos = content.find(_DEEPSEEK_END, scan_start)
+ scan_end = end_pos if end_pos >= 0 else len(content)
+ body = content[scan_start:scan_end]
+
+ # R1 path first: ``function<|tool▁sep|>NAME\n```json\n{...}\n```<|tool▁call▁end|>``.
+ pos = 0
+ while pos < len(body):
+ m = _DEEPSEEK_R1_FUNC_RE.search(body, pos)
+ if not m:
+ break
+ name = m.group(1).strip()
+ json_start = m.end()
+ # Walk a balanced ``{`` even if the trailing fence is truncated.
+ if json_start >= len(body) or body[json_start] != "{":
+ pos = m.end()
+ continue
+ brace_end = _balanced_brace_end(body, json_start)
+ if brace_end is None:
+ break
+ try:
+ args = json.loads(body[json_start : brace_end + 1])
+ except (json.JSONDecodeError, ValueError):
+ pos = brace_end + 1
+ continue
+ if not isinstance(args, dict):
+ pos = brace_end + 1
+ continue
+ if name:
+ out.append({
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {
+ "name": name,
+ "arguments": json.dumps(args),
+ },
+ })
+ # Move past the closing fence + ``<|tool▁call▁end|>``.
+ close_m = _DEEPSEEK_R1_CLOSE_RE.search(body, brace_end + 1)
+ pos = close_m.end() if close_m else brace_end + 1
+ if out:
+ return out
+
+ # V3 / V3.1 path: name then bare JSON.
+ pos = 0
+ while pos < len(body):
+ m = _DEEPSEEK_V3_FUNC_RE.search(body, pos)
+ if not m:
+ break
+ name = m.group(1).strip()
+ json_start = m.end()
+ # Skip any whitespace before the JSON.
+ while json_start < len(body) and body[json_start] in " \t\n\r":
+ json_start += 1
+ if json_start >= len(body) or body[json_start] != "{":
+ pos = m.end()
+ continue
+ brace_end = _balanced_brace_end(body, json_start)
+ if brace_end is None:
+ break
+ try:
+ args = json.loads(body[json_start : brace_end + 1])
+ except (json.JSONDecodeError, ValueError):
+ pos = brace_end + 1
+ continue
+ if not isinstance(args, dict):
+ pos = brace_end + 1
+ continue
+ if name:
+ out.append({
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {
+ "name": name,
+ "arguments": json.dumps(args),
+ },
+ })
+ # Skip past optional ``<|tool▁call▁end|>``.
+ next_end = body.find(_DEEPSEEK_CALL_END, brace_end + 1)
+ pos = next_end + len(_DEEPSEEK_CALL_END) if next_end >= 0 else brace_end + 1
+ return out
+
+
+# ── GLM 4.5 / 4.6 / 4.7 ─────────────────────────────────────────────
+
+
+def _parse_glm_tool_calls(content: str, *, id_offset: int) -> list[dict]:
+ """GLM 4.x emission:
+ ``NAME\\nK1\\nV1...
+ Kn\\nVn\\n``
+
+ Strings come through raw; non-string args are JSON-encoded per the
+ template's ``{{ v | tojson(ensure_ascii=False) if v is not string
+ else v }}`` rule. Multi-call is back-to-back ``...
+ `` blocks with no outer envelope. Mirrors llama.cpp's
+ common_chat_parse_glm_4_5 (chat-parser.cpp:1040-1052) and vLLM's
+ glm4_moe_tool_parser.
+ """
+ out: list[dict] = []
+ pos = 0
+ while pos < len(content):
+ m = _GLM_TC_OPEN_RE.search(content, pos)
+ if not m:
+ break
+ name = m.group(1).strip()
+ body_start = m.end()
+ close = content.find(_GLM_TC_CLOSE, body_start)
+ body_end = close if close >= 0 else len(content)
+ body = content[body_start:body_end]
+
+ args: dict[str, Any] = {}
+ for pair in _GLM_ARG_PAIR_RE.finditer(body):
+ key = pair.group(1).strip()
+ raw_val = pair.group(2).strip()
+ # JSON literal first, then ast-literal fallback, else raw string.
+ try:
+ args[key] = json.loads(raw_val)
+ continue
+ except (json.JSONDecodeError, ValueError):
+ pass
+ try:
+ import ast as _ast
+ args[key] = _ast.literal_eval(raw_val)
+ continue
+ except (ValueError, SyntaxError):
+ pass
+ args[key] = raw_val
+
+ if name:
+ out.append({
+ "id": f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {
+ "name": name,
+ "arguments": json.dumps(args),
+ },
+ })
+ pos = close + len(_GLM_TC_CLOSE) if close >= 0 else len(content)
+ return out
+
+
+# ── Kimi K2 / Moonshot ──────────────────────────────────────────────
+
+
+def _parse_kimi_tool_calls(content: str, *, id_offset: int) -> list[dict]:
+ """Kimi K2 emission:
+ ``<|tool_calls_section_begin|>
+ <|tool_call_begin|>functions.NAME:IDX<|tool_call_argument_begin|>{json}<|tool_call_end|>
+ ...
+ <|tool_calls_section_end|>``
+
+ Name arrives as ``functions.NAME:IDX``. Strip the ``functions.``
+ prefix and ``:N`` suffix to recover the bare name. The full id
+ string is preserved as ``tool_calls[i].id`` so the conversation
+ replay round-trips the exact form the model emitted (vLLM and
+ SGLang both do this).
+ """
+ out: list[dict] = []
+ section_start = content.find(_KIMI_SECTION_BEGIN)
+ if section_start < 0:
+ return out
+ scan_start = section_start + len(_KIMI_SECTION_BEGIN)
+ section_end = content.find(_KIMI_SECTION_END, scan_start)
+ scan_end = section_end if section_end >= 0 else len(content)
+ body = content[scan_start:scan_end]
+
+ pos = 0
+ while pos < len(body):
+ call_start = body.find(_KIMI_CALL_BEGIN, pos)
+ if call_start < 0:
+ break
+ id_start = call_start + len(_KIMI_CALL_BEGIN)
+ arg_begin = body.find(_KIMI_ARG_BEGIN, id_start)
+ if arg_begin < 0:
+ break
+ full_id = body[id_start:arg_begin].strip()
+ m = _KIMI_ID_RE.match(full_id)
+ if m:
+ name = m.group(1).split(".")[-1]
+ else:
+ name = full_id.split(":")[0].split(".")[-1]
+ json_start = arg_begin + len(_KIMI_ARG_BEGIN)
+ # Walk a balanced brace so streaming truncation that drops the
+ # trailing ``<|tool_call_end|>`` still surfaces a call.
+ # Skip whitespace before the ``{``.
+ while json_start < len(body) and body[json_start] in " \t\n\r":
+ json_start += 1
+ if json_start >= len(body) or body[json_start] != "{":
+ pos = arg_begin + len(_KIMI_ARG_BEGIN)
+ continue
+ brace_end = _balanced_brace_end(body, json_start)
+ if brace_end is None:
+ break
+ try:
+ args = json.loads(body[json_start : brace_end + 1])
+ except (json.JSONDecodeError, ValueError):
+ pos = brace_end + 1
+ continue
+ if not isinstance(args, dict):
+ pos = brace_end + 1
+ continue
+ if name:
+ out.append({
+ "id": full_id or f"call_{id_offset + len(out)}",
+ "type": "function",
+ "function": {
+ "name": name,
+ "arguments": json.dumps(args),
+ },
+ })
+ call_end = body.find(_KIMI_CALL_END, brace_end + 1)
+ pos = call_end + len(_KIMI_CALL_END) if call_end >= 0 else brace_end + 1
+ return out
diff --git a/studio/backend/core/inference/worker.py b/studio/backend/core/inference/worker.py
index 085a1ab899..20a7d2d16c 100644
--- a/studio/backend/core/inference/worker.py
+++ b/studio/backend/core/inference/worker.py
@@ -346,6 +346,26 @@ def _handle_load(backend, config: dict, resp_queue: Any) -> None:
"audio_type": getattr(mc, "audio_type", None),
"has_audio_input": getattr(mc, "has_audio_input", False),
}
+ # Forward chat_template_info so the parent can classify
+ # capabilities without re-entering the subprocess.
+ try:
+ _bm = getattr(backend, "models", {}) or {}
+ _entry = (
+ _bm.get(mc.identifier)
+ or _bm.get(getattr(backend, "active_model_name", None))
+ or {}
+ )
+ _tpl_info = _entry.get("chat_template_info")
+ if isinstance(_tpl_info, dict):
+ model_info["chat_template_info"] = {
+ "has_template": bool(_tpl_info.get("has_template", False)),
+ "template": _tpl_info.get("template"),
+ "format_type": _tpl_info.get("format_type", "generic"),
+ "template_name": _tpl_info.get("template_name"),
+ "special_tokens": _tpl_info.get("special_tokens", {}) or {},
+ }
+ except Exception as _tpl_exc:
+ logger.warning("chat_template_info forward failed: %s", _tpl_exc)
_send_response(
resp_queue,
{
@@ -416,6 +436,18 @@ def _handle_generate(
"cancel_event": cancel_event,
}
+ # Optional template/tool plumbing: only forward keys that are
+ # actually present so the backend signature can evolve without
+ # breaking older command payloads.
+ for opt_key in (
+ "tools",
+ "enable_thinking",
+ "reasoning_effort",
+ "preserve_thinking",
+ ):
+ if opt_key in cmd:
+ gen_kwargs[opt_key] = cmd[opt_key]
+
# Choose generation path
use_adapter = cmd.get("use_adapter")
if use_adapter is not None:
diff --git a/studio/backend/core/tool_healing.py b/studio/backend/core/tool_healing.py
new file mode 100644
index 0000000000..bb61965764
--- /dev/null
+++ b/studio/backend/core/tool_healing.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""Tool-call XML parsing and stripping helpers.
+
+Extracted verbatim from studio/backend/core/inference/llama_cpp.py so that
+external inference servers (llama-server wrappers, llama-swap, custom
+shims) can reuse the same logic without importing the inference
+orchestrator, structlog, httpx, or the rest of the studio backend.
+
+The regexes and function bodies are byte-for-byte identical to the
+original inline implementation in llama_cpp.py. Any change made here must
+preserve that equivalence; tests/python/test_tool_healing_extraction_is_exact.py
+verifies it with AST comparison.
+"""
+
+import json
+import re
+
+# Pre-compiled patterns for tool XML stripping.
+_TOOL_CLOSED_PATS = [
+ re.compile(r".*?", re.DOTALL),
+ re.compile(r".*?", re.DOTALL),
+]
+_TOOL_ALL_PATS = _TOOL_CLOSED_PATS + [
+ re.compile(r".*$", re.DOTALL),
+ re.compile(r".*$", re.DOTALL),
+]
+
+# Pre-compiled patterns for tool-call XML parsing.
+_TC_JSON_START_RE = re.compile(r"\s*\{")
+_TC_FUNC_START_RE = re.compile(r"\s*")
+_TC_END_TAG_RE = re.compile(r"")
+_TC_FUNC_CLOSE_RE = re.compile(r"\s*\s*$")
+_TC_PARAM_START_RE = re.compile(r"\s*")
+_TC_PARAM_CLOSE_RE = re.compile(r"\s*\s*$")
+
+
+def parse_tool_calls_from_text(content: str) -> list[dict]:
+ """
+ Parse tool calls from XML markup in content text.
+
+ Handles formats like:
+ {"name":"web_search","arguments":{"query":"..."}}
+ ...
+ Closing tags (, , ) are all optional
+ since models frequently omit them.
+ """
+ tool_calls = []
+
+ # Pattern 1: JSON inside tags.
+ # Use balanced-brace extraction that skips braces inside JSON strings.
+ for m in _TC_JSON_START_RE.finditer(content):
+ brace_start = m.end() - 1 # position of the opening {
+ depth, i = 0, brace_start
+ in_string = False
+ while i < len(content):
+ ch = content[i]
+ if in_string:
+ if ch == "\\" and i + 1 < len(content):
+ i += 2 # skip escaped character
+ continue
+ if ch == '"':
+ in_string = False
+ elif ch == '"':
+ in_string = True
+ elif ch == "{":
+ depth += 1
+ elif ch == "}":
+ depth -= 1
+ if depth == 0:
+ break
+ i += 1
+ if depth == 0:
+ json_str = content[brace_start : i + 1]
+ try:
+ obj = json.loads(json_str)
+ tc = {
+ "id": f"call_{len(tool_calls)}",
+ "type": "function",
+ "function": {
+ "name": obj.get("name", ""),
+ "arguments": obj.get("arguments", {}),
+ },
+ }
+ if isinstance(tc["function"]["arguments"], dict):
+ tc["function"]["arguments"] = json.dumps(
+ tc["function"]["arguments"]
+ )
+ tool_calls.append(tc)
+ except (json.JSONDecodeError, ValueError):
+ pass
+
+ # Pattern 2: XML-style value
+ # All closing tags optional -- models frequently omit ,
+ # , and/or .
+ if not tool_calls:
+ # Step 1: Find all positions and extract their bodies.
+ # Body boundary: use only or next as a boundary because
+ # code parameter values can contain that literal string.
+ # After extracting, we trim a trailing if present.
+ func_starts = list(_TC_FUNC_START_RE.finditer(content))
+ for idx, fm in enumerate(func_starts):
+ func_name = fm.group(1)
+ body_start = fm.end()
+ # Hard boundaries: next
+ next_func = (
+ func_starts[idx + 1].start()
+ if idx + 1 < len(func_starts)
+ else len(content)
+ )
+ end_tag = _TC_END_TAG_RE.search(content[body_start:])
+ if end_tag:
+ body_end = body_start + end_tag.start()
+ else:
+ body_end = len(content)
+ body_end = min(body_end, next_func)
+ body = content[body_start:body_end]
+ # Trim trailing if present (it's the real closing tag)
+ body = _TC_FUNC_CLOSE_RE.sub("", body)
+
+ # Step 2: Extract parameters from body.
+ # For single-parameter functions (the common case: code, command,
+ # query), use body end as the only boundary to avoid false matches
+ # on inside code strings.
+ arguments = {}
+ param_starts = list(_TC_PARAM_START_RE.finditer(body))
+ if len(param_starts) == 1:
+ # Single parameter: value is everything from after the tag
+ # to end of body, trimming any trailing .
+ pm = param_starts[0]
+ val = body[pm.end() :]
+ val = _TC_PARAM_CLOSE_RE.sub("", val)
+ arguments[pm.group(1)] = val.strip()
+ else:
+ for pidx, pm in enumerate(param_starts):
+ param_name = pm.group(1)
+ val_start = pm.end()
+ # Value ends at next if present
+ val = _TC_PARAM_CLOSE_RE.sub("", val)
+ arguments[param_name] = val.strip()
+
+ tc = {
+ "id": f"call_{len(tool_calls)}",
+ "type": "function",
+ "function": {
+ "name": func_name,
+ "arguments": json.dumps(arguments),
+ },
+ }
+ tool_calls.append(tc)
+ return tool_calls
+
+
+def strip_tool_call_markup(text: str, *, final: bool = False) -> str:
+ """Strip tool-call XML markup from text.
+
+ When ``final`` is False, only fully closed tool-call blocks are removed.
+ When ``final`` is True, trailing incomplete tool-call blocks are removed
+ too, and the result is stripped of surrounding whitespace.
+ """
+ patterns = _TOOL_ALL_PATS if final else _TOOL_CLOSED_PATS
+ for pat in patterns:
+ text = pat.sub("", text)
+ return text.strip() if final else text
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 6d05be2310..5436d51f0d 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -117,6 +117,7 @@ def _friendly_error(exc: Exception) -> str:
LlamaCppBackend,
_DEFAULT_MAX_TOKENS_FLOOR,
_DEFAULT_T_MAX_PREDICT_MS,
+ _hf_offline_if_dns_dead,
detect_reasoning_flags,
)
from core.inference.llama_server_args import (
@@ -142,6 +143,7 @@ def _friendly_error(exc: Exception) -> str:
LlamaCppBackend,
_DEFAULT_MAX_TOKENS_FLOOR,
_DEFAULT_T_MAX_PREDICT_MS,
+ _hf_offline_if_dns_dead,
detect_reasoning_flags,
)
from core.inference.llama_server_args import (
@@ -231,6 +233,57 @@ def _friendly_error(exc: Exception) -> str:
studio_router = APIRouter()
+def _detect_safetensors_features(backend, chat_template: Optional[str]) -> dict:
+ """Classify reasoning/tool capabilities via the GGUF classifier so
+ flags match across backends. gpt-oss is overridden because Harmony
+ routes reasoning and tools through tokenizer channels, not template
+ markup."""
+ model_id = getattr(backend, "active_model_name", None)
+ flags = (
+ detect_reasoning_flags(
+ chat_template,
+ model_identifier = model_id,
+ log_source = "safetensors",
+ )
+ if chat_template
+ else {
+ "supports_reasoning": False,
+ "reasoning_style": "enable_thinking",
+ "reasoning_always_on": False,
+ "supports_preserve_thinking": False,
+ "supports_tools": False,
+ }
+ )
+ # Our safetensors loop only parses {json}
+ # and .... Llama uses <|python_tag|>,
+ # Mistral uses [TOOL_CALLS]; advertising tools for those would
+ # enable a pill the parser cannot honour. GGUF is unaffected --
+ # llama-server normalises every format into structured deltas.
+ if (
+ flags.get("supports_tools")
+ and chat_template
+ and "" not in chat_template
+ and " XML this loop parses).
+ try:
+ if hasattr(backend, "_is_gpt_oss_model") and backend._is_gpt_oss_model():
+ flags["supports_reasoning"] = True
+ flags["reasoning_style"] = "reasoning_effort"
+ flags["supports_tools"] = False
+ except Exception:
+ logger.debug("gpt_oss_check_failed", exc_info = True)
+ return flags
+
+
def _effective_enable_tools(payload) -> Optional[bool]:
"""Resolve `payload.enable_tools` against the process-level tool policy.
@@ -581,6 +634,7 @@ async def load_model(
reasoning_style = llama_backend.reasoning_style,
reasoning_always_on = llama_backend.reasoning_always_on,
supports_preserve_thinking = llama_backend.supports_preserve_thinking,
+ supports_tools = llama_backend.supports_tools,
chat_template = llama_backend.chat_template,
speculative_type = llama_backend.speculative_type,
)
@@ -602,21 +656,10 @@ async def load_model(
logger.warning(
f"Could not retrieve chat template for {backend.active_model_name}: {e}"
)
- # Non-GGUF: only advertise reasoning for gpt-oss Harmony,
- # which emits reasoning via channels at the tokenizer level.
- # Template-level chat_template_kwargs (enable_thinking /
- # preserve_thinking / tools) are not yet forwarded through
- # the transformers generation path, so avoid advertising
- # controls the server cannot honour outside GGUF.
- _sf_supports_reasoning = False
- _sf_reasoning_style = "enable_thinking"
- if hasattr(backend, "_is_gpt_oss_model"):
- try:
- if backend._is_gpt_oss_model():
- _sf_supports_reasoning = True
- _sf_reasoning_style = "reasoning_effort"
- except Exception:
- pass
+ # Classify via the same path as GGUF.
+ _sf_flags = _detect_safetensors_features(backend, _chat_template)
+ _sf_supports_reasoning = _sf_flags["supports_reasoning"]
+ _sf_reasoning_style = _sf_flags["reasoning_style"]
return LoadResponse(
status = "already_loaded",
model = model_log_label
@@ -637,19 +680,21 @@ async def load_model(
),
supports_reasoning = _sf_supports_reasoning,
reasoning_style = _sf_reasoning_style,
- reasoning_always_on = False,
- supports_preserve_thinking = False,
- supports_tools = False,
+ reasoning_always_on = _sf_flags["reasoning_always_on"],
+ supports_preserve_thinking = _sf_flags["supports_preserve_thinking"],
+ supports_tools = _sf_flags["supports_tools"],
chat_template = _chat_template,
)
- # Create config using clean factory method
- # is_lora is auto-detected from adapter_config.json on disk/HF
- config = ModelConfig.from_identifier(
- model_id = model_identifier,
- hf_token = request.hf_token,
- gguf_variant = request.gguf_variant,
- )
+ # is_lora auto-detected from adapter_config.json on disk/HF.
+ # DNS-probe wrap so offline loads skip 30-60s of soft-failed
+ # network checks before the worker starts.
+ with _hf_offline_if_dns_dead():
+ config = ModelConfig.from_identifier(
+ model_id = model_identifier,
+ hf_token = request.hf_token,
+ gguf_variant = request.gguf_variant,
+ )
if not config:
raise HTTPException(
@@ -822,7 +867,7 @@ async def load_model(
display_name = model_log_label
if native_grant_backed
else config.display_name,
- is_vision = config.is_vision,
+ is_vision = llama_backend.is_vision,
is_lora = False,
is_gguf = True,
is_audio = _gguf_is_audio,
@@ -964,19 +1009,8 @@ async def load_model(
except Exception:
pass
- # Non-GGUF: gpt-oss Harmony surfaces reasoning via tokenizer-level
- # channels; other safetensors reasoning/tools/preserve-thinking
- # knobs are not forwarded to tokenizer.apply_chat_template yet, so
- # we only advertise support for the Harmony case here.
- _sf_supports_reasoning = False
- _sf_reasoning_style = "enable_thinking"
- if hasattr(backend, "_is_gpt_oss_model"):
- try:
- if backend._is_gpt_oss_model():
- _sf_supports_reasoning = True
- _sf_reasoning_style = "reasoning_effort"
- except Exception:
- pass
+ # Classify reasoning/tool flags via the GGUF sniffer.
+ _sf_flags = _detect_safetensors_features(backend, _chat_template)
return LoadResponse(
status = "loaded",
@@ -994,11 +1028,11 @@ async def load_model(
requires_trust_remote_code = bool(
inference_config.get("trust_remote_code", False)
),
- supports_reasoning = _sf_supports_reasoning,
- reasoning_style = _sf_reasoning_style,
- reasoning_always_on = False,
- supports_preserve_thinking = False,
- supports_tools = False,
+ supports_reasoning = _sf_flags["supports_reasoning"],
+ reasoning_style = _sf_flags["reasoning_style"],
+ reasoning_always_on = _sf_flags["reasoning_always_on"],
+ supports_preserve_thinking = _sf_flags["supports_preserve_thinking"],
+ supports_tools = _sf_flags["supports_tools"],
chat_template = _chat_template,
)
@@ -1282,6 +1316,24 @@ async def get_status(
try:
llama_backend = get_llama_cpp_backend()
+ # MTP probe + freshness check (both cached). Drive the UI banner.
+ try:
+ _bin = type(llama_backend)._find_llama_server_binary()
+ _caps = type(llama_backend).probe_server_capabilities(_bin)
+ _supports_mtp = bool(_caps.get("supports_mtp", False))
+ except Exception:
+ _bin = None
+ _supports_mtp = True # fail open
+ try:
+ from utils.llama_cpp_freshness import check_prebuilt_freshness
+
+ _freshness = check_prebuilt_freshness(_bin)
+ except Exception:
+ _freshness = {}
+ _stale = bool(_freshness.get("stale"))
+ _installed_tag = _freshness.get("installed_tag")
+ _latest_tag = _freshness.get("latest_tag")
+
# If a GGUF model is loaded via llama-server, report that
if llama_backend.is_loaded:
_model_id = llama_backend.model_identifier
@@ -1324,6 +1376,10 @@ async def get_status(
cache_type_kv = llama_backend.cache_type_kv,
chat_template_override = llama_backend.chat_template_override,
speculative_type = llama_backend.speculative_type,
+ llama_cpp_supports_mtp = _supports_mtp,
+ llama_cpp_prebuilt_stale = _stale,
+ llama_cpp_installed_tag = _installed_tag,
+ llama_cpp_latest_tag = _latest_tag,
)
# Otherwise, report Unsloth backend status
@@ -1347,18 +1403,8 @@ async def get_status(
else None
)
- # Non-GGUF: only gpt-oss Harmony is wired through the transformers
- # generation path. Other template-level reasoning / tool kwargs
- # are not yet forwarded, so we do not advertise them here.
- supports_reasoning = False
- reasoning_style = "enable_thinking"
- if backend.active_model_name and hasattr(backend, "_is_gpt_oss_model"):
- try:
- if backend._is_gpt_oss_model():
- supports_reasoning = True
- reasoning_style = "reasoning_effort"
- except Exception:
- pass
+ # Non-GGUF: classify from the loaded template.
+ _sf_flags = _detect_safetensors_features(backend, chat_template)
inference_config = (
load_inference_config(backend.active_model_name)
if backend.active_model_name
@@ -1378,12 +1424,16 @@ async def get_status(
requires_trust_remote_code = bool(
(inference_config or {}).get("trust_remote_code", False)
),
- supports_reasoning = supports_reasoning,
- reasoning_style = reasoning_style,
- reasoning_always_on = False,
- supports_preserve_thinking = False,
- supports_tools = False,
+ supports_reasoning = _sf_flags["supports_reasoning"],
+ reasoning_style = _sf_flags["reasoning_style"],
+ reasoning_always_on = _sf_flags["reasoning_always_on"],
+ supports_preserve_thinking = _sf_flags["supports_preserve_thinking"],
+ supports_tools = _sf_flags["supports_tools"],
chat_template = chat_template,
+ llama_cpp_supports_mtp = _supports_mtp,
+ llama_cpp_prebuilt_stale = _stale,
+ llama_cpp_installed_tag = _installed_tag,
+ llama_cpp_latest_tag = _latest_tag,
)
except Exception as e:
@@ -2739,6 +2789,300 @@ async def gguf_stream_chunks():
except Exception as e:
raise HTTPException(status_code = 400, detail = f"Failed to decode image: {e}")
+ # Classify capability flags from the loaded template.
+ _sf_model_info = backend.models.get(backend.active_model_name, {})
+ _sf_tpl = (_sf_model_info.get("chat_template_info") or {}).get("template")
+ _sf_features = _detect_safetensors_features(backend, _sf_tpl)
+
+ cancel_event = threading.Event()
+ completion_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
+ created = int(time.time())
+
+ # ── Safetensors tool-calling path ─────────────────────────
+ # Mirrors the GGUF agentic loop's event shape. Disabled for
+ # vision turns (untested overlap with image render slot) and
+ # for gpt-oss (Harmony uses dedicated channels, not
+ # XML -- gpt-oss tools still work via the GGUF path).
+ _sf_is_gptoss = False
+ try:
+ _sf_is_gptoss = bool(
+ hasattr(backend, "_is_gpt_oss_model") and backend._is_gpt_oss_model()
+ )
+ except Exception:
+ _sf_is_gptoss = False
+
+ _sf_tool_budget = (
+ payload.max_tool_calls_per_message
+ if payload.max_tool_calls_per_message is not None
+ else 25
+ )
+
+ _sf_use_tools = (
+ _effective_enable_tools(payload)
+ and _sf_features.get("supports_tools", False)
+ and image is None
+ and not _sf_is_gptoss
+ and _sf_tool_budget > 0
+ )
+
+ if _sf_use_tools:
+ from core.inference.tools import ALL_TOOLS
+
+ if payload.enabled_tools is not None:
+ _sf_tools_to_use = [
+ t for t in ALL_TOOLS if t["function"]["name"] in payload.enabled_tools
+ ]
+ else:
+ _sf_tools_to_use = ALL_TOOLS
+
+ _sf_tool_names = {t["function"]["name"] for t in _sf_tools_to_use}
+ _sf_has_web = "web_search" in _sf_tool_names
+ _sf_has_code = "python" in _sf_tool_names or "terminal" in _sf_tool_names
+
+ _sf_date_line = f"The current date is {_date.today().isoformat()}."
+ _sf_model_size_b = _extract_model_size_b(model_name)
+ _sf_is_small_model = _sf_model_size_b is not None and _sf_model_size_b < 9
+
+ if _sf_is_small_model:
+ _sf_web_tips = "Do not repeat the same search query."
+ else:
+ _sf_web_tips = (
+ "When you search and find a relevant URL in the results, "
+ "fetch its full content by calling web_search with the url parameter. "
+ "Do not repeat the same search query. If a search returns "
+ "no useful results, try rephrasing or fetching a result URL directly."
+ )
+ _sf_code_tips = (
+ "Use code execution for math, calculations, data processing, "
+ "or to parse and analyze information from tool results."
+ )
+
+ if _sf_has_web and _sf_has_code:
+ _sf_nudge = (
+ _sf_date_line + " "
+ "You have access to tools. When appropriate, prefer using "
+ "tools rather than answering from memory. "
+ + _sf_web_tips
+ + " "
+ + _sf_code_tips
+ )
+ elif _sf_has_code:
+ _sf_nudge = (
+ _sf_date_line + " "
+ "You have access to tools. When appropriate, prefer using "
+ "code execution rather than answering from memory. " + _sf_code_tips
+ )
+ elif _sf_has_web:
+ _sf_nudge = (
+ _sf_date_line + " "
+ "You have access to tools. When appropriate, prefer using "
+ "web search for up-to-date or uncertain factual "
+ "information rather than answering from memory. " + _sf_web_tips
+ )
+ else:
+ _sf_nudge = ""
+
+ _sf_system_prompt = system_prompt
+ if _sf_nudge:
+ _sf_nudge += _TOOL_ACTION_NUDGE
+ if _sf_system_prompt:
+ _sf_system_prompt = _sf_system_prompt.rstrip() + "\n\n" + _sf_nudge
+ else:
+ _sf_system_prompt = _sf_nudge
+
+ # Strip stale tool-call XML from prior assistant turns.
+ _sf_chat_messages = []
+ for _msg in chat_messages:
+ if _msg.get("role") == "assistant" and isinstance(_msg.get("content"), str):
+ _sf_chat_messages.append(
+ {
+ **_msg,
+ "content": _TOOL_XML_RE.sub("", _msg["content"]).strip(),
+ }
+ )
+ else:
+ _sf_chat_messages.append(_msg)
+
+ def sf_generate_with_tools():
+ return backend.generate_chat_completion_with_tools(
+ messages = _sf_chat_messages,
+ tools = _sf_tools_to_use,
+ system_prompt = _sf_system_prompt or "",
+ temperature = payload.temperature,
+ top_p = payload.top_p,
+ top_k = payload.top_k,
+ min_p = payload.min_p,
+ max_tokens = payload.max_tokens,
+ repetition_penalty = payload.repetition_penalty,
+ cancel_event = cancel_event,
+ enable_thinking = payload.enable_thinking,
+ reasoning_effort = payload.reasoning_effort,
+ preserve_thinking = payload.preserve_thinking,
+ auto_heal_tool_calls = payload.auto_heal_tool_calls
+ if payload.auto_heal_tool_calls is not None
+ else True,
+ max_tool_iterations = _sf_tool_budget,
+ tool_call_timeout = payload.tool_call_timeout
+ if payload.tool_call_timeout is not None
+ else 300,
+ session_id = payload.session_id,
+ use_adapter = payload.use_adapter,
+ )
+
+ _sf_tool_sentinel = object()
+ _sf_cancel_keys = (payload.cancel_id, payload.session_id, completion_id)
+ _sf_tracker = _TrackedCancel(cancel_event, *_sf_cancel_keys)
+ _sf_tracker.__enter__()
+
+ async def sf_tool_stream():
+ try:
+ first_chunk = ChatCompletionChunk(
+ id = completion_id,
+ created = created,
+ model = model_name,
+ choices = [
+ ChunkChoice(
+ delta = ChoiceDelta(role = "assistant"),
+ finish_reason = None,
+ )
+ ],
+ )
+ yield f"data: {first_chunk.model_dump_json(exclude_none = True)}\n\n"
+
+ gen = sf_generate_with_tools()
+ prev_text = ""
+ while True:
+ if cancel_event.is_set():
+ backend.reset_generation_state()
+ break
+ if await request.is_disconnected():
+ cancel_event.set()
+ backend.reset_generation_state()
+ return
+
+ event = await asyncio.to_thread(next, gen, _sf_tool_sentinel)
+ if event is _sf_tool_sentinel:
+ break
+
+ if event["type"] == "status":
+ if not event["text"]:
+ prev_text = ""
+ status_data = json.dumps(
+ {
+ "type": "tool_status",
+ "content": event["text"],
+ }
+ )
+ yield f"data: {status_data}\n\n"
+ continue
+
+ if event["type"] in ("tool_start", "tool_end"):
+ if event["type"] == "tool_start":
+ prev_text = ""
+ yield f"data: {json.dumps(event)}\n\n"
+ continue
+
+ # Diff cumulative cleaned text against last snapshot.
+ raw_cumulative = event.get("text", "")
+ clean_cumulative = _TOOL_XML_RE.sub("", raw_cumulative)
+ new_text = clean_cumulative[len(prev_text) :]
+ prev_text = clean_cumulative
+ if not new_text:
+ continue
+ chunk = ChatCompletionChunk(
+ id = completion_id,
+ created = created,
+ model = model_name,
+ choices = [
+ ChunkChoice(
+ delta = ChoiceDelta(content = new_text),
+ finish_reason = None,
+ )
+ ],
+ )
+ yield f"data: {chunk.model_dump_json(exclude_none = True)}\n\n"
+
+ final_chunk = ChatCompletionChunk(
+ id = completion_id,
+ created = created,
+ model = model_name,
+ choices = [
+ ChunkChoice(
+ delta = ChoiceDelta(),
+ finish_reason = "stop",
+ )
+ ],
+ )
+ yield f"data: {final_chunk.model_dump_json(exclude_none = True)}\n\n"
+ yield "data: [DONE]\n\n"
+
+ except asyncio.CancelledError:
+ cancel_event.set()
+ backend.reset_generation_state()
+ raise
+ except Exception:
+ backend.reset_generation_state()
+ # Generic wire message; full trace stays in the log
+ # (CWE-209: transformers/torch errors may leak paths).
+ logger.exception("safetensors tool stream error")
+ error_chunk = {
+ "error": {
+ "message": "An internal error occurred.",
+ "type": "server_error",
+ },
+ }
+ yield f"data: {json.dumps(error_chunk)}\n\n"
+ finally:
+ _sf_tracker.__exit__(None, None, None)
+
+ if payload.stream:
+ return StreamingResponse(
+ sf_tool_stream(),
+ media_type = "text/event-stream",
+ headers = {
+ "Cache-Control": "no-cache",
+ "Connection": "keep-alive",
+ "X-Accel-Buffering": "no",
+ },
+ )
+
+ # Non-streaming JSON: drain the loop, build one ChatCompletion.
+ try:
+
+ def _drain_to_text():
+ full_text = ""
+ gen = sf_generate_with_tools()
+ for event in gen:
+ if cancel_event.is_set():
+ break
+ if event.get("type") == "content":
+ full_text = _TOOL_XML_RE.sub("", event.get("text", ""))
+ return full_text
+
+ content_text = await asyncio.to_thread(_drain_to_text)
+ response = ChatCompletion(
+ id = completion_id,
+ created = created,
+ model = model_name,
+ choices = [
+ CompletionChoice(
+ message = CompletionMessage(content = content_text),
+ finish_reason = "stop",
+ )
+ ],
+ )
+ return JSONResponse(content = response.model_dump())
+ except Exception:
+ backend.reset_generation_state()
+ # CWE-209: generic detail; full trace in log.
+ logger.exception("safetensors tool completion error")
+ raise HTTPException(
+ status_code = 500,
+ detail = "An internal error occurred.",
+ )
+ finally:
+ _sf_tracker.__exit__(None, None, None)
+
# Shared generation kwargs
gen_kwargs = dict(
messages = chat_messages,
@@ -2751,9 +3095,14 @@ async def gguf_stream_chunks():
max_new_tokens = payload.max_tokens or 2048,
repetition_penalty = payload.repetition_penalty,
)
-
- # Choose generation path (adapter-controlled or standard)
- cancel_event = threading.Event()
+ # Forward reasoning kwargs; the worker/template wrapper peels off
+ # any the template doesn't accept.
+ if payload.enable_thinking is not None:
+ gen_kwargs["enable_thinking"] = payload.enable_thinking
+ if payload.reasoning_effort is not None:
+ gen_kwargs["reasoning_effort"] = payload.reasoning_effort
+ if payload.preserve_thinking is not None:
+ gen_kwargs["preserve_thinking"] = payload.preserve_thinking
if payload.use_adapter is not None:
@@ -2770,9 +3119,6 @@ def generate():
cancel_event = cancel_event, **gen_kwargs
)
- completion_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
- created = int(time.time())
-
# ── Streaming response ────────────────────────────────────────
if payload.stream:
_cancel_keys = (payload.cancel_id, payload.session_id, completion_id)
@@ -3545,6 +3891,17 @@ async def _responses_stream(
),
)
+ # Direct pass-through bypasses the openai_chat_completions image gate.
+ if not llama_backend.is_vision and any(
+ isinstance(m.content, list)
+ and any(isinstance(p, ImageContentPart) for p in m.content)
+ for m in messages
+ ):
+ raise HTTPException(
+ status_code = 400,
+ detail = "Image provided but current GGUF model does not support vision.",
+ )
+
body = _build_openai_passthrough_body(
chat_req, backend_ctx = llama_backend.context_length
)
diff --git a/studio/backend/tests/test_mlx_inference_backend.py b/studio/backend/tests/test_mlx_inference_backend.py
index ce447bdd1f..38d3abb80c 100644
--- a/studio/backend/tests/test_mlx_inference_backend.py
+++ b/studio/backend/tests/test_mlx_inference_backend.py
@@ -158,3 +158,92 @@ def _native_vlm_load(*_args, **_kwargs):
assert backend._is_vlm is True
assert isinstance(backend._processor, _DummyProcessor)
assert isinstance(backend._tokenizer, _DummyTokenizer)
+
+
+# Regression: MLXInferenceBackend.generate_chat_response must accept the
+# four template kwargs (tools / enable_thinking / reasoning_effort /
+# preserve_thinking) so the route layer can forward what the user
+# toggled in the UI. The previous signature raised
+# "got an unexpected keyword argument 'tools'" on Mac.
+
+
+def test_mlx_generate_chat_response_accepts_template_kwargs():
+ import inspect
+ from core.inference.mlx_inference import MLXInferenceBackend
+
+ sig = inspect.signature(MLXInferenceBackend.generate_chat_response)
+ params = sig.parameters
+ for name in ("tools", "enable_thinking", "reasoning_effort", "preserve_thinking"):
+ assert name in params, (
+ f"MLX.generate_chat_response is missing the {name!r} kwarg; "
+ "the route layer forwards this and a missing kwarg raises "
+ "TypeError on Mac"
+ )
+ assert params[name].default is None, (
+ f"{name!r} must default to None so existing callers stay valid"
+ )
+
+
+def test_mlx_generate_text_forwards_kwargs_into_template_helper(monkeypatch):
+ """The Mac text path must route through apply_chat_template_for_
+ generation so reasoning / tool kwargs reach the tokenizer."""
+ _install_fake_mlx(monkeypatch)
+ from core.inference.mlx_inference import MLXInferenceBackend
+
+ captured = {}
+
+ def _fake_apply(tokenizer, messages, **kwargs):
+ captured["tokenizer"] = tokenizer
+ captured["messages"] = messages
+ captured["kwargs"] = kwargs
+ return ""
+
+ monkeypatch.setattr(
+ "core.inference.chat_template_helpers."
+ "apply_chat_template_for_generation",
+ _fake_apply,
+ raising = True,
+ )
+
+ # mlx_lm.stream_generate yields response objects with .token; make a
+ # one-token generator so _generate_text returns without touching the
+ # real stack.
+ import types as _types
+ mlx_lm_pkg = _types.ModuleType("mlx_lm")
+ mlx_lm_sample = _types.ModuleType("mlx_lm.sample_utils")
+ mlx_lm_sample.make_sampler = lambda **_kw: object()
+ mlx_lm_sample.make_logits_processors = lambda **_kw: None
+
+ class _Resp:
+ def __init__(self, tok): self.token = tok
+
+ def _stream_generate(_model, _tokenizer, **_kw):
+ yield _Resp(1)
+ mlx_lm_pkg.stream_generate = _stream_generate
+ monkeypatch.setitem(sys.modules, "mlx_lm", mlx_lm_pkg)
+ monkeypatch.setitem(sys.modules, "mlx_lm.sample_utils", mlx_lm_sample)
+
+ class _Tok:
+ chat_template = "x"
+ def decode(self, ids, skip_special_tokens = False):
+ return "hi"
+
+ backend = MLXInferenceBackend()
+ backend._model = object()
+ backend._tokenizer = _Tok()
+ backend._is_vlm = False
+
+ out = list(backend.generate_chat_response(
+ messages = [{"role": "user", "content": "ping"}],
+ tools = [{"function": {"name": "web_search"}}],
+ enable_thinking = True,
+ reasoning_effort = "medium",
+ preserve_thinking = True,
+ max_new_tokens = 1,
+ ))
+ assert out == ["hi"]
+ # The kwargs the user toggled must reach the chat-template helper.
+ assert captured["kwargs"]["tools"] == [{"function": {"name": "web_search"}}]
+ assert captured["kwargs"]["enable_thinking"] is True
+ assert captured["kwargs"]["reasoning_effort"] == "medium"
+ assert captured["kwargs"]["preserve_thinking"] is True
diff --git a/studio/backend/tests/test_safetensors_capability_advertise.py b/studio/backend/tests/test_safetensors_capability_advertise.py
new file mode 100644
index 0000000000..c3ee5b9ff1
--- /dev/null
+++ b/studio/backend/tests/test_safetensors_capability_advertise.py
@@ -0,0 +1,451 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""
+Capability advertisement contract: classifier honesty, worker→
+orchestrator IPC hop, and route-layer end-to-end. Pure helpers + fakes;
+no torch / transformers import.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+_backend_root = Path(__file__).resolve().parent.parent
+if str(_backend_root) not in sys.path:
+ sys.path.insert(0, str(_backend_root))
+
+
+# Qwen3 snippet covering tools, enable_thinking, preserve_thinking.
+QWEN3_TEMPLATE = """
+{%- if tools %}
+ {{- '<|im_start|>system\\nFor each function call, return a json object'
+ ' wrapped inside tags.\\n' }}
+ {%- for tool in tools %}
+ {{- tool | tojson }}
+ {%- endfor %}
+{%- endif %}
+{%- for message in messages %}
+ {%- if message.role == 'tool' %}
+ {{- '<|im_start|>tool\\n' + message.content + '<|im_end|>\\n' }}
+ {%- endif %}
+{%- endfor %}
+{%- if enable_thinking is defined and enable_thinking %}
+ {{- '' }}
+{%- endif %}
+{%- if preserve_thinking %}
+ {{- assistant.reasoning_content }}
+{%- endif %}
+"""
+
+
+GPT_OSS_TEMPLATE = """
+<|start|>system<|message|>You are gpt-oss.
+reasoning_effort: {{ reasoning_effort }}
+<|end|>
+"""
+
+
+PLAIN_TEMPLATE = """
+{%- for message in messages %}
+ {{- message.role + ': ' + message.content + '\\n' }}
+{%- endfor %}
+"""
+
+
+# ── Tests: classifier honesty ────────────────────────────────────────
+
+
+def test_detect_reasoning_flags_qwen3_supports_tools_and_reasoning():
+ from core.inference.llama_cpp import detect_reasoning_flags
+
+ flags = detect_reasoning_flags(QWEN3_TEMPLATE, "unsloth/Qwen3-0.6B")
+ assert flags["supports_tools"] is True
+ assert flags["supports_reasoning"] is True
+ assert flags["reasoning_style"] == "enable_thinking"
+ assert flags["supports_preserve_thinking"] is True
+ assert flags["reasoning_always_on"] is False
+
+
+def test_detect_reasoning_flags_plain_template_all_false():
+ from core.inference.llama_cpp import detect_reasoning_flags
+
+ flags = detect_reasoning_flags(PLAIN_TEMPLATE, "some/PlainChat")
+ assert flags["supports_tools"] is False
+ assert flags["supports_reasoning"] is False
+ assert flags["supports_preserve_thinking"] is False
+ assert flags["reasoning_always_on"] is False
+
+
+def test_detect_reasoning_flags_none_template_returns_all_false():
+ from core.inference.llama_cpp import detect_reasoning_flags
+
+ flags = detect_reasoning_flags(None)
+ assert flags["supports_tools"] is False
+ assert flags["supports_reasoning"] is False
+ assert flags["supports_preserve_thinking"] is False
+ assert flags["reasoning_always_on"] is False
+ assert flags["reasoning_style"] == "enable_thinking"
+
+
+def test_detect_safetensors_features_passes_template_through_to_classifier():
+ """Route wrapper forwards a real template to the inner classifier."""
+ from routes.inference import _detect_safetensors_features
+
+ backend = SimpleNamespace(active_model_name = "unsloth/Qwen3-0.6B")
+ flags = _detect_safetensors_features(backend, QWEN3_TEMPLATE)
+ assert flags["supports_tools"] is True
+ assert flags["supports_reasoning"] is True
+
+
+def test_detect_safetensors_features_none_template_returns_all_false():
+ from routes.inference import _detect_safetensors_features
+
+ backend = SimpleNamespace(active_model_name = "unsloth/Qwen3-0.6B")
+ flags = _detect_safetensors_features(backend, None)
+ assert flags == {
+ "supports_reasoning": False,
+ "reasoning_style": "enable_thinking",
+ "reasoning_always_on": False,
+ "supports_preserve_thinking": False,
+ "supports_tools": False,
+ }
+
+
+def test_detect_safetensors_features_gptoss_disables_tools():
+ """gpt-oss Harmony: tools intentionally off even if template marks it."""
+ from routes.inference import _detect_safetensors_features
+
+ backend = MagicMock()
+ backend.active_model_name = "unsloth/gpt-oss-20b"
+ backend._is_gpt_oss_model.return_value = True
+
+ flags = _detect_safetensors_features(backend, QWEN3_TEMPLATE)
+ assert flags["supports_reasoning"] is True
+ assert flags["reasoning_style"] == "reasoning_effort"
+ assert flags["supports_tools"] is False
+
+
+# Llama-3 / Mistral templates advertise tool handling but the model emits
+# tool calls in <|python_tag|> / [TOOL_CALLS] format -- not the
+# / system<|end_header_id|>' }}
+ {{- 'You have access to the following tools.' }}
+ {%- for tool in tools %}
+ {{- tool | tojson }}
+ {%- endfor %}
+{%- endif %}
+{%- for message in messages %}
+ {%- if message.role == 'tool' %}
+ {{- '<|start_header_id|>ipython<|end_header_id|>' }}
+ {{- '<|python_tag|>' }}
+ {{- message.content }}
+ {%- endif %}
+{%- endfor %}
+"""
+
+MISTRAL_TEMPLATE = """
+{%- if tools %}
+ {%- for tool in tools %}
+ {{- tool | tojson }}
+ {%- endfor %}
+{%- endif %}
+{%- for message in messages %}
+ {%- if message.role == 'tool' %}
+ {{- '[TOOL_CALLS]' + message.content + '[/TOOL_CALLS]' }}
+ {%- endif %}
+{%- endfor %}
+"""
+
+
+def test_detect_safetensors_features_llama3_template_suppresses_tools():
+ """Llama-3 emits <|python_tag|>; safetensors loop cannot parse it."""
+ from routes.inference import _detect_safetensors_features
+
+ backend = SimpleNamespace(active_model_name = "unsloth/Llama-3.2-3B-Instruct")
+ flags = _detect_safetensors_features(backend, LLAMA3_TEMPLATE)
+ assert flags["supports_tools"] is False
+
+
+def test_detect_safetensors_features_mistral_template_suppresses_tools():
+ """Mistral emits [TOOL_CALLS]; safetensors loop cannot parse it."""
+ from routes.inference import _detect_safetensors_features
+
+ backend = SimpleNamespace(active_model_name = "unsloth/mistral-7b-instruct-v0.3")
+ flags = _detect_safetensors_features(backend, MISTRAL_TEMPLATE)
+ assert flags["supports_tools"] is False
+
+
+def test_detect_safetensors_features_qwen_tool_call_keeps_tools_on():
+ """Sanity check: gate only suppresses non-Qwen formats."""
+ from routes.inference import _detect_safetensors_features
+
+ backend = SimpleNamespace(active_model_name = "unsloth/Qwen3-0.6B")
+ flags = _detect_safetensors_features(backend, QWEN3_TEMPLATE)
+ assert flags["supports_tools"] is True
+
+
+def test_detect_safetensors_features_function_xml_format_keeps_tools_on():
+ """Templates emitting XML are parser-compatible."""
+ from routes.inference import _detect_safetensors_features
+
+ tpl_with_function_xml = (
+ "{%- if tools %}<|im_start|>system\n"
+ "Tool call format: v"
+ "<|im_end|>{%- endif %}"
+ )
+ backend = SimpleNamespace(active_model_name = "custom/with-function-xml")
+ flags = _detect_safetensors_features(backend, tpl_with_function_xml)
+ assert flags["supports_tools"] is True
+
+
+# Qwen3.5 family pins -- the live GGUF + safetensors templates fetched
+# from the unsloth/Qwen3.5-0.8B(-GGUF) repos both wrap tool calls as
+# ``\n...``. Capture a faithful slice so the
+# classifier never silently regresses for this family.
+
+QWEN35_TOOL_INSTRUCTION = (
+ "{%- if tools %}\n"
+ " <|im_start|>system\n"
+ " # Tools\n"
+ " \n"
+ " {%- for tool in tools %}{{ tool | tojson }}{%- endfor %}\n"
+ " \n"
+ " If you choose to call a function ONLY reply in the following format:\n"
+ " \n"
+ " \n"
+ " \n"
+ " value_1\n"
+ " \n"
+ " \n"
+ " \n"
+ " <|im_end|>\n"
+ "{%- endif %}\n"
+ "{%- if enable_thinking is defined and enable_thinking %}{{- '' }}{%- endif %}\n"
+)
+
+
+def test_detect_safetensors_features_qwen35_keeps_tools_on():
+ """unsloth/Qwen3.5-0.8B family must surface tools+reasoning enabled."""
+ from routes.inference import _detect_safetensors_features
+
+ backend = SimpleNamespace(active_model_name = "unsloth/Qwen3.5-0.8B")
+ flags = _detect_safetensors_features(backend, QWEN35_TOOL_INSTRUCTION)
+ assert flags["supports_tools"] is True
+ assert flags["supports_reasoning"] is True
+ assert flags["reasoning_style"] == "enable_thinking"
+
+
+# ── Tests: IPC bridge contract ───────────────────────────────────────
+
+
+def test_orchestrator_mirrors_chat_template_info_into_models_dict():
+ """Worker → orchestrator must copy chat_template_info verbatim."""
+ from core.inference.orchestrator import InferenceOrchestrator
+
+ orch = InferenceOrchestrator.__new__(InferenceOrchestrator)
+ orch.models = {}
+ orch.active_model_name = None
+ orch.loading_models = set()
+
+ model_info = {
+ "identifier": "unsloth/Qwen3-0.6B",
+ "display_name": "Qwen3-0.6B",
+ "is_vision": False,
+ "is_lora": False,
+ "is_gguf": False,
+ "is_audio": False,
+ "audio_type": None,
+ "has_audio_input": False,
+ "chat_template_info": {
+ "has_template": True,
+ "template": QWEN3_TEMPLATE,
+ "format_type": "chatml",
+ "template_name": "qwen3",
+ "special_tokens": {"bos_token": "<|im_start|>"},
+ },
+ }
+
+ # Replay orchestrator.load_model's mirror block verbatim.
+ orch.active_model_name = model_info["identifier"]
+ orch.models[orch.active_model_name] = {
+ "is_vision": model_info.get("is_vision", False),
+ "is_lora": model_info.get("is_lora", False),
+ "display_name": model_info.get("display_name", "x"),
+ "is_audio": model_info.get("is_audio", False),
+ "audio_type": model_info.get("audio_type"),
+ "has_audio_input": model_info.get("has_audio_input", False),
+ }
+ _tpl_info = model_info.get("chat_template_info")
+ if isinstance(_tpl_info, dict):
+ orch.models[orch.active_model_name]["chat_template_info"] = _tpl_info
+
+ entry = orch.models[orch.active_model_name]
+ tpl = entry.get("chat_template_info", {}).get("template")
+ assert tpl == QWEN3_TEMPLATE
+
+ from routes.inference import _detect_safetensors_features
+
+ flags = _detect_safetensors_features(
+ SimpleNamespace(active_model_name = orch.active_model_name), tpl
+ )
+ assert flags["supports_tools"] is True
+ assert flags["supports_reasoning"] is True
+
+
+def test_orchestrator_missing_chat_template_info_falls_back_to_all_false():
+ """Old / malformed worker reply: no crash, all flags False."""
+ from core.inference.orchestrator import InferenceOrchestrator
+ from routes.inference import _detect_safetensors_features
+
+ orch = InferenceOrchestrator.__new__(InferenceOrchestrator)
+ orch.models = {}
+ orch.active_model_name = "unsloth/Qwen3-0.6B"
+
+ model_info = {
+ "identifier": "unsloth/Qwen3-0.6B",
+ "is_vision": False,
+ "is_lora": False,
+ # NB: no chat_template_info key
+ }
+ orch.models[orch.active_model_name] = {
+ "is_vision": False,
+ "is_lora": False,
+ }
+ _tpl_info = model_info.get("chat_template_info")
+ if isinstance(_tpl_info, dict):
+ orch.models[orch.active_model_name]["chat_template_info"] = _tpl_info
+
+ entry = orch.models[orch.active_model_name]
+ tpl = entry.get("chat_template_info", {}).get("template")
+ assert tpl is None
+
+ flags = _detect_safetensors_features(
+ SimpleNamespace(active_model_name = orch.active_model_name), tpl
+ )
+ assert flags["supports_tools"] is False
+
+
+def test_worker_load_reply_payload_includes_chat_template_info():
+ """Worker IPC reply carries chat_template_info dict."""
+
+ class _StubBackend:
+ def __init__(self, identifier, template):
+ self.active_model_name = identifier
+ self.models = {
+ identifier: {
+ "chat_template_info": {
+ "has_template": True,
+ "template": template,
+ "format_type": "chatml",
+ "template_name": "qwen3",
+ "special_tokens": {"bos_token": "<|im_start|>"},
+ }
+ }
+ }
+
+ backend = _StubBackend("unsloth/Qwen3-0.6B", QWEN3_TEMPLATE)
+ mc = SimpleNamespace(
+ identifier = "unsloth/Qwen3-0.6B",
+ display_name = "Qwen3-0.6B",
+ is_vision = False,
+ is_lora = False,
+ )
+
+ # Replay the worker's payload-build block.
+ model_info = {
+ "identifier": mc.identifier,
+ "display_name": mc.display_name,
+ "is_vision": mc.is_vision,
+ "is_lora": mc.is_lora,
+ "is_gguf": False,
+ }
+ _bm = getattr(backend, "models", {}) or {}
+ _entry = (
+ _bm.get(mc.identifier)
+ or _bm.get(getattr(backend, "active_model_name", None))
+ or {}
+ )
+ _tpl_info = _entry.get("chat_template_info")
+ if isinstance(_tpl_info, dict):
+ model_info["chat_template_info"] = {
+ "has_template": bool(_tpl_info.get("has_template", False)),
+ "template": _tpl_info.get("template"),
+ "format_type": _tpl_info.get("format_type", "generic"),
+ "template_name": _tpl_info.get("template_name"),
+ "special_tokens": _tpl_info.get("special_tokens", {}) or {},
+ }
+
+ assert "chat_template_info" in model_info
+ assert model_info["chat_template_info"]["template"] == QWEN3_TEMPLATE
+ assert model_info["chat_template_info"]["has_template"] is True
+
+
+def test_worker_load_reply_payload_survives_missing_template():
+ """Tokenizer with no chat_template still produces a valid reply."""
+
+ class _StubBackend:
+ def __init__(self):
+ self.active_model_name = "legacy/no-template"
+ self.models = {"legacy/no-template": {}} # no chat_template_info
+
+ backend = _StubBackend()
+ mc = SimpleNamespace(
+ identifier = "legacy/no-template",
+ display_name = "legacy",
+ is_vision = False,
+ is_lora = False,
+ )
+
+ model_info = {
+ "identifier": mc.identifier,
+ "display_name": mc.display_name,
+ "is_vision": mc.is_vision,
+ "is_lora": mc.is_lora,
+ "is_gguf": False,
+ }
+ _bm = getattr(backend, "models", {}) or {}
+ _entry = _bm.get(mc.identifier) or {}
+ _tpl_info = _entry.get("chat_template_info")
+ if isinstance(_tpl_info, dict):
+ model_info["chat_template_info"] = dict(_tpl_info)
+
+ assert "chat_template_info" not in model_info
+
+
+# ── End-to-end: route layer sees the template, advertises True ───────
+
+
+def test_route_layer_emits_supports_tools_true_for_qwen3_safetensors():
+ """End-to-end: Qwen3 safetensors flips supports_tools=True."""
+ from routes.inference import _detect_safetensors_features
+
+ backend = SimpleNamespace(
+ active_model_name = "unsloth/Qwen3-0.6B",
+ models = {
+ "unsloth/Qwen3-0.6B": {
+ "is_vision": False,
+ "chat_template_info": {
+ "has_template": True,
+ "template": QWEN3_TEMPLATE,
+ "format_type": "chatml",
+ },
+ }
+ },
+ )
+
+ _model_info = backend.models.get(backend.active_model_name, {})
+ _tpl = _model_info.get("chat_template_info", {}).get("template")
+ flags = _detect_safetensors_features(backend, _tpl)
+
+ assert flags["supports_tools"] is True
+ assert flags["supports_reasoning"] is True
+ assert flags["supports_preserve_thinking"] is True
diff --git a/studio/backend/tests/test_safetensors_tool_loop.py b/studio/backend/tests/test_safetensors_tool_loop.py
new file mode 100644
index 0000000000..4464bd5fc6
--- /dev/null
+++ b/studio/backend/tests/test_safetensors_tool_loop.py
@@ -0,0 +1,1810 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""
+Tests for the safetensors agentic tool loop.
+
+Covers the shared ``tool_call_parser`` helpers and the cumulative-text
+state machine inside ``safetensors_agentic.run_safetensors_tool_loop``.
+The loop is exercised with hand-crafted fake single-turn generators so
+no model load is needed; the tests run in CI under a few seconds.
+
+Edge cases under coverage:
+* Plain answers (no tool calls) flush full content.
+* Single ``{json}`` triggers the tool and re-enters.
+* Single ``...`` XML form triggers the same path.
+* Truncated unclosed ```` is still parsed.
+* Tool result is fed back as ``role=tool`` for the next iteration.
+* Bad JSON inside ```` does not raise and (when healed) is
+ routed as a ``{"query": ...}`` web search call.
+* Duplicate tool calls produce a synthetic "do not repeat" result the
+ second time.
+* ``__IMAGES__`` sentinel is stripped before the model sees the result.
+* Tool execution errors are tagged so the model gets a nudge but the
+ loop keeps streaming.
+* Cancel is honoured between iterations.
+* ``max_tool_iterations`` cap is respected and a final-answer attempt
+ closes the stream cleanly.
+"""
+
+import threading
+
+import pytest
+
+from core.inference import safetensors_agentic
+from core.inference.safetensors_agentic import (
+ _coerce_arguments,
+ run_safetensors_tool_loop,
+)
+from core.inference.tool_call_parser import (
+ has_tool_signal,
+ parse_tool_calls_from_text,
+ strip_tool_markup,
+)
+from utils.datasets import is_gpt_oss_model_name
+
+
+# ────────────────────────────────────────────────────────────────────
+# parse_tool_calls_from_text
+# ────────────────────────────────────────────────────────────────────
+
+
+class TestParser:
+ def test_json_tool_call(self):
+ text = (
+ '{"name":"web_search","arguments":{"query":"hello"}}'
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ tc = result[0]
+ assert tc["type"] == "function"
+ assert tc["function"]["name"] == "web_search"
+ # Arguments must always be a JSON string.
+ assert isinstance(tc["function"]["arguments"], str)
+ assert "hello" in tc["function"]["arguments"]
+
+ def test_json_tool_call_unclosed(self):
+ # No ; balanced-brace extractor must still close.
+ text = '{"name":"python","arguments":{"code":"print(1)"}}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "python"
+
+ def test_xml_function_call(self):
+ text = "print('hi')"
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "python"
+ assert "print('hi')" in result[0]["function"]["arguments"]
+
+ def test_xml_unclosed(self):
+ # Closing tags omitted; parser must still extract the value.
+ text = "ls -la"
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "terminal"
+ assert "ls -la" in result[0]["function"]["arguments"]
+
+ def test_code_with_embedded_xml(self):
+ # A code parameter contains the literal . Must not
+ # truncate the value because the parser uses end-of-body as the
+ # only boundary for single-parameter calls.
+ text = (
+ "html = ''\n"
+ "print('hi')"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert "print('hi')" in result[0]["function"]["arguments"]
+
+ def test_multiple_calls(self):
+ text = (
+ '{"name":"web_search","arguments":{"query":"a"}}'
+ '{"name":"web_search","arguments":{"query":"b"}}'
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 2
+ assert result[0]["function"]["name"] == "web_search"
+ assert result[1]["function"]["name"] == "web_search"
+
+ def test_bad_json_does_not_raise(self):
+ text = "{not valid json}"
+ result = parse_tool_calls_from_text(text)
+ # Bad JSON is silently dropped; caller can fall back to text.
+ assert result == []
+
+ def test_has_tool_signal(self):
+ assert has_tool_signal("blah x")
+ assert has_tool_signal("hi ...")
+ assert not has_tool_signal("hello world")
+
+ def test_strip_markup_closed(self):
+ text = "before {} after"
+ assert strip_tool_markup(text) == "before after"
+
+ def test_strip_markup_unclosed_final(self):
+ text = "before {partial"
+ # With final=True the trailing run is dropped.
+ assert strip_tool_markup(text, final = True) == "before"
+ # Without final=True the unclosed run is preserved.
+ assert "partial" in strip_tool_markup(text)
+
+
+class TestParserMultiFormat:
+ """Parser coverage for Llama-3 / Mistral / Gemma 4 emission formats.
+
+ Each model family upstream of GGUF emits a different tool-call
+ shape. The shared parser must turn all of them into the same
+ OpenAI ``{name, arguments}`` shape so the safetensors / MLX
+ agentic loop is family-agnostic.
+ """
+
+ # ── Llama-3 ────────────────────────────────────────────────────
+
+ def test_llama3_python_tag_dot_call(self):
+ # Llama-3 built-in tools: <|python_tag|>NAME.call(k="v", ...).
+ import json
+
+ text = '<|python_tag|>brave_search.call(query="weather in Tokyo")'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "brave_search"
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args == {"query": "weather in Tokyo"}
+
+ def test_llama3_python_tag_dot_call_multi_arg(self):
+ import json
+
+ text = (
+ "<|python_tag|>get_weather.call("
+ 'location="Tokyo", units="celsius", days=5)'
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args == {"location": "Tokyo", "units": "celsius", "days": 5}
+
+ def test_llama3_python_tag_json_form(self):
+ import json
+
+ text = (
+ '<|python_tag|>{"name":"web_search",' '"parameters":{"query":"hi","n":5}}'
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "web_search"
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args == {"query": "hi", "n": 5}
+
+ def test_llama3_python_tag_json_form_with_eom(self):
+ # Llama-3 emits ``<|eom_id|>`` after the JSON; must not break parsing.
+ import json
+
+ text = (
+ '<|python_tag|>{"name":"python",'
+ '"parameters":{"code":"print(2+2)"}}<|eom_id|>'
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args == {"code": "print(2+2)"}
+
+ def test_llama3_strip_markup_final(self):
+ text = '<|python_tag|>brave_search.call(query="x")'
+ assert strip_tool_markup(text, final = True) == ""
+
+ # ── Llama-3.2 bare JSON ``custom_tools`` ─────────────────────
+
+ def test_llama3_2_bare_json_parameters(self):
+ # Llama-3.2-Instruct emits bare JSON directly as content; no
+ # <|python_tag|> prefix per its training template.
+ import json
+
+ text = '{"name":"web_search","parameters":{"query":"Tokyo weather"}}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "web_search"
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args == {"query": "Tokyo weather"}
+
+ def test_llama3_2_bare_json_arguments_key(self):
+ import json
+
+ text = '{"name":"add","arguments":{"a":1,"b":2}}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args == {"a": 1, "b": 2}
+
+ def test_llama3_2_bare_json_multi_call(self):
+ # Llama-3 may chain calls with ``; `` per training template.
+ text = '{"name":"a","parameters":{}}; ' '{"name":"b","parameters":{}}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 2
+ assert result[0]["function"]["name"] == "a"
+ assert result[1]["function"]["name"] == "b"
+
+ def test_llama3_2_bare_json_with_eom_sentinel(self):
+ text = '{"name":"x","parameters":{"y":1}}<|eom_id|>'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "x"
+
+ def test_llama3_2_bare_json_leading_sentinel_skipped(self):
+ # Sometimes prior <|eot_id|> leaks into the next turn.
+ text = '<|eot_id|>{"name":"x","parameters":{}}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "x"
+
+ def test_llama3_2_bare_json_plain_prose_does_not_fire(self):
+ # Defensive: must NOT fire on plain assistant prose.
+ text = "Hello world, how are you today?"
+ assert parse_tool_calls_from_text(text) == []
+
+ def test_llama3_2_bare_json_embedded_in_prose_does_not_fire(self):
+ # Defensive: JSON embedded in prose must NOT fire (parser is
+ # strict about content STARTING with `{`).
+ text = 'The tool result was: {"name":"foo"}'
+ assert parse_tool_calls_from_text(text) == []
+
+ def test_llama3_2_bare_json_missing_name_does_not_fire(self):
+ text = '{"result":"ok","data":[1,2,3]}'
+ assert parse_tool_calls_from_text(text) == []
+
+ def test_llama3_2_bare_json_missing_args_does_not_fire(self):
+ text = '{"name":"x"}'
+ assert parse_tool_calls_from_text(text) == []
+
+ def test_llama3_2_bare_json_args_not_dict_does_not_fire(self):
+ text = '{"name":"x","parameters":42}'
+ assert parse_tool_calls_from_text(text) == []
+
+ # ── Mistral pre-v11 ───────────────────────────────────────────
+
+ def test_mistral_pre_v11_array(self):
+ import json
+
+ text = (
+ '[TOOL_CALLS] [{"name":"web_search",'
+ '"arguments":{"query":"hello"},"id":"abc"}]'
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "web_search"
+ # Mistral provides its own id; preserve it.
+ assert result[0]["id"] == "abc"
+ assert json.loads(result[0]["function"]["arguments"]) == {"query": "hello"}
+
+ def test_mistral_pre_v11_array_multi(self):
+ text = (
+ '[TOOL_CALLS] [{"name":"a","arguments":{"x":1},"id":"id1"},'
+ '{"name":"b","arguments":{"y":2},"id":"id2"}]'
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 2
+ assert result[0]["function"]["name"] == "a"
+ assert result[1]["function"]["name"] == "b"
+
+ def test_mistral_pre_v11_unclosed_array(self):
+ # Closing ``]`` truncated -- parser must heal off individual objects.
+ text = '[TOOL_CALLS] [{"name":"web_search","arguments":{"q":"x"},"id":"id"}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "web_search"
+
+ # ── Mistral v11+ ───────────────────────────────────────────────
+
+ def test_mistral_v11_single(self):
+ # Magistral / Mistral Small 3.1: bare ``name{json}`` after trigger.
+ import json
+
+ text = '[TOOL_CALLS]add{"a":3.5,"b":4}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "add"
+ assert json.loads(result[0]["function"]["arguments"]) == {"a": 3.5, "b": 4}
+
+ def test_mistral_v11_parallel(self):
+ # v11+ parallel: ``[TOOL_CALLS]a{...}[TOOL_CALLS]b{...}``.
+ text = '[TOOL_CALLS]add{"a":1}[TOOL_CALLS]sub{"b":2}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 2
+ assert result[0]["function"]["name"] == "add"
+ assert result[1]["function"]["name"] == "sub"
+
+ def test_mistral_v11_with_args_marker(self):
+ # Ministral / Mistral Large 3: ``[TOOL_CALLS]name[ARGS]{json}``.
+ import json
+
+ text = '[TOOL_CALLS]add[ARGS]{"a":1,"b":2}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "add"
+ assert json.loads(result[0]["function"]["arguments"]) == {"a": 1, "b": 2}
+
+ def test_mistral_strip_markup_v11(self):
+ text = '[TOOL_CALLS]add{"a":1}'
+ assert strip_tool_markup(text, final = True) == ""
+
+ # ── Gemma 4 ───────────────────────────────────────────────────
+
+ def test_gemma4_simple_call(self):
+ import json
+
+ text = (
+ "<|tool_call>call:get_weather{"
+ 'location:<|"|>Tokyo<|"|>,units:<|"|>celsius<|"|>}'
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "get_weather"
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args == {"location": "Tokyo", "units": "celsius"}
+
+ def test_gemma4_with_primitives(self):
+ import json
+
+ text = (
+ "<|tool_call>call:set_pref{"
+ "enabled:true,attempts:5,threshold:1.5,nickname:null}"
+ )
+ result = parse_tool_calls_from_text(text)
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args == {
+ "enabled": True,
+ "attempts": 5,
+ "threshold": 1.5,
+ "nickname": None,
+ }
+
+ def test_gemma4_nested_args(self):
+ # Gemma 4 nests dicts / lists with bare keys and ``<|"|>`` strings.
+ import json
+
+ text = (
+ "<|tool_call>call:search{"
+ 'query:<|"|>foo<|"|>,filters:{site:<|"|>example.com<|"|>,recent:true},'
+ 'tags:[<|"|>a<|"|>,<|"|>b<|"|>]}'
+ )
+ result = parse_tool_calls_from_text(text)
+ args = json.loads(result[0]["function"]["arguments"])
+ assert args["query"] == "foo"
+ assert args["filters"] == {"site": "example.com", "recent": True}
+ assert args["tags"] == ["a", "b"]
+
+ def test_gemma4_multi_call(self):
+ text = (
+ "<|tool_call>call:a{x:1}" "<|tool_call>call:b{y:2}"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 2
+ assert result[0]["function"]["name"] == "a"
+ assert result[1]["function"]["name"] == "b"
+
+ def test_gemma4_unclosed_does_not_raise(self):
+ # Truncated mid-stream; must not raise.
+ text = '<|tool_call>call:foo{x:<|"|>bar<|"|>'
+ result = parse_tool_calls_from_text(text)
+ assert isinstance(result, list)
+
+ def test_gemma4_strip_markup_final(self):
+ text = "<|tool_call>call:foo{x:1}"
+ assert strip_tool_markup(text, final = True) == ""
+
+ # ── Cross-format sentinels ────────────────────────────────────
+
+ def test_all_markers_in_tool_xml_signals(self):
+ # Streaming buffer wakes up on every emission marker.
+ from core.inference.tool_call_parser import TOOL_XML_SIGNALS
+
+ for marker in (
+ "",
+ "",
+ "[TOOL_CALLS]",
+ "<|tool_call>",
+ ):
+ assert (
+ marker in TOOL_XML_SIGNALS
+ ), f"streaming loop would not wake on {marker!r}"
+
+ def test_has_tool_signal_for_all_formats(self):
+ assert has_tool_signal('<|python_tag|>brave_search.call(q="x")')
+ assert has_tool_signal('[TOOL_CALLS] [{"name":"x"}]')
+ assert has_tool_signal('[TOOL_CALLS]add{"a":1}')
+ assert has_tool_signal("<|tool_call>call:foo{}")
+
+
+# ────────────────────────────────────────────────────────────────────
+# run_safetensors_tool_loop
+# ────────────────────────────────────────────────────────────────────
+
+
+def _fake_stream(chunks):
+ """Build a single-turn generator that yields cumulative snapshots."""
+
+ def _gen(_messages):
+ acc = ""
+ for c in chunks:
+ acc += c
+ yield acc
+
+ return _gen
+
+
+def _const_stream(text):
+ """A single-turn generator that yields one cumulative snapshot."""
+
+ def _gen(_messages):
+ yield text
+
+ return _gen
+
+
+class FakeExecuteTool:
+ """Stand-in for ``core.inference.tools.execute_tool``."""
+
+ def __init__(self, results):
+ # ``results`` is a list of strings or RuntimeError instances.
+ self.results = list(results)
+ self.calls: list[tuple[str, dict]] = []
+
+ def __call__(
+ self,
+ name,
+ arguments,
+ *,
+ cancel_event = None,
+ timeout = None,
+ session_id = None,
+ ):
+ self.calls.append((name, arguments))
+ result = self.results.pop(0) if self.results else "OK"
+ if isinstance(result, Exception):
+ raise result
+ return result
+
+
+def _collect_events(generator, max_events = 200):
+ events = []
+ for ev in generator:
+ events.append(ev)
+ if len(events) >= max_events:
+ break
+ return events
+
+
+def _make_loop(*, turns, exec_results = None, **kwargs):
+ """Build a configured loop with a multi-turn fake generator.
+
+ ``turns`` is a list of chunk-lists; iteration N yields chunks from
+ ``turns[N]``.
+ """
+ turn_iter = iter(turns)
+
+ def _gen(_messages):
+ try:
+ chunks = next(turn_iter)
+ except StopIteration:
+ return
+ acc = ""
+ for c in chunks:
+ acc += c
+ yield acc
+
+ exec_fn = FakeExecuteTool(exec_results or [])
+ return run_safetensors_tool_loop(
+ single_turn = _gen,
+ messages = [{"role": "user", "content": "hi"}],
+ tools = [
+ {"type": "function", "function": {"name": "web_search"}},
+ {"type": "function", "function": {"name": "python"}},
+ {"type": "function", "function": {"name": "terminal"}},
+ ],
+ execute_tool = exec_fn,
+ **kwargs,
+ ), exec_fn
+
+
+class TestParserDeepSeek:
+ """DeepSeek R1 / V3 / V3.1 coverage. Markers use full-width pipes
+ (U+FF5C) and lower-one-eighth-block (U+2581). R1 wraps args in a
+ Markdown ``` ```json ``` ``` fence; V3 / V3.1 emit bare JSON."""
+
+ def test_r1_simple_call_with_code_fence(self):
+ import json as _json
+ text = (
+ "<|tool▁calls▁begin|>"
+ "<|tool▁call▁begin|>function"
+ "<|tool▁sep|>special_function\n"
+ "```json\n"
+ '{"arg1": 1}\n'
+ "```"
+ "<|tool▁call▁end|>"
+ "<|tool▁calls▁end|>"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "special_function"
+ assert _json.loads(result[0]["function"]["arguments"]) == {"arg1": 1}
+
+ def test_r1_short_form_outer_marker(self):
+ # llama.cpp accepts ``<|tool▁calls|>`` as the short-form opener.
+ import json as _json
+ text = (
+ "<|tool▁calls|>function"
+ "<|tool▁sep|>get_time\n"
+ "```json\n"
+ '{"city": "Paris"}\n'
+ "```"
+ "<|tool▁call▁end|>"
+ "<|tool▁calls▁end|>"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "get_time"
+
+ def test_v3_1_bare_json(self):
+ # V3 / V3.1 omit the ``function`` prefix and the code fence.
+ import json as _json
+ text = (
+ "<|tool▁calls▁begin|>"
+ "<|tool▁call▁begin|>get_time"
+ "<|tool▁sep|>"
+ '{"city": "Tokyo"}'
+ "<|tool▁call▁end|>"
+ "<|tool▁calls▁end|>"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "get_time"
+ assert _json.loads(result[0]["function"]["arguments"]) == {"city": "Tokyo"}
+
+ def test_v3_1_multi_call_shares_envelope(self):
+ # Parallel calls share one outer envelope; each inner call has
+ # its own ``<|tool▁call▁begin|>...<|tool▁call▁end|>``.
+ text = (
+ "<|tool▁calls▁begin|>"
+ "<|tool▁call▁begin|>get_time"
+ "<|tool▁sep|>"
+ '{"city": "Paris"}'
+ "<|tool▁call▁end|>"
+ "<|tool▁call▁begin|>get_weather"
+ "<|tool▁sep|>"
+ '{"city": "Paris"}'
+ "<|tool▁call▁end|>"
+ "<|tool▁calls▁end|>"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 2
+ assert result[0]["function"]["name"] == "get_time"
+ assert result[1]["function"]["name"] == "get_weather"
+
+ def test_v3_1_with_reasoning(self):
+ # Reasoning ... precedes the tool block. The
+ # parser only sees the tool block (reasoning handling lives
+ # in the streaming buffer / template helper); confirm the
+ # block parses even with leading prose.
+ text = (
+ "I'm thinking\n"
+ "<|tool▁calls▁begin|>"
+ "<|tool▁call▁begin|>get_time"
+ "<|tool▁sep|>"
+ '{"city": "Tokyo"}'
+ "<|tool▁call▁end|>"
+ "<|tool▁calls▁end|>"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "get_time"
+
+ def test_deepseek_strip_markup(self):
+ text = (
+ "before "
+ "<|tool▁calls▁begin|>"
+ "<|tool▁call▁begin|>foo"
+ "<|tool▁sep|>"
+ "{}"
+ "<|tool▁call▁end|>"
+ "<|tool▁calls▁end|>"
+ " after"
+ )
+ assert strip_tool_markup(text, final = True) == "before after"
+
+ def test_deepseek_signal_wakes_streaming(self):
+ # The streaming buffer state machine must wake on the DeepSeek
+ # opener so the rest of the section is drained instead of
+ # leaked.
+ text = "<|tool▁calls▁begin|>..."
+ assert has_tool_signal(text)
+
+
+class TestParserGLM:
+ """GLM 4.5 / 4.6 / 4.7 coverage. Marker collides with Qwen's
+ ```` but the body shape is XML kv pairs instead of JSON,
+ so the dispatch order keeps both formats working."""
+
+ def test_glm_simple_call(self):
+ import json as _json
+ text = (
+ "web_search\n"
+ "query\n"
+ "weather Tokyo\n"
+ ""
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "web_search"
+ args = _json.loads(result[0]["function"]["arguments"])
+ # Strings come through raw; the parser does not double-quote.
+ assert args == {"query": "weather Tokyo"}
+
+ def test_glm_mixed_types_decode_correctly(self):
+ # Per the chat_template.jinja, strings are emitted raw and
+ # non-strings are JSON-encoded. The parser must decode the
+ # mixed shape back to native types.
+ import json as _json
+ text = (
+ "complex_function\n"
+ "name\nJohn Doe\n"
+ "age\n30\n"
+ "active\ntrue\n"
+ "score\n95.5\n"
+ ""
+ )
+ result = parse_tool_calls_from_text(text)
+ args = _json.loads(result[0]["function"]["arguments"])
+ assert args == {
+ "name": "John Doe",
+ "age": 30,
+ "active": True,
+ "score": 95.5,
+ }
+
+ def test_glm_multi_call_back_to_back(self):
+ # GLM emits parallel calls as consecutive ``...
+ # `` blocks with no outer envelope.
+ text = (
+ "a\nx\n1\n"
+ "b\ny\n2\n"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 2
+ assert result[0]["function"]["name"] == "a"
+ assert result[1]["function"]["name"] == "b"
+
+ def test_glm_unclosed_tool_call_does_not_lose_value(self):
+ # Truncated mid-stream (no ) -- the parser must
+ # still surface what it found rather than dropping the call.
+ text = (
+ "web_search\n"
+ "query\n"
+ "partial"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "web_search"
+
+ def test_glm_does_not_break_qwen_path(self):
+ # Real Qwen emission must still be parsed by the Qwen branch,
+ # not silently misrouted to GLM (the marker is shared).
+ text = '{"name":"web_search","arguments":{"q":"x"}}'
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "web_search"
+
+ def test_glm_strip_markup(self):
+ text = (
+ "before "
+ "a\nx\n1\n"
+ " after"
+ )
+ assert strip_tool_markup(text, final = True) == "before after"
+
+
+class TestParserKimi:
+ """Kimi K2 / Moonshot coverage. ASCII pipes only (NOT full-width).
+ Name arrives as ``functions.NAME:IDX``; the parser strips the
+ prefix and the index to recover the bare callable name while
+ preserving the full id for round-trip rendering."""
+
+ def test_kimi_simple_call(self):
+ import json as _json
+ text = (
+ "<|tool_calls_section_begin|>"
+ "<|tool_call_begin|>functions.special_function:0"
+ "<|tool_call_argument_begin|>"
+ '{"arg1": 1}'
+ "<|tool_call_end|>"
+ "<|tool_calls_section_end|>"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ # Bare name recovered; full id preserved verbatim.
+ assert result[0]["function"]["name"] == "special_function"
+ assert result[0]["id"] == "functions.special_function:0"
+ assert _json.loads(result[0]["function"]["arguments"]) == {"arg1": 1}
+
+ def test_kimi_multi_call_with_index(self):
+ # Multiple consecutive calls inside a single section, each
+ # with its own monotonically incrementing ``:IDX``.
+ text = (
+ "<|tool_calls_section_begin|>"
+ "<|tool_call_begin|>functions.read_file:0"
+ "<|tool_call_argument_begin|>"
+ '{"path":"a"}'
+ "<|tool_call_end|>"
+ "<|tool_call_begin|>functions.web_search:1"
+ "<|tool_call_argument_begin|>"
+ '{"query":"x"}'
+ "<|tool_call_end|>"
+ "<|tool_calls_section_end|>"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 2
+ assert result[0]["function"]["name"] == "read_file"
+ assert result[0]["id"].endswith(":0")
+ assert result[1]["function"]["name"] == "web_search"
+ assert result[1]["id"].endswith(":1")
+
+ def test_kimi_dotted_name_keeps_last_segment(self):
+ # Bare ``NAME:IDX`` (no ``functions.`` prefix) and ``a.b.c:IDX``
+ # nested names must both resolve to the final dot-segment per
+ # vLLM / SGLang behaviour.
+ text = (
+ "<|tool_calls_section_begin|>"
+ "<|tool_call_begin|>a.b.c:2"
+ "<|tool_call_argument_begin|>"
+ "{}"
+ "<|tool_call_end|>"
+ "<|tool_calls_section_end|>"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "c"
+
+ def test_kimi_handles_unclosed_section(self):
+ # End marker missing -- the parser must still extract the call.
+ text = (
+ "<|tool_calls_section_begin|>"
+ "<|tool_call_begin|>functions.foo:0"
+ "<|tool_call_argument_begin|>"
+ '{"a":1}'
+ "<|tool_call_end|>"
+ )
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1
+ assert result[0]["function"]["name"] == "foo"
+
+ def test_kimi_strip_markup(self):
+ text = (
+ "before "
+ "<|tool_calls_section_begin|>"
+ "<|tool_call_begin|>functions.x:0"
+ "<|tool_call_argument_begin|>"
+ "{}"
+ "<|tool_call_end|>"
+ "<|tool_calls_section_end|>"
+ " after"
+ )
+ assert strip_tool_markup(text, final = True) == "before after"
+
+ def test_kimi_signal_wakes_streaming(self):
+ text = "<|tool_calls_section_begin|>..."
+ assert has_tool_signal(text)
+
+
+class TestParserCrossFormatRouting:
+ """Ensure the per-format dispatch order doesn't misroute any
+ family. Real emissions for each new family + every old family
+ must still parse correctly when intermixed."""
+
+ def test_dispatch_routes_each_family_correctly(self):
+ cases = [
+ (
+ "Qwen",
+ '{"name":"a","arguments":{"x":1}}',
+ "a",
+ ),
+ (
+ "DeepSeek V3.1",
+ "<|tool▁calls▁begin|>"
+ "<|tool▁call▁begin|>get_time"
+ "<|tool▁sep|>"
+ '{"city":"Tokyo"}'
+ "<|tool▁call▁end|>"
+ "<|tool▁calls▁end|>",
+ "get_time",
+ ),
+ (
+ "GLM",
+ "web_search\n"
+ "q\nx\n"
+ "",
+ "web_search",
+ ),
+ (
+ "Kimi",
+ "<|tool_calls_section_begin|>"
+ "<|tool_call_begin|>functions.add:0"
+ "<|tool_call_argument_begin|>"
+ '{"a":1}'
+ "<|tool_call_end|>"
+ "<|tool_calls_section_end|>",
+ "add",
+ ),
+ ]
+ for label, text, expected_name in cases:
+ result = parse_tool_calls_from_text(text)
+ assert len(result) == 1, f"{label}: parser missed the call"
+ assert result[0]["function"]["name"] == expected_name, (
+ f"{label}: got {result[0]['function']['name']!r}, "
+ f"expected {expected_name!r}"
+ )
+
+ def test_all_new_markers_in_tool_xml_signals(self):
+ # The safetensors / MLX streaming buffer must wake on every
+ # supported emission marker -- otherwise the BUFFERING state
+ # leaks tool content to the user before parse.
+ from core.inference.tool_call_parser import TOOL_XML_SIGNALS
+
+ for marker in (
+ "<|tool▁calls▁begin|>",
+ "<|tool▁call▁begin|>",
+ "<|tool_calls_section_begin|>",
+ "<|tool_call_begin|>",
+ ):
+ assert marker in TOOL_XML_SIGNALS, (
+ f"streaming loop would not wake on {marker!r}"
+ )
+
+
+class TestLoopBasic:
+ def test_plain_answer(self):
+ # No tool XML; loop should yield content then status="".
+ loop, _exec = _make_loop(
+ turns = [["Hello", " world", "!"]],
+ exec_results = [],
+ )
+ events = _collect_events(loop)
+ contents = [e for e in events if e["type"] == "content"]
+ statuses = [e for e in events if e["type"] == "status"]
+ assert contents, "expected at least one content event"
+ # Final cumulative content should contain the answer.
+ final_text = contents[-1]["text"]
+ assert "Hello world!" in final_text
+ assert statuses and statuses[-1]["text"] == ""
+
+ def test_single_tool_then_answer(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ # : tool call only.
+ [
+ '{"name":"web_search",',
+ '"arguments":{"query":"weather"}}',
+ "",
+ ],
+ # : final answer.
+ ["The ", "weather is ", "sunny."],
+ ],
+ exec_results = ["Sunny and 22C"],
+ )
+ events = _collect_events(loop)
+ kinds = [e["type"] for e in events]
+
+ assert "tool_start" in kinds
+ assert "tool_end" in kinds
+ # Tool was actually called with the parsed arguments.
+ assert exec_fn.calls == [("web_search", {"query": "weather"})]
+
+ tool_start = next(e for e in events if e["type"] == "tool_start")
+ assert tool_start["tool_name"] == "web_search"
+ tool_end = next(e for e in events if e["type"] == "tool_end")
+ assert tool_end["result"] == "Sunny and 22C"
+
+ contents = [e for e in events if e["type"] == "content"]
+ assert contents and "sunny" in contents[-1]["text"].lower()
+
+ def test_function_xml_form(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ ["print(1)"],
+ ["Result: 1"],
+ ],
+ exec_results = ["1\n"],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("python", {"code": "print(1)"})]
+ contents = [e for e in events if e["type"] == "content"]
+ assert "Result: 1" in contents[-1]["text"]
+
+ def test_llama3_python_tag_form(self):
+ # The agentic loop must recognise Llama-3's <|python_tag|>
+ # marker, drain the rest of the turn, and execute the call.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ "<|python_tag|>web_search.call(",
+ 'query="weather in Tokyo"',
+ ")",
+ ],
+ ["The weather is sunny."],
+ ],
+ exec_results = ["Sunny, 22C"],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "weather in Tokyo"})]
+ contents = [e for e in events if e["type"] == "content"]
+ assert "sunny" in contents[-1]["text"].lower()
+
+ def test_mistral_pre_v11_form(self):
+ # Pre-v11 Mistral emission: ``[TOOL_CALLS] [{...}]``.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '[TOOL_CALLS] [{"name":"web_search",',
+ '"arguments":{"query":"hi"},"id":"abc"}]',
+ ],
+ ["done"],
+ ],
+ exec_results = ["ok"],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "hi"})]
+ # Mistral-provided ids must propagate to tool_start events.
+ tool_start = next(e for e in events if e["type"] == "tool_start")
+ assert tool_start["tool_call_id"] == "abc"
+
+ def test_mistral_v11_form(self):
+ # v11+ Mistral emission: bare ``name{json}`` after the trigger.
+ loop, exec_fn = _make_loop(
+ turns = [
+ ['[TOOL_CALLS]web_search{"query":"hi"}'],
+ ["done"],
+ ],
+ exec_results = ["ok"],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "hi"})]
+
+ def test_gemma4_form(self):
+ # Gemma 4 emission: ``<|tool_call>call:NAME{...}``.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ "<|tool_call>call:web_search{",
+ 'query:<|"|>weather<|"|>',
+ "}",
+ ],
+ ["sunny"],
+ ],
+ exec_results = ["Sunny, 22C"],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "weather"})]
+
+ def test_deepseek_v3_1_form(self):
+ # DeepSeek V3.1 emission inside the agentic loop -- the buffer
+ # state machine must wake on ``<|tool▁calls▁begin|>`` and the
+ # parser must extract the V3.1 bare-JSON body.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ "<|tool▁calls▁begin|>",
+ "<|tool▁call▁begin|>web_search",
+ "<|tool▁sep|>",
+ '{"query":"Tokyo weather"}',
+ "<|tool▁call▁end|>",
+ "<|tool▁calls▁end|>",
+ ],
+ ["The weather is sunny."],
+ ],
+ exec_results = ["Sunny, 22C"],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "Tokyo weather"})]
+ contents = [e for e in events if e["type"] == "content"]
+ assert contents and "sunny" in contents[-1]["text"].lower()
+
+ def test_glm_form(self):
+ # GLM 4.x emission: ``NAME\n...``.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ "web_search\n",
+ "query\n",
+ "Tokyo\n",
+ "",
+ ],
+ ["found"],
+ ],
+ exec_results = ["..."],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "Tokyo"})]
+
+ def test_kimi_form(self):
+ # Kimi K2 emission ``<|tool_calls_section_begin|>...``.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ "<|tool_calls_section_begin|>",
+ "<|tool_call_begin|>functions.web_search:0",
+ "<|tool_call_argument_begin|>",
+ '{"query":"Tokyo"}',
+ "<|tool_call_end|>",
+ "<|tool_calls_section_end|>",
+ ],
+ ["done"],
+ ],
+ exec_results = ["..."],
+ )
+ events = _collect_events(loop)
+ # The bare name must reach execute_tool, even though the model
+ # emitted ``functions.web_search:0`` as the formatted id.
+ assert exec_fn.calls == [("web_search", {"query": "Tokyo"})]
+ # tool_start carries the original full id so the conversation
+ # roundtrip can replay it verbatim.
+ tool_start = next(e for e in events if e["type"] == "tool_start")
+ assert tool_start["tool_call_id"] == "functions.web_search:0"
+
+ def test_truncated_unclosed_tool_call(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ # No ; balanced-brace parser must still
+ # succeed because the JSON itself is balanced.
+ ['{"name":"web_search","arguments":{"query":"x"}}'],
+ ["done"],
+ ],
+ exec_results = ["result"],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "x"})]
+
+ def test_bad_json_healed_to_query(self):
+ # Tool call with non-JSON string arguments. With auto_heal_tool_calls
+ # the string is routed as {"query": ...}.
+ loop, exec_fn = _make_loop(
+ turns = [
+ # JSON inside the tool call is well-formed; the
+ # ``arguments`` is a string that is not itself valid
+ # JSON for ``_coerce_arguments`` to parse, so the
+ # heal path runs.
+ [
+ '{"name":"web_search","arguments":"hello world"}'
+ ],
+ ["ok"],
+ ],
+ exec_results = ["..."],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls and exec_fn.calls[0][0] == "web_search"
+ assert exec_fn.calls[0][1] == {"query": "hello world"}
+
+
+class TestLoopBehaviour:
+ def test_duplicate_tool_call_synthetic_result(self):
+ # Two identical successful calls in a row: the second is short-
+ # circuited with a "do not repeat" message and execute_tool is
+ # called only once.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"web_search","arguments":{"query":"x"}}'
+ ],
+ [
+ '{"name":"web_search","arguments":{"query":"x"}}'
+ ],
+ ["final"],
+ ],
+ exec_results = ["search-result-1"],
+ )
+ events = _collect_events(loop)
+ # Only one real call.
+ assert len(exec_fn.calls) == 1
+ tool_end_events = [e for e in events if e["type"] == "tool_end"]
+ assert len(tool_end_events) == 2
+ assert "do not repeat" in tool_end_events[1]["result"].lower()
+
+ def test_image_sentinel_stripped_from_model_feed(self):
+ # The tool result has a frontend image sentinel that should be
+ # stripped before being fed back into the next turn, BUT the
+ # tool_end event still carries the raw result for the UI.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"python","arguments":{"code":"plot()"}}'
+ ],
+ ["see chart"],
+ ],
+ exec_results = ["chart\n__IMAGES__:/tmp/chart.png"],
+ )
+ events = _collect_events(loop)
+ tool_end = next(e for e in events if e["type"] == "tool_end")
+ assert "__IMAGES__" in tool_end["result"]
+
+ def test_image_sentinel_stripped_with_leading_marker(self):
+ # Sentinel at start (no newline) must not leak to the model.
+ from core.inference import safetensors_agentic as _sa
+
+ captured: list[list[dict]] = []
+
+ def fake_single_turn(messages, **_kw):
+ captured.append([dict(m) for m in messages])
+ if len(captured) == 1:
+ yield '{"name":"python","arguments":{"code":"plot()"}}'
+ else:
+ yield "done"
+
+ events = list(
+ _sa.run_safetensors_tool_loop(
+ single_turn = fake_single_turn,
+ messages = [{"role": "user", "content": "plot please"}],
+ tools = [{"function": {"name": "python"}}],
+ execute_tool = lambda *_a, **_kw: "__IMAGES__:/tmp/x.png",
+ cancel_event = threading.Event(),
+ max_tool_iterations = 3,
+ auto_heal_tool_calls = True,
+ )
+ )
+ # Model's second turn must not see "__IMAGES__".
+ assert len(captured) >= 2
+ tool_msgs = [m for m in captured[1] if m.get("role") == "tool"]
+ assert tool_msgs, "no tool message reached the model"
+ for tm in tool_msgs:
+ assert (
+ "__IMAGES__" not in tm["content"]
+ ), f"sentinel leaked to model: {tm['content']!r}"
+
+ def test_image_sentinel_stripped_with_multiple_markers(self):
+ # Consecutive sentinels: cut at the first, nothing leaks.
+ from core.inference import safetensors_agentic as _sa
+
+ captured: list[list[dict]] = []
+
+ def fake_single_turn(messages, **_kw):
+ captured.append([dict(m) for m in messages])
+ if len(captured) == 1:
+ yield '{"name":"python","arguments":{"code":"plot()"}}'
+ else:
+ yield "done"
+
+ multi = "panel\n__IMAGES__:/tmp/a.png\n__IMAGES__:/tmp/b.png"
+ events = list(
+ _sa.run_safetensors_tool_loop(
+ single_turn = fake_single_turn,
+ messages = [{"role": "user", "content": "plot please"}],
+ tools = [{"function": {"name": "python"}}],
+ execute_tool = lambda *_a, **_kw: multi,
+ cancel_event = threading.Event(),
+ max_tool_iterations = 3,
+ auto_heal_tool_calls = True,
+ )
+ )
+ tool_msgs = [m for m in captured[1] if m.get("role") == "tool"]
+ assert tool_msgs
+ for tm in tool_msgs:
+ assert (
+ "__IMAGES__" not in tm["content"]
+ ), f"second sentinel leaked: {tm['content']!r}"
+ assert (
+ tm["content"] == "panel"
+ ), f"expected payload-only 'panel', got {tm['content']!r}"
+
+ def test_tool_execution_error_is_emitted_but_loop_continues(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"web_search","arguments":{"query":"x"}}'
+ ],
+ ["sorry, that failed"],
+ ],
+ exec_results = ["Error: network unreachable"],
+ )
+ events = _collect_events(loop)
+ tool_end = next(e for e in events if e["type"] == "tool_end")
+ assert tool_end["result"].startswith("Error")
+ # The loop must still produce a content event after the failure.
+ contents = [e for e in events if e["type"] == "content"]
+ assert contents
+
+ def test_exception_in_executor_does_not_raise(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"web_search","arguments":{"query":"x"}}'
+ ],
+ ["recovered"],
+ ],
+ exec_results = [RuntimeError("boom")],
+ )
+ events = _collect_events(loop)
+ tool_end = next(e for e in events if e["type"] == "tool_end")
+ assert "boom" in tool_end["result"]
+
+
+class TestLoopRePrompt:
+ """Re-prompt-on-plan-without-action parity with the GGUF path.
+
+ When the model emits forward-looking intent ("Let me search for
+ that") without actually calling a tool, the loop must nudge it to
+ act instead of silently terminating. Up to ``_MAX_REPROMPTS`` (3)
+ re-prompts per request, drawn from extra iteration slots so the
+ caller's tool-call budget is preserved.
+ """
+
+ def test_intent_signal_triggers_reprompt(self):
+ # Turn 1: intent signal, no tool call.
+ # Turn 2 (re-prompt): proper tool call -> executes.
+ # Turn 3: final answer.
+ loop, exec_fn = _make_loop(
+ turns = [
+ ["Let me search for that."],
+ [
+ '{"name":"web_search","arguments":'
+ '{"query":"sky color"}}'
+ ],
+ ["The sky is blue."],
+ ],
+ exec_results = ["Blue (Rayleigh scattering)"],
+ )
+ events = _collect_events(loop)
+ # web_search must have been called once (after the re-prompt).
+ assert exec_fn.calls == [("web_search", {"query": "sky color"})]
+ contents = [e for e in events if e["type"] == "content"]
+ assert contents and "blue" in contents[-1]["text"].lower()
+
+ def test_intent_signal_without_tools_does_not_reprompt(self):
+ # Same intent signal but no tools enabled -- must NOT re-prompt.
+ loop, exec_fn = _make_loop(
+ turns = [["Let me think about that for a moment."]],
+ exec_results = [],
+ )
+ # _make_loop hard-codes three tools; rebuild without tools.
+ from core.inference.safetensors_agentic import run_safetensors_tool_loop
+
+ def _gen(_messages):
+ yield "Let me think about that for a moment."
+
+ exec_fn = FakeExecuteTool([])
+ events = _collect_events(
+ run_safetensors_tool_loop(
+ single_turn = _gen,
+ messages = [{"role": "user", "content": "hi"}],
+ tools = [],
+ execute_tool = exec_fn,
+ )
+ )
+ assert exec_fn.calls == []
+ contents = [e for e in events if e["type"] == "content"]
+ assert contents and "think" in contents[-1]["text"].lower()
+
+ def test_direct_answer_does_not_trigger_reprompt(self):
+ # Plain answer with no intent words: do NOT re-prompt.
+ loop, exec_fn = _make_loop(
+ turns = [["4"]],
+ exec_results = [],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == []
+ contents = [e for e in events if e["type"] == "content"]
+ assert contents and contents[-1]["text"].strip() == "4"
+
+ def test_max_reprompts_capped_at_three(self):
+ # Model keeps stalling with intent -- after 3 re-prompts the
+ # loop must give up rather than burn forever.
+ turns = [["Let me search for that."]] * 6 # well over the cap
+ loop, exec_fn = _make_loop(
+ turns = turns,
+ exec_results = [],
+ )
+ events = _collect_events(loop, max_events = 500)
+ # No tool ever ran, but the loop terminated cleanly.
+ assert exec_fn.calls == []
+ statuses = [e for e in events if e["type"] == "status"]
+ assert statuses and statuses[-1]["text"] == ""
+
+ def test_short_intent_below_buffer_threshold_triggers_reprompt(self):
+ # Short emission that never exits BUFFERING (< 32 chars + no
+ # marker prefix). The unified buffer-end path must still
+ # trigger the intent re-prompt, not silently terminate.
+ loop, exec_fn = _make_loop(
+ turns = [
+ ["Let me check."],
+ [
+ '{"name":"web_search","arguments":'
+ '{"query":"x"}}'
+ ],
+ ["found"],
+ ],
+ exec_results = ["..."],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "x"})]
+
+ def test_reprompt_does_not_consume_tool_budget(self):
+ # max_tool_iterations=1: one re-prompt, then one real tool call,
+ # then the budget-exhausted final answer must still fire. If the
+ # re-prompt ate the slot the tool call would never run.
+ loop, exec_fn = _make_loop(
+ turns = [
+ # 1. Intent stall (re-prompt 1/3).
+ ["Let me search for that."],
+ # 2. Real tool call (uses the budget slot).
+ [
+ '{"name":"web_search","arguments":'
+ '{"query":"weather"}}'
+ ],
+ # 3. Budget exhausted -> nudged final answer.
+ ["Final: it is sunny"],
+ ],
+ exec_results = ["sunny"],
+ max_tool_iterations = 1,
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "weather"})]
+ contents = [e for e in events if e["type"] == "content"]
+ assert contents and "sunny" in contents[-1]["text"].lower()
+
+
+class TestLoopCanonicalHealKey:
+ """Per-tool canonical heal key (``code`` for python, ``command`` for
+ terminal, ``query`` for everything else). Mirrors GGUF after the
+ PR-5615 follow-up that ported this mapping over."""
+
+ def test_python_bare_string_heals_to_code(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ ['{"name":"python","arguments":"print(1)"}' ""],
+ ["done"],
+ ],
+ exec_results = ["1\n"],
+ )
+ events = _collect_events(loop)
+ # The bare string must heal to {"code": "print(1)"}, not
+ # {"query": ...}, so the python sandbox actually executes it.
+ assert exec_fn.calls == [("python", {"code": "print(1)"})]
+
+ def test_terminal_bare_string_heals_to_command(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ ['{"name":"terminal","arguments":"ls -la"}' ""],
+ ["done"],
+ ],
+ exec_results = ["..."],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("terminal", {"command": "ls -la"})]
+
+ def test_unknown_tool_bare_string_heals_to_query(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ ['{"name":"web_search","arguments":"hello"}' ""],
+ ["ok"],
+ ],
+ exec_results = ["..."],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "hello"})]
+
+
+class TestGGUFSafetensorsHealingParity:
+ """Pin parity between the GGUF agentic loop and the safetensors /
+ MLX loop so a regression on either side breaks CI."""
+
+ def test_gguf_imports_shared_signal_markers(self):
+ # The GGUF BUFFERING state machine must wake on every emission
+ # marker the shared parser knows -- otherwise Llama-3 / Mistral
+ # / Gemma 4 emissions slip past as plain prose when the
+ # llama-server structured channel fails.
+ import inspect
+
+ from core.inference.llama_cpp import LlamaCppBackend
+
+ src = inspect.getsource(LlamaCppBackend.generate_chat_completion_with_tools)
+ assert "_SHARED_TOOL_XML_SIGNALS" in src, (
+ "GGUF agentic loop must reuse the shared TOOL_XML_SIGNALS "
+ "tuple so it wakes on all five emission formats"
+ )
+
+ def test_gguf_uses_shared_strip_helper(self):
+ # The GGUF stream-cleanup function must delegate to the shared
+ # strip_tool_markup so closed-pair markup is removed for every
+ # emission family (Llama-3 <|python_tag|>, Mistral [TOOL_CALLS],
+ # Gemma 4 <|tool_call>...).
+ import inspect
+
+ from core.inference.llama_cpp import LlamaCppBackend
+
+ src = inspect.getsource(LlamaCppBackend.generate_chat_completion_with_tools)
+ assert "_shared_strip_tool_markup" in src, (
+ "GGUF stream cleanup must delegate to the shared "
+ "strip_tool_markup helper"
+ )
+
+ def test_gguf_uses_canonical_heal_keys(self):
+ # GGUF must heal a bare-string ``arguments`` to the same per-tool
+ # canonical key as safetensors -- ``code`` for python, ``command``
+ # for terminal, ``query`` for everything else.
+ import inspect
+
+ from core.inference.llama_cpp import LlamaCppBackend
+
+ src = inspect.getsource(LlamaCppBackend.generate_chat_completion_with_tools)
+ # The canonical key dict literal must be present in the heal
+ # path so a Llama-3 / Mistral / Gemma 4 bare-string emission
+ # for python doesn't get routed as {"query": "print(1)"}.
+ assert '"python": "code"' in src
+ assert '"terminal": "command"' in src
+
+ def test_intent_regex_matches_same_phrases_as_gguf(self):
+ # The intent re-prompt regex must match the SAME forward-looking
+ # phrases on both backends so behaviour is the same on Mac (MLX
+ # / safetensors) and on Linux (GGUF).
+ from core.inference.llama_cpp import _INTENT_SIGNAL as gguf_re
+ from core.inference.safetensors_agentic import (
+ _INTENT_SIGNAL as sf_re,
+ )
+
+ for phrase in (
+ "I'll search for that",
+ "I will look it up",
+ "Let me check",
+ "I am going to call the tool",
+ "First, I will explore",
+ "Here's my plan",
+ "Now I need to call web_search",
+ ):
+ assert gguf_re.search(phrase), f"GGUF missed {phrase!r}"
+ assert sf_re.search(phrase), f"safetensors missed {phrase!r}"
+
+ for plain in (
+ "4",
+ "Hello!",
+ "The sky is blue.",
+ "I can help with that.",
+ "I should mention",
+ "Let's go.",
+ ):
+ assert not gguf_re.search(plain), f"GGUF wrongly fired on {plain!r}"
+ assert not sf_re.search(plain), f"safetensors wrongly fired on {plain!r}"
+
+ def test_max_reprompts_equal_on_both_backends(self):
+ from core.inference.llama_cpp import _MAX_REPROMPTS as gguf_cap
+ from core.inference.safetensors_agentic import _MAX_REPROMPTS as sf_cap
+
+ assert gguf_cap == sf_cap == 3
+
+
+class TestLoopControl:
+ def test_cancel_event_breaks_loop(self):
+ cancel = threading.Event()
+ cancel.set()
+ # Even with a fake stream that emits tool calls, the loop must
+ # bail before invoking execute_tool when cancel is set.
+ exec_fn = FakeExecuteTool([])
+ events = list(
+ run_safetensors_tool_loop(
+ single_turn = _const_stream(
+ '{"name":"web_search",'
+ '"arguments":{"query":"x"}}'
+ ),
+ messages = [{"role": "user", "content": "hi"}],
+ tools = [],
+ execute_tool = exec_fn,
+ cancel_event = cancel,
+ )
+ )
+ assert events == []
+ assert exec_fn.calls == []
+
+ def test_max_iterations_caps_loop(self):
+ # The loop should stop after max_tool_iterations even if the
+ # model keeps asking for tools, then emit a final-attempt round.
+ loop, exec_fn = _make_loop(
+ turns = [
+ # : tool call (executes once)
+ [
+ '{"name":"web_search","arguments":{"query":"a"}}'
+ ],
+ # : model gives a final answer when nudged.
+ ["here is the final answer"],
+ ],
+ exec_results = ["result"],
+ max_tool_iterations = 1,
+ )
+ events = _collect_events(loop)
+ contents = [e for e in events if e["type"] == "content"]
+ # Final content must include the final answer.
+ assert contents and "final answer" in contents[-1]["text"]
+
+
+class TestStatusFormatting:
+ def test_status_for_known_tools(self):
+ # Use the private helper directly to verify status formatting.
+ assert (
+ safetensors_agentic._status_for_tool("web_search", {"query": "abc"})
+ == "Searching: abc"
+ )
+ assert (
+ safetensors_agentic._status_for_tool(
+ "web_search", {"url": "https://www.example.com/x"}
+ )
+ == "Reading: example.com"
+ )
+ assert safetensors_agentic._status_for_tool(
+ "python", {"code": "x = 1"}
+ ).startswith("Running Python:")
+ assert safetensors_agentic._status_for_tool(
+ "terminal", {"command": "ls"}
+ ).startswith("Running:")
+ assert safetensors_agentic._status_for_tool("unknown_tool", {}).startswith(
+ "Calling:"
+ )
+
+
+class TestProseMentioningToolCall:
+ def test_assistant_prose_with_literal_tool_call_text_survives(self):
+ # Regression: if the assistant text legitimately mentions
+ # ```` as a literal string and the parser finds no
+ # actual call, the loop must surface the full content instead
+ # of silently stripping everything past the literal marker.
+ loop, exec_fn = _make_loop(
+ turns = [
+ # : a real tool call so the loop moves to
+ # .
+ [
+ '{"name":"web_search","arguments":{"query":"x"}}'
+ ],
+ # : prose that mentions the literal text.
+ ["the docs say means an LLM tool call wrapper"],
+ ],
+ exec_results = ["result"],
+ )
+ events = _collect_events(loop)
+ contents = [e for e in events if e["type"] == "content"]
+ assert contents, "expected at least one content event"
+ final = contents[-1]["text"]
+ assert (
+ "LLM tool" in final
+ ), f"prose mentioning should not be truncated; got {final!r}"
+
+ def test_tool_result_with_tool_call_text_does_not_retrigger(self):
+ # Tool result text contains the literal ```` string.
+ # The loop must only parse the MODEL output, not the tool
+ # result, so we should see exactly one call.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"web_search","arguments":{"query":"x"}}'
+ ],
+ ["the docs mention wrappers"],
+ ],
+ exec_results = ["Page text: appears here in the docs"],
+ )
+ events = _collect_events(loop)
+ assert len(exec_fn.calls) == 1
+
+
+class TestChatTemplateHelper:
+ """Cover the dependency-light helper used by InferenceBackend."""
+
+ def setup_method(self):
+ from core.inference.chat_template_helpers import (
+ apply_chat_template_for_generation,
+ )
+
+ self.apply = apply_chat_template_for_generation
+
+ class _Tok:
+ def __init__(self, accepted):
+ self.accepted = accepted
+ self.call_count = 0
+ self.last_kwargs = None
+
+ def apply_chat_template(
+ self, messages, *, tokenize = False, add_generation_prompt = True, **kw
+ ):
+ self.call_count += 1
+ unknown = set(kw) - self.accepted
+ if unknown:
+ raise TypeError(f"unexpected kwargs: {sorted(unknown)}")
+ self.last_kwargs = dict(kw)
+ return "PROMPT"
+
+ def test_richest_call_wins_when_template_supports_all(self):
+ tok = self._Tok({"tools", "enable_thinking"})
+ self.apply(tok, [], tools = [{}], enable_thinking = True)
+ assert tok.call_count == 1
+ assert "tools" in tok.last_kwargs
+ assert "enable_thinking" in tok.last_kwargs
+
+ def test_falls_back_when_template_rejects_reasoning_kwarg(self):
+ tok = self._Tok({"tools"})
+ self.apply(tok, [], tools = [{}], enable_thinking = True)
+ assert tok.call_count >= 2
+ assert tok.last_kwargs == {"tools": [{}]}
+
+ def test_falls_back_to_bare_call(self):
+ tok = self._Tok(set())
+ self.apply(tok, [], tools = [{}], enable_thinking = True)
+ assert tok.last_kwargs == {}
+
+ def test_jinja_error_propagates(self):
+ class Boom:
+ def apply_chat_template(self, *a, **kw):
+ raise ValueError("jinja: missing var")
+
+ with pytest.raises(ValueError):
+ self.apply(Boom(), [])
+
+ def test_no_kwargs_single_call(self):
+ tok = self._Tok(set())
+ self.apply(tok, [])
+ assert tok.call_count == 1
+
+
+# ────────────────────────────────────────────────────────────────────
+# Guardrails (allowlist, budget, streaming-leak, dedup, id offset,
+# auto_heal=False, canonical healed-arg key)
+# ────────────────────────────────────────────────────────────────────
+
+
+class TestGuardrails:
+ def test_disabled_tool_is_not_executed(self):
+ exec_fn = FakeExecuteTool([])
+ loop = run_safetensors_tool_loop(
+ single_turn = _fake_stream(
+ [
+ '{"name":"terminal","arguments":{"command":"echo bypass"}}'
+ ]
+ ),
+ messages = [{"role": "user", "content": "hi"}],
+ tools = [{"type": "function", "function": {"name": "web_search"}}],
+ execute_tool = exec_fn,
+ max_tool_iterations = 2,
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == []
+ tool_ends = [e for e in events if e["type"] == "tool_end"]
+ assert tool_ends and "not enabled" in tool_ends[0]["result"].lower()
+
+ def test_empty_tools_list_does_not_enforce_allowlist(self):
+ exec_fn = FakeExecuteTool(["OK"])
+ loop = run_safetensors_tool_loop(
+ single_turn = _fake_stream(
+ [
+ '{"name":"python","arguments":{"code":"print(1)"}}'
+ ]
+ ),
+ messages = [{"role": "user", "content": "hi"}],
+ tools = [],
+ execute_tool = exec_fn,
+ max_tool_iterations = 2,
+ )
+ _collect_events(loop)
+ assert exec_fn.calls == [("python", {"code": "print(1)"})]
+
+ def test_max_iterations_zero_executes_no_tools(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"web_search","arguments":{"query":"x"}}'
+ ]
+ ],
+ exec_results = ["OK"],
+ max_tool_iterations = 0,
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == []
+ assert events and events[-1] == {"type": "status", "text": ""}
+
+ def test_streaming_clips_before_tool_signal_no_leak(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ "I will look this up. ",
+ "Some more prose that's long enough to leave the buffer. ",
+ '{"name":"web_search","arguments":{"query":"x"}}',
+ ],
+ ["all done"],
+ ],
+ exec_results = ["weather: sunny"],
+ max_tool_iterations = 2,
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "x"})]
+ for e in events:
+ if e["type"] == "content":
+ assert "" not in e["text"]
+ assert "web_search" not in e["text"]
+
+ def test_auto_heal_disabled_still_parses_valid_tool_call(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"web_search","arguments":{"query":"x"}}'
+ ],
+ ["done"],
+ ],
+ exec_results = ["OK"],
+ auto_heal_tool_calls = False,
+ max_tool_iterations = 2,
+ )
+ _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "x"})]
+
+ def test_non_consecutive_duplicate_is_short_circuited(self):
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"web_search","arguments":{"query":"A"}}'
+ ],
+ [
+ '{"name":"web_search","arguments":{"query":"B"}}'
+ ],
+ [
+ '{"name":"web_search","arguments":{"query":"A"}}'
+ ],
+ ["final"],
+ ],
+ exec_results = ["res-A", "res-B"],
+ max_tool_iterations = 4,
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [
+ ("web_search", {"query": "A"}),
+ ("web_search", {"query": "B"}),
+ ]
+ tool_ends = [e for e in events if e["type"] == "tool_end"]
+ assert "already made this exact call" in tool_ends[-1]["result"]
+
+ def test_coerce_string_args_python_uses_code_key(self):
+ assert _coerce_arguments("print(1)", heal = True, tool_name = "python") == {
+ "code": "print(1)"
+ }
+
+ def test_coerce_string_args_terminal_uses_command_key(self):
+ assert _coerce_arguments("ls -la", heal = True, tool_name = "terminal") == {
+ "command": "ls -la"
+ }
+
+ def test_tool_call_ids_unique_across_loop_iterations(self):
+ loop, _exec = _make_loop(
+ turns = [
+ [
+ '{"name":"web_search","arguments":{"query":"A"}}'
+ ],
+ [
+ '{"name":"web_search","arguments":{"query":"B"}}'
+ ],
+ ["done"],
+ ],
+ exec_results = ["A", "B"],
+ max_tool_iterations = 3,
+ )
+ events = _collect_events(loop)
+ ids = [e["tool_call_id"] for e in events if e["type"] == "tool_start"]
+ assert len(ids) == 2 and ids[0] != ids[1]
+
+
+# ────────────────────────────────────────────────────────────────────
+# Shared gpt-oss name detector
+# ────────────────────────────────────────────────────────────────────
+
+
+class TestGptOssNameDetection:
+ def test_substring_match(self):
+ assert is_gpt_oss_model_name("unsloth/gpt-oss-20b") is True
+
+ def test_negative_known_non_oss_model(self):
+ assert is_gpt_oss_model_name("meta-llama/Llama-3.1-8B-Instruct") is False
+
+ def test_empty_or_none_returns_false(self):
+ assert is_gpt_oss_model_name("") is False
+ assert is_gpt_oss_model_name(None) is False
+
+
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])
diff --git a/studio/backend/utils/datasets/__init__.py b/studio/backend/utils/datasets/__init__.py
index c9237d83c1..7988b09972 100644
--- a/studio/backend/utils/datasets/__init__.py
+++ b/studio/backend/utils/datasets/__init__.py
@@ -59,6 +59,7 @@
TEMPLATE_TO_MODEL_MAPPER,
MODEL_TO_TEMPLATE_MAPPER,
TEMPLATE_TO_RESPONSES_MAPPER,
+ is_gpt_oss_model_name,
)
# Legacy imports from the original dataset_utils.py for backward compatibility
@@ -98,6 +99,7 @@
"TEMPLATE_TO_MODEL_MAPPER",
"MODEL_TO_TEMPLATE_MAPPER",
"TEMPLATE_TO_RESPONSES_MAPPER",
+ "is_gpt_oss_model_name",
# Main entry points
"check_dataset_format",
"format_and_template_dataset",
diff --git a/studio/backend/utils/datasets/model_mappings.py b/studio/backend/utils/datasets/model_mappings.py
index 21e8566ac5..eb2e5482c9 100644
--- a/studio/backend/utils/datasets/model_mappings.py
+++ b/studio/backend/utils/datasets/model_mappings.py
@@ -442,6 +442,26 @@
MODEL_TO_TEMPLATE_MAPPER[value.lower()] = lowered_key
+def is_gpt_oss_model_name(name: str) -> bool:
+ """Name-based check for gpt-oss / harmony models.
+
+ Used by both the in-process backend and the parent-process
+ orchestrator to detect harmony models without an IPC round-trip.
+ """
+ name = (name or "").lower()
+ if not name:
+ return False
+ try:
+ if MODEL_TO_TEMPLATE_MAPPER.get(name) == "gpt-oss":
+ return True
+ for key, tmpl in MODEL_TO_TEMPLATE_MAPPER.items():
+ if tmpl == "gpt-oss" and (key in name or name in key):
+ return True
+ except Exception:
+ pass
+ return "gpt-oss" in name
+
+
TEMPLATE_TO_RESPONSES_MAPPER = {
"gemma-4-thinking": {
"instruction": "<|turn>user\n",
diff --git a/studio/frontend/src/features/chat/api/chat-adapter.ts b/studio/frontend/src/features/chat/api/chat-adapter.ts
index aeca357f1c..626e6dbb2d 100644
--- a/studio/frontend/src/features/chat/api/chat-adapter.ts
+++ b/studio/frontend/src/features/chat/api/chat-adapter.ts
@@ -3,7 +3,7 @@
import type { ChatModelAdapter } from "@assistant-ui/react";
import type { MessageTiming, ToolCallMessagePart } from "@assistant-ui/core";
-import { toast } from "sonner";
+import { toast } from "@/lib/toast";
import { getAuthToken } from "@/features/auth/session";
import { apiUrl } from "@/lib/api-base";
import {
@@ -16,7 +16,10 @@ import {
validateModel,
} from "./chat-api";
import { pickFriendlyContainerName } from "../lib/friendly-names";
-import { createOpenAIContainer } from "./openai-containers";
+import {
+ createOpenAIContainer,
+ listOpenAIContainers,
+} from "./openai-containers";
import {
encryptProviderApiKey,
isProviderKeyRotationError,
@@ -31,6 +34,7 @@ import {
isCustomProviderType,
loadExternalProviders,
parseExternalModelId,
+ providerTypeSupportsVision,
supportsProviderPromptCaching,
toExternalBackendProviderType,
} from "../external-providers";
@@ -46,6 +50,7 @@ import {
import { useChatRuntimeStore } from "../stores/chat-runtime-store";
import { isMultimodalResponse } from "../types/api";
import type { ChatModelSummary } from "../types/runtime";
+import { getImageInputUnavailableReason } from "../utils/image-input-support";
import {
hasClosedThinkTag,
parseAssistantContent,
@@ -588,6 +593,12 @@ async function autoLoadSmallestModel(): Promise<{
reasoningStyle: sfLoadResp.reasoning_style ?? "enable_thinking",
supportsPreserveThinking: sfLoadResp.supports_preserve_thinking ?? false,
supportsTools: sfLoadResp.supports_tools ?? false,
+ // Parity with the GGUF branch above.
+ toolsEnabled: sfLoadResp.supports_tools ?? false,
+ codeToolsEnabled: sfLoadResp.supports_tools ?? false,
+ defaultChatTemplate: sfLoadResp.chat_template ?? null,
+ chatTemplateOverride: null,
+ loadedChatTemplateOverride: null,
});
const sfModel: ChatModelSummary = {
id: repo.repo_id,
@@ -779,6 +790,37 @@ export function createOpenAIStreamAdapter(): ChatModelAdapter {
}
const imageBase64 = findLatestUserImageBase64(messages);
const audioBase64 = findLatestUserAudioBase64(messages);
+
+ // Block when ANY image is in the outbound payload (current or
+ // prior turns) and the loaded model can't process images. Keeps
+ // the gate simple: once a chat contains an image, a non-vision
+ // model can't respond — user starts a new chat to switch models.
+ if (imageBase64) {
+ const activeModel = runtime.models.find(
+ (m) => m.id === params.checkpoint,
+ );
+ const imageGateReason = getImageInputUnavailableReason({
+ activeModel,
+ isExternalModel: isExternalRequest,
+ externalSupportsVision: providerTypeSupportsVision(
+ externalProvider?.providerType,
+ ),
+ externalModelLabel: externalSelection?.modelId ?? null,
+ loadedIsMultimodal: runtime.loadedIsMultimodal,
+ modelLoaded: !!params.checkpoint && !runtime.modelLoading,
+ });
+ if (imageGateReason) {
+ toast.error(imageGateReason);
+ // Flip the per-thread running flag on→off so the compare-mode
+ // waitForRunEnd resolves instead of hanging. This gate fires
+ // before the streaming path's setThreadRunning(true), so the
+ // wait promise would otherwise never settle.
+ const gatedThreadKey = resolvedThreadId || "__default";
+ runtime.setThreadRunning(gatedThreadKey, true);
+ runtime.setThreadRunning(gatedThreadKey, false);
+ throw new Error(imageGateReason);
+ }
+ }
// Clear pending audio from store after extracting (consumed on send)
if (audioBase64) {
const audioName = runtime.pendingAudioName;
@@ -1013,6 +1055,41 @@ export function createOpenAIStreamAdapter(): ChatModelAdapter {
openaiCodeExecContainerId = null;
anthropicCodeExecContainerId = null;
}
+ // Pre-send container validation (OpenAI only). The list
+ // endpoint already filters status==="expired" server-side
+ // (studio/backend/routes/inference.py — list_openai_containers),
+ // so membership in this set means "OpenAI will accept it
+ // as container_reference". A stale id silently dropped here
+ // falls through to the inheritance + lazy-create logic
+ // below, so the user never sees "Container is expired" in
+ // the chat thread. On list-call failure we leave
+ // activeContainerIds null and skip validation — the
+ // backend's transparent retry path is the safety net for
+ // that case.
+ let activeContainerIds: Set | null = null;
+ if (externalProvider.providerType === "openai") {
+ try {
+ const list = await listOpenAIContainers({
+ apiKey: externalApiKey,
+ baseUrl: externalProvider.baseUrl || null,
+ });
+ activeContainerIds = new Set(list.map((c) => c.id));
+ } catch {
+ activeContainerIds = null;
+ }
+ if (
+ activeContainerIds &&
+ openaiCodeExecContainerId &&
+ !activeContainerIds.has(openaiCodeExecContainerId)
+ ) {
+ void db.threads
+ .update(resolvedThreadId, {
+ openaiCodeExecContainerId: null,
+ })
+ .catch(() => {});
+ openaiCodeExecContainerId = null;
+ }
+ }
// Cross-thread inheritance: when the active thread has
// no container yet, default to the one most recently
// used on *any* other thread (provider-scoped).
@@ -1033,15 +1110,27 @@ export function createOpenAIStreamAdapter(): ChatModelAdapter {
.toArray();
for (const t of others) {
if (t.id === resolvedThreadId) continue;
- if (t.openaiCodeExecContainerId) {
- openaiCodeExecContainerId = t.openaiCodeExecContainerId;
+ if (!t.openaiCodeExecContainerId) continue;
+ // Skip inherited ids that are not in the active
+ // container set — they would 400 on send. Also
+ // null them on the source thread so the next
+ // inheritance pass doesn't re-pick the same dead id.
+ if (
+ activeContainerIds &&
+ !activeContainerIds.has(t.openaiCodeExecContainerId)
+ ) {
void db.threads
- .update(resolvedThreadId, {
- openaiCodeExecContainerId,
- })
+ .update(t.id, { openaiCodeExecContainerId: null })
.catch(() => {});
- break;
+ continue;
}
+ openaiCodeExecContainerId = t.openaiCodeExecContainerId;
+ void db.threads
+ .update(resolvedThreadId, {
+ openaiCodeExecContainerId,
+ })
+ .catch(() => {});
+ break;
}
} catch {
/* fall through to lazy-create below */