diff --git a/.github/workflows/embedding.yml b/.github/workflows/embedding.yml new file mode 100644 index 0000000000000..f3689461774af --- /dev/null +++ b/.github/workflows/embedding.yml @@ -0,0 +1,220 @@ +# Embedding CLI build and tests +name: Embedding CLI + +on: + workflow_dispatch: + push: + branches: [master, feature/**] + paths: + - '.github/workflows/embedding.yml' + - 'examples/**' + - 'src/**' + - 'ggml/**' + - 'include/**' + - '**/CMakeLists.txt' + - 'tests/e2e/embedding/**' + pull_request: + types: [opened, synchronize, reopened] + paths: + - '.github/workflows/embedding.yml' + - 'examples/**' + - 'src/**' + - 'ggml/**' + - 'include/**' + - '**/CMakeLists.txt' + - 'tests/e2e/embedding/**' + +jobs: + embedding-cli-tests-linux: + runs-on: ubuntu-latest + env: + LLAMA_CACHE: tmp # stable path for cache + EMBD_TEST_DEBUG: "1" + + steps: + - uses: actions/checkout@v4 + with: { fetch-depth: 0 } + + - name: Restore model cache + uses: actions/cache@v4 + with: + path: | + ~/.cache/llama.cpp + tmp + key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1 + restore-keys: | + hf-${{ runner.os }}- + hf- + + - name: Install system deps + run: | + sudo apt-get update + sudo apt-get -y install \ + build-essential cmake curl libcurl4-openssl-dev python3-pip + + - name: Set up Python + uses: actions/setup-python@v5 + with: { python-version: '3.11' } + + - name: Install Python deps + run: | + python -m pip install -r requirements.txt || echo "No extra requirements found" + python -m pip install pytest numpy pytest-timeout + + - name: Build llama-embedding + run: | + cmake -B build -DCMAKE_BUILD_TYPE=Release + cmake --build build --target llama-embedding -j $(nproc) + + - name: Pre-download tiny model (retry x3 on network) + run: | + set -e + tries=0 + until ./build/bin/llama-embedding \ + -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \ + -hff embeddinggemma-300M-qat-Q4_0.gguf \ + --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do + tries=$((tries+1)) + if [ $tries -ge 3 ]; then + echo "Pre-download failed after $tries attempts" + exit 1 + fi + echo "Retrying download ($tries/3)..." + sleep 3 + done + + - name: Run embedding tests (30s per-test cap) + shell: bash + run: | + set -o pipefail + pytest -v tests/e2e/embedding \ + --timeout=30 \ + --durations=10 \ + --junitxml=pytest-report.xml | tee pytest-output.txt + + - name: Upload test artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: linux-embedding-tests + path: | + pytest-output.txt + pytest-report.xml + + - name: Save model cache + if: always() + uses: actions/cache@v4 + with: + path: | + ~/.cache/llama.cpp + tmp + key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1 + + embedding-cli-tests-windows: + runs-on: windows-latest + continue-on-error: true + env: + LLAMA_CACHE: tmp + EMBD_TEST_DEBUG: "1" + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: { python-version: '3.11' } + + # --- vcpkg plain bootstrap (no actions, no submodules) --- + - name: Bootstrap vcpkg + shell: pwsh + run: | + $env:VCPKG_ROOT = "$env:RUNNER_TEMP\vcpkg" + git clone https://github.com/microsoft/vcpkg $env:VCPKG_ROOT + & "$env:VCPKG_ROOT\bootstrap-vcpkg.bat" -disableMetrics + echo "VCPKG_ROOT=$env:VCPKG_ROOT" | Out-File -FilePath $env:GITHUB_ENV -Append + + - name: Install curl with OpenSSL via vcpkg + shell: pwsh + run: | + & "$env:VCPKG_ROOT\vcpkg.exe" install curl[openssl]:x64-windows + + - name: Restore model cache + uses: actions/cache@v4 + with: + path: | + $HOME/.cache/llama.cpp + tmp + key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1 + restore-keys: | + hf-${{ runner.os }}- + hf- + + - name: Install Python deps + run: pip install pytest numpy + + - name: Configure & Build (Release) + shell: pwsh + run: | + cmake -B build -DCMAKE_BUILD_TYPE=Release ` + -DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_ROOT\scripts\buildsystems\vcpkg.cmake" + cmake --build build --target llama-embedding --config Release -j 2 + + - name: Pre-download tiny model (retry x3) + shell: bash + run: | + set -e + tries=0 + until ./build/bin/Release/llama-embedding.exe \ + -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \ + -hff embeddinggemma-300M-qat-Q4_0.gguf \ + --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do + tries=$((tries+1)) + if [ $tries -ge 3 ]; then + echo "Pre-download failed after $tries attempts"; exit 1 + fi + echo "Retrying download ($tries/3)..."; sleep 3 + done + + - name: Run smoke tests + shell: bash + run: | + pytest -q tests/e2e/embedding -k raw_vs_json_consistency + + + + embedding-cli-tests-macos: + runs-on: macos-latest + continue-on-error: true + env: + LLAMA_CACHE: tmp + EMBD_TEST_DEBUG: "1" + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: { python-version: '3.11' } + + - name: Install Python deps + run: pip install pytest numpy + + - name: Build + run: | + cmake -B build -DCMAKE_BUILD_TYPE=Release + cmake --build build --target llama-embedding -j 3 + + - name: Pre-download tiny model (retry x3) + run: | + set -e + tries=0 + until ./build/bin/llama-embedding \ + -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \ + -hff embeddinggemma-300M-qat-Q4_0.gguf \ + --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do + tries=$((tries+1)) + if [ $tries -ge 3 ]; then + echo "Pre-download failed after $tries attempts"; exit 1 + fi + echo "Retrying download ($tries/3)..."; sleep 3 + done + + - name: Warm cache & run a tiny smoke + run: | + ./build/bin/llama-embedding --help >/dev/null 2>&1 + pytest -q tests/e2e/embedding -k raw_vs_json_consistency diff --git a/tests/e2e/embedding/test_embedding_cli.py b/tests/e2e/embedding/test_embedding_cli.py new file mode 100644 index 0000000000000..80f986ec86bb6 --- /dev/null +++ b/tests/e2e/embedding/test_embedding_cli.py @@ -0,0 +1,251 @@ +import json +import hashlib +import logging +import os +import pytest +import subprocess +from pathlib import Path +import numpy as np +import time +from typing import Optional, List + +# --------------------------------------------------------------------------- +# Configuration constants +# --------------------------------------------------------------------------- + +EPS = 1e-3 +REPO_ROOT = Path(__file__).resolve().parents[3] +EXE = REPO_ROOT / ("build/bin/llama-embedding.exe" if os.name == "nt" else "build/bin/llama-embedding") +DEFAULT_ENV = {**os.environ, "LLAMA_CACHE": os.environ.get("LLAMA_CACHE", "tmp")} +SEED = "42" +ALLOWED_DIMS = {384, 768, 1024, 4096} + +SMALL_CTX = 16 # preflight/cache +TEST_CTX = 1024 # main tests + +log = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Shared helpers (single source of truth for command building) +# --------------------------------------------------------------------------- + + +def resolve_exe() -> Path: + exe = EXE + if not exe.exists() and os.name == "nt": + alt = REPO_ROOT / "build/bin/Release/llama-embedding.exe" + if alt.exists(): + exe = alt + if not exe.exists(): + raise FileNotFoundError(f"llama-embedding not found under {REPO_ROOT}/build/bin") + return exe + + +def hf_params_default(): + return { + "hf_repo": "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF", + "hf_file": "embeddinggemma-300M-qat-Q4_0.gguf", + } + + +def build_cmd( + *, + exe: Path, + params: dict, + fmt: str, + threads: int, + ctx: int, + seed: str, + extra: Optional[List[str]] = None, # was: list[str] | None +) -> List[str]: # was: list[str] + assert fmt in {"raw", "json"}, f"unsupported fmt={fmt}" + cmd = [ + str(exe), + "-hfr", params["hf_repo"], + "-hff", params["hf_file"], + "--ctx-size", str(ctx), + "--embd-output-format", fmt, + "--threads", str(threads), + "--seed", seed, + ] + if extra: + cmd.extend(extra) + return cmd + + +def run_cmd(cmd: list[str], text: str, timeout: int = 60) -> str: + t0 = time.perf_counter() + res = subprocess.run(cmd, input=text, capture_output=True, text=True, + env=DEFAULT_ENV, timeout=timeout) + dur_ms = (time.perf_counter() - t0) * 1000.0 + if os.environ.get("EMBD_TEST_DEBUG") == "1": + log.debug("embedding cmd finished in %.1f ms", dur_ms) + + if res.returncode != 0: + raise AssertionError(f"embedding failed ({res.returncode}):\n{res.stderr[:400]}") + out = res.stdout.strip() + assert out, "empty stdout from llama-embedding" + return out + +# --------------------------------------------------------------------------- +# Session model preflight/cache +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def embedding_model(): + """Download/cache model once per session with a tiny ctx + no warmup.""" + exe = resolve_exe() + params = hf_params_default() + cmd = build_cmd( + exe=exe, params=params, fmt="json", + threads=1, ctx=SMALL_CTX, seed=SEED, + extra=["--no-warmup"], + ) + _ = run_cmd(cmd, text="ok") + return params + +# --------------------------------------------------------------------------- +# Utility functions +# --------------------------------------------------------------------------- + + +def run_embedding( + text: str, + *, + fmt: str = "raw", + threads: int = 1, + ctx: int = TEST_CTX, + params: Optional[dict] = None, # was: dict | None + timeout: int = 60, +) -> str: + exe = resolve_exe() + params = params or hf_params_default() + cmd = build_cmd(exe=exe, params=params, fmt=fmt, threads=threads, ctx=ctx, seed=SEED) + return run_cmd(cmd, text, timeout=timeout) + + +def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))) + + +def embedding_hash(vec: np.ndarray) -> str: + """Return short deterministic signature for regression tracking.""" + return hashlib.sha256(vec[:8].tobytes()).hexdigest()[:16] + + +def parse_vec(out: str, fmt: str) -> np.ndarray: + if fmt == "raw": + arr = np.array(out.split(), dtype=np.float32) + else: + arr = np.array(json.loads(out)["data"][0]["embedding"], dtype=np.float32) + return arr + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +# Register custom mark so pytest doesn't warn about it +pytestmark = pytest.mark.filterwarnings("ignore::pytest.PytestUnknownMarkWarning") + + +@pytest.mark.parametrize("fmt", ["raw", "json"]) +@pytest.mark.parametrize("text", ["hello world", "hi 🌎", "line1\nline2\nline3"]) +def test_embedding_runs_and_finite(fmt, text, embedding_model): + out = run_embedding(text, fmt=fmt, threads=1, ctx=TEST_CTX, params=embedding_model) + vec = parse_vec(out, fmt) + assert vec.dtype == np.float32 + # dim & finiteness + assert len(vec) in ALLOWED_DIMS, f"unexpected dim={len(vec)}" + assert np.all(np.isfinite(vec)) + assert 0.1 < np.linalg.norm(vec) < 10 + + +def test_raw_vs_json_consistency(embedding_model): + text = "hello world" + raw = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw") + jsn = parse_vec(run_embedding(text, fmt="json", params=embedding_model), "json") + assert raw.shape == jsn.shape + cos = cosine_similarity(raw, jsn) + assert cos > 0.999, f"raw/json divergence: cos={cos:.6f}" + assert embedding_hash(raw) == embedding_hash(jsn) + + +def test_empty_input_deterministic(embedding_model): + v1 = parse_vec(run_embedding("", fmt="raw", params=embedding_model), "raw") + v2 = parse_vec(run_embedding("", fmt="raw", params=embedding_model), "raw") + assert np.all(np.isfinite(v1)) + assert embedding_hash(v1) == embedding_hash(v2) + assert cosine_similarity(v1, v2) > 0.99999 + + +def test_very_long_input_stress(embedding_model): + """Stress test: large input near context window.""" + text = "lorem " * 2000 + vec = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw") + assert len(vec) in ALLOWED_DIMS + assert np.isfinite(np.linalg.norm(vec)) + + +@pytest.mark.parametrize("text", [" ", "\n\n\n", "123 456 789"]) +def test_low_information_inputs_stable(text, embedding_model): + """Whitespace/numeric inputs should yield stable embeddings.""" + v1 = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw") + v2 = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw") + cos = cosine_similarity(v1, v2) + assert cos > 0.999, f"unstable embedding for {text!r}" + + +@pytest.mark.parametrize("flag", ["--no-such-flag", "--help"]) +def test_invalid_or_help_flag(flag): + """Invalid flags should fail; help should succeed.""" + exe = resolve_exe() + res = subprocess.run([str(exe), flag], capture_output=True, text=True, env=DEFAULT_ENV) + if flag == "--no-such-flag": + assert res.returncode != 0 + assert any(k in res.stderr.lower() for k in ("error", "invalid", "unknown")) + else: + assert res.returncode == 0 + assert "usage" in (res.stdout.lower() + res.stderr.lower()) + + +@pytest.mark.parametrize("fmt", ["raw", "json"]) +def test_threads_two_similarity_vs_single(fmt, embedding_model): + text = "determinism vs threads" + single = parse_vec(run_embedding(text, fmt=fmt, threads=1, params=embedding_model), fmt) + multi = parse_vec(run_embedding(text, fmt=fmt, threads=2, params=embedding_model), fmt) + assert single.shape == multi.shape + cos = cosine_similarity(single, multi) + assert cos >= 0.999, f"threads>1 similarity too low: {cos:.6f}" + + +def test_json_shape_schema_minimal(embedding_model): + js = json.loads(run_embedding("schema check", fmt="json", params=embedding_model)) + assert isinstance(js, dict) + + # Top-level “object” (present in CLI) is optional for us + if "object" in js: + assert js["object"] in ("list", "embeddings", "embedding_list") + + # Required: data[0].embedding + index + assert "data" in js and isinstance(js["data"], list) and len(js["data"]) >= 1 + item0 = js["data"][0] + assert isinstance(item0, dict) + if "object" in item0: + assert item0["object"] in ("embedding",) + assert "index" in item0 and item0["index"] == 0 + assert "embedding" in item0 and isinstance(item0["embedding"], list) + assert len(item0["embedding"]) in ALLOWED_DIMS + + # Optional fields: tolerate absence in current CLI output + if "model" in js: + assert isinstance(js["model"], str) + if "dim" in js: + assert js["dim"] == len(item0["embedding"]) + usage = js.get("usage", {}) + if usage: + assert isinstance(usage, dict) + # if present, prompt_tokens should be int + if "prompt_tokens" in usage: + assert isinstance(usage["prompt_tokens"], int)