From c1c3d99ef61bb37f2d6af7f02ebd57a6fc770498 Mon Sep 17 00:00:00 2001
From: Sam Malayek <malayek@gmail.com>
Date: Fri, 31 Oct 2025 17:42:24 -0700
Subject: [PATCH 1/3] Add e2e tests for embedding raw flag

---
 .github/workflows/embeddings.yml |  59 ++++++++++
 examples/tests/__init__.py       |   0
 examples/tests/test_embedding.py | 184 +++++++++++++++++++++++++++++++
 3 files changed, 243 insertions(+)
 create mode 100644 .github/workflows/embeddings.yml
 create mode 100644 examples/tests/__init__.py
 create mode 100644 examples/tests/test_embedding.py

diff --git a/.github/workflows/embeddings.yml b/.github/workflows/embeddings.yml
new file mode 100644
index 0000000000000..c530a696d6f9e
--- /dev/null
+++ b/.github/workflows/embeddings.yml
@@ -0,0 +1,59 @@
+# Embedding CLI build and tests
+name: Embedding CLI
+
+on:
+    workflow_dispatch:
+    push:
+        branches:
+            - feature/*
+            - master
+        paths:
+            - '.github/workflows/embeddings.yml'
+            - 'examples/embedding/**'
+            - 'examples/tests/**'
+    pull_request:
+        types: [opened, synchronize, reopened]
+        paths:
+            - '.github/workflows/embeddings.yml'
+            - 'examples/embedding/**'
+            - 'examples/tests/**'
+
+jobs:
+    embedding-cli-tests:
+        runs-on: ubuntu-latest
+
+        steps:
+            - name: Install system deps
+              run: |
+                  sudo apt-get update
+                  sudo apt-get -y install \
+                    build-essential \
+                    cmake \
+                    curl \
+                    libcurl4-openssl-dev \
+                    python3-pip
+
+            - name: Checkout repository
+              uses: actions/checkout@v4
+              with:
+                  fetch-depth: 0
+
+            - name: Set up Python
+              uses: actions/setup-python@v5
+              with:
+                  python-version: '3.11'
+
+            - name: Install Python deps
+              run: |
+                  pip install -r requirements.txt || echo "No extra requirements found"
+                  pip install pytest
+
+            - name: Build llama-embedding
+              run: |
+                  cmake -B build \
+                    -DCMAKE_BUILD_TYPE=Release
+                  cmake --build build --target llama-embedding -j $(nproc)
+
+            - name: Run embedding tests
+              run: |
+                  pytest -v examples/tests
diff --git a/examples/tests/__init__.py b/examples/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/examples/tests/test_embedding.py b/examples/tests/test_embedding.py
new file mode 100644
index 0000000000000..414b01d4d475f
--- /dev/null
+++ b/examples/tests/test_embedding.py
@@ -0,0 +1,184 @@
+import os, json, subprocess, hashlib
+from pathlib import Path
+import numpy as np
+import pytest
+
+# ---------------------------------------------------------------------------
+# Configuration constants
+# ---------------------------------------------------------------------------
+
+EPS = 1e-3
+REPO_ROOT = Path(__file__).resolve().parents[2]
+EXE = REPO_ROOT / ("build/bin/llama-embedding.exe" if os.name == "nt" else "build/bin/llama-embedding")
+DEFAULT_ENV = {**os.environ, "LLAMA_CACHE": os.environ.get("LLAMA_CACHE", "tmp")}
+SEED = "42"
+
+
+# ---------------------------------------------------------------------------
+# Model setup helpers
+# ---------------------------------------------------------------------------
+
+def get_model_hf_params():
+    """Default lightweight embedding model."""
+    return {
+        "hf_repo": "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF",
+        "hf_file": "embeddinggemma-300M-qat-Q4_0.gguf",
+    }
+
+
+@pytest.fixture(scope="session")
+def embedding_model():
+    """Download/cache model once per session."""
+    exe_path = EXE
+    if not exe_path.exists():
+        alt = REPO_ROOT / "build/bin/Release/llama-embedding.exe"
+        if alt.exists():
+            exe_path = alt
+        else:
+            raise FileNotFoundError(f"llama-embedding binary not found under {REPO_ROOT}/build/bin")
+
+    params = get_model_hf_params()
+    cmd = [
+        str(exe_path),
+        "-hfr", params["hf_repo"],
+        "-hff", params["hf_file"],
+        "--ctx-size", "16",
+        "--embd-output-format", "json",
+        "--no-warmup",
+        "--threads", "1",
+        "--seed", SEED,
+    ]
+    res = subprocess.run(cmd, input="ok", capture_output=True, text=True, env=DEFAULT_ENV)
+    assert res.returncode == 0, f"model download failed: {res.stderr}"
+    return params
+
+
+# ---------------------------------------------------------------------------
+# Utility functions
+# ---------------------------------------------------------------------------
+
+def run_embedding(text: str, fmt: str = "raw", params=None) -> str:
+    """Runs llama-embedding and returns stdout (string)."""
+    exe_path = EXE
+    if not exe_path.exists():
+        raise FileNotFoundError(f"Missing binary: {exe_path}")
+    params = params or get_model_hf_params()
+    cmd = [
+        str(exe_path),
+        "-hfr", params["hf_repo"],
+        "-hff", params["hf_file"],
+        "--ctx-size", "2048",
+        "--embd-output-format", fmt,
+        "--threads", "1",
+        "--seed", SEED,
+    ]
+    result = subprocess.run(cmd, input=text, capture_output=True, text=True, env=DEFAULT_ENV)
+    if result.returncode:
+        raise AssertionError(f"embedding failed ({result.returncode}):\n{result.stderr[:400]}")
+    out = result.stdout.strip()
+    assert out, f"empty output for text={text!r}, fmt={fmt}"
+    return out
+
+
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
+
+
+def embedding_hash(vec: np.ndarray) -> str:
+    """Return short deterministic signature for regression tracking."""
+    return hashlib.sha256(vec[:8].tobytes()).hexdigest()[:16]
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+# Register custom mark so pytest doesn't warn about it
+pytestmark = pytest.mark.filterwarnings("ignore::pytest.PytestUnknownMarkWarning")
+
+@pytest.mark.slow
+@pytest.mark.parametrize("fmt", ["raw", "json"])
+@pytest.mark.parametrize("text", ["hello world", "hi 🌎", "line1\nline2\nline3"])
+def test_embedding_runs_and_finite(fmt, text, embedding_model):
+    """Ensure embeddings run end-to-end and produce finite floats."""
+    out = run_embedding(text, fmt, embedding_model)
+    floats = (
+        np.array(out.split(), float)
+        if fmt == "raw"
+        else np.array(json.loads(out)["data"][0]["embedding"], float)
+    )
+    assert len(floats) > 100
+    assert np.all(np.isfinite(floats)), f"non-finite values in {fmt} output"
+    assert 0.1 < np.linalg.norm(floats) < 10
+
+
+def test_raw_vs_json_consistency(embedding_model):
+    """Compare raw vs JSON embedding output for same text."""
+    text = "hello world"
+    raw = np.array(run_embedding(text, "raw", embedding_model).split(), float)
+    jsn = np.array(json.loads(run_embedding(text, "json", embedding_model))["data"][0]["embedding"], float)
+
+    assert raw.shape == jsn.shape
+    cos = cosine_similarity(raw, jsn)
+    assert cos > 0.999, f"divergence: cos={cos:.4f}"
+    assert embedding_hash(raw) == embedding_hash(jsn), "hash mismatch → possible nondeterminism"
+
+
+def test_empty_input_deterministic(embedding_model):
+    """Empty input should yield finite, deterministic vector."""
+    v1 = np.array(run_embedding("", "raw", embedding_model).split(), float)
+    v2 = np.array(run_embedding("", "raw", embedding_model).split(), float)
+    assert np.all(np.isfinite(v1))
+    cos = cosine_similarity(v1, v2)
+    assert cos > 0.9999, f"Empty input not deterministic (cos={cos:.5f})"
+    assert 0.1 < np.linalg.norm(v1) < 10
+
+
+@pytest.mark.slow
+def test_very_long_input_stress(embedding_model):
+    """Stress test: large input near context window."""
+    text = "lorem " * 2000
+    vec = np.array(run_embedding(text, "raw", embedding_model).split(), float)
+    assert len(vec) > 100
+    assert np.isfinite(np.linalg.norm(vec))
+
+
+@pytest.mark.parametrize(
+    "text",
+    ["   ", "\n\n\n", "123 456 789"],
+)
+def test_low_information_inputs_stable(text, embedding_model):
+    """Whitespace/numeric inputs should yield stable embeddings."""
+    v1 = np.array(run_embedding(text, "raw", embedding_model).split(), float)
+    v2 = np.array(run_embedding(text, "raw", embedding_model).split(), float)
+    cos = cosine_similarity(v1, v2)
+    assert cos > 0.999, f"unstable embedding for {text!r}"
+
+
+@pytest.mark.parametrize("flag", ["--no-such-flag", "--help"])
+def test_invalid_or_help_flag(flag):
+    """Invalid flags should fail; help should succeed."""
+    res = subprocess.run([str(EXE), flag], capture_output=True, text=True)
+    if flag == "--no-such-flag":
+        assert res.returncode != 0
+        assert any(k in res.stderr.lower() for k in ("error", "invalid", "unknown"))
+    else:
+        assert res.returncode == 0
+        assert "usage" in (res.stdout.lower() + res.stderr.lower())
+
+
+@pytest.mark.parametrize("fmt", ["raw", "json"])
+@pytest.mark.parametrize("text", ["deterministic test", "deterministic test again"])
+def test_repeated_call_consistent(fmt, text, embedding_model):
+    """Same input → same hash across repeated runs."""
+    out1 = run_embedding(text, fmt, embedding_model)
+    out2 = run_embedding(text, fmt, embedding_model)
+
+    if fmt == "json":
+        v1 = np.array(json.loads(out1)["data"][0]["embedding"], float)
+        v2 = np.array(json.loads(out2)["data"][0]["embedding"], float)
+    else:
+        v1 = np.array(out1.split(), float)
+        v2 = np.array(out2.split(), float)
+
+    assert embedding_hash(v1) == embedding_hash(v2)

From 2de1e6871f99d324ebfa703495cb06a6ed0a2521 Mon Sep 17 00:00:00 2001
From: Sam Malayek <malayek@gmail.com>
Date: Sun, 2 Nov 2025 13:55:01 -0800
Subject: [PATCH 2/3] Increase scope of embedding cli tests

---
 .../{embeddings.yml => embedding.yml}         | 26 ++++++++++++-------
 examples/tests/__init__.py                    |  0
 .../e2e/embedding/test_embedding_cli.py       | 10 ++++---
 3 files changed, 23 insertions(+), 13 deletions(-)
 rename .github/workflows/{embeddings.yml => embedding.yml} (71%)
 delete mode 100644 examples/tests/__init__.py
 rename examples/tests/test_embedding.py => tests/e2e/embedding/test_embedding_cli.py (98%)

diff --git a/.github/workflows/embeddings.yml b/.github/workflows/embedding.yml
similarity index 71%
rename from .github/workflows/embeddings.yml
rename to .github/workflows/embedding.yml
index c530a696d6f9e..db566b36d5f77 100644
--- a/.github/workflows/embeddings.yml
+++ b/.github/workflows/embedding.yml
@@ -4,19 +4,25 @@ name: Embedding CLI
 on:
     workflow_dispatch:
     push:
-        branches:
-            - feature/*
-            - master
+        branches: [master, feature/**]
         paths:
-            - '.github/workflows/embeddings.yml'
-            - 'examples/embedding/**'
-            - 'examples/tests/**'
+            - '.github/workflows/embedding.yml'
+            - 'examples/**'
+            - 'src/**'
+            - 'ggml/**'
+            - 'include/**'
+            - '**/CMakeLists.txt'
+            - 'tests/e2e/embedding/**'
     pull_request:
         types: [opened, synchronize, reopened]
         paths:
-            - '.github/workflows/embeddings.yml'
-            - 'examples/embedding/**'
-            - 'examples/tests/**'
+            - '.github/workflows/embedding.yml'
+            - 'examples/**'
+            - 'src/**'
+            - 'ggml/**'
+            - 'include/**'
+            - '**/CMakeLists.txt'
+            - 'tests/e2e/embedding/**'
 
 jobs:
     embedding-cli-tests:
@@ -56,4 +62,4 @@ jobs:
 
             - name: Run embedding tests
               run: |
-                  pytest -v examples/tests
+                  pytest -v tests/e2e/embedding
diff --git a/examples/tests/__init__.py b/examples/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/examples/tests/test_embedding.py b/tests/e2e/embedding/test_embedding_cli.py
similarity index 98%
rename from examples/tests/test_embedding.py
rename to tests/e2e/embedding/test_embedding_cli.py
index 414b01d4d475f..cf4731d6bfd65 100644
--- a/examples/tests/test_embedding.py
+++ b/tests/e2e/embedding/test_embedding_cli.py
@@ -1,14 +1,17 @@
-import os, json, subprocess, hashlib
+import json
+import hashlib
+import os
+import pytest
+import subprocess
 from pathlib import Path
 import numpy as np
-import pytest
 
 # ---------------------------------------------------------------------------
 # Configuration constants
 # ---------------------------------------------------------------------------
 
 EPS = 1e-3
-REPO_ROOT = Path(__file__).resolve().parents[2]
+REPO_ROOT = Path(__file__).resolve().parents[3]
 EXE = REPO_ROOT / ("build/bin/llama-embedding.exe" if os.name == "nt" else "build/bin/llama-embedding")
 DEFAULT_ENV = {**os.environ, "LLAMA_CACHE": os.environ.get("LLAMA_CACHE", "tmp")}
 SEED = "42"
@@ -96,6 +99,7 @@ def embedding_hash(vec: np.ndarray) -> str:
 # Register custom mark so pytest doesn't warn about it
 pytestmark = pytest.mark.filterwarnings("ignore::pytest.PytestUnknownMarkWarning")
 
+
 @pytest.mark.slow
 @pytest.mark.parametrize("fmt", ["raw", "json"])
 @pytest.mark.parametrize("text", ["hello world", "hi 🌎", "line1\nline2\nline3"])

From 5ce810ee516d87134bd5a1536a38a9564204c26a Mon Sep 17 00:00:00 2001
From: Sam Malayek <malayek@gmail.com>
Date: Mon, 3 Nov 2025 00:35:33 -0800
Subject: [PATCH 3/3] Update test and workflow to match new RFC

---
 .github/workflows/embedding.yml           | 193 +++++++++++++++--
 tests/e2e/embedding/test_embedding_cli.py | 247 ++++++++++++++--------
 2 files changed, 329 insertions(+), 111 deletions(-)

diff --git a/.github/workflows/embedding.yml b/.github/workflows/embedding.yml
index db566b36d5f77..f3689461774af 100644
--- a/.github/workflows/embedding.yml
+++ b/.github/workflows/embedding.yml
@@ -25,41 +25,196 @@ on:
             - 'tests/e2e/embedding/**'
 
 jobs:
-    embedding-cli-tests:
+    embedding-cli-tests-linux:
         runs-on: ubuntu-latest
+        env:
+            LLAMA_CACHE: tmp   # stable path for cache
+            EMBD_TEST_DEBUG: "1"
 
         steps:
+            - uses: actions/checkout@v4
+              with: { fetch-depth: 0 }
+
+            - name: Restore model cache
+              uses: actions/cache@v4
+              with:
+                  path: |
+                      ~/.cache/llama.cpp
+                      tmp
+                  key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1
+                  restore-keys: |
+                      hf-${{ runner.os }}-
+                      hf-
+
             - name: Install system deps
               run: |
                   sudo apt-get update
                   sudo apt-get -y install \
-                    build-essential \
-                    cmake \
-                    curl \
-                    libcurl4-openssl-dev \
-                    python3-pip
-
-            - name: Checkout repository
-              uses: actions/checkout@v4
-              with:
-                  fetch-depth: 0
+                    build-essential cmake curl libcurl4-openssl-dev python3-pip
 
             - name: Set up Python
               uses: actions/setup-python@v5
-              with:
-                  python-version: '3.11'
+              with: { python-version: '3.11' }
 
             - name: Install Python deps
               run: |
-                  pip install -r requirements.txt || echo "No extra requirements found"
-                  pip install pytest
+                  python -m pip install -r requirements.txt || echo "No extra requirements found"
+                  python -m pip install pytest numpy pytest-timeout
 
             - name: Build llama-embedding
               run: |
-                  cmake -B build \
-                    -DCMAKE_BUILD_TYPE=Release
+                  cmake -B build -DCMAKE_BUILD_TYPE=Release
                   cmake --build build --target llama-embedding -j $(nproc)
 
-            - name: Run embedding tests
+            - name: Pre-download tiny model (retry x3 on network)
+              run: |
+                  set -e
+                  tries=0
+                  until ./build/bin/llama-embedding \
+                      -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \
+                      -hff embeddinggemma-300M-qat-Q4_0.gguf \
+                      --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do
+                    tries=$((tries+1))
+                    if [ $tries -ge 3 ]; then
+                      echo "Pre-download failed after $tries attempts"
+                      exit 1
+                    fi
+                    echo "Retrying download ($tries/3)..."
+                    sleep 3
+                  done
+
+            - name: Run embedding tests (30s per-test cap)
+              shell: bash
+              run: |
+                  set -o pipefail
+                  pytest -v tests/e2e/embedding \
+                  --timeout=30 \
+                  --durations=10 \
+                  --junitxml=pytest-report.xml | tee pytest-output.txt
+
+            - name: Upload test artifacts
+              if: always()
+              uses: actions/upload-artifact@v4
+              with:
+                  name: linux-embedding-tests
+                  path: |
+                      pytest-output.txt
+                      pytest-report.xml
+
+            - name: Save model cache
+              if: always()
+              uses: actions/cache@v4
+              with:
+                  path: |
+                      ~/.cache/llama.cpp
+                      tmp
+                  key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1
+
+    embedding-cli-tests-windows:
+        runs-on: windows-latest
+        continue-on-error: true
+        env:
+            LLAMA_CACHE: tmp
+            EMBD_TEST_DEBUG: "1"
+
+        steps:
+            - uses: actions/checkout@v4
+            - uses: actions/setup-python@v5
+              with: { python-version: '3.11' }
+
+            # --- vcpkg plain bootstrap (no actions, no submodules) ---
+            - name: Bootstrap vcpkg
+              shell: pwsh
+              run: |
+                  $env:VCPKG_ROOT = "$env:RUNNER_TEMP\vcpkg"
+                  git clone https://github.com/microsoft/vcpkg $env:VCPKG_ROOT
+                  & "$env:VCPKG_ROOT\bootstrap-vcpkg.bat" -disableMetrics
+                  echo "VCPKG_ROOT=$env:VCPKG_ROOT" | Out-File -FilePath $env:GITHUB_ENV -Append
+
+            - name: Install curl with OpenSSL via vcpkg
+              shell: pwsh
+              run: |
+                  & "$env:VCPKG_ROOT\vcpkg.exe" install curl[openssl]:x64-windows
+
+            - name: Restore model cache
+              uses: actions/cache@v4
+              with:
+                  path: |
+                      $HOME/.cache/llama.cpp
+                      tmp
+                  key: hf-${{ runner.os }}-embeddinggemma-300M-q4_0-v1
+                  restore-keys: |
+                      hf-${{ runner.os }}-
+                      hf-
+
+            - name: Install Python deps
+              run: pip install pytest numpy
+
+            - name: Configure & Build (Release)
+              shell: pwsh
+              run: |
+                  cmake -B build -DCMAKE_BUILD_TYPE=Release `
+                    -DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_ROOT\scripts\buildsystems\vcpkg.cmake"
+                  cmake --build build --target llama-embedding --config Release -j 2
+
+            - name: Pre-download tiny model (retry x3)
+              shell: bash
+              run: |
+                  set -e
+                  tries=0
+                  until ./build/bin/Release/llama-embedding.exe \
+                    -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \
+                    -hff embeddinggemma-300M-qat-Q4_0.gguf \
+                    --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do
+                    tries=$((tries+1))
+                    if [ $tries -ge 3 ]; then
+                      echo "Pre-download failed after $tries attempts"; exit 1
+                    fi
+                    echo "Retrying download ($tries/3)..."; sleep 3
+                  done
+
+            - name: Run smoke tests
+              shell: bash
+              run: |
+                  pytest -q tests/e2e/embedding -k raw_vs_json_consistency
+
+
+
+    embedding-cli-tests-macos:
+        runs-on: macos-latest
+        continue-on-error: true
+        env:
+            LLAMA_CACHE: tmp
+            EMBD_TEST_DEBUG: "1"
+        steps:
+            - uses: actions/checkout@v4
+            - uses: actions/setup-python@v5
+              with: { python-version: '3.11' }
+
+            - name: Install Python deps
+              run: pip install pytest numpy
+
+            - name: Build
+              run: |
+                  cmake -B build -DCMAKE_BUILD_TYPE=Release
+                  cmake --build build --target llama-embedding -j 3
+
+            - name: Pre-download tiny model (retry x3)
+              run: |
+                  set -e
+                  tries=0
+                  until ./build/bin/llama-embedding \
+                    -hfr ggml-org/embeddinggemma-300M-qat-q4_0-GGUF \
+                    -hff embeddinggemma-300M-qat-Q4_0.gguf \
+                    --ctx-size 16 --embd-output-format json --no-warmup --threads 1 --seed 42 <<< "ok"; do
+                    tries=$((tries+1))
+                    if [ $tries -ge 3 ]; then
+                      echo "Pre-download failed after $tries attempts"; exit 1
+                    fi
+                    echo "Retrying download ($tries/3)..."; sleep 3
+                  done
+
+            - name: Warm cache & run a tiny smoke
               run: |
-                  pytest -v tests/e2e/embedding
+                  ./build/bin/llama-embedding --help >/dev/null 2>&1
+                  pytest -q tests/e2e/embedding -k raw_vs_json_consistency
diff --git a/tests/e2e/embedding/test_embedding_cli.py b/tests/e2e/embedding/test_embedding_cli.py
index cf4731d6bfd65..80f986ec86bb6 100644
--- a/tests/e2e/embedding/test_embedding_cli.py
+++ b/tests/e2e/embedding/test_embedding_cli.py
@@ -1,10 +1,13 @@
 import json
 import hashlib
+import logging
 import os
 import pytest
 import subprocess
 from pathlib import Path
 import numpy as np
+import time
+from typing import Optional, List
 
 # ---------------------------------------------------------------------------
 # Configuration constants
@@ -15,72 +18,111 @@
 EXE = REPO_ROOT / ("build/bin/llama-embedding.exe" if os.name == "nt" else "build/bin/llama-embedding")
 DEFAULT_ENV = {**os.environ, "LLAMA_CACHE": os.environ.get("LLAMA_CACHE", "tmp")}
 SEED = "42"
+ALLOWED_DIMS = {384, 768, 1024, 4096}
 
+SMALL_CTX = 16        # preflight/cache
+TEST_CTX  = 1024      # main tests
+
+log = logging.getLogger(__name__)
 
 # ---------------------------------------------------------------------------
-# Model setup helpers
+# Shared helpers (single source of truth for command building)
 # ---------------------------------------------------------------------------
 
-def get_model_hf_params():
-    """Default lightweight embedding model."""
+
+def resolve_exe() -> Path:
+    exe = EXE
+    if not exe.exists() and os.name == "nt":
+        alt = REPO_ROOT / "build/bin/Release/llama-embedding.exe"
+        if alt.exists():
+            exe = alt
+    if not exe.exists():
+        raise FileNotFoundError(f"llama-embedding not found under {REPO_ROOT}/build/bin")
+    return exe
+
+
+def hf_params_default():
     return {
         "hf_repo": "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF",
         "hf_file": "embeddinggemma-300M-qat-Q4_0.gguf",
     }
 
 
-@pytest.fixture(scope="session")
-def embedding_model():
-    """Download/cache model once per session."""
-    exe_path = EXE
-    if not exe_path.exists():
-        alt = REPO_ROOT / "build/bin/Release/llama-embedding.exe"
-        if alt.exists():
-            exe_path = alt
-        else:
-            raise FileNotFoundError(f"llama-embedding binary not found under {REPO_ROOT}/build/bin")
-
-    params = get_model_hf_params()
+def build_cmd(
+    *,
+    exe: Path,
+    params: dict,
+    fmt: str,
+    threads: int,
+    ctx: int,
+    seed: str,
+    extra: Optional[List[str]] = None,  # was: list[str] | None
+) -> List[str]:  # was: list[str]
+    assert fmt in {"raw", "json"}, f"unsupported fmt={fmt}"
     cmd = [
-        str(exe_path),
+        str(exe),
         "-hfr", params["hf_repo"],
         "-hff", params["hf_file"],
-        "--ctx-size", "16",
-        "--embd-output-format", "json",
-        "--no-warmup",
-        "--threads", "1",
-        "--seed", SEED,
+        "--ctx-size", str(ctx),
+        "--embd-output-format", fmt,
+        "--threads", str(threads),
+        "--seed", seed,
     ]
-    res = subprocess.run(cmd, input="ok", capture_output=True, text=True, env=DEFAULT_ENV)
-    assert res.returncode == 0, f"model download failed: {res.stderr}"
-    return params
+    if extra:
+        cmd.extend(extra)
+    return cmd
+
+
+def run_cmd(cmd: list[str], text: str, timeout: int = 60) -> str:
+    t0 = time.perf_counter()
+    res = subprocess.run(cmd, input=text, capture_output=True, text=True,
+                         env=DEFAULT_ENV, timeout=timeout)
+    dur_ms = (time.perf_counter() - t0) * 1000.0
+    if os.environ.get("EMBD_TEST_DEBUG") == "1":
+        log.debug("embedding cmd finished in %.1f ms", dur_ms)
+
+    if res.returncode != 0:
+        raise AssertionError(f"embedding failed ({res.returncode}):\n{res.stderr[:400]}")
+    out = res.stdout.strip()
+    assert out, "empty stdout from llama-embedding"
+    return out
+
+# ---------------------------------------------------------------------------
+# Session model preflight/cache
+# ---------------------------------------------------------------------------
 
 
+@pytest.fixture(scope="session")
+def embedding_model():
+    """Download/cache model once per session with a tiny ctx + no warmup."""
+    exe = resolve_exe()
+    params = hf_params_default()
+    cmd = build_cmd(
+        exe=exe, params=params, fmt="json",
+        threads=1, ctx=SMALL_CTX, seed=SEED,
+        extra=["--no-warmup"],
+    )
+    _ = run_cmd(cmd, text="ok")
+    return params
+
 # ---------------------------------------------------------------------------
 # Utility functions
 # ---------------------------------------------------------------------------
 
-def run_embedding(text: str, fmt: str = "raw", params=None) -> str:
-    """Runs llama-embedding and returns stdout (string)."""
-    exe_path = EXE
-    if not exe_path.exists():
-        raise FileNotFoundError(f"Missing binary: {exe_path}")
-    params = params or get_model_hf_params()
-    cmd = [
-        str(exe_path),
-        "-hfr", params["hf_repo"],
-        "-hff", params["hf_file"],
-        "--ctx-size", "2048",
-        "--embd-output-format", fmt,
-        "--threads", "1",
-        "--seed", SEED,
-    ]
-    result = subprocess.run(cmd, input=text, capture_output=True, text=True, env=DEFAULT_ENV)
-    if result.returncode:
-        raise AssertionError(f"embedding failed ({result.returncode}):\n{result.stderr[:400]}")
-    out = result.stdout.strip()
-    assert out, f"empty output for text={text!r}, fmt={fmt}"
-    return out
+
+def run_embedding(
+    text: str,
+    *,
+    fmt: str = "raw",
+    threads: int = 1,
+    ctx: int = TEST_CTX,
+    params: Optional[dict] = None,  # was: dict | None
+    timeout: int = 60,
+) -> str:
+    exe = resolve_exe()
+    params = params or hf_params_default()
+    cmd = build_cmd(exe=exe, params=params, fmt=fmt, threads=threads, ctx=ctx, seed=SEED)
+    return run_cmd(cmd, text, timeout=timeout)
 
 
 def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
@@ -92,69 +134,65 @@ def embedding_hash(vec: np.ndarray) -> str:
     return hashlib.sha256(vec[:8].tobytes()).hexdigest()[:16]
 
 
+def parse_vec(out: str, fmt: str) -> np.ndarray:
+    if fmt == "raw":
+        arr = np.array(out.split(), dtype=np.float32)
+    else:
+        arr = np.array(json.loads(out)["data"][0]["embedding"], dtype=np.float32)
+    return arr
+
 # ---------------------------------------------------------------------------
 # Tests
 # ---------------------------------------------------------------------------
 
+
 # Register custom mark so pytest doesn't warn about it
 pytestmark = pytest.mark.filterwarnings("ignore::pytest.PytestUnknownMarkWarning")
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize("fmt", ["raw", "json"])
 @pytest.mark.parametrize("text", ["hello world", "hi 🌎", "line1\nline2\nline3"])
 def test_embedding_runs_and_finite(fmt, text, embedding_model):
-    """Ensure embeddings run end-to-end and produce finite floats."""
-    out = run_embedding(text, fmt, embedding_model)
-    floats = (
-        np.array(out.split(), float)
-        if fmt == "raw"
-        else np.array(json.loads(out)["data"][0]["embedding"], float)
-    )
-    assert len(floats) > 100
-    assert np.all(np.isfinite(floats)), f"non-finite values in {fmt} output"
-    assert 0.1 < np.linalg.norm(floats) < 10
+    out = run_embedding(text, fmt=fmt, threads=1, ctx=TEST_CTX, params=embedding_model)
+    vec = parse_vec(out, fmt)
+    assert vec.dtype == np.float32
+    # dim & finiteness
+    assert len(vec) in ALLOWED_DIMS, f"unexpected dim={len(vec)}"
+    assert np.all(np.isfinite(vec))
+    assert 0.1 < np.linalg.norm(vec) < 10
 
 
 def test_raw_vs_json_consistency(embedding_model):
-    """Compare raw vs JSON embedding output for same text."""
     text = "hello world"
-    raw = np.array(run_embedding(text, "raw", embedding_model).split(), float)
-    jsn = np.array(json.loads(run_embedding(text, "json", embedding_model))["data"][0]["embedding"], float)
-
+    raw = parse_vec(run_embedding(text, fmt="raw",  params=embedding_model), "raw")
+    jsn = parse_vec(run_embedding(text, fmt="json", params=embedding_model), "json")
     assert raw.shape == jsn.shape
     cos = cosine_similarity(raw, jsn)
-    assert cos > 0.999, f"divergence: cos={cos:.4f}"
-    assert embedding_hash(raw) == embedding_hash(jsn), "hash mismatch → possible nondeterminism"
+    assert cos > 0.999, f"raw/json divergence: cos={cos:.6f}"
+    assert embedding_hash(raw) == embedding_hash(jsn)
 
 
 def test_empty_input_deterministic(embedding_model):
-    """Empty input should yield finite, deterministic vector."""
-    v1 = np.array(run_embedding("", "raw", embedding_model).split(), float)
-    v2 = np.array(run_embedding("", "raw", embedding_model).split(), float)
+    v1 = parse_vec(run_embedding("", fmt="raw", params=embedding_model), "raw")
+    v2 = parse_vec(run_embedding("", fmt="raw", params=embedding_model), "raw")
     assert np.all(np.isfinite(v1))
-    cos = cosine_similarity(v1, v2)
-    assert cos > 0.9999, f"Empty input not deterministic (cos={cos:.5f})"
-    assert 0.1 < np.linalg.norm(v1) < 10
+    assert embedding_hash(v1) == embedding_hash(v2)
+    assert cosine_similarity(v1, v2) > 0.99999
 
 
-@pytest.mark.slow
 def test_very_long_input_stress(embedding_model):
     """Stress test: large input near context window."""
     text = "lorem " * 2000
-    vec = np.array(run_embedding(text, "raw", embedding_model).split(), float)
-    assert len(vec) > 100
+    vec = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw")
+    assert len(vec) in ALLOWED_DIMS
     assert np.isfinite(np.linalg.norm(vec))
 
 
-@pytest.mark.parametrize(
-    "text",
-    ["   ", "\n\n\n", "123 456 789"],
-)
+@pytest.mark.parametrize("text", ["   ", "\n\n\n", "123 456 789"])
 def test_low_information_inputs_stable(text, embedding_model):
     """Whitespace/numeric inputs should yield stable embeddings."""
-    v1 = np.array(run_embedding(text, "raw", embedding_model).split(), float)
-    v2 = np.array(run_embedding(text, "raw", embedding_model).split(), float)
+    v1 = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw")
+    v2 = parse_vec(run_embedding(text, fmt="raw", params=embedding_model), "raw")
     cos = cosine_similarity(v1, v2)
     assert cos > 0.999, f"unstable embedding for {text!r}"
 
@@ -162,7 +200,8 @@ def test_low_information_inputs_stable(text, embedding_model):
 @pytest.mark.parametrize("flag", ["--no-such-flag", "--help"])
 def test_invalid_or_help_flag(flag):
     """Invalid flags should fail; help should succeed."""
-    res = subprocess.run([str(EXE), flag], capture_output=True, text=True)
+    exe = resolve_exe()
+    res = subprocess.run([str(exe), flag], capture_output=True, text=True, env=DEFAULT_ENV)
     if flag == "--no-such-flag":
         assert res.returncode != 0
         assert any(k in res.stderr.lower() for k in ("error", "invalid", "unknown"))
@@ -172,17 +211,41 @@ def test_invalid_or_help_flag(flag):
 
 
 @pytest.mark.parametrize("fmt", ["raw", "json"])
-@pytest.mark.parametrize("text", ["deterministic test", "deterministic test again"])
-def test_repeated_call_consistent(fmt, text, embedding_model):
-    """Same input → same hash across repeated runs."""
-    out1 = run_embedding(text, fmt, embedding_model)
-    out2 = run_embedding(text, fmt, embedding_model)
-
-    if fmt == "json":
-        v1 = np.array(json.loads(out1)["data"][0]["embedding"], float)
-        v2 = np.array(json.loads(out2)["data"][0]["embedding"], float)
-    else:
-        v1 = np.array(out1.split(), float)
-        v2 = np.array(out2.split(), float)
-
-    assert embedding_hash(v1) == embedding_hash(v2)
+def test_threads_two_similarity_vs_single(fmt, embedding_model):
+    text = "determinism vs threads"
+    single = parse_vec(run_embedding(text, fmt=fmt, threads=1, params=embedding_model), fmt)
+    multi  = parse_vec(run_embedding(text, fmt=fmt, threads=2, params=embedding_model), fmt)
+    assert single.shape == multi.shape
+    cos = cosine_similarity(single, multi)
+    assert cos >= 0.999, f"threads>1 similarity too low: {cos:.6f}"
+
+
+def test_json_shape_schema_minimal(embedding_model):
+    js = json.loads(run_embedding("schema check", fmt="json", params=embedding_model))
+    assert isinstance(js, dict)
+
+    # Top-level “object” (present in CLI) is optional for us
+    if "object" in js:
+        assert js["object"] in ("list", "embeddings", "embedding_list")
+
+    # Required: data[0].embedding + index
+    assert "data" in js and isinstance(js["data"], list) and len(js["data"]) >= 1
+    item0 = js["data"][0]
+    assert isinstance(item0, dict)
+    if "object" in item0:
+        assert item0["object"] in ("embedding",)
+    assert "index" in item0 and item0["index"] == 0
+    assert "embedding" in item0 and isinstance(item0["embedding"], list)
+    assert len(item0["embedding"]) in ALLOWED_DIMS
+
+    # Optional fields: tolerate absence in current CLI output
+    if "model" in js:
+        assert isinstance(js["model"], str)
+    if "dim" in js:
+        assert js["dim"] == len(item0["embedding"])
+    usage = js.get("usage", {})
+    if usage:
+        assert isinstance(usage, dict)
+        # if present, prompt_tokens should be int
+        if "prompt_tokens" in usage:
+            assert isinstance(usage["prompt_tokens"], int)