From 6969c5276329dd350d7e4f1fdd46ccd1ed5f4107 Mon Sep 17 00:00:00 2001
From: Benson Wong <mostlygeek+git@gmail.com>
Date: Mon, 16 Mar 2026 23:54:32 +0000
Subject: [PATCH 1/4] checkpoint

---
 docker/.gitignore        |   2 +
 docker/AGENTS.md         |  17 +++
 docker/Dockerfile.cuda   | 235 ++++++++++++++++++++++++++++++
 docker/Dockerfile.vulkan | 241 +++++++++++++++++++++++++++++++
 docker/build-image.sh    | 300 +++++++++++++++++++++++++++++++++++++++
 docker/test-binaries.sh  | 120 ++++++++++++++++
 6 files changed, 915 insertions(+)
 create mode 100644 docker/.gitignore
 create mode 100644 docker/AGENTS.md
 create mode 100644 docker/Dockerfile.cuda
 create mode 100644 docker/Dockerfile.vulkan
 create mode 100755 docker/build-image.sh
 create mode 100755 docker/test-binaries.sh
diff --git a/docker/.gitignore b/docker/.gitignore
new file mode 100644
index 00000000..b7697f06
--- /dev/null
+++ b/docker/.gitignore
@@ -0,0 +1,2 @@
+# BuildKit configuration file (generated by build-image.sh)
+buildkitd.toml
diff --git a/docker/AGENTS.md b/docker/AGENTS.md
new file mode 100644
index 00000000..8c82679e
--- /dev/null
+++ b/docker/AGENTS.md
@@ -0,0 +1,17 @@
+- you are working in my VM sandbox. It is safe to use sudo.
+- use or install whatever tools you need to complete your goal
+- use DOCKER_BUILDKIT=1 docker build -t llama-swap:optimized
+ - DOCKER_BUILDKIT=1 is important to use the caching
+- ALWAYS send notifications to get the user's attention
+- when running `./build-image.sh`, use a 2-hour (7200000ms) timeout minimum as CUDA builds take 60-120+ minutes to compile for multiple architectures
+
+# Notifications
+
+ALWAYS send notifications to keep the user informed:
+
+- When starting or finishing a job
+- For progress updates on long-running tasks (especially Docker builds)
+- For todo list progress updates (when items start/complete)
+- When you need feedback or to elicit information from the user
+- use pushover.sh <message>, example: `pushover.sh "notification to send"`
+
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
new file mode 100644
index 00000000..bebcb771
--- /dev/null
+++ b/docker/Dockerfile.cuda
@@ -0,0 +1,235 @@
+# Multi-stage Dockerfile for CUDA-optimized AI inference tools
+# Includes: llama.cpp, whisper.cpp, stable-diffusion.cpp
+#
+# CUDA architectures targeting consumer/prosumer GPUs from Pascal through Blackwell:
+#   sm_60  - Pascal (HPC):      Tesla P100 (GP100) - full FP16 2x throughput, HBM2
+#   sm_61  - Pascal (consumer):  Tesla P40, GTX 1080/1070/1060, Titan Xp (GP102/104/106)
+#   sm_75  - Turing:             RTX 2080/2070/2060, Quadro RTX 6000, Tesla T4 - 2nd gen Tensor Cores, INT8/INT4
+#   sm_86  - Ampere (consumer):  RTX 3090/3080/3070/3060, RTX A6000 - 3rd gen Tensor Cores, BF16/TF32, 128 FP32 cores/SM
+#   sm_89  - Ada Lovelace:       RTX 4090/4080/4070/4060, RTX 6000 Ada - FP8 support, 4th gen Tensor Cores
+#   sm_120 - Blackwell (consumer): RTX 5090/5080/5070 - requires CUDA 12.8+ (not included, base image is 12.4)
+#
+# Build with BuildKit for cache mounts:
+#   DOCKER_BUILDKIT=1 docker build -t llama-swap:latest .
+#
+# Features:
+# - Separate git clone stages for independent caching
+# - ccache support for C++ compilation caching
+# - BuildKit cache mounts for cmake incremental builds
+# - Independent project rebuilds
+
+# ============================================================================
+# Stage 1: Base Builder with Common Dependencies
+# ============================================================================
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 AS builder-base
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89"
+ENV CCACHE_DIR=/ccache
+ENV CCACHE_MAXSIZE=2G
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    git \
+    python3 \
+    python3-pip \
+    libssl-dev \
+    curl \
+    ca-certificates \
+    ccache \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+
+# ============================================================================
+# Stage 2: Source Cloning (Cached Independently)
+# ============================================================================
+
+FROM builder-base AS llama-source
+ARG LLAMA_COMMIT_HASH=master
+RUN git clone --filter=blob:none --no-checkout https://github.com/ggml-org/llama.cpp.git /src/llama.cpp && \
+    cd /src/llama.cpp && \
+    git fetch --depth=1 origin ${LLAMA_COMMIT_HASH} && \
+    git checkout FETCH_HEAD
+
+FROM builder-base AS whisper-source
+ARG WHISPER_COMMIT_HASH=master
+RUN git clone --filter=blob:none --no-checkout https://github.com/ggml-org/whisper.cpp.git /src/whisper.cpp && \
+    cd /src/whisper.cpp && \
+    git fetch --depth=1 origin ${WHISPER_COMMIT_HASH} && \
+    git checkout FETCH_HEAD
+
+FROM builder-base AS sd-source
+ARG SD_COMMIT_HASH=master
+RUN git clone --filter=blob:none --no-checkout https://github.com/leejet/stable-diffusion.cpp.git /src/stable-diffusion.cpp && \
+    cd /src/stable-diffusion.cpp && \
+    git fetch --depth=1 origin ${SD_COMMIT_HASH} && \
+    git checkout FETCH_HEAD && \
+    git submodule update --init --recursive --depth=1
+
+# ============================================================================
+# Stage 3: Individual Project Builds with Cache Mounts
+# ============================================================================
+
+# Build whisper.cpp (first - quick to build)
+FROM builder-base AS whisper-build
+COPY --from=whisper-source /src/whisper.cpp /build/whisper.cpp
+WORKDIR /build/whisper.cpp
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=cache,target=/build/whisper.cpp/build \
+    cmake -B build \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CUDA=ON \
+        -DGGML_VULKAN=OFF \
+        -DCMAKE_CUDA_ARCHITECTURES="60;61;75;86;89" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_FLAGS="-allow-unsupported-compiler" \
+        -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache && \
+    cmake --build build --config Release -j$(nproc) --target whisper-cli whisper-server && \
+    mkdir -p /install/bin /install/lib && \
+    cp -v build/bin/whisper-cli build/bin/whisper-server /install/bin/ 2>/dev/null || true && \
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \;
+
+# Build stable-diffusion.cpp (second - medium build time)
+FROM builder-base AS sd-build
+COPY --from=sd-source /src/stable-diffusion.cpp /build/stable-diffusion.cpp
+WORKDIR /build/stable-diffusion.cpp
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=cache,target=/build/stable-diffusion.cpp/build \
+    set -e && \
+    rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true && \
+    cmake -B build \
+        -DSD_CUDA=ON \
+        -DGGML_CUDA=ON \
+        -DGGML_NATIVE=OFF \
+        -DGGML_VULKAN=OFF \
+        -DCMAKE_CUDA_ARCHITECTURES="60;61;75;86;89" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_FLAGS="-allow-unsupported-compiler" \
+        -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache && \
+    cmake --build build --config Release -j$(nproc) --target sd-cli sd-server && \
+    mkdir -p /install/bin /install/lib && \
+    cp -v build/bin/sd-cli build/bin/sd-server /install/bin/ && \
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \;
+
+# Build llama.cpp (last - longest build time)
+FROM builder-base AS llama-build
+COPY --from=llama-source /src/llama.cpp /build/llama.cpp
+WORKDIR /build/llama.cpp
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=cache,target=/build/llama.cpp/build \
+    cmake -B build \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CUDA=ON \
+        -DGGML_VULKAN=OFF \
+        -DCMAKE_CUDA_ARCHITECTURES="60;61;75;86;89" \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_FLAGS="-allow-unsupported-compiler" \
+        -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache && \
+    cmake --build build --config Release -j$(nproc) --target llama-cli llama-server && \
+    # Copy outputs to install directory for easy extraction
+    mkdir -p /install/bin /install/lib /install/examples && \
+    cp -v build/bin/llama-cli build/bin/llama-server /install/bin/ 2>/dev/null || true && \
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \; && \
+    cp -vr examples/* /install/examples/ 2>/dev/null || true
+
+# ============================================================================
+# Stage 4: Runtime Stage
+# ============================================================================
+FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 AS runtime
+
+ARG LLAMA_SWAP_VERSION=v198
+ARG LLAMA_COMMIT_HASH=unknown
+ARG WHISPER_COMMIT_HASH=unknown
+ARG SD_COMMIT_HASH=unknown
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH="/usr/local/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 \
+    python3 \
+    python3-pip \
+    curl \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Download and install llama-swap binary
+RUN LLAMA_SWAP_VER_NUM=$(echo "${LLAMA_SWAP_VERSION}" | sed 's/^v//') && \
+    curl -L -o /tmp/llama-swap.tar.gz \
+    "https://github.com/mostlygeek/llama-swap/releases/download/${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VER_NUM}_linux_amd64.tar.gz" && \
+    tar -xzf /tmp/llama-swap.tar.gz -C /usr/local/bin/ && \
+    rm /tmp/llama-swap.tar.gz && \
+    chmod +x /usr/local/bin/llama-swap
+
+# Copy libcuda stubs for CPU-only testing (not in ldconfig path to avoid conflicts with real drivers)
+# Real NVIDIA drivers will be used when available; stubs are fallback for testing on CPU-only hosts
+COPY --from=builder-base /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so
+COPY --from=builder-base /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+# Copy test script for verifying binaries work correctly
+COPY test-binaries.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/test-binaries.sh
+
+RUN pip3 install --no-cache-dir numpy sentencepiece
+
+WORKDIR /app
+
+# Copy specific binaries from each build stage
+# Copy only specific binaries from each build stage
+COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
+COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
+COPY --from=llama-build /install/lib/ /usr/local/lib/
+COPY --from=llama-build /install/examples/ /opt/llama.cpp/examples/
+
+COPY --from=whisper-build /install/bin/whisper-server /usr/local/bin/
+COPY --from=whisper-build /install/bin/whisper-cli /usr/local/bin/
+COPY --from=whisper-build /install/lib/ /usr/local/lib/
+
+COPY --from=sd-build /install/bin/sd-server /usr/local/bin/
+COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/
+COPY --from=sd-build /install/lib/ /usr/local/lib/
+
+# Update library cache
+RUN ldconfig
+
+# Create symlinks for common command names
+RUN ln -sf /usr/local/bin/llama-cli /usr/local/bin/llama && \
+    ln -sf /usr/local/bin/whisper-cli /usr/local/bin/whisper && \
+    ln -sf /usr/local/bin/sd-cli /usr/local/bin/stable-diffusion
+
+# Verify installation
+RUN echo "=== Installed Binaries ===" && \
+    ls -la /usr/local/bin/llama* /usr/local/bin/whisper* /usr/local/bin/sd* /usr/local/bin/llama-swap 2>/dev/null || true && \
+    echo "" && \
+    echo "=== Library Check ===" && \
+    ldconfig -p | grep -E "(ggml|llama|whisper|sd)" || true && \
+    echo "" && \
+    echo "=== llama-swap Version ===" && \
+    llama-swap --version 2>/dev/null || true && \
+    echo "" && \
+    echo "=== llama-cli Version ===" && \
+    llama-cli --version 2>/dev/null || true
+
+# Write version information to /versions.txt
+RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
+    echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
+    echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
+    echo "llama-swap: ${LLAMA_SWAP_VERSION}" >> /versions.txt && \
+    echo "backend: CUDA" >> /versions.txt && \
+    echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
+
+WORKDIR /models
+CMD ["bash"]
diff --git a/docker/Dockerfile.vulkan b/docker/Dockerfile.vulkan
new file mode 100644
index 00000000..1df1de5c
--- /dev/null
+++ b/docker/Dockerfile.vulkan
@@ -0,0 +1,241 @@
+# Multi-stage Dockerfile for Vulkan GPU-accelerated AI inference tools
+# Includes: llama.cpp, whisper.cpp, stable-diffusion.cpp
+# Supports: AMD GPUs and other Vulkan-compatible hardware
+#
+# Build with BuildKit for cache mounts:
+#   DOCKER_BUILDKIT=1 docker build -t llama-swap:vulkan .
+#
+# Features:
+# - Separate git clone stages for independent caching
+# - ccache support for C++ compilation caching
+# - BuildKit cache mounts for cmake incremental builds
+# - Independent project rebuilds
+# - No architecture targeting needed (Vulkan is portable)
+
+# ============================================================================
+# Stage 1: Base Builder with Common Dependencies
+# ============================================================================
+FROM ubuntu:22.04 AS builder-base
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CCACHE_DIR=/ccache
+ENV CCACHE_MAXSIZE=2G
+ENV VULKAN_SDK=/opt/vulkan-sdk
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    git \
+    python3 \
+    python3-pip \
+    libssl-dev \
+    curl \
+    ca-certificates \
+    ccache \
+    wget \
+    xz-utils \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install newer Vulkan SDK (1.3.275.0) - needed for whisper.cpp
+RUN wget -q --show-progress \
+    https://sdk.lunarg.com/sdk/download/1.3.275.0/linux/vulkansdk-linux-x86_64-1.3.275.0.tar.xz \
+    -O /tmp/vulkan-sdk.tar.xz && \
+    mkdir -p /opt && \
+    tar -xf /tmp/vulkan-sdk.tar.xz -C /opt && \
+    rm /tmp/vulkan-sdk.tar.xz
+
+# Set up Vulkan environment (SDK extracts to /opt/1.3.275.0/x86_64)
+ENV VULKAN_SDK=/opt/1.3.275.0/x86_64
+ENV PATH="${VULKAN_SDK}/bin:${PATH}"
+ENV CMAKE_PREFIX_PATH="${VULKAN_SDK}"
+ENV VULKAN_INCLUDE_DIRS="${VULKAN_SDK}/include"
+
+# Create ccache symlinks for compiler caching
+RUN mkdir -p /usr/lib/ccache && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/gcc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/g++ && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/cc && \
+    ln -sf /usr/bin/ccache /usr/lib/ccache/c++
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+WORKDIR /src
+
+# ============================================================================
+# Stage 2: Source Cloning (Cached Independently)
+# ============================================================================
+
+FROM builder-base AS llama-source
+ARG LLAMA_COMMIT_HASH=master
+RUN git clone --filter=blob:none --no-checkout https://github.com/ggml-org/llama.cpp.git /src/llama.cpp && \
+    cd /src/llama.cpp && \
+    git fetch --depth=1 origin ${LLAMA_COMMIT_HASH} && \
+    git checkout FETCH_HEAD
+
+FROM builder-base AS whisper-source
+ARG WHISPER_COMMIT_HASH=master
+RUN git clone --filter=blob:none --no-checkout https://github.com/ggml-org/whisper.cpp.git /src/whisper.cpp && \
+    cd /src/whisper.cpp && \
+    git fetch --depth=1 origin ${WHISPER_COMMIT_HASH} && \
+    git checkout FETCH_HEAD
+
+FROM builder-base AS sd-source
+ARG SD_COMMIT_HASH=master
+RUN git clone --filter=blob:none --no-checkout https://github.com/leejet/stable-diffusion.cpp.git /src/stable-diffusion.cpp && \
+    cd /src/stable-diffusion.cpp && \
+    git fetch --depth=1 origin ${SD_COMMIT_HASH} && \
+    git checkout FETCH_HEAD && \
+    git submodule update --init --recursive --depth=1
+
+# ============================================================================
+# Stage 3: Individual Project Builds with Cache Mounts
+# ============================================================================
+
+# Build whisper.cpp (first - quick to build)
+FROM builder-base AS whisper-build
+COPY --from=whisper-source /src/whisper.cpp /build/whisper.cpp
+WORKDIR /build/whisper.cpp
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=cache,target=/build/whisper.cpp/build \
+    rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true && \
+    cmake -B build \
+        -DGGML_NATIVE=OFF \
+        -DGGML_VULKAN=ON \
+        -DVulkan_INCLUDE_DIR="${VULKAN_SDK}/include" \
+        -DVulkan_LIBRARY="${VULKAN_SDK}/lib/libvulkan.so" \
+        -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build --config Release -j$(nproc) --target whisper-cli whisper-server && \
+    mkdir -p /install/bin /install/lib && \
+    cp -v build/bin/whisper-cli build/bin/whisper-server /install/bin/ && \
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \;
+
+# Build stable-diffusion.cpp (second - medium build time)
+FROM builder-base AS sd-build
+COPY --from=sd-source /src/stable-diffusion.cpp /build/stable-diffusion.cpp
+WORKDIR /build/stable-diffusion.cpp
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=cache,target=/build/stable-diffusion.cpp/build \
+    set -e && \
+    rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true && \
+    cmake -B build \
+        -DSD_VULKAN=ON \
+        -DGGML_VULKAN=ON \
+        -DSD_BUILD_EXAMPLES=ON \
+        -DGGML_NATIVE=OFF \
+        -DVulkan_INCLUDE_DIR="${VULKAN_SDK}/include" \
+        -DVulkan_LIBRARY="${VULKAN_SDK}/lib/libvulkan.so" \
+        -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build --config Release -j$(nproc) --target sd-cli sd-server && \
+    mkdir -p /install/bin /install/lib && \
+    cp -v build/bin/sd-cli build/bin/sd-server /install/bin/ && \
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \;
+
+# Build llama.cpp (last - longest build time)
+FROM builder-base AS llama-build
+COPY --from=llama-source /src/llama.cpp /build/llama.cpp
+WORKDIR /build/llama.cpp
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=cache,target=/build/llama.cpp/build \
+    rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true && \
+    cmake -B build \
+        -DGGML_NATIVE=OFF \
+        -DGGML_VULKAN=ON \
+        -DGGML_BACKEND_DL=ON \
+        -DVulkan_INCLUDE_DIR="${VULKAN_SDK}/include" \
+        -DVulkan_LIBRARY="${VULKAN_SDK}/lib/libvulkan.so" \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build --config Release -j$(nproc) --target llama-cli llama-server && \
+    # Copy outputs to install directory for easy extraction
+    mkdir -p /install/bin /install/lib /install/examples && \
+    cp -v build/bin/llama-cli build/bin/llama-server /install/bin/ 2>/dev/null || true && \
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \; && \
+    cp -vr examples/* /install/examples/ 2>/dev/null || true
+
+# ============================================================================
+# Stage 4: Runtime Stage
+# ============================================================================
+FROM ubuntu:22.04 AS runtime
+
+ARG LLAMA_SWAP_VERSION=v198
+ARG LLAMA_COMMIT_HASH=unknown
+ARG WHISPER_COMMIT_HASH=unknown
+ARG SD_COMMIT_HASH=unknown
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH="/usr/local/bin:${PATH}"
+# Allow the Vulkan loader to find the Mesa ICDs
+ENV VK_DRIVER_FILES=/usr/share/vulkan/icd.d/lvp_icd.x86_64.json:/usr/share/vulkan/icd.d/radeon_icd.x86_64.json:/usr/share/vulkan/icd.d/intel_icd.x86_64.json
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 \
+    libvulkan1 \
+    mesa-vulkan-drivers \
+    vulkan-tools \
+    python3 \
+    python3-pip \
+    curl \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Download and install llama-swap binary
+RUN LLAMA_SWAP_VER_NUM=$(echo "${LLAMA_SWAP_VERSION}" | sed 's/^v//') && \
+    curl -L -o /tmp/llama-swap.tar.gz \
+    "https://github.com/mostlygeek/llama-swap/releases/download/${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VER_NUM}_linux_amd64.tar.gz" && \
+    tar -xzf /tmp/llama-swap.tar.gz -C /usr/local/bin/ && \
+    rm /tmp/llama-swap.tar.gz && \
+    chmod +x /usr/local/bin/llama-swap
+
+RUN pip3 install --no-cache-dir numpy sentencepiece
+
+WORKDIR /app
+
+# Copy specific binaries from each build stage
+# Copy only specific binaries from each build stage
+COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
+COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
+COPY --from=llama-build /install/lib/ /usr/local/lib/
+COPY --from=llama-build /install/examples/ /opt/llama.cpp/examples/
+
+COPY --from=whisper-build /install/bin/whisper-server /usr/local/bin/
+COPY --from=whisper-build /install/bin/whisper-cli /usr/local/bin/
+COPY --from=whisper-build /install/lib/ /usr/local/lib/
+
+COPY --from=sd-build /install/bin/sd-server /usr/local/bin/
+COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/
+COPY --from=sd-build /install/lib/ /usr/local/lib/
+
+# Update library cache
+RUN ldconfig
+
+# Create symlinks for common command names
+RUN ln -sf /usr/local/bin/llama-cli /usr/local/bin/llama && \
+    ln -sf /usr/local/bin/whisper-cli /usr/local/bin/whisper && \
+    ln -sf /usr/local/bin/sd-cli /usr/local/bin/stable-diffusion
+
+# Verify installation
+RUN echo "=== Installed Binaries ===" && \
+    ls -la /usr/local/bin/llama* /usr/local/bin/whisper* /usr/local/bin/sd* /usr/local/bin/llama-swap 2>/dev/null || true && \
+    echo "" && \
+    echo "=== Library Check ===" && \
+    ldconfig -p | grep -E "(ggml|llama|whisper|sd)" || true && \
+    echo "" && \
+    echo "=== Vulkan Support Check ===" && \
+    ls -la /usr/share/vulkan/icd.d/ 2>/dev/null || true && \
+    echo "" && \
+    echo "=== llama-swap Version ===" && \
+    llama-swap --version 2>/dev/null || true && \
+    echo "" && \
+    echo "=== llama-cli Version ===" && \
+    llama-cli --version 2>/dev/null || true
+
+# Write version information to /versions.txt
+RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
+    echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
+    echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
+    echo "llama-swap: ${LLAMA_SWAP_VERSION}" >> /versions.txt && \
+    echo "backend: Vulkan" >> /versions.txt && \
+    echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
+
+WORKDIR /models
+CMD ["bash"]
diff --git a/docker/build-image.sh b/docker/build-image.sh
new file mode 100755
index 00000000..d1f420fe
--- /dev/null
+++ b/docker/build-image.sh
@@ -0,0 +1,300 @@
+#!/bin/bash
+#
+# Build script for llama-swap-docker with commit hash pinning
+#
+# Usage:
+#   ./build-image.sh --cuda                    # Build CUDA image
+#   ./build-image.sh --vulkan                  # Build Vulkan image
+#   ./build-image.sh --cuda --no-cache         # Build CUDA image without cache
+#   LLAMA_COMMIT_HASH=abc123 ./build-image.sh --cuda   # Override llama.cpp commit
+#   WHISPER_COMMIT_HASH=def456 ./build-image.sh --vulkan # Override whisper.cpp commit
+#   SD_COMMIT_HASH=ghi789 ./build-image.sh --cuda      # Override stable-diffusion.cpp commit
+#   LLAMA_SWAP_VERSION=v198 ./build-image.sh --cuda    # Override llama-swap version
+#
+# Features:
+#   - Auto-detects latest commit hashes from git repos
+#   - Auto-detects latest llama-swap release
+#   - Allows environment variable overrides for reproducible builds
+#   - Cache-friendly: changing commit hash busts cache appropriately
+#   - Supports both CUDA and Vulkan backends (requires explicit flag)
+#
+
+set -euo pipefail
+
+# Parse command line arguments
+BACKEND=""
+NO_CACHE=false
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: No backend specified. Please use --cuda or --vulkan."
+    echo ""
+    echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+    echo ""
+    echo "Options:"
+    echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
+    echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
+    echo "  --no-cache  Force rebuild without using Docker cache"
+    echo "  --help, -h  Show this help message"
+    echo ""
+    echo "Environment variables:"
+    echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:latest or llama-swap:vulkan)"
+    echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
+    echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
+    echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
+    echo "  LLAMA_SWAP_VERSION   Override llama-swap version (default: latest, e.g., v198)"
+    exit 1
+fi
+
+for arg in "$@"; do
+    case $arg in
+        --cuda)
+            BACKEND="cuda"
+            ;;
+        --vulkan)
+            BACKEND="vulkan"
+            ;;
+        --no-cache)
+            NO_CACHE=true
+            ;;
+        --help|-h)
+            echo "Usage: ./build-image.sh --cuda|--vulkan [--no-cache]"
+            echo ""
+            echo "Options:"
+            echo "  --cuda      Build CUDA image (NVIDIA GPUs)"
+            echo "  --vulkan    Build Vulkan image (AMD GPUs and compatible hardware)"
+            echo "  --no-cache  Force rebuild without using Docker cache"
+            echo "  --help, -h  Show this help message"
+            echo ""
+            echo "Environment variables:"
+            echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:latest or llama-swap:vulkan)"
+            echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
+            echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
+            echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
+            echo "  LLAMA_SWAP_VERSION   Override llama-swap version (default: latest, e.g., v198)"
+            exit 0
+            ;;
+    esac
+done
+
+# Validate backend selection
+if [[ -z "$BACKEND" ]]; then
+    echo "Error: No backend specified. Please use --cuda or --vulkan."
+    exit 1
+fi
+
+# Configuration
+if [[ -n "${DOCKER_IMAGE_TAG:-}" ]]; then
+    # User provided a custom tag, use it as-is
+    :
+elif [[ "$BACKEND" == "vulkan" ]]; then
+    DOCKER_IMAGE_TAG="llama-swap:vulkan"
+else
+    DOCKER_IMAGE_TAG="llama-swap:cuda"
+fi
+DOCKER_BUILDKIT="${DOCKER_BUILDKIT:-1}"
+
+# Set Dockerfile based on backend
+if [[ "$BACKEND" == "vulkan" ]]; then
+    DOCKERFILE="Dockerfile.vulkan"
+    echo "Building for: Vulkan (AMD GPUs and compatible hardware)"
+else
+    DOCKERFILE="Dockerfile.cuda"
+    echo "Building for: CUDA (NVIDIA GPUs)"
+fi
+
+# Git repository URLs
+LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
+WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
+SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
+LLAMA_SWAP_REPO="mostlygeek/llama-swap"
+
+# Function to get the latest llama-swap release version
+get_latest_llama_swap_version() {
+    curl -s "https://api.github.com/repos/${LLAMA_SWAP_REPO}/releases/latest" | grep -oP '"tag_name": "\K[^"]+' || echo ""
+}
+
+# Function to get the latest commit hash from a git repo's default branch
+get_latest_commit() {
+    local repo_url="$1"
+    local branch="${2:-master}"
+
+    # Try to get the latest commit hash for the specified branch
+    git ls-remote --heads "${repo_url}" "${branch}" 2>/dev/null | head -1 | cut -f1
+}
+
+# Function to get the default branch name (master or main)
+get_default_branch() {
+    local repo_url="$1"
+
+    # Check for master first
+    if git ls-remote --heads "${repo_url}" master &>/dev/null; then
+        echo "master"
+    elif git ls-remote --heads "${repo_url}" main &>/dev/null; then
+        echo "main"
+    else
+        echo "master"  # fallback
+    fi
+}
+
+echo "=========================================="
+echo "llama-swap-docker Build Script"
+echo "=========================================="
+echo ""
+
+# Determine commit hashes - use env vars or auto-detect
+if [[ -n "${LLAMA_COMMIT_HASH:-}" ]]; then
+    LLAMA_HASH="${LLAMA_COMMIT_HASH}"
+    echo "llama.cpp: Using provided commit hash: ${LLAMA_HASH}"
+else
+    LLAMA_BRANCH=$(get_default_branch "${LLAMA_REPO}")
+    LLAMA_HASH=$(get_latest_commit "${LLAMA_REPO}" "${LLAMA_BRANCH}")
+    if [[ -z "${LLAMA_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for llama.cpp" >&2
+        exit 1
+    fi
+    echo "llama.cpp: Auto-detected latest commit (${LLAMA_BRANCH}): ${LLAMA_HASH}"
+fi
+
+if [[ -n "${WHISPER_COMMIT_HASH:-}" ]]; then
+    WHISPER_HASH="${WHISPER_COMMIT_HASH}"
+    echo "whisper.cpp: Using provided commit hash: ${WHISPER_HASH}"
+else
+    WHISPER_BRANCH=$(get_default_branch "${WHISPER_REPO}")
+    WHISPER_HASH=$(get_latest_commit "${WHISPER_REPO}" "${WHISPER_BRANCH}")
+    if [[ -z "${WHISPER_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for whisper.cpp" >&2
+        exit 1
+    fi
+    echo "whisper.cpp: Auto-detected latest commit (${WHISPER_BRANCH}): ${WHISPER_HASH}"
+fi
+
+if [[ -n "${SD_COMMIT_HASH:-}" ]]; then
+    SD_HASH="${SD_COMMIT_HASH}"
+    echo "stable-diffusion.cpp: Using provided commit hash: ${SD_HASH}"
+else
+    SD_BRANCH=$(get_default_branch "${SD_REPO}")
+    SD_HASH=$(get_latest_commit "${SD_REPO}" "${SD_BRANCH}")
+    if [[ -z "${SD_HASH}" ]]; then
+        echo "ERROR: Could not determine latest commit for stable-diffusion.cpp" >&2
+        exit 1
+    fi
+    echo "stable-diffusion.cpp: Auto-detected latest commit (${SD_BRANCH}): ${SD_HASH}"
+fi
+
+# Determine llama-swap version - use env var or auto-detect
+if [[ -n "${LLAMA_SWAP_VERSION:-}" ]]; then
+    LLAMA_SWAP_VER="${LLAMA_SWAP_VERSION}"
+    echo "llama-swap: Using provided version: ${LLAMA_SWAP_VER}"
+else
+    LLAMA_SWAP_VER=$(get_latest_llama_swap_version)
+    if [[ -z "${LLAMA_SWAP_VER}" ]]; then
+        echo "ERROR: Could not determine latest llama-swap version" >&2
+        exit 1
+    fi
+    echo "llama-swap: Auto-detected latest version: ${LLAMA_SWAP_VER}"
+fi
+
+echo ""
+echo "=========================================="
+echo "Starting Docker build..."
+echo "=========================================="
+echo ""
+
+# Build the Docker image with commit hashes and llama-swap version as build args
+BUILD_ARGS=(
+    --build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
+    --build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
+    --build-arg "SD_COMMIT_HASH=${SD_HASH}"
+    --build-arg "LLAMA_SWAP_VERSION=${LLAMA_SWAP_VER}"
+    -t "${DOCKER_IMAGE_TAG}"
+    -f "${DOCKERFILE}"
+)
+
+if [[ "$NO_CACHE" == true ]]; then
+    BUILD_ARGS+=(--no-cache)
+    echo "Note: Building without cache"
+fi
+
+# Use docker buildx with a custom builder for parallelism control
+# The legacy DOCKER_BUILDKIT=1 docker build doesn't respect BUILDKIT_MAX_PARALLELISM env var
+# We need to use a custom builder with a buildkitd.toml config file
+BUILDER_NAME="llama-swap-builder"
+
+# Check if our custom builder exists with the right config, create/update if needed
+if ! docker buildx inspect "$BUILDER_NAME" >/dev/null 2>&1; then
+    echo "Creating custom buildx builder with max-parallelism=1..."
+    
+    # Create buildkitd.toml config file
+    cat > buildkitd.toml << 'BUILDKIT_EOF'
+[worker.oci]
+  max-parallelism = 1
+BUILDKIT_EOF
+    
+    # Create the builder with the config
+    docker buildx create --name "$BUILDER_NAME" \
+        --driver docker-container \
+        --buildkitd-config buildkitd.toml \
+        --use
+else
+    # Switch to our builder
+    docker buildx use "$BUILDER_NAME"
+fi
+
+echo "Building with sequential stages (one at a time), each using all CPU cores..."
+echo "Using builder: $BUILDER_NAME"
+
+# Use docker buildx build with --load to load the image into Docker
+# The --builder flag ensures we use our custom builder with max-parallelism=1
+docker buildx build --builder "$BUILDER_NAME" --load "${BUILD_ARGS[@]}" .
+
+echo ""
+echo "=========================================="
+echo "Verifying build artifacts..."
+echo "=========================================="
+echo ""
+
+# Verify all expected binaries exist in the image
+MISSING_BINARIES=()
+
+for binary in llama-server llama-cli whisper-server whisper-cli sd-server sd-cli llama-swap; do
+    if ! docker run --rm "${DOCKER_IMAGE_TAG}" which "${binary}" >/dev/null 2>&1; then
+        MISSING_BINARIES+=("${binary}")
+    fi
+done
+
+if [[ ${#MISSING_BINARIES[@]} -gt 0 ]]; then
+    echo "ERROR: Build succeeded but the following binaries are missing from the image:"
+    for binary in "${MISSING_BINARIES[@]}"; do
+        echo "  - ${binary}"
+    done
+    echo ""
+    echo "This usually indicates a build stage failure. Try running with --no-cache flag:"
+    echo "  ./build-image.sh --vulkan --no-cache"
+    exit 1
+fi
+
+echo "All expected binaries verified: llama-server, llama-cli, whisper-server, whisper-cli, sd-server, sd-cli, llama-swap"
+
+echo ""
+echo "=========================================="
+echo "Build complete!"
+echo "=========================================="
+echo ""
+echo "Image tag: ${DOCKER_IMAGE_TAG}"
+echo ""
+echo "Built with:"
+echo "  llama.cpp:           ${LLAMA_HASH}"
+echo "  whisper.cpp:         ${WHISPER_HASH}"
+echo "  stable-diffusion.cpp: ${SD_HASH}"
+echo "  llama-swap:          ${LLAMA_SWAP_VER}"
+echo ""
+if [[ "$BACKEND" == "vulkan" ]]; then
+    echo "Run with:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri ${DOCKER_IMAGE_TAG}"
+    echo ""
+    echo "Note: For AMD GPUs, you may also need to mount render devices:"
+    echo "  docker run -it --rm --device /dev/dri:/dev/dri --group-add video ${DOCKER_IMAGE_TAG}"
+else
+    echo "Run with:"
+    echo "  docker run -it --rm --gpus all ${DOCKER_IMAGE_TAG}"
+fi
diff --git a/docker/test-binaries.sh b/docker/test-binaries.sh
new file mode 100755
index 00000000..8fe05d2e
--- /dev/null
+++ b/docker/test-binaries.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+# Test script for verifying CUDA-enabled binaries work
+# Automatically detects real NVIDIA drivers vs stub drivers
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+print_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Detect if real NVIDIA drivers are available
+detect_cuda_drivers() {
+    local real_driver_paths=(
+        "/lib/x86_64-linux-gnu/libcuda.so.1"
+        "/usr/lib/x86_64-linux-gnu/libcuda.so.1"
+        "/usr/local/cuda/lib64/libcuda.so.1"
+    )
+    
+    for path in "${real_driver_paths[@]}"; do
+        if [ -f "$path" ]; then
+            print_info "Real NVIDIA drivers found at: $path"
+            return 0
+        fi
+    done
+    
+    return 1
+}
+
+# Main execution
+print_info "Starting binary tests..."
+
+# Check for real drivers
+if detect_cuda_drivers; then
+    print_info "Using real NVIDIA drivers"
+    # Unset any stub-related LD_LIBRARY_PATH to avoid conflicts
+    export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
+else
+    print_warn "No real NVIDIA drivers detected"
+    print_warn "Falling back to stub drivers for testing"
+    print_warn "GPU functionality will NOT be available"
+    
+    # Add stubs to LD_LIBRARY_PATH for testing
+    export LD_LIBRARY_PATH="/usr/local/cuda/lib64/stubs:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
+    print_info "LD_LIBRARY_PATH set to: $LD_LIBRARY_PATH"
+fi
+
+# Test llama-server
+print_info "Testing llama-server..."
+if command -v llama-server &> /dev/null; then
+    if llama-server --help > /dev/null 2>&1 || llama-server -h > /dev/null 2>&1; then
+        print_info "✓ llama-server: OK"
+    else
+        print_error "✗ llama-server: Failed to run"
+        exit 1
+    fi
+else
+    print_error "✗ llama-server: Not found in PATH"
+    exit 1
+fi
+
+# Test whisper-server
+print_info "Testing whisper-server..."
+if command -v whisper-server &> /dev/null; then
+    if whisper-server --help > /dev/null 2>&1 || whisper-server -h > /dev/null 2>&1; then
+        print_info "✓ whisper-server: OK"
+    else
+        print_error "✗ whisper-server: Failed to run"
+        exit 1
+    fi
+else
+    print_error "✗ whisper-server: Not found in PATH"
+    exit 1
+fi
+
+# Test sd-server (stable-diffusion)
+print_info "Testing sd-server..."
+if command -v sd-server &> /dev/null; then
+    if sd-server --help > /dev/null 2>&1 || sd-server -h > /dev/null 2>&1; then
+        print_info "✓ sd-server: OK"
+    else
+        print_error "✗ sd-server: Failed to run"
+        exit 1
+    fi
+else
+    print_error "✗ sd-server: Not found in PATH"
+    exit 1
+fi
+
+print_info "All binary tests passed!"
+
+# Additional info about environment
+print_info "Environment information:"
+echo "  LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
+echo "  CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}"
+
+# Check if nvidia-smi is available
+if command -v nvidia-smi &> /dev/null; then
+    print_info "nvidia-smi output:"
+    nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader 2>/dev/null || \
+        print_warn "nvidia-smi found but could not query GPU information"
+else
+    print_warn "nvidia-smi not available (expected on CPU-only hosts)"
+fi
+
+exit 0

From c5eb79888256c11721fa282ae604240c841d79c5 Mon Sep 17 00:00:00 2001
From: Benson Wong <mostlygeek+git@gmail.com>
Date: Tue, 17 Mar 2026 21:13:26 +0000
Subject: [PATCH 2/4] docker: build llama-swap from source in containers

Build llama-swap binary from local source code instead of downloading
from GitHub releases. This ensures the container uses the exact code
in the repository.

- Add golang:1.25-alpine builder stage to compile llama-swap
- Generate version from git hash with +dirty suffix for unstaged changes
- Update build-image.sh to use repository root as build context
- Remove LLAMA_SWAP_VERSION environment variable and related code
- Add test-binaries.sh to Dockerfile.vulkan for consistency

Both CUDA and Vulkan Dockerfiles now build llama-swap from source.
---
 docker/Dockerfile.cuda   | 42 +++++++++++++++++++++++++++++---------
 docker/Dockerfile.vulkan | 44 ++++++++++++++++++++++++++++++++--------
 docker/build-image.sh    | 41 ++++++++++---------------------------
 3 files changed, 78 insertions(+), 49 deletions(-)

diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index bebcb771..9198af65 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -18,6 +18,32 @@
 # - BuildKit cache mounts for cmake incremental builds
 # - Independent project rebuilds
 
+# ============================================================================
+# Stage 0: Build llama-swap from local source
+# ============================================================================
+FROM golang:1.25-alpine AS llama-swap-builder
+WORKDIR /app
+
+# Copy Go module files first for layer caching
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy source code and build
+COPY . .
+ENV CGO_ENABLED=0 GOOS=linux GOARCH=amd64
+RUN go build -o llama-swap .
+
+# Get version info: git hash + dirty flag, or 'local-build' fallback
+RUN if git rev-parse --git-dir > /dev/null 2>&1; then \
+        VERSION=$(git rev-parse --short HEAD) && \
+        if [ -n "$(git status --porcelain)" ]; then \
+            VERSION="${VERSION}+dirty"; \
+        fi && \
+        echo "$VERSION" > /app/llama-swap-version; \
+    else \
+        echo "local-build" > /app/llama-swap-version; \
+    fi
+
 # ============================================================================
 # Stage 1: Base Builder with Common Dependencies
 # ============================================================================
@@ -150,7 +176,6 @@ RUN --mount=type=cache,target=/ccache \
 # ============================================================================
 FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 AS runtime
 
-ARG LLAMA_SWAP_VERSION=v198
 ARG LLAMA_COMMIT_HASH=unknown
 ARG WHISPER_COMMIT_HASH=unknown
 ARG SD_COMMIT_HASH=unknown
@@ -164,15 +189,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     python3-pip \
     curl \
     ca-certificates \
+    git \
     && rm -rf /var/lib/apt/lists/*
 
-# Download and install llama-swap binary
-RUN LLAMA_SWAP_VER_NUM=$(echo "${LLAMA_SWAP_VERSION}" | sed 's/^v//') && \
-    curl -L -o /tmp/llama-swap.tar.gz \
-    "https://github.com/mostlygeek/llama-swap/releases/download/${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VER_NUM}_linux_amd64.tar.gz" && \
-    tar -xzf /tmp/llama-swap.tar.gz -C /usr/local/bin/ && \
-    rm /tmp/llama-swap.tar.gz && \
-    chmod +x /usr/local/bin/llama-swap
+# Copy llama-swap binary from builder stage
+COPY --from=llama-swap-builder /app/llama-swap /usr/local/bin/
+COPY --from=llama-swap-builder /app/llama-swap-version /tmp/
 
 # Copy libcuda stubs for CPU-only testing (not in ldconfig path to avoid conflicts with real drivers)
 # Real NVIDIA drivers will be used when available; stubs are fallback for testing on CPU-only hosts
@@ -180,7 +202,7 @@ COPY --from=builder-base /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/
 COPY --from=builder-base /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 
 # Copy test script for verifying binaries work correctly
-COPY test-binaries.sh /usr/local/bin/
+COPY docker/test-binaries.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/test-binaries.sh
 
 RUN pip3 install --no-cache-dir numpy sentencepiece
@@ -227,7 +249,7 @@ RUN echo "=== Installed Binaries ===" && \
 RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
     echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
     echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
-    echo "llama-swap: ${LLAMA_SWAP_VERSION}" >> /versions.txt && \
+    echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \
     echo "backend: CUDA" >> /versions.txt && \
     echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
 
diff --git a/docker/Dockerfile.vulkan b/docker/Dockerfile.vulkan
index 1df1de5c..53b3d7dd 100644
--- a/docker/Dockerfile.vulkan
+++ b/docker/Dockerfile.vulkan
@@ -12,6 +12,32 @@
 # - Independent project rebuilds
 # - No architecture targeting needed (Vulkan is portable)
 
+# ============================================================================
+# Stage 0: Build llama-swap from local source
+# ============================================================================
+FROM golang:1.25-alpine AS llama-swap-builder
+WORKDIR /app
+
+# Copy Go module files first for layer caching
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy source code and build
+COPY . .
+ENV CGO_ENABLED=0 GOOS=linux GOARCH=amd64
+RUN go build -o llama-swap .
+
+# Get version info: git hash + dirty flag, or 'local-build' fallback
+RUN if git rev-parse --git-dir > /dev/null 2>&1; then \
+        VERSION=$(git rev-parse --short HEAD) && \
+        if [ -n "$(git status --porcelain)" ]; then \
+            VERSION="${VERSION}+dirty"; \
+        fi && \
+        echo "$VERSION" > /app/llama-swap-version; \
+    else \
+        echo "local-build" > /app/llama-swap-version; \
+    fi
+
 # ============================================================================
 # Stage 1: Base Builder with Common Dependencies
 # ============================================================================
@@ -157,7 +183,6 @@ RUN --mount=type=cache,target=/ccache \
 # ============================================================================
 FROM ubuntu:22.04 AS runtime
 
-ARG LLAMA_SWAP_VERSION=v198
 ARG LLAMA_COMMIT_HASH=unknown
 ARG WHISPER_COMMIT_HASH=unknown
 ARG SD_COMMIT_HASH=unknown
@@ -176,15 +201,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     python3-pip \
     curl \
     ca-certificates \
+    git \
     && rm -rf /var/lib/apt/lists/*
 
-# Download and install llama-swap binary
-RUN LLAMA_SWAP_VER_NUM=$(echo "${LLAMA_SWAP_VERSION}" | sed 's/^v//') && \
-    curl -L -o /tmp/llama-swap.tar.gz \
-    "https://github.com/mostlygeek/llama-swap/releases/download/${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VER_NUM}_linux_amd64.tar.gz" && \
-    tar -xzf /tmp/llama-swap.tar.gz -C /usr/local/bin/ && \
-    rm /tmp/llama-swap.tar.gz && \
-    chmod +x /usr/local/bin/llama-swap
+# Copy llama-swap binary from builder stage
+COPY --from=llama-swap-builder /app/llama-swap /usr/local/bin/
+COPY --from=llama-swap-builder /app/llama-swap-version /tmp/
+
+# Copy test script for verifying binaries work correctly
+COPY docker/test-binaries.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/test-binaries.sh
 
 RUN pip3 install --no-cache-dir numpy sentencepiece
 
@@ -233,7 +259,7 @@ RUN echo "=== Installed Binaries ===" && \
 RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
     echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
     echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
-    echo "llama-swap: ${LLAMA_SWAP_VERSION}" >> /versions.txt && \
+    echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \
     echo "backend: Vulkan" >> /versions.txt && \
     echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
 
diff --git a/docker/build-image.sh b/docker/build-image.sh
index d1f420fe..fa4d50c1 100755
--- a/docker/build-image.sh
+++ b/docker/build-image.sh
@@ -9,11 +9,10 @@
 #   LLAMA_COMMIT_HASH=abc123 ./build-image.sh --cuda   # Override llama.cpp commit
 #   WHISPER_COMMIT_HASH=def456 ./build-image.sh --vulkan # Override whisper.cpp commit
 #   SD_COMMIT_HASH=ghi789 ./build-image.sh --cuda      # Override stable-diffusion.cpp commit
-#   LLAMA_SWAP_VERSION=v198 ./build-image.sh --cuda    # Override llama-swap version
 #
 # Features:
 #   - Auto-detects latest commit hashes from git repos
-#   - Auto-detects latest llama-swap release
+#   - Builds llama-swap from local source code
 #   - Allows environment variable overrides for reproducible builds
 #   - Cache-friendly: changing commit hash busts cache appropriately
 #   - Supports both CUDA and Vulkan backends (requires explicit flag)
@@ -37,11 +36,10 @@ if [[ $# -eq 0 ]]; then
     echo "  --help, -h  Show this help message"
     echo ""
     echo "Environment variables:"
-    echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:latest or llama-swap:vulkan)"
+    echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:cuda or llama-swap:vulkan)"
     echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
     echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
     echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
-    echo "  LLAMA_SWAP_VERSION   Override llama-swap version (default: latest, e.g., v198)"
     exit 1
 fi
 
@@ -66,11 +64,10 @@ for arg in "$@"; do
             echo "  --help, -h  Show this help message"
             echo ""
             echo "Environment variables:"
-            echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:latest or llama-swap:vulkan)"
+            echo "  DOCKER_IMAGE_TAG     Set custom image tag (default: llama-swap:cuda or llama-swap:vulkan)"
             echo "  LLAMA_COMMIT_HASH    Override llama.cpp commit hash"
             echo "  WHISPER_COMMIT_HASH  Override whisper.cpp commit hash"
             echo "  SD_COMMIT_HASH       Override stable-diffusion.cpp commit hash"
-            echo "  LLAMA_SWAP_VERSION   Override llama-swap version (default: latest, e.g., v198)"
             exit 0
             ;;
     esac
@@ -106,12 +103,6 @@ fi
 LLAMA_REPO="https://github.com/ggml-org/llama.cpp.git"
 WHISPER_REPO="https://github.com/ggml-org/whisper.cpp.git"
 SD_REPO="https://github.com/leejet/stable-diffusion.cpp.git"
-LLAMA_SWAP_REPO="mostlygeek/llama-swap"
-
-# Function to get the latest llama-swap release version
-get_latest_llama_swap_version() {
-    curl -s "https://api.github.com/repos/${LLAMA_SWAP_REPO}/releases/latest" | grep -oP '"tag_name": "\K[^"]+' || echo ""
-}
 
 # Function to get the latest commit hash from a git repo's default branch
 get_latest_commit() {
@@ -181,33 +172,22 @@ else
     echo "stable-diffusion.cpp: Auto-detected latest commit (${SD_BRANCH}): ${SD_HASH}"
 fi
 
-# Determine llama-swap version - use env var or auto-detect
-if [[ -n "${LLAMA_SWAP_VERSION:-}" ]]; then
-    LLAMA_SWAP_VER="${LLAMA_SWAP_VERSION}"
-    echo "llama-swap: Using provided version: ${LLAMA_SWAP_VER}"
-else
-    LLAMA_SWAP_VER=$(get_latest_llama_swap_version)
-    if [[ -z "${LLAMA_SWAP_VER}" ]]; then
-        echo "ERROR: Could not determine latest llama-swap version" >&2
-        exit 1
-    fi
-    echo "llama-swap: Auto-detected latest version: ${LLAMA_SWAP_VER}"
-fi
-
 echo ""
 echo "=========================================="
 echo "Starting Docker build..."
 echo "=========================================="
 echo ""
 
-# Build the Docker image with commit hashes and llama-swap version as build args
+# Build the Docker image with commit hashes as build args
+# Build context is the repository root (..) so the Dockerfile can access Go source
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 BUILD_ARGS=(
     --build-arg "LLAMA_COMMIT_HASH=${LLAMA_HASH}"
     --build-arg "WHISPER_COMMIT_HASH=${WHISPER_HASH}"
     --build-arg "SD_COMMIT_HASH=${SD_HASH}"
-    --build-arg "LLAMA_SWAP_VERSION=${LLAMA_SWAP_VER}"
     -t "${DOCKER_IMAGE_TAG}"
-    -f "${DOCKERFILE}"
+    -f "${SCRIPT_DIR}/${DOCKERFILE}"
 )
 
 if [[ "$NO_CACHE" == true ]]; then
@@ -245,7 +225,8 @@ echo "Using builder: $BUILDER_NAME"
 
 # Use docker buildx build with --load to load the image into Docker
 # The --builder flag ensures we use our custom builder with max-parallelism=1
-docker buildx build --builder "$BUILDER_NAME" --load "${BUILD_ARGS[@]}" .
+# Build context is the repository root so we can access Go source files
+docker buildx build --builder "$BUILDER_NAME" --load "${BUILD_ARGS[@]}" "${REPO_ROOT}"
 
 echo ""
 echo "=========================================="
@@ -286,7 +267,7 @@ echo "Built with:"
 echo "  llama.cpp:           ${LLAMA_HASH}"
 echo "  whisper.cpp:         ${WHISPER_HASH}"
 echo "  stable-diffusion.cpp: ${SD_HASH}"
-echo "  llama-swap:          ${LLAMA_SWAP_VER}"
+echo "  llama-swap:          $(docker run --rm "${DOCKER_IMAGE_TAG}" cat /versions.txt | grep llama-swap | cut -d' ' -f2-)"
 echo ""
 if [[ "$BACKEND" == "vulkan" ]]; then
     echo "Run with:"

From c6a93d8a12eb838d48312359d676334eab7f2b7f Mon Sep 17 00:00:00 2001
From: Benson Wong <mostlygeek+git@gmail.com>
Date: Tue, 17 Mar 2026 22:31:06 +0000
Subject: [PATCH 3/4] fix llama-swap build in Dockerfile.cuda

---
 docker/Dockerfile.cuda   | 94 ++++++++++++++++------------------------
 docker/Dockerfile.vulkan |  7 ++-
 2 files changed, 41 insertions(+), 60 deletions(-)

diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index 9198af65..69ebc3f4 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -19,36 +19,12 @@
 # - Independent project rebuilds
 
 # ============================================================================
-# Stage 0: Build llama-swap from local source
-# ============================================================================
-FROM golang:1.25-alpine AS llama-swap-builder
-WORKDIR /app
-
-# Copy Go module files first for layer caching
-COPY go.mod go.sum ./
-RUN go mod download
-
-# Copy source code and build
-COPY . .
-ENV CGO_ENABLED=0 GOOS=linux GOARCH=amd64
-RUN go build -o llama-swap .
-
-# Get version info: git hash + dirty flag, or 'local-build' fallback
-RUN if git rev-parse --git-dir > /dev/null 2>&1; then \
-        VERSION=$(git rev-parse --short HEAD) && \
-        if [ -n "$(git status --porcelain)" ]; then \
-            VERSION="${VERSION}+dirty"; \
-        fi && \
-        echo "$VERSION" > /app/llama-swap-version; \
-    else \
-        echo "local-build" > /app/llama-swap-version; \
-    fi
-
-# ============================================================================
-# Stage 1: Base Builder with Common Dependencies
+# Stage 0: Base Builder with Common Dependencies
 # ============================================================================
 FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 AS builder-base
 
+ARG NODE_MAJOR=24
+
 ENV DEBIAN_FRONTEND=noninteractive
 ENV CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89"
 ENV CCACHE_DIR=/ccache
@@ -65,10 +41,40 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     curl \
     ca-certificates \
     ccache \
+    make \
+    gnupg \
+    && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key \
+       | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
+    && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_MAJOR}.x nodistro main" \
+       > /etc/apt/sources.list.d/nodesource.list \
+    && apt-get update && apt-get install -y nodejs \
     && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /src
 
+# ============================================================================
+# Stage 1: Build llama-swap from local source
+# Uses builder-base for cache efficiency (shares base layers with other builders)
+# ============================================================================
+FROM builder-base AS llama-swap-builder
+
+# Install Go 1.25 (ubuntu22.04 default is older)
+ENV GOLANG_VERSION=1.25.0
+ENV GOPATH=/go
+ENV PATH=/usr/local/go/bin:$GOPATH/bin:$PATH
+RUN curl -fsSL https://go.dev/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -C /usr/local -xz
+
+WORKDIR /app
+
+# Copy Go module files first for layer caching
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy source code and build
+COPY . .
+ENV CGO_ENABLED=0 GOOS=linux GOARCH=amd64
+RUN mkdir /install && make linux && cp build/llama-swap-linux-amd64 /install/llama-swap
+
 # ============================================================================
 # Stage 2: Source Cloning (Cached Independently)
 # ============================================================================
@@ -166,19 +172,15 @@ RUN --mount=type=cache,target=/ccache \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache && \
     cmake --build build --config Release -j$(nproc) --target llama-cli llama-server && \
     # Copy outputs to install directory for easy extraction
-    mkdir -p /install/bin /install/lib /install/examples && \
+    mkdir -p /install/bin /install/lib && \
     cp -v build/bin/llama-cli build/bin/llama-server /install/bin/ 2>/dev/null || true && \
-    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \; && \
-    cp -vr examples/* /install/examples/ 2>/dev/null || true
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \;
 
 # ============================================================================
 # Stage 4: Runtime Stage
 # ============================================================================
 FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 AS runtime
 
-ARG LLAMA_COMMIT_HASH=unknown
-ARG WHISPER_COMMIT_HASH=unknown
-ARG SD_COMMIT_HASH=unknown
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PATH="/usr/local/bin:${PATH}"
 ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
@@ -214,7 +216,6 @@ WORKDIR /app
 COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
 COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
 COPY --from=llama-build /install/lib/ /usr/local/lib/
-COPY --from=llama-build /install/examples/ /opt/llama.cpp/examples/
 
 COPY --from=whisper-build /install/bin/whisper-server /usr/local/bin/
 COPY --from=whisper-build /install/bin/whisper-cli /usr/local/bin/
@@ -227,30 +228,11 @@ COPY --from=sd-build /install/lib/ /usr/local/lib/
 # Update library cache
 RUN ldconfig
 
-# Create symlinks for common command names
-RUN ln -sf /usr/local/bin/llama-cli /usr/local/bin/llama && \
-    ln -sf /usr/local/bin/whisper-cli /usr/local/bin/whisper && \
-    ln -sf /usr/local/bin/sd-cli /usr/local/bin/stable-diffusion
-
-# Verify installation
-RUN echo "=== Installed Binaries ===" && \
-    ls -la /usr/local/bin/llama* /usr/local/bin/whisper* /usr/local/bin/sd* /usr/local/bin/llama-swap 2>/dev/null || true && \
-    echo "" && \
-    echo "=== Library Check ===" && \
-    ldconfig -p | grep -E "(ggml|llama|whisper|sd)" || true && \
-    echo "" && \
-    echo "=== llama-swap Version ===" && \
-    llama-swap --version 2>/dev/null || true && \
-    echo "" && \
-    echo "=== llama-cli Version ===" && \
-    llama-cli --version 2>/dev/null || true
-
 # Write version information to /versions.txt
-RUN echo "llama.cpp: ${LLAMA_COMMIT_HASH}" > /versions.txt && \
+RUN echo "llama.cpp: $(/usr/local/bin/llama-server --version 2>&1 | grep version) > /versions.txt && \
     echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
-    echo "stable-diffusion.cpp: ${SD_COMMIT_HASH}" >> /versions.txt && \
-    echo "llama-swap: $(cat /tmp/llama-swap-version)" >> /versions.txt && \
-    echo "backend: CUDA" >> /versions.txt && \
+    echo "stable-diffusion.cpp: $(/usr/local/bin/sd-server --version)" >> /versions.txt && \
+    echo "llama-swap: $(/usr/local/bin/llama-swap -version)" >> /versions.txt && \
     echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
 
 WORKDIR /models
diff --git a/docker/Dockerfile.vulkan b/docker/Dockerfile.vulkan
index 53b3d7dd..6b48ae18 100644
--- a/docker/Dockerfile.vulkan
+++ b/docker/Dockerfile.vulkan
@@ -146,7 +146,7 @@ RUN --mount=type=cache,target=/ccache \
     cmake -B build \
         -DSD_VULKAN=ON \
         -DGGML_VULKAN=ON \
-        -DSD_BUILD_EXAMPLES=ON \
+        -DSD_BUILD_EXAMPLES=OFF \
         -DGGML_NATIVE=OFF \
         -DVulkan_INCLUDE_DIR="${VULKAN_SDK}/include" \
         -DVulkan_LIBRARY="${VULKAN_SDK}/lib/libvulkan.so" \
@@ -173,10 +173,9 @@ RUN --mount=type=cache,target=/ccache \
         -DCMAKE_BUILD_TYPE=Release && \
     cmake --build build --config Release -j$(nproc) --target llama-cli llama-server && \
     # Copy outputs to install directory for easy extraction
-    mkdir -p /install/bin /install/lib /install/examples && \
+    mkdir -p /install/bin /install/lib && \
     cp -v build/bin/llama-cli build/bin/llama-server /install/bin/ 2>/dev/null || true && \
-    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \; && \
-    cp -vr examples/* /install/examples/ 2>/dev/null || true
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \;
 
 # ============================================================================
 # Stage 4: Runtime Stage

From 74c625bbd5cc2ad59c830ceb03fdc73885a35239 Mon Sep 17 00:00:00 2001
From: Benson Wong <mostlygeek+git@gmail.com>
Date: Tue, 17 Mar 2026 16:58:01 -0700
Subject: [PATCH 4/4] docker,Makefile: fix cuda build

---
 Makefile               |  2 +-
 docker/Dockerfile.cuda | 13 ++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index f7d18586..f0fb12d2 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ mac: ui
 linux: ui
 	@echo "Building Linux binary..."
 	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
-	GOOS=linux GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-arm64
+#GOOS=linux GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-arm64
 
 # Build Windows binary
 windows: ui
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index 69ebc3f4..4cd85991 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -195,8 +195,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/*
 
 # Copy llama-swap binary from builder stage
-COPY --from=llama-swap-builder /app/llama-swap /usr/local/bin/
-COPY --from=llama-swap-builder /app/llama-swap-version /tmp/
+COPY --from=llama-swap-builder /install/llama-swap /usr/local/bin/
 
 # Copy libcuda stubs for CPU-only testing (not in ldconfig path to avoid conflicts with real drivers)
 # Real NVIDIA drivers will be used when available; stubs are fallback for testing on CPU-only hosts
@@ -229,11 +228,11 @@ COPY --from=sd-build /install/lib/ /usr/local/lib/
 RUN ldconfig
 
 # Write version information to /versions.txt
-RUN echo "llama.cpp: $(/usr/local/bin/llama-server --version 2>&1 | grep version) > /versions.txt && \
-    echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
-    echo "stable-diffusion.cpp: $(/usr/local/bin/sd-server --version)" >> /versions.txt && \
-    echo "llama-swap: $(/usr/local/bin/llama-swap -version)" >> /versions.txt && \
-    echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
+#RUN echo "llama.cpp: $(/usr/local/bin/llama-server --version 2>&1 | grep version) > /versions.txt && \
+#    echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
+#    echo "stable-diffusion.cpp: $(/usr/local/bin/sd-server --version)" >> /versions.txt && \
+#    echo "llama-swap: $(/usr/local/bin/llama-swap -version)" >> /versions.txt && \
+#    echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
 
 WORKDIR /models
 CMD ["bash"]