mostlygeek · mostlygeek · Mar 16, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/Makefile b/Makefile
@@ -51,7 +51,7 @@ mac: ui
 linux: ui
 	@echo "Building Linux binary..."
 	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
-	GOOS=linux GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-arm64
+#GOOS=linux GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-arm64
 
 # Build Windows binary
 windows: ui

diff --git a/docker/.gitignore b/docker/.gitignore
@@ -0,0 +1,2 @@
+# BuildKit configuration file (generated by build-image.sh)
+buildkitd.toml
diff --git a/docker/AGENTS.md b/docker/AGENTS.md
@@ -0,0 +1,17 @@
+- you are working in my VM sandbox. It is safe to use sudo.
+- use or install whatever tools you need to complete your goal
+- use DOCKER_BUILDKIT=1 docker build -t llama-swap:optimized
+ - DOCKER_BUILDKIT=1 is important to use the caching
+- ALWAYS send notifications to get the user's attention
+- when running `./build-image.sh`, use a 2-hour (7200000ms) timeout minimum as CUDA builds take 60-120+ minutes to compile for multiple architectures
+
+# Notifications
+
+ALWAYS send notifications to keep the user informed:
+
+- When starting or finishing a job
+- For progress updates on long-running tasks (especially Docker builds)
+- For todo list progress updates (when items start/complete)
+- When you need feedback or to elicit information from the user
+- use pushover.sh <message>, example: `pushover.sh "notification to send"`
+
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
@@ -0,0 +1,238 @@
+# Multi-stage Dockerfile for CUDA-optimized AI inference tools
+# Includes: llama.cpp, whisper.cpp, stable-diffusion.cpp
+#
+# CUDA architectures targeting consumer/prosumer GPUs from Pascal through Blackwell:
+#   sm_60  - Pascal (HPC):      Tesla P100 (GP100) - full FP16 2x throughput, HBM2
+#   sm_61  - Pascal (consumer):  Tesla P40, GTX 1080/1070/1060, Titan Xp (GP102/104/106)
+#   sm_75  - Turing:             RTX 2080/2070/2060, Quadro RTX 6000, Tesla T4 - 2nd gen Tensor Cores, INT8/INT4
+#   sm_86  - Ampere (consumer):  RTX 3090/3080/3070/3060, RTX A6000 - 3rd gen Tensor Cores, BF16/TF32, 128 FP32 cores/SM
+#   sm_89  - Ada Lovelace:       RTX 4090/4080/4070/4060, RTX 6000 Ada - FP8 support, 4th gen Tensor Cores
+#   sm_120 - Blackwell (consumer): RTX 5090/5080/5070 - requires CUDA 12.8+ (not included, base image is 12.4)
+#
+# Build with BuildKit for cache mounts:
+#   DOCKER_BUILDKIT=1 docker build -t llama-swap:latest .
+#
+# Features:
+# - Separate git clone stages for independent caching
+# - ccache support for C++ compilation caching
+# - BuildKit cache mounts for cmake incremental builds
+# - Independent project rebuilds
+
+# ============================================================================
+# Stage 0: Base Builder with Common Dependencies
+# ============================================================================
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 AS builder-base
+
+ARG NODE_MAJOR=24
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CMAKE_CUDA_ARCHITECTURES="60;61;75;86;89"
+ENV CCACHE_DIR=/ccache
+ENV CCACHE_MAXSIZE=2G
+ENV PATH="/usr/lib/ccache:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    git \
+    python3 \
+    python3-pip \
+    libssl-dev \
+    curl \
+    ca-certificates \
+    ccache \
+    make \
+    gnupg \
+    && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key \
+       | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
+    && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_MAJOR}.x nodistro main" \
+       > /etc/apt/sources.list.d/nodesource.list \
+    && apt-get update && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+
+# ============================================================================
+# Stage 1: Build llama-swap from local source
+# Uses builder-base for cache efficiency (shares base layers with other builders)
+# ============================================================================
+FROM builder-base AS llama-swap-builder
+
+# Install Go 1.25 (ubuntu22.04 default is older)
+ENV GOLANG_VERSION=1.25.0
+ENV GOPATH=/go
+ENV PATH=/usr/local/go/bin:$GOPATH/bin:$PATH
+RUN curl -fsSL https://go.dev/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -C /usr/local -xz
+
+WORKDIR /app
+
+# Copy Go module files first for layer caching
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy source code and build
+COPY . .
+ENV CGO_ENABLED=0 GOOS=linux GOARCH=amd64
+RUN mkdir /install && make linux && cp build/llama-swap-linux-amd64 /install/llama-swap
+
+# ============================================================================
+# Stage 2: Source Cloning (Cached Independently)
+# ============================================================================
+
+FROM builder-base AS llama-source
+ARG LLAMA_COMMIT_HASH=master
+RUN git clone --filter=blob:none --no-checkout https://github.com/ggml-org/llama.cpp.git /src/llama.cpp && \
+    cd /src/llama.cpp && \
+    git fetch --depth=1 origin ${LLAMA_COMMIT_HASH} && \
+    git checkout FETCH_HEAD
+
+FROM builder-base AS whisper-source
+ARG WHISPER_COMMIT_HASH=master
+RUN git clone --filter=blob:none --no-checkout https://github.com/ggml-org/whisper.cpp.git /src/whisper.cpp && \
+    cd /src/whisper.cpp && \
+    git fetch --depth=1 origin ${WHISPER_COMMIT_HASH} && \
+    git checkout FETCH_HEAD
+
+FROM builder-base AS sd-source
+ARG SD_COMMIT_HASH=master
+RUN git clone --filter=blob:none --no-checkout https://github.com/leejet/stable-diffusion.cpp.git /src/stable-diffusion.cpp && \
+    cd /src/stable-diffusion.cpp && \
+    git fetch --depth=1 origin ${SD_COMMIT_HASH} && \
+    git checkout FETCH_HEAD && \
+    git submodule update --init --recursive --depth=1
+
+# ============================================================================
+# Stage 3: Individual Project Builds with Cache Mounts
+# ============================================================================
+
+# Build whisper.cpp (first - quick to build)
+FROM builder-base AS whisper-build
+COPY --from=whisper-source /src/whisper.cpp /build/whisper.cpp
+WORKDIR /build/whisper.cpp
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=cache,target=/build/whisper.cpp/build \
+    cmake -B build \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CUDA=ON \
+        -DGGML_VULKAN=OFF \
+        -DCMAKE_CUDA_ARCHITECTURES="60;61;75;86;89" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_FLAGS="-allow-unsupported-compiler" \
+        -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache && \
+    cmake --build build --config Release -j$(nproc) --target whisper-cli whisper-server && \
+    mkdir -p /install/bin /install/lib && \
+    cp -v build/bin/whisper-cli build/bin/whisper-server /install/bin/ 2>/dev/null || true && \
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \;
+
+# Build stable-diffusion.cpp (second - medium build time)
+FROM builder-base AS sd-build
+COPY --from=sd-source /src/stable-diffusion.cpp /build/stable-diffusion.cpp
+WORKDIR /build/stable-diffusion.cpp
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=cache,target=/build/stable-diffusion.cpp/build \
+    set -e && \
+    rm -rf build/CMakeCache.txt build/CMakeFiles 2>/dev/null || true && \
+    cmake -B build \
+        -DSD_CUDA=ON \
+        -DGGML_CUDA=ON \
+        -DGGML_NATIVE=OFF \
+        -DGGML_VULKAN=OFF \
+        -DCMAKE_CUDA_ARCHITECTURES="60;61;75;86;89" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_FLAGS="-allow-unsupported-compiler" \
+        -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache && \
+    cmake --build build --config Release -j$(nproc) --target sd-cli sd-server && \
+    mkdir -p /install/bin /install/lib && \
+    cp -v build/bin/sd-cli build/bin/sd-server /install/bin/ && \
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \;
+
+# Build llama.cpp (last - longest build time)
+FROM builder-base AS llama-build
+COPY --from=llama-source /src/llama.cpp /build/llama.cpp
+WORKDIR /build/llama.cpp
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=cache,target=/build/llama.cpp/build \
+    cmake -B build \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CUDA=ON \
+        -DGGML_VULKAN=OFF \
+        -DCMAKE_CUDA_ARCHITECTURES="60;61;75;86;89" \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_FLAGS="-allow-unsupported-compiler" \
+        -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,/usr/local/cuda/lib64/stubs -lcuda" \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache && \
+    cmake --build build --config Release -j$(nproc) --target llama-cli llama-server && \
+    # Copy outputs to install directory for easy extraction
+    mkdir -p /install/bin /install/lib && \
+    cp -v build/bin/llama-cli build/bin/llama-server /install/bin/ 2>/dev/null || true && \
+    find build -name "*.so*" -type f -exec cp -v {} /install/lib/ \;
+
+# ============================================================================
+# Stage 4: Runtime Stage
+# ============================================================================
+FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 AS runtime
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH="/usr/local/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 \
+    python3 \
+    python3-pip \
+    curl \
+    ca-certificates \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy llama-swap binary from builder stage
+COPY --from=llama-swap-builder /install/llama-swap /usr/local/bin/
+
+# Copy libcuda stubs for CPU-only testing (not in ldconfig path to avoid conflicts with real drivers)
+# Real NVIDIA drivers will be used when available; stubs are fallback for testing on CPU-only hosts
+COPY --from=builder-base /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so
+COPY --from=builder-base /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+# Copy test script for verifying binaries work correctly
+COPY docker/test-binaries.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/test-binaries.sh
+
+RUN pip3 install --no-cache-dir numpy sentencepiece
+
+WORKDIR /app
+
+# Copy specific binaries from each build stage
+# Copy only specific binaries from each build stage
+COPY --from=llama-build /install/bin/llama-server /usr/local/bin/
+COPY --from=llama-build /install/bin/llama-cli /usr/local/bin/
+COPY --from=llama-build /install/lib/ /usr/local/lib/
+
+COPY --from=whisper-build /install/bin/whisper-server /usr/local/bin/
+COPY --from=whisper-build /install/bin/whisper-cli /usr/local/bin/
+COPY --from=whisper-build /install/lib/ /usr/local/lib/
+
+COPY --from=sd-build /install/bin/sd-server /usr/local/bin/
+COPY --from=sd-build /install/bin/sd-cli /usr/local/bin/
+COPY --from=sd-build /install/lib/ /usr/local/lib/
+
+# Update library cache
+RUN ldconfig
+
+# Write version information to /versions.txt
+#RUN echo "llama.cpp: $(/usr/local/bin/llama-server --version 2>&1 | grep version) > /versions.txt && \
+#    echo "whisper.cpp: ${WHISPER_COMMIT_HASH}" >> /versions.txt && \
+#    echo "stable-diffusion.cpp: $(/usr/local/bin/sd-server --version)" >> /versions.txt && \
+#    echo "llama-swap: $(/usr/local/bin/llama-swap -version)" >> /versions.txt && \
+#    echo "build_timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> /versions.txt
+
+WORKDIR /models
+CMD ["bash"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# BuildKit configuration file (generated by build-image.sh)
		buildkitd.toml