From f5744e724afe2cc66e8161170aa722e01ad9d0a8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 13 Jan 2026 21:26:44 +0000
Subject: [PATCH 1/7] Initial plan


From 7aff62a6d0da56388b21bb609b851e6d6b546da2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 13 Jan 2026 21:31:50 +0000
Subject: [PATCH 2/7] Add Llama 3.2 Vision docker setup with Flask webhook API

Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com>
---
 README.md                     |   1 +
 llama-vision/.gitignore       |  42 +++++
 llama-vision/Dockerfile       |  57 +++++++
 llama-vision/Makefile         |  62 +++++++
 llama-vision/README.md        | 300 ++++++++++++++++++++++++++++++++++
 llama-vision/app/schema.py    |  49 ++++++
 llama-vision/app/webhook.py   | 297 +++++++++++++++++++++++++++++++++
 llama-vision/compose.yml      |  35 ++++
 llama-vision/requirements.txt |   6 +
 llama-vision/test_api.py      | 106 ++++++++++++
 10 files changed, 955 insertions(+)
 create mode 100644 llama-vision/.gitignore
 create mode 100644 llama-vision/Dockerfile
 create mode 100644 llama-vision/Makefile
 create mode 100644 llama-vision/README.md
 create mode 100644 llama-vision/app/schema.py
 create mode 100644 llama-vision/app/webhook.py
 create mode 100644 llama-vision/compose.yml
 create mode 100644 llama-vision/requirements.txt
 create mode 100644 llama-vision/test_api.py

diff --git a/README.md b/README.md
index 06309cb..793359a 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@ A collection of docker containers I use for various hobby projects, currently co
 
 - `ghost` -> ghost6 based blog setup
 - `homebridge` -> latest Homebridge container for HomeKit integration (armv8 architecture)
+- `llama-vision` -> Llama 3.2 Vision model with CPU inference (Q4_K_M quantization) and Flask webhook API
 - `python` -> base "hello-world" docker setup with poetry for dependency resolution and VSCode debug capabilities
 - `verilator` -> minimal Verilator simulator for RTL simulation (based on OpenTitan container setup)
 - `watchtower` -> automatic Docker image updates for all containers
diff --git a/llama-vision/.gitignore b/llama-vision/.gitignore
new file mode 100644
index 0000000..3dad26e
--- /dev/null
+++ b/llama-vision/.gitignore
@@ -0,0 +1,42 @@
+# Models directory
+models/
+*.gguf
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+.venv
+
+# Flask
+instance/
+.webassets-cache
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Logs
+*.log
+
+# Temporary files
+tmp/
+temp/
+*.tmp
diff --git a/llama-vision/Dockerfile b/llama-vision/Dockerfile
new file mode 100644
index 0000000..9147faf
--- /dev/null
+++ b/llama-vision/Dockerfile
@@ -0,0 +1,57 @@
+# Multi-stage build for llama.cpp with Llama 3.2 Vision support
+FROM python:3.11-slim as builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    git \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Build llama.cpp from source
+WORKDIR /build
+RUN git clone https://github.com/ggerganov/llama.cpp.git
+WORKDIR /build/llama.cpp
+RUN cmake -B build -DLLAMA_CURL=ON
+RUN cmake --build build --config Release
+
+# Final runtime image
+FROM python:3.11-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy llama.cpp build from builder
+COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/
+COPY --from=builder /build/llama.cpp/build/bin/llama-cli /usr/local/bin/
+
+# Set working directory
+WORKDIR /app
+
+# Copy application files
+COPY requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app/ /app/
+
+# Environment variables
+ENV MODEL_NAME="llama-3.2-11b-vision-instruct-q4_k_m.gguf"
+ENV MODEL_PATH="/models"
+ENV FLASK_APP=webhook.py
+ENV PYTHONUNBUFFERED=1
+
+# Create models directory
+RUN mkdir -p /models
+
+# Expose Flask port
+EXPOSE 5000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:5000/health || exit 1
+
+# Run Flask app
+CMD ["python", "webhook.py"]
diff --git a/llama-vision/Makefile b/llama-vision/Makefile
new file mode 100644
index 0000000..3b1332e
--- /dev/null
+++ b/llama-vision/Makefile
@@ -0,0 +1,62 @@
+.PHONY: build run stop clean test help
+
+# Default model name (can be overridden)
+MODEL_NAME ?= llama-3.2-11b-vision-instruct-q4_k_m.gguf
+
+help:
+	@echo "Llama Vision Docker - Available targets:"
+	@echo ""
+	@echo "  make build       - Build the Docker image"
+	@echo "  make run         - Run the container with models volume"
+	@echo "  make stop        - Stop the running container"
+	@echo "  make clean       - Stop and remove the container"
+	@echo "  make logs        - Show container logs"
+	@echo "  make test        - Run API tests"
+	@echo "  make shell       - Open shell in running container"
+	@echo "  make health      - Check service health"
+	@echo ""
+	@echo "Environment variables:"
+	@echo "  MODEL_NAME       - Model filename (default: $(MODEL_NAME))"
+
+build:
+	docker build --pull --rm -t llama-vision:latest .
+
+run:
+	docker run -d \
+		-p 5000:5000 \
+		-v $$(pwd)/models:/models \
+		-e MODEL_NAME=$(MODEL_NAME) \
+		--name llama-vision \
+		llama-vision:latest
+
+stop:
+	docker stop llama-vision || true
+
+clean: stop
+	docker rm llama-vision || true
+
+logs:
+	docker logs -f llama-vision
+
+shell:
+	docker exec -it llama-vision /bin/bash
+
+health:
+	@echo "Checking health..."
+	@curl -s http://localhost:5000/health | python -m json.tool
+
+test:
+	python test_api.py
+
+# Docker compose targets
+compose-up:
+	docker compose up -d
+
+compose-down:
+	docker compose down
+
+compose-logs:
+	docker compose logs -f
+
+compose-build:
+	docker compose build
diff --git a/llama-vision/README.md b/llama-vision/README.md
new file mode 100644
index 0000000..018190b
--- /dev/null
+++ b/llama-vision/README.md
@@ -0,0 +1,300 @@
+# Llama 3.2 Vision Docker Setup
+
+Docker image that runs Llama 3.2 Vision model with CPU inference (no GPU/NVIDIA acceleration) using the recommended Q4_K_M quantization. The service exposes a Flask webhook interface to accept images and text prompts and provides responses via JSON.
+
+## Features
+
+- 🚀 CPU-only inference using llama.cpp (no GPU required)
+- 📸 Vision model support for image understanding
+- 🔧 Q4_K_M quantization for optimal CPU performance (as recommended in [llama.cpp PR #5780](https://github.com/ggml-org/llama.cpp/pull/5780))
+- 🌐 RESTful API with Flask webhook interface
+- 📋 Structured JSON response schema
+- 🔍 Health check endpoint
+- 📊 Token usage tracking
+
+## Quick Start
+
+### Prerequisites
+
+- Docker installed on your system
+- A Llama 3.2 Vision model in GGUF format with Q4_K_M quantization
+
+### Download Model
+
+You can download the recommended Q4_K_M quantized Llama 3.2 Vision model from Hugging Face:
+
+```bash
+# Example: Download Llama 3.2 11B Vision Instruct Q4_K_M
+# Visit https://huggingface.co/ and search for "llama-3.2-vision Q4_K_M GGUF"
+# Download the .gguf file and place it in a models directory
+mkdir -p models
+# Place your downloaded model in ./models/
+```
+
+### Build the Docker Image
+
+```bash
+docker build -t llama-vision:latest .
+```
+
+### Run the Container
+
+```bash
+docker run -d \
+  -p 5000:5000 \
+  -v $(pwd)/models:/models \
+  -e MODEL_NAME="your-model-name.gguf" \
+  --name llama-vision \
+  llama-vision:latest
+```
+
+### Environment Variables
+
+- `MODEL_NAME`: Name of the GGUF model file (default: `llama-3.2-11b-vision-instruct-q4_k_m.gguf`)
+- `MODEL_PATH`: Directory containing the model (default: `/models`)
+- `PORT`: Port for the Flask server (default: `5000`)
+
+## API Documentation
+
+### Health Check
+
+**Endpoint:** `GET /health`
+
+**Response:**
+```json
+{
+  "status": "healthy",
+  "model_loaded": true,
+  "model_name": "llama-3.2-11b-vision-instruct-q4_k_m.gguf",
+  "timestamp": "2024-01-13T12:00:00.000000"
+}
+```
+
+### Inference Endpoint
+
+**Endpoint:** `POST /infer`
+
+**Request Body:**
+```json
+{
+  "prompt": "Describe what you see in this image",
+  "image": "base64_encoded_image_data",
+  "max_tokens": 256,
+  "temperature": 0.7,
+  "top_p": 0.95
+}
+```
+
+**Parameters:**
+- `prompt` (required): Text prompt/question about the image
+- `image` (required): Base64 encoded image data (supports data URL format)
+- `max_tokens` (optional): Maximum tokens in response (default: 256)
+- `temperature` (optional): Sampling temperature (default: 0.7)
+- `top_p` (optional): Nucleus sampling parameter (default: 0.95)
+
+**Success Response (200):**
+```json
+{
+  "success": true,
+  "response_text": "The image shows a beautiful sunset over mountains...",
+  "model": "llama-3.2-11b-vision-instruct-q4_k_m.gguf",
+  "timestamp": "2024-01-13T12:00:00.000000",
+  "token_usage": {
+    "prompt_tokens": 150,
+    "completion_tokens": 200,
+    "total_tokens": 350
+  },
+  "metadata": {
+    "max_tokens": 256,
+    "temperature": 0.7,
+    "top_p": 0.95
+  },
+  "error": null
+}
+```
+
+**Error Response (400/500):**
+```json
+{
+  "success": false,
+  "error": "Missing required field: prompt",
+  "error_type": "validation",
+  "timestamp": "2024-01-13T12:00:00.000000",
+  "details": null
+}
+```
+
+## Response Schema
+
+### VisionResponse
+
+```python
+{
+  "success": bool,              # Request success status
+  "response_text": str,         # Generated text response
+  "model": str,                 # Model used for inference
+  "timestamp": str,             # ISO timestamp
+  "token_usage": {              # Token usage statistics
+    "prompt_tokens": int,
+    "completion_tokens": int,
+    "total_tokens": int
+  },
+  "metadata": dict,             # Additional metadata
+  "error": str | null           # Error message if any
+}
+```
+
+### ErrorResponse
+
+```python
+{
+  "success": false,             # Always false for errors
+  "error": str,                 # Error message
+  "error_type": str,            # Error type (validation, model, system)
+  "timestamp": str,             # ISO timestamp
+  "details": dict | null        # Additional error details
+}
+```
+
+## Example Usage
+
+### Python Example
+
+```python
+import requests
+import base64
+
+# Read and encode image
+with open("image.jpg", "rb") as f:
+    image_data = base64.b64encode(f.read()).decode()
+
+# Make request
+response = requests.post(
+    "http://localhost:5000/infer",
+    json={
+        "prompt": "What objects can you see in this image?",
+        "image": image_data,
+        "max_tokens": 300,
+        "temperature": 0.7
+    }
+)
+
+result = response.json()
+print(result["response_text"])
+```
+
+### cURL Example
+
+```bash
+# Encode image to base64
+IMAGE_B64=$(base64 -w 0 image.jpg)
+
+# Make request
+curl -X POST http://localhost:5000/infer \
+  -H "Content-Type: application/json" \
+  -d "{
+    \"prompt\": \"Describe this image in detail\",
+    \"image\": \"$IMAGE_B64\",
+    \"max_tokens\": 256
+  }"
+```
+
+## Technical Details
+
+### Quantization
+
+This setup uses **Q4_K_M** quantization, which is recommended for CPU inference as it provides:
+- Excellent balance between quality and performance
+- ~4GB memory footprint for 7B models
+- Good accuracy preservation compared to higher precision formats
+- Optimized for CPU matrix operations
+
+The recommendation comes from the llama.cpp community discussions, particularly [PR #5780](https://github.com/ggml-org/llama.cpp/pull/5780), where Q4_K_M is noted as the default choice for most use cases.
+
+### Architecture
+
+- **Base Image:** Python 3.11-slim (small footprint)
+- **Build:** Multi-stage build for smaller final image
+- **Inference Engine:** llama.cpp compiled from source with CPU optimizations
+- **Python Bindings:** llama-cpp-python for easy integration
+- **Web Framework:** Flask for RESTful API
+- **Image Processing:** Pillow for image handling
+
+### Performance Considerations
+
+- CPU threads are automatically configured to use all available cores
+- Context window: 2048 tokens (configurable in code)
+- No GPU layers (n_gpu_layers=0)
+- Optimized for multi-core CPU inference
+
+## Development
+
+### Project Structure
+
+```
+llama-vision/
+├── Dockerfile              # Multi-stage Docker build
+├── requirements.txt        # Python dependencies
+├── README.md              # This file
+└── app/
+    ├── webhook.py         # Flask application
+    └── schema.py          # Pydantic response schemas
+```
+
+### Building and Testing Locally
+
+```bash
+# Build
+docker build -t llama-vision:latest .
+
+# Run with mounted models directory
+docker run -it --rm \
+  -p 5000:5000 \
+  -v $(pwd)/models:/models \
+  -e MODEL_NAME="your-model.gguf" \
+  llama-vision:latest
+
+# Test health endpoint
+curl http://localhost:5000/health
+
+# Test inference
+curl -X POST http://localhost:5000/infer \
+  -H "Content-Type: application/json" \
+  -d '{"prompt": "test", "image": "..."}'
+```
+
+## Troubleshooting
+
+### Model Not Found
+
+Ensure your model file is in the correct location:
+```bash
+ls -la models/
+```
+
+The model file should match the `MODEL_NAME` environment variable.
+
+### Out of Memory
+
+If you experience OOM errors:
+- Use a smaller model (1B or 3B instead of 11B)
+- Reduce context window in webhook.py (n_ctx parameter)
+- Ensure sufficient RAM (11B models need ~6-8GB RAM with Q4_K_M)
+
+### Slow Inference
+
+CPU inference is inherently slower than GPU:
+- Consider using a smaller model for faster responses
+- Ensure Docker has access to all CPU cores
+- Use Q4_K_M quantization (already configured)
+
+## License
+
+MIT License - see repository LICENSE file for details.
+
+## References
+
+- [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
+- [Llama 3.2 Models](https://huggingface.co/meta-llama)
+- [GGUF Quantization Guide](https://github.com/ggerganov/llama.cpp/pull/5780)
diff --git a/llama-vision/app/schema.py b/llama-vision/app/schema.py
new file mode 100644
index 0000000..4ea3eb3
--- /dev/null
+++ b/llama-vision/app/schema.py
@@ -0,0 +1,49 @@
+"""
+Response schema definitions for the Llama Vision webhook.
+"""
+
+from typing import Optional, List, Dict, Any
+from pydantic import BaseModel, Field
+from datetime import datetime
+
+
+class TokenUsage(BaseModel):
+    """Token usage statistics."""
+    prompt_tokens: int = Field(description="Number of tokens in the prompt")
+    completion_tokens: int = Field(description="Number of tokens in the completion")
+    total_tokens: int = Field(description="Total number of tokens used")
+
+
+class VisionResponse(BaseModel):
+    """Standard response format for vision inference requests."""
+    
+    success: bool = Field(description="Whether the request was successful")
+    response_text: str = Field(description="The generated text response from the model")
+    model: str = Field(description="The model used for inference")
+    timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(), 
+                          description="ISO timestamp of the response")
+    token_usage: Optional[TokenUsage] = Field(None, description="Token usage statistics")
+    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, 
+                                               description="Additional metadata")
+    error: Optional[str] = Field(None, description="Error message if success is False")
+
+
+class ErrorResponse(BaseModel):
+    """Error response format."""
+    
+    success: bool = Field(default=False, description="Always False for error responses")
+    error: str = Field(description="Error message describing what went wrong")
+    error_type: str = Field(description="Type of error (e.g., validation, model, system)")
+    timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(),
+                          description="ISO timestamp of the error")
+    details: Optional[Dict[str, Any]] = Field(None, description="Additional error details")
+
+
+class HealthResponse(BaseModel):
+    """Health check response format."""
+    
+    status: str = Field(description="Health status (healthy, unhealthy, degraded)")
+    model_loaded: bool = Field(description="Whether the model is loaded and ready")
+    model_name: str = Field(description="Name of the loaded model")
+    timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(),
+                          description="ISO timestamp of health check")
diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py
new file mode 100644
index 0000000..233bcb4
--- /dev/null
+++ b/llama-vision/app/webhook.py
@@ -0,0 +1,297 @@
+"""
+Flask webhook interface for Llama 3.2 Vision model inference.
+Accepts images and text prompts, returns JSON responses.
+"""
+
+import os
+import logging
+import base64
+import tempfile
+from io import BytesIO
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+
+from flask import Flask, request, jsonify
+from werkzeug.exceptions import BadRequest
+from PIL import Image
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import Llava15ChatHandler
+
+from schema import VisionResponse, ErrorResponse, HealthResponse, TokenUsage
+
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Initialize Flask app
+app = Flask(__name__)
+
+# Global model instance
+llama_model: Optional[Llama] = None
+model_name: str = ""
+
+
+def load_model():
+    """Load the Llama model from the specified path."""
+    global llama_model, model_name
+    
+    model_path_env = os.environ.get('MODEL_PATH', '/models')
+    model_name_env = os.environ.get('MODEL_NAME', 'llama-3.2-11b-vision-instruct-q4_k_m.gguf')
+    
+    full_model_path = Path(model_path_env) / model_name_env
+    
+    logger.info(f"Loading model from: {full_model_path}")
+    
+    if not full_model_path.exists():
+        logger.error(f"Model file not found at: {full_model_path}")
+        raise FileNotFoundError(f"Model file not found at: {full_model_path}")
+    
+    try:
+        # Initialize with vision support using llava chat handler
+        # For vision models, we need the appropriate chat handler
+        llama_model = Llama(
+            model_path=str(full_model_path),
+            n_ctx=2048,  # Context window
+            n_threads=os.cpu_count(),  # Use all available CPU threads
+            n_gpu_layers=0,  # CPU only, no GPU layers
+            verbose=False,
+            chat_format="llava-1-5",  # Vision chat format
+        )
+        model_name = model_name_env
+        logger.info(f"Model loaded successfully: {model_name}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}")
+        raise
+
+
+def process_image(image_data: str, image_format: str = "base64") -> str:
+    """
+    Process image data and return path to temporary file.
+    
+    Args:
+        image_data: Base64 encoded image or image bytes
+        image_format: Format of the image data (base64, bytes)
+    
+    Returns:
+        Path to temporary image file
+    """
+    try:
+        if image_format == "base64":
+            # Remove data URL prefix if present
+            if "," in image_data:
+                image_data = image_data.split(",", 1)[1]
+            
+            image_bytes = base64.b64decode(image_data)
+        else:
+            image_bytes = image_data
+        
+        # Open image with PIL to validate and potentially convert
+        img = Image.open(BytesIO(image_bytes))
+        
+        # Create temporary file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
+        img.save(temp_file.name, format="PNG")
+        temp_file.close()
+        
+        return temp_file.name
+    except Exception as e:
+        logger.error(f"Failed to process image: {e}")
+        raise ValueError(f"Invalid image data: {e}")
+
+
+def create_vision_prompt(text: str, image_path: str) -> List[Dict[str, Any]]:
+    """
+    Create a prompt in the format expected by llama-cpp-python for vision models.
+    
+    Args:
+        text: Text prompt
+        image_path: Path to the image file
+    
+    Returns:
+        Formatted prompt as list of message dicts
+    """
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": f"file://{image_path}"}},
+                {"type": "text", "text": text}
+            ]
+        }
+    ]
+
+
+@app.route('/health', methods=['GET'])
+def health_check():
+    """Health check endpoint."""
+    try:
+        is_loaded = llama_model is not None
+        status = "healthy" if is_loaded else "unhealthy"
+        
+        response = HealthResponse(
+            status=status,
+            model_loaded=is_loaded,
+            model_name=model_name if is_loaded else "not loaded"
+        )
+        
+        return jsonify(response.model_dump()), 200 if is_loaded else 503
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return jsonify({"status": "error", "error": str(e)}), 500
+
+
+@app.route('/infer', methods=['POST'])
+def infer():
+    """
+    Main inference endpoint.
+    
+    Expected JSON payload:
+    {
+        "prompt": "Describe this image",
+        "image": "base64_encoded_image_data",
+        "max_tokens": 256,
+        "temperature": 0.7,
+        "top_p": 0.95
+    }
+    """
+    try:
+        # Validate model is loaded
+        if llama_model is None:
+            error = ErrorResponse(
+                error="Model not loaded",
+                error_type="model",
+                details={"message": "Model failed to load at startup"}
+            )
+            return jsonify(error.model_dump()), 503
+        
+        # Parse request JSON
+        data = request.get_json()
+        if not data:
+            error = ErrorResponse(
+                error="No JSON data provided",
+                error_type="validation"
+            )
+            return jsonify(error.model_dump()), 400
+        
+        # Extract required fields
+        prompt_text = data.get('prompt')
+        image_data = data.get('image')
+        
+        if not prompt_text:
+            error = ErrorResponse(
+                error="Missing required field: prompt",
+                error_type="validation"
+            )
+            return jsonify(error.model_dump()), 400
+        
+        if not image_data:
+            error = ErrorResponse(
+                error="Missing required field: image",
+                error_type="validation"
+            )
+            return jsonify(error.model_dump()), 400
+        
+        # Extract optional parameters
+        max_tokens = data.get('max_tokens', 256)
+        temperature = data.get('temperature', 0.7)
+        top_p = data.get('top_p', 0.95)
+        
+        logger.info(f"Processing inference request with prompt: {prompt_text[:50]}...")
+        
+        # Process image
+        image_path = process_image(image_data)
+        
+        try:
+            # Create vision prompt
+            messages = create_vision_prompt(prompt_text, image_path)
+            
+            # Run inference
+            result = llama_model.create_chat_completion(
+                messages=messages,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+            )
+            
+            # Extract response
+            response_text = result['choices'][0]['message']['content']
+            
+            # Get token usage
+            usage = result.get('usage', {})
+            token_usage = TokenUsage(
+                prompt_tokens=usage.get('prompt_tokens', 0),
+                completion_tokens=usage.get('completion_tokens', 0),
+                total_tokens=usage.get('total_tokens', 0)
+            )
+            
+            # Create response
+            response = VisionResponse(
+                success=True,
+                response_text=response_text,
+                model=model_name,
+                token_usage=token_usage,
+                metadata={
+                    "max_tokens": max_tokens,
+                    "temperature": temperature,
+                    "top_p": top_p
+                }
+            )
+            
+            logger.info(f"Inference completed successfully")
+            return jsonify(response.model_dump()), 200
+            
+        finally:
+            # Clean up temporary image file
+            try:
+                os.unlink(image_path)
+            except Exception as e:
+                logger.warning(f"Failed to delete temporary image: {e}")
+    
+    except ValueError as e:
+        logger.error(f"Validation error: {e}")
+        error = ErrorResponse(
+            error=str(e),
+            error_type="validation"
+        )
+        return jsonify(error.model_dump()), 400
+    
+    except Exception as e:
+        logger.error(f"Inference error: {e}", exc_info=True)
+        error = ErrorResponse(
+            error=str(e),
+            error_type="system",
+            details={"traceback": str(e)}
+        )
+        return jsonify(error.model_dump()), 500
+
+
+@app.route('/', methods=['GET'])
+def index():
+    """Root endpoint with API information."""
+    return jsonify({
+        "service": "Llama 3.2 Vision Inference API",
+        "version": "1.0.0",
+        "endpoints": {
+            "/health": "GET - Health check",
+            "/infer": "POST - Run inference with image and text prompt",
+        },
+        "model": model_name if llama_model else "not loaded"
+    }), 200
+
+
+if __name__ == '__main__':
+    # Load model at startup
+    try:
+        load_model()
+    except Exception as e:
+        logger.error(f"Failed to load model at startup: {e}")
+        logger.warning("Starting server anyway, but inference will fail")
+    
+    # Start Flask server
+    port = int(os.environ.get('PORT', 5000))
+    app.run(host='0.0.0.0', port=port, debug=False)
diff --git a/llama-vision/compose.yml b/llama-vision/compose.yml
new file mode 100644
index 0000000..3499627
--- /dev/null
+++ b/llama-vision/compose.yml
@@ -0,0 +1,35 @@
+version: '3.8'
+
+services:
+  llama-vision:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: llama-vision:latest
+    container_name: llama-vision
+    ports:
+      - "5000:5000"
+    volumes:
+      # Mount your models directory here
+      - ./models:/models
+    environment:
+      # Configure your model name
+      - MODEL_NAME=llama-3.2-11b-vision-instruct-q4_k_m.gguf
+      - MODEL_PATH=/models
+      - PORT=5000
+    restart: unless-stopped
+    # Resource limits (adjust based on your needs)
+    deploy:
+      resources:
+        limits:
+          cpus: '4'
+          memory: 8G
+        reservations:
+          cpus: '2'
+          memory: 4G
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:5000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
diff --git a/llama-vision/requirements.txt b/llama-vision/requirements.txt
new file mode 100644
index 0000000..f9e4749
--- /dev/null
+++ b/llama-vision/requirements.txt
@@ -0,0 +1,6 @@
+flask==3.0.0
+werkzeug==3.0.1
+llama-cpp-python==0.2.90
+pillow==10.1.0
+requests==2.31.0
+pydantic==2.5.0
diff --git a/llama-vision/test_api.py b/llama-vision/test_api.py
new file mode 100644
index 0000000..4fb8ce9
--- /dev/null
+++ b/llama-vision/test_api.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+Example script to test the Llama Vision API.
+"""
+
+import requests
+import base64
+import json
+import sys
+from pathlib import Path
+
+
+def test_health(base_url: str = "http://localhost:5000"):
+    """Test the health endpoint."""
+    print("Testing health endpoint...")
+    response = requests.get(f"{base_url}/health")
+    print(f"Status: {response.status_code}")
+    print(f"Response: {json.dumps(response.json(), indent=2)}")
+    return response.status_code == 200
+
+
+def test_inference(image_path: str, prompt: str, base_url: str = "http://localhost:5000"):
+    """Test the inference endpoint with an image."""
+    print(f"\nTesting inference endpoint...")
+    print(f"Image: {image_path}")
+    print(f"Prompt: {prompt}")
+    
+    # Read and encode image
+    try:
+        with open(image_path, "rb") as f:
+            image_data = base64.b64encode(f.read()).decode()
+    except FileNotFoundError:
+        print(f"Error: Image file not found: {image_path}")
+        return False
+    
+    # Prepare request
+    payload = {
+        "prompt": prompt,
+        "image": image_data,
+        "max_tokens": 300,
+        "temperature": 0.7,
+        "top_p": 0.95
+    }
+    
+    # Make request
+    try:
+        print("Sending request...")
+        response = requests.post(
+            f"{base_url}/infer",
+            json=payload,
+            timeout=120  # 2 minutes timeout for slow CPU inference
+        )
+        
+        print(f"Status: {response.status_code}")
+        result = response.json()
+        print(f"Response: {json.dumps(result, indent=2)}")
+        
+        if result.get("success"):
+            print(f"\n✅ Success!")
+            print(f"Model Response: {result.get('response_text')}")
+            if result.get("token_usage"):
+                usage = result["token_usage"]
+                print(f"Tokens: {usage['total_tokens']} (prompt: {usage['prompt_tokens']}, completion: {usage['completion_tokens']})")
+        else:
+            print(f"\n❌ Error: {result.get('error')}")
+        
+        return response.status_code == 200
+        
+    except requests.exceptions.Timeout:
+        print("Error: Request timed out (inference may take a long time on CPU)")
+        return False
+    except Exception as e:
+        print(f"Error: {e}")
+        return False
+
+
+def main():
+    """Main function."""
+    base_url = "http://localhost:5000"
+    
+    # Test health
+    if not test_health(base_url):
+        print("\n❌ Health check failed! Is the service running?")
+        sys.exit(1)
+    
+    print("\n✅ Health check passed!")
+    
+    # Test inference if image provided
+    if len(sys.argv) > 1:
+        image_path = sys.argv[1]
+        prompt = sys.argv[2] if len(sys.argv) > 2 else "Describe this image in detail."
+        
+        if test_inference(image_path, prompt, base_url):
+            print("\n✅ Inference test passed!")
+        else:
+            print("\n❌ Inference test failed!")
+            sys.exit(1)
+    else:
+        print("\nℹ️  To test inference, run:")
+        print(f"  python {sys.argv[0]} <image_path> [prompt]")
+        print("\nExample:")
+        print(f"  python {sys.argv[0]} test_image.jpg 'What objects are in this image?'")
+
+
+if __name__ == "__main__":
+    main()

From 620e2ff63a2a5971b7d6c40a6ad5a8d15b3194f8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 13 Jan 2026 21:34:10 +0000
Subject: [PATCH 3/7] Add CLIP model support for vision features and update
 documentation

Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com>
---
 llama-vision/Dockerfile     |  1 +
 llama-vision/Makefile       |  5 ++++-
 llama-vision/README.md      | 37 ++++++++++++++++++++++++++++++-------
 llama-vision/app/webhook.py | 37 ++++++++++++++++++++++++++-----------
 llama-vision/compose.yml    |  3 ++-
 5 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/llama-vision/Dockerfile b/llama-vision/Dockerfile
index 9147faf..b5c4b58 100644
--- a/llama-vision/Dockerfile
+++ b/llama-vision/Dockerfile
@@ -39,6 +39,7 @@ COPY app/ /app/
 
 # Environment variables
 ENV MODEL_NAME="llama-3.2-11b-vision-instruct-q4_k_m.gguf"
+ENV CLIP_MODEL_NAME="mmproj-model-f16.gguf"
 ENV MODEL_PATH="/models"
 ENV FLASK_APP=webhook.py
 ENV PYTHONUNBUFFERED=1
diff --git a/llama-vision/Makefile b/llama-vision/Makefile
index 3b1332e..f24999c 100644
--- a/llama-vision/Makefile
+++ b/llama-vision/Makefile
@@ -1,7 +1,8 @@
 .PHONY: build run stop clean test help
 
-# Default model name (can be overridden)
+# Default model names (can be overridden)
 MODEL_NAME ?= llama-3.2-11b-vision-instruct-q4_k_m.gguf
+CLIP_MODEL_NAME ?= mmproj-model-f16.gguf
 
 help:
 	@echo "Llama Vision Docker - Available targets:"
@@ -17,6 +18,7 @@ help:
 	@echo ""
 	@echo "Environment variables:"
 	@echo "  MODEL_NAME       - Model filename (default: $(MODEL_NAME))"
+	@echo "  CLIP_MODEL_NAME  - CLIP projector filename (default: $(CLIP_MODEL_NAME))"
 
 build:
 	docker build --pull --rm -t llama-vision:latest .
@@ -26,6 +28,7 @@ run:
 		-p 5000:5000 \
 		-v $$(pwd)/models:/models \
 		-e MODEL_NAME=$(MODEL_NAME) \
+		-e CLIP_MODEL_NAME=$(CLIP_MODEL_NAME) \
 		--name llama-vision \
 		llama-vision:latest
 
diff --git a/llama-vision/README.md b/llama-vision/README.md
index 018190b..d357f9b 100644
--- a/llama-vision/README.md
+++ b/llama-vision/README.md
@@ -21,16 +21,29 @@ Docker image that runs Llama 3.2 Vision model with CPU inference (no GPU/NVIDIA
 
 ### Download Model
 
-You can download the recommended Q4_K_M quantized Llama 3.2 Vision model from Hugging Face:
+You need two model files for Llama 3.2 Vision to work:
+1. **Main model**: The language model in GGUF format with Q4_K_M quantization
+2. **CLIP projector**: The vision encoder/projector model (usually named `mmproj-*.gguf`)
+
+You can download models from Hugging Face. For example:
 
 ```bash
-# Example: Download Llama 3.2 11B Vision Instruct Q4_K_M
-# Visit https://huggingface.co/ and search for "llama-3.2-vision Q4_K_M GGUF"
-# Download the .gguf file and place it in a models directory
+# Create models directory
 mkdir -p models
-# Place your downloaded model in ./models/
+
+# Download the main model and CLIP projector
+# Visit https://huggingface.co/ and search for "llama-3.2-vision Q4_K_M GGUF"
+# Or for LLaVA models:
+# Main model: https://huggingface.co/mys/ggml_llava-v1.5-7b
+# CLIP projector: https://huggingface.co/mys/ggml_llava-v1.5-7b (mmproj-model-f16.gguf)
+
+# Place both files in ./models/
+# - your-model-name.gguf (main language model)
+# - mmproj-model-f16.gguf (CLIP vision projector)
 ```
 
+**Note:** Llama 3.2 Vision is a multimodal model that requires both components. Without the CLIP projector, vision features will not work.
+
 ### Build the Docker Image
 
 ```bash
@@ -44,14 +57,16 @@ docker run -d \
   -p 5000:5000 \
   -v $(pwd)/models:/models \
   -e MODEL_NAME="your-model-name.gguf" \
+  -e CLIP_MODEL_NAME="mmproj-model-f16.gguf" \
   --name llama-vision \
   llama-vision:latest
 ```
 
 ### Environment Variables
 
-- `MODEL_NAME`: Name of the GGUF model file (default: `llama-3.2-11b-vision-instruct-q4_k_m.gguf`)
-- `MODEL_PATH`: Directory containing the model (default: `/models`)
+- `MODEL_NAME`: Name of the main GGUF model file (default: `llama-3.2-11b-vision-instruct-q4_k_m.gguf`)
+- `CLIP_MODEL_NAME`: Name of the CLIP projector GGUF file (default: `mmproj-model-f16.gguf`)
+- `MODEL_PATH`: Directory containing the models (default: `/models`)
 - `PORT`: Port for the Flask server (default: `5000`)
 
 ## API Documentation
@@ -211,6 +226,14 @@ This setup uses **Q4_K_M** quantization, which is recommended for CPU inference
 
 The recommendation comes from the llama.cpp community discussions, particularly [PR #5780](https://github.com/ggml-org/llama.cpp/pull/5780), where Q4_K_M is noted as the default choice for most use cases.
 
+### Vision Model Components
+
+Llama 3.2 Vision (and similar multimodal models like LLaVA) require two components:
+1. **Main Language Model**: The text generation model in GGUF format
+2. **CLIP Projector**: A vision encoder that processes images and projects them into the language model's embedding space (typically named `mmproj-*.gguf`)
+
+Both files must be present in the models directory for vision inference to work properly.
+
 ### Architecture
 
 - **Base Image:** Python 3.11-slim (small footprint)
diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py
index 233bcb4..3979393 100644
--- a/llama-vision/app/webhook.py
+++ b/llama-vision/app/webhook.py
@@ -15,7 +15,6 @@
 from werkzeug.exceptions import BadRequest
 from PIL import Image
 from llama_cpp import Llama
-from llama_cpp.llama_chat_format import Llava15ChatHandler
 
 from schema import VisionResponse, ErrorResponse, HealthResponse, TokenUsage
 
@@ -41,26 +40,42 @@ def load_model():
     
     model_path_env = os.environ.get('MODEL_PATH', '/models')
     model_name_env = os.environ.get('MODEL_NAME', 'llama-3.2-11b-vision-instruct-q4_k_m.gguf')
+    clip_model_name_env = os.environ.get('CLIP_MODEL_NAME', 'mmproj-model-f16.gguf')
     
     full_model_path = Path(model_path_env) / model_name_env
+    full_clip_path = Path(model_path_env) / clip_model_name_env
     
     logger.info(f"Loading model from: {full_model_path}")
+    logger.info(f"Loading CLIP model from: {full_clip_path}")
     
     if not full_model_path.exists():
         logger.error(f"Model file not found at: {full_model_path}")
         raise FileNotFoundError(f"Model file not found at: {full_model_path}")
     
+    if not full_clip_path.exists():
+        logger.warning(f"CLIP model file not found at: {full_clip_path}")
+        logger.warning("Vision features may not work without CLIP model")
+    
     try:
-        # Initialize with vision support using llava chat handler
-        # For vision models, we need the appropriate chat handler
-        llama_model = Llama(
-            model_path=str(full_model_path),
-            n_ctx=2048,  # Context window
-            n_threads=os.cpu_count(),  # Use all available CPU threads
-            n_gpu_layers=0,  # CPU only, no GPU layers
-            verbose=False,
-            chat_format="llava-1-5",  # Vision chat format
-        )
+        # Initialize with vision support
+        # For Llama 3.2 Vision models, we need both the main model and CLIP projector
+        model_kwargs = {
+            "model_path": str(full_model_path),
+            "n_ctx": 2048,  # Context window
+            "n_threads": os.cpu_count(),  # Use all available CPU threads
+            "n_gpu_layers": 0,  # CPU only, no GPU layers
+            "verbose": False,
+        }
+        
+        # Add CLIP model if available
+        if full_clip_path.exists():
+            model_kwargs["clip_model_path"] = str(full_clip_path)
+            model_kwargs["chat_format"] = "llava-1-5"  # Vision chat format
+            logger.info("CLIP model loaded for vision support")
+        else:
+            logger.warning("Running without CLIP model - vision features disabled")
+        
+        llama_model = Llama(**model_kwargs)
         model_name = model_name_env
         logger.info(f"Model loaded successfully: {model_name}")
         return True
diff --git a/llama-vision/compose.yml b/llama-vision/compose.yml
index 3499627..ec08d8b 100644
--- a/llama-vision/compose.yml
+++ b/llama-vision/compose.yml
@@ -13,8 +13,9 @@ services:
       # Mount your models directory here
       - ./models:/models
     environment:
-      # Configure your model name
+      # Configure your model names
       - MODEL_NAME=llama-3.2-11b-vision-instruct-q4_k_m.gguf
+      - CLIP_MODEL_NAME=mmproj-model-f16.gguf
       - MODEL_PATH=/models
       - PORT=5000
     restart: unless-stopped

From 556c1b90fa29426fc2c7b42d597b337c5bf993dc Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 13 Jan 2026 21:37:11 +0000
Subject: [PATCH 4/7] Fix security issues, resource handling, and dependency
 versions

Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com>
---
 llama-vision/README.md        |  1 +
 llama-vision/app/webhook.py   | 41 ++++++++++++++++++++++-------------
 llama-vision/requirements.txt | 12 +++++-----
 3 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/llama-vision/README.md b/llama-vision/README.md
index d357f9b..07d7223 100644
--- a/llama-vision/README.md
+++ b/llama-vision/README.md
@@ -68,6 +68,7 @@ docker run -d \
 - `CLIP_MODEL_NAME`: Name of the CLIP projector GGUF file (default: `mmproj-model-f16.gguf`)
 - `MODEL_PATH`: Directory containing the models (default: `/models`)
 - `PORT`: Port for the Flask server (default: `5000`)
+- `N_THREADS`: Number of CPU threads for inference (default: half of available CPUs, or set to 0 for auto)
 
 ## API Documentation
 
diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py
index 3979393..2cf151b 100644
--- a/llama-vision/app/webhook.py
+++ b/llama-vision/app/webhook.py
@@ -59,10 +59,20 @@ def load_model():
     try:
         # Initialize with vision support
         # For Llama 3.2 Vision models, we need both the main model and CLIP projector
+        
+        # Configure CPU threads conservatively to avoid resource contention
+        # Use environment variable if set, otherwise use half of available CPUs
+        n_threads = int(os.environ.get('N_THREADS', 0))
+        if n_threads <= 0:
+            cpu_count = os.cpu_count() or 4
+            n_threads = max(1, cpu_count // 2)
+        
+        logger.info(f"Using {n_threads} CPU threads for inference")
+        
         model_kwargs = {
             "model_path": str(full_model_path),
             "n_ctx": 2048,  # Context window
-            "n_threads": os.cpu_count(),  # Use all available CPU threads
+            "n_threads": n_threads,
             "n_gpu_layers": 0,  # CPU only, no GPU layers
             "verbose": False,
         }
@@ -216,10 +226,18 @@ def infer():
         temperature = data.get('temperature', 0.7)
         top_p = data.get('top_p', 0.95)
         
-        logger.info(f"Processing inference request with prompt: {prompt_text[:50]}...")
+        logger.info(f"Processing inference request (prompt length: {len(prompt_text)} chars)")
         
         # Process image
-        image_path = process_image(image_data)
+        image_path = None
+        try:
+            image_path = process_image(image_data)
+        except ValueError as e:
+            error = ErrorResponse(
+                error=str(e),
+                error_type="validation"
+            )
+            return jsonify(error.model_dump()), 400
         
         try:
             # Create vision prompt
@@ -262,18 +280,11 @@ def infer():
             
         finally:
             # Clean up temporary image file
-            try:
-                os.unlink(image_path)
-            except Exception as e:
-                logger.warning(f"Failed to delete temporary image: {e}")
-    
-    except ValueError as e:
-        logger.error(f"Validation error: {e}")
-        error = ErrorResponse(
-            error=str(e),
-            error_type="validation"
-        )
-        return jsonify(error.model_dump()), 400
+            if image_path:
+                try:
+                    os.unlink(image_path)
+                except Exception as e:
+                    logger.warning(f"Failed to delete temporary image: {e}")
     
     except Exception as e:
         logger.error(f"Inference error: {e}", exc_info=True)
diff --git a/llama-vision/requirements.txt b/llama-vision/requirements.txt
index f9e4749..e11b555 100644
--- a/llama-vision/requirements.txt
+++ b/llama-vision/requirements.txt
@@ -1,6 +1,6 @@
-flask==3.0.0
-werkzeug==3.0.1
-llama-cpp-python==0.2.90
-pillow==10.1.0
-requests==2.31.0
-pydantic==2.5.0
+flask>=3.0.3,<4.0
+werkzeug>=3.0.3,<4.0
+llama-cpp-python>=0.2.90,<0.3.0
+pillow>=10.4.0,<11.0
+requests>=2.32.0,<3.0
+pydantic>=2.9.0,<3.0

From dbe653c5c6e5bc12236042b98763604bd8236c3c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 13 Jan 2026 21:38:56 +0000
Subject: [PATCH 5/7] Remove unused import and update docker-compose format

Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com>
---
 llama-vision/app/webhook.py | 1 -
 llama-vision/compose.yml    | 2 --
 2 files changed, 3 deletions(-)

diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py
index 2cf151b..5ae20c3 100644
--- a/llama-vision/app/webhook.py
+++ b/llama-vision/app/webhook.py
@@ -12,7 +12,6 @@
 from typing import Optional, List, Dict, Any
 
 from flask import Flask, request, jsonify
-from werkzeug.exceptions import BadRequest
 from PIL import Image
 from llama_cpp import Llama
 
diff --git a/llama-vision/compose.yml b/llama-vision/compose.yml
index ec08d8b..6045f2c 100644
--- a/llama-vision/compose.yml
+++ b/llama-vision/compose.yml
@@ -1,5 +1,3 @@
-version: '3.8'
-
 services:
   llama-vision:
     build:

From b49c53226d4576e9993c3a2fc690c14b89843f60 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 13 Jan 2026 21:50:42 +0000
Subject: [PATCH 6/7] Refactor: Create proper JSON schema file and rename
 Python models

- Add response_schema.json with formal JSON schema definitions
- Rename schema.py to models.py to clarify it contains internal validation logic
- Update README to reference the JSON schema file
- Update documentation with JSON examples instead of Python type hints

Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com>
---
 llama-vision/README.md                    |  42 ++++----
 llama-vision/app/{schema.py => models.py} |   0
 llama-vision/app/response_schema.json     | 118 ++++++++++++++++++++++
 llama-vision/app/webhook.py               |   2 +-
 4 files changed, 143 insertions(+), 19 deletions(-)
 rename llama-vision/app/{schema.py => models.py} (100%)
 create mode 100644 llama-vision/app/response_schema.json

diff --git a/llama-vision/README.md b/llama-vision/README.md
index 07d7223..1a89f03 100644
--- a/llama-vision/README.md
+++ b/llama-vision/README.md
@@ -142,33 +142,38 @@ docker run -d \
 
 ## Response Schema
 
+The formal JSON schema definition is available in `app/response_schema.json`. Below are examples of the response formats:
+
 ### VisionResponse
 
-```python
+```json
 {
-  "success": bool,              # Request success status
-  "response_text": str,         # Generated text response
-  "model": str,                 # Model used for inference
-  "timestamp": str,             # ISO timestamp
-  "token_usage": {              # Token usage statistics
-    "prompt_tokens": int,
-    "completion_tokens": int,
-    "total_tokens": int
+  "success": true,
+  "response_text": "The image shows...",
+  "model": "llama-3.2-11b-vision-instruct-q4_k_m.gguf",
+  "timestamp": "2024-01-13T12:00:00.000000",
+  "token_usage": {
+    "prompt_tokens": 150,
+    "completion_tokens": 200,
+    "total_tokens": 350
   },
-  "metadata": dict,             # Additional metadata
-  "error": str | null           # Error message if any
+  "metadata": {
+    "max_tokens": 256,
+    "temperature": 0.7
+  },
+  "error": null
 }
 ```
 
 ### ErrorResponse
 
-```python
+```json
 {
-  "success": false,             # Always false for errors
-  "error": str,                 # Error message
-  "error_type": str,            # Error type (validation, model, system)
-  "timestamp": str,             # ISO timestamp
-  "details": dict | null        # Additional error details
+  "success": false,
+  "error": "Missing required field: prompt",
+  "error_type": "validation",
+  "timestamp": "2024-01-13T12:00:00.000000",
+  "details": null
 }
 ```
 
@@ -262,7 +267,8 @@ llama-vision/
 ├── README.md              # This file
 └── app/
     ├── webhook.py         # Flask application
-    └── schema.py          # Pydantic response schemas
+    ├── models.py          # Pydantic validation models
+    └── response_schema.json  # JSON schema for API responses
 ```
 
 ### Building and Testing Locally
diff --git a/llama-vision/app/schema.py b/llama-vision/app/models.py
similarity index 100%
rename from llama-vision/app/schema.py
rename to llama-vision/app/models.py
diff --git a/llama-vision/app/response_schema.json b/llama-vision/app/response_schema.json
new file mode 100644
index 0000000..c63a0eb
--- /dev/null
+++ b/llama-vision/app/response_schema.json
@@ -0,0 +1,118 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Llama Vision API Response Schemas",
+  "definitions": {
+    "TokenUsage": {
+      "type": "object",
+      "description": "Token usage statistics",
+      "properties": {
+        "prompt_tokens": {
+          "type": "integer",
+          "description": "Number of tokens in the prompt"
+        },
+        "completion_tokens": {
+          "type": "integer",
+          "description": "Number of tokens in the completion"
+        },
+        "total_tokens": {
+          "type": "integer",
+          "description": "Total number of tokens used"
+        }
+      },
+      "required": ["prompt_tokens", "completion_tokens", "total_tokens"]
+    },
+    "VisionResponse": {
+      "type": "object",
+      "description": "Standard response format for vision inference requests",
+      "properties": {
+        "success": {
+          "type": "boolean",
+          "description": "Whether the request was successful"
+        },
+        "response_text": {
+          "type": "string",
+          "description": "The generated text response from the model"
+        },
+        "model": {
+          "type": "string",
+          "description": "The model used for inference"
+        },
+        "timestamp": {
+          "type": "string",
+          "format": "date-time",
+          "description": "ISO 8601 timestamp of the response"
+        },
+        "token_usage": {
+          "$ref": "#/definitions/TokenUsage",
+          "description": "Token usage statistics"
+        },
+        "metadata": {
+          "type": "object",
+          "description": "Additional metadata",
+          "additionalProperties": true
+        },
+        "error": {
+          "type": ["string", "null"],
+          "description": "Error message if success is False"
+        }
+      },
+      "required": ["success", "response_text", "model", "timestamp"]
+    },
+    "ErrorResponse": {
+      "type": "object",
+      "description": "Error response format",
+      "properties": {
+        "success": {
+          "type": "boolean",
+          "const": false,
+          "description": "Always False for error responses"
+        },
+        "error": {
+          "type": "string",
+          "description": "Error message describing what went wrong"
+        },
+        "error_type": {
+          "type": "string",
+          "enum": ["validation", "model", "system"],
+          "description": "Type of error"
+        },
+        "timestamp": {
+          "type": "string",
+          "format": "date-time",
+          "description": "ISO 8601 timestamp of the error"
+        },
+        "details": {
+          "type": ["object", "null"],
+          "description": "Additional error details",
+          "additionalProperties": true
+        }
+      },
+      "required": ["success", "error", "error_type", "timestamp"]
+    },
+    "HealthResponse": {
+      "type": "object",
+      "description": "Health check response format",
+      "properties": {
+        "status": {
+          "type": "string",
+          "enum": ["healthy", "unhealthy", "degraded"],
+          "description": "Health status"
+        },
+        "model_loaded": {
+          "type": "boolean",
+          "description": "Whether the model is loaded and ready"
+        },
+        "model_name": {
+          "type": "string",
+          "description": "Name of the loaded model"
+        },
+        "timestamp": {
+          "type": "string",
+          "format": "date-time",
+          "description": "ISO 8601 timestamp of health check"
+        }
+      },
+      "required": ["status", "model_loaded", "model_name", "timestamp"]
+    }
+  }
+}
diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py
index 5ae20c3..c1fa9cd 100644
--- a/llama-vision/app/webhook.py
+++ b/llama-vision/app/webhook.py
@@ -15,7 +15,7 @@
 from PIL import Image
 from llama_cpp import Llama
 
-from schema import VisionResponse, ErrorResponse, HealthResponse, TokenUsage
+from models import VisionResponse, ErrorResponse, HealthResponse, TokenUsage
 
 
 # Configure logging

From e6b351d9b6169c4cf2252eadc6b41b6f0a14d763 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 13 Jan 2026 21:52:23 +0000
Subject: [PATCH 7/7] Add enum validation to models for consistency with JSON
 schema

- Use Literal types for error_type and status fields
- Ensures runtime validation matches JSON schema constraints
- Update docstring to reference response_schema.json

Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com>
---
 llama-vision/app/models.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llama-vision/app/models.py b/llama-vision/app/models.py
index 4ea3eb3..37a04b1 100644
--- a/llama-vision/app/models.py
+++ b/llama-vision/app/models.py
@@ -1,8 +1,9 @@
 """
-Response schema definitions for the Llama Vision webhook.
+Response models for runtime validation in the Llama Vision webhook.
+For the formal JSON schema definition, see response_schema.json.
 """
 
-from typing import Optional, List, Dict, Any
+from typing import Optional, Dict, Any, Literal
 from pydantic import BaseModel, Field
 from datetime import datetime
 
@@ -33,7 +34,9 @@ class ErrorResponse(BaseModel):
     
     success: bool = Field(default=False, description="Always False for error responses")
     error: str = Field(description="Error message describing what went wrong")
-    error_type: str = Field(description="Type of error (e.g., validation, model, system)")
+    error_type: Literal["validation", "model", "system"] = Field(
+        description="Type of error (validation, model, or system)"
+    )
     timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(),
                           description="ISO timestamp of the error")
     details: Optional[Dict[str, Any]] = Field(None, description="Additional error details")
@@ -42,7 +45,9 @@ class ErrorResponse(BaseModel):
 class HealthResponse(BaseModel):
     """Health check response format."""
     
-    status: str = Field(description="Health status (healthy, unhealthy, degraded)")
+    status: Literal["healthy", "unhealthy", "degraded"] = Field(
+        description="Health status (healthy, unhealthy, or degraded)"
+    )
     model_loaded: bool = Field(description="Whether the model is loaded and ready")
     model_name: str = Field(description="Name of the loaded model")
     timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(),