From f5744e724afe2cc66e8161170aa722e01ad9d0a8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:26:44 +0000 Subject: [PATCH 1/7] Initial plan From 7aff62a6d0da56388b21bb609b851e6d6b546da2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:31:50 +0000 Subject: [PATCH 2/7] Add Llama 3.2 Vision docker setup with Flask webhook API Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com> --- README.md | 1 + llama-vision/.gitignore | 42 +++++ llama-vision/Dockerfile | 57 +++++++ llama-vision/Makefile | 62 +++++++ llama-vision/README.md | 300 ++++++++++++++++++++++++++++++++++ llama-vision/app/schema.py | 49 ++++++ llama-vision/app/webhook.py | 297 +++++++++++++++++++++++++++++++++ llama-vision/compose.yml | 35 ++++ llama-vision/requirements.txt | 6 + llama-vision/test_api.py | 106 ++++++++++++ 10 files changed, 955 insertions(+) create mode 100644 llama-vision/.gitignore create mode 100644 llama-vision/Dockerfile create mode 100644 llama-vision/Makefile create mode 100644 llama-vision/README.md create mode 100644 llama-vision/app/schema.py create mode 100644 llama-vision/app/webhook.py create mode 100644 llama-vision/compose.yml create mode 100644 llama-vision/requirements.txt create mode 100644 llama-vision/test_api.py diff --git a/README.md b/README.md index 06309cb..793359a 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ A collection of docker containers I use for various hobby projects, currently co - `ghost` -> ghost6 based blog setup - `homebridge` -> latest Homebridge container for HomeKit integration (armv8 architecture) +- `llama-vision` -> Llama 3.2 Vision model with CPU inference (Q4_K_M quantization) and Flask webhook API - `python` -> base "hello-world" docker setup with poetry for dependency resolution and VSCode debug capabilities - `verilator` -> minimal Verilator simulator for RTL simulation (based on OpenTitan container setup) - `watchtower` -> automatic Docker image updates for all containers diff --git a/llama-vision/.gitignore b/llama-vision/.gitignore new file mode 100644 index 0000000..3dad26e --- /dev/null +++ b/llama-vision/.gitignore @@ -0,0 +1,42 @@ +# Models directory +models/ +*.gguf + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +.venv + +# Flask +instance/ +.webassets-cache + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log + +# Temporary files +tmp/ +temp/ +*.tmp diff --git a/llama-vision/Dockerfile b/llama-vision/Dockerfile new file mode 100644 index 0000000..9147faf --- /dev/null +++ b/llama-vision/Dockerfile @@ -0,0 +1,57 @@ +# Multi-stage build for llama.cpp with Llama 3.2 Vision support +FROM python:3.11-slim as builder + +# Install build dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + cmake \ + git \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Build llama.cpp from source +WORKDIR /build +RUN git clone https://github.com/ggerganov/llama.cpp.git +WORKDIR /build/llama.cpp +RUN cmake -B build -DLLAMA_CURL=ON +RUN cmake --build build --config Release + +# Final runtime image +FROM python:3.11-slim + +# Install runtime dependencies +RUN apt-get update && apt-get install -y \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy llama.cpp build from builder +COPY --from=builder /build/llama.cpp/build/bin/llama-server /usr/local/bin/ +COPY --from=builder /build/llama.cpp/build/bin/llama-cli /usr/local/bin/ + +# Set working directory +WORKDIR /app + +# Copy application files +COPY requirements.txt /app/ +RUN pip install --no-cache-dir -r requirements.txt + +COPY app/ /app/ + +# Environment variables +ENV MODEL_NAME="llama-3.2-11b-vision-instruct-q4_k_m.gguf" +ENV MODEL_PATH="/models" +ENV FLASK_APP=webhook.py +ENV PYTHONUNBUFFERED=1 + +# Create models directory +RUN mkdir -p /models + +# Expose Flask port +EXPOSE 5000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:5000/health || exit 1 + +# Run Flask app +CMD ["python", "webhook.py"] diff --git a/llama-vision/Makefile b/llama-vision/Makefile new file mode 100644 index 0000000..3b1332e --- /dev/null +++ b/llama-vision/Makefile @@ -0,0 +1,62 @@ +.PHONY: build run stop clean test help + +# Default model name (can be overridden) +MODEL_NAME ?= llama-3.2-11b-vision-instruct-q4_k_m.gguf + +help: + @echo "Llama Vision Docker - Available targets:" + @echo "" + @echo " make build - Build the Docker image" + @echo " make run - Run the container with models volume" + @echo " make stop - Stop the running container" + @echo " make clean - Stop and remove the container" + @echo " make logs - Show container logs" + @echo " make test - Run API tests" + @echo " make shell - Open shell in running container" + @echo " make health - Check service health" + @echo "" + @echo "Environment variables:" + @echo " MODEL_NAME - Model filename (default: $(MODEL_NAME))" + +build: + docker build --pull --rm -t llama-vision:latest . + +run: + docker run -d \ + -p 5000:5000 \ + -v $$(pwd)/models:/models \ + -e MODEL_NAME=$(MODEL_NAME) \ + --name llama-vision \ + llama-vision:latest + +stop: + docker stop llama-vision || true + +clean: stop + docker rm llama-vision || true + +logs: + docker logs -f llama-vision + +shell: + docker exec -it llama-vision /bin/bash + +health: + @echo "Checking health..." + @curl -s http://localhost:5000/health | python -m json.tool + +test: + python test_api.py + +# Docker compose targets +compose-up: + docker compose up -d + +compose-down: + docker compose down + +compose-logs: + docker compose logs -f + +compose-build: + docker compose build diff --git a/llama-vision/README.md b/llama-vision/README.md new file mode 100644 index 0000000..018190b --- /dev/null +++ b/llama-vision/README.md @@ -0,0 +1,300 @@ +# Llama 3.2 Vision Docker Setup + +Docker image that runs Llama 3.2 Vision model with CPU inference (no GPU/NVIDIA acceleration) using the recommended Q4_K_M quantization. The service exposes a Flask webhook interface to accept images and text prompts and provides responses via JSON. + +## Features + +- šŸš€ CPU-only inference using llama.cpp (no GPU required) +- šŸ“ø Vision model support for image understanding +- šŸ”§ Q4_K_M quantization for optimal CPU performance (as recommended in [llama.cpp PR #5780](https://github.com/ggml-org/llama.cpp/pull/5780)) +- 🌐 RESTful API with Flask webhook interface +- šŸ“‹ Structured JSON response schema +- šŸ” Health check endpoint +- šŸ“Š Token usage tracking + +## Quick Start + +### Prerequisites + +- Docker installed on your system +- A Llama 3.2 Vision model in GGUF format with Q4_K_M quantization + +### Download Model + +You can download the recommended Q4_K_M quantized Llama 3.2 Vision model from Hugging Face: + +```bash +# Example: Download Llama 3.2 11B Vision Instruct Q4_K_M +# Visit https://huggingface.co/ and search for "llama-3.2-vision Q4_K_M GGUF" +# Download the .gguf file and place it in a models directory +mkdir -p models +# Place your downloaded model in ./models/ +``` + +### Build the Docker Image + +```bash +docker build -t llama-vision:latest . +``` + +### Run the Container + +```bash +docker run -d \ + -p 5000:5000 \ + -v $(pwd)/models:/models \ + -e MODEL_NAME="your-model-name.gguf" \ + --name llama-vision \ + llama-vision:latest +``` + +### Environment Variables + +- `MODEL_NAME`: Name of the GGUF model file (default: `llama-3.2-11b-vision-instruct-q4_k_m.gguf`) +- `MODEL_PATH`: Directory containing the model (default: `/models`) +- `PORT`: Port for the Flask server (default: `5000`) + +## API Documentation + +### Health Check + +**Endpoint:** `GET /health` + +**Response:** +```json +{ + "status": "healthy", + "model_loaded": true, + "model_name": "llama-3.2-11b-vision-instruct-q4_k_m.gguf", + "timestamp": "2024-01-13T12:00:00.000000" +} +``` + +### Inference Endpoint + +**Endpoint:** `POST /infer` + +**Request Body:** +```json +{ + "prompt": "Describe what you see in this image", + "image": "base64_encoded_image_data", + "max_tokens": 256, + "temperature": 0.7, + "top_p": 0.95 +} +``` + +**Parameters:** +- `prompt` (required): Text prompt/question about the image +- `image` (required): Base64 encoded image data (supports data URL format) +- `max_tokens` (optional): Maximum tokens in response (default: 256) +- `temperature` (optional): Sampling temperature (default: 0.7) +- `top_p` (optional): Nucleus sampling parameter (default: 0.95) + +**Success Response (200):** +```json +{ + "success": true, + "response_text": "The image shows a beautiful sunset over mountains...", + "model": "llama-3.2-11b-vision-instruct-q4_k_m.gguf", + "timestamp": "2024-01-13T12:00:00.000000", + "token_usage": { + "prompt_tokens": 150, + "completion_tokens": 200, + "total_tokens": 350 + }, + "metadata": { + "max_tokens": 256, + "temperature": 0.7, + "top_p": 0.95 + }, + "error": null +} +``` + +**Error Response (400/500):** +```json +{ + "success": false, + "error": "Missing required field: prompt", + "error_type": "validation", + "timestamp": "2024-01-13T12:00:00.000000", + "details": null +} +``` + +## Response Schema + +### VisionResponse + +```python +{ + "success": bool, # Request success status + "response_text": str, # Generated text response + "model": str, # Model used for inference + "timestamp": str, # ISO timestamp + "token_usage": { # Token usage statistics + "prompt_tokens": int, + "completion_tokens": int, + "total_tokens": int + }, + "metadata": dict, # Additional metadata + "error": str | null # Error message if any +} +``` + +### ErrorResponse + +```python +{ + "success": false, # Always false for errors + "error": str, # Error message + "error_type": str, # Error type (validation, model, system) + "timestamp": str, # ISO timestamp + "details": dict | null # Additional error details +} +``` + +## Example Usage + +### Python Example + +```python +import requests +import base64 + +# Read and encode image +with open("image.jpg", "rb") as f: + image_data = base64.b64encode(f.read()).decode() + +# Make request +response = requests.post( + "http://localhost:5000/infer", + json={ + "prompt": "What objects can you see in this image?", + "image": image_data, + "max_tokens": 300, + "temperature": 0.7 + } +) + +result = response.json() +print(result["response_text"]) +``` + +### cURL Example + +```bash +# Encode image to base64 +IMAGE_B64=$(base64 -w 0 image.jpg) + +# Make request +curl -X POST http://localhost:5000/infer \ + -H "Content-Type: application/json" \ + -d "{ + \"prompt\": \"Describe this image in detail\", + \"image\": \"$IMAGE_B64\", + \"max_tokens\": 256 + }" +``` + +## Technical Details + +### Quantization + +This setup uses **Q4_K_M** quantization, which is recommended for CPU inference as it provides: +- Excellent balance between quality and performance +- ~4GB memory footprint for 7B models +- Good accuracy preservation compared to higher precision formats +- Optimized for CPU matrix operations + +The recommendation comes from the llama.cpp community discussions, particularly [PR #5780](https://github.com/ggml-org/llama.cpp/pull/5780), where Q4_K_M is noted as the default choice for most use cases. + +### Architecture + +- **Base Image:** Python 3.11-slim (small footprint) +- **Build:** Multi-stage build for smaller final image +- **Inference Engine:** llama.cpp compiled from source with CPU optimizations +- **Python Bindings:** llama-cpp-python for easy integration +- **Web Framework:** Flask for RESTful API +- **Image Processing:** Pillow for image handling + +### Performance Considerations + +- CPU threads are automatically configured to use all available cores +- Context window: 2048 tokens (configurable in code) +- No GPU layers (n_gpu_layers=0) +- Optimized for multi-core CPU inference + +## Development + +### Project Structure + +``` +llama-vision/ +ā”œā”€ā”€ Dockerfile # Multi-stage Docker build +ā”œā”€ā”€ requirements.txt # Python dependencies +ā”œā”€ā”€ README.md # This file +└── app/ + ā”œā”€ā”€ webhook.py # Flask application + └── schema.py # Pydantic response schemas +``` + +### Building and Testing Locally + +```bash +# Build +docker build -t llama-vision:latest . + +# Run with mounted models directory +docker run -it --rm \ + -p 5000:5000 \ + -v $(pwd)/models:/models \ + -e MODEL_NAME="your-model.gguf" \ + llama-vision:latest + +# Test health endpoint +curl http://localhost:5000/health + +# Test inference +curl -X POST http://localhost:5000/infer \ + -H "Content-Type: application/json" \ + -d '{"prompt": "test", "image": "..."}' +``` + +## Troubleshooting + +### Model Not Found + +Ensure your model file is in the correct location: +```bash +ls -la models/ +``` + +The model file should match the `MODEL_NAME` environment variable. + +### Out of Memory + +If you experience OOM errors: +- Use a smaller model (1B or 3B instead of 11B) +- Reduce context window in webhook.py (n_ctx parameter) +- Ensure sufficient RAM (11B models need ~6-8GB RAM with Q4_K_M) + +### Slow Inference + +CPU inference is inherently slower than GPU: +- Consider using a smaller model for faster responses +- Ensure Docker has access to all CPU cores +- Use Q4_K_M quantization (already configured) + +## License + +MIT License - see repository LICENSE file for details. + +## References + +- [llama.cpp](https://github.com/ggerganov/llama.cpp) +- [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) +- [Llama 3.2 Models](https://huggingface.co/meta-llama) +- [GGUF Quantization Guide](https://github.com/ggerganov/llama.cpp/pull/5780) diff --git a/llama-vision/app/schema.py b/llama-vision/app/schema.py new file mode 100644 index 0000000..4ea3eb3 --- /dev/null +++ b/llama-vision/app/schema.py @@ -0,0 +1,49 @@ +""" +Response schema definitions for the Llama Vision webhook. +""" + +from typing import Optional, List, Dict, Any +from pydantic import BaseModel, Field +from datetime import datetime + + +class TokenUsage(BaseModel): + """Token usage statistics.""" + prompt_tokens: int = Field(description="Number of tokens in the prompt") + completion_tokens: int = Field(description="Number of tokens in the completion") + total_tokens: int = Field(description="Total number of tokens used") + + +class VisionResponse(BaseModel): + """Standard response format for vision inference requests.""" + + success: bool = Field(description="Whether the request was successful") + response_text: str = Field(description="The generated text response from the model") + model: str = Field(description="The model used for inference") + timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(), + description="ISO timestamp of the response") + token_usage: Optional[TokenUsage] = Field(None, description="Token usage statistics") + metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, + description="Additional metadata") + error: Optional[str] = Field(None, description="Error message if success is False") + + +class ErrorResponse(BaseModel): + """Error response format.""" + + success: bool = Field(default=False, description="Always False for error responses") + error: str = Field(description="Error message describing what went wrong") + error_type: str = Field(description="Type of error (e.g., validation, model, system)") + timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(), + description="ISO timestamp of the error") + details: Optional[Dict[str, Any]] = Field(None, description="Additional error details") + + +class HealthResponse(BaseModel): + """Health check response format.""" + + status: str = Field(description="Health status (healthy, unhealthy, degraded)") + model_loaded: bool = Field(description="Whether the model is loaded and ready") + model_name: str = Field(description="Name of the loaded model") + timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(), + description="ISO timestamp of health check") diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py new file mode 100644 index 0000000..233bcb4 --- /dev/null +++ b/llama-vision/app/webhook.py @@ -0,0 +1,297 @@ +""" +Flask webhook interface for Llama 3.2 Vision model inference. +Accepts images and text prompts, returns JSON responses. +""" + +import os +import logging +import base64 +import tempfile +from io import BytesIO +from pathlib import Path +from typing import Optional, List, Dict, Any + +from flask import Flask, request, jsonify +from werkzeug.exceptions import BadRequest +from PIL import Image +from llama_cpp import Llama +from llama_cpp.llama_chat_format import Llava15ChatHandler + +from schema import VisionResponse, ErrorResponse, HealthResponse, TokenUsage + + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Initialize Flask app +app = Flask(__name__) + +# Global model instance +llama_model: Optional[Llama] = None +model_name: str = "" + + +def load_model(): + """Load the Llama model from the specified path.""" + global llama_model, model_name + + model_path_env = os.environ.get('MODEL_PATH', '/models') + model_name_env = os.environ.get('MODEL_NAME', 'llama-3.2-11b-vision-instruct-q4_k_m.gguf') + + full_model_path = Path(model_path_env) / model_name_env + + logger.info(f"Loading model from: {full_model_path}") + + if not full_model_path.exists(): + logger.error(f"Model file not found at: {full_model_path}") + raise FileNotFoundError(f"Model file not found at: {full_model_path}") + + try: + # Initialize with vision support using llava chat handler + # For vision models, we need the appropriate chat handler + llama_model = Llama( + model_path=str(full_model_path), + n_ctx=2048, # Context window + n_threads=os.cpu_count(), # Use all available CPU threads + n_gpu_layers=0, # CPU only, no GPU layers + verbose=False, + chat_format="llava-1-5", # Vision chat format + ) + model_name = model_name_env + logger.info(f"Model loaded successfully: {model_name}") + return True + except Exception as e: + logger.error(f"Failed to load model: {e}") + raise + + +def process_image(image_data: str, image_format: str = "base64") -> str: + """ + Process image data and return path to temporary file. + + Args: + image_data: Base64 encoded image or image bytes + image_format: Format of the image data (base64, bytes) + + Returns: + Path to temporary image file + """ + try: + if image_format == "base64": + # Remove data URL prefix if present + if "," in image_data: + image_data = image_data.split(",", 1)[1] + + image_bytes = base64.b64decode(image_data) + else: + image_bytes = image_data + + # Open image with PIL to validate and potentially convert + img = Image.open(BytesIO(image_bytes)) + + # Create temporary file + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png") + img.save(temp_file.name, format="PNG") + temp_file.close() + + return temp_file.name + except Exception as e: + logger.error(f"Failed to process image: {e}") + raise ValueError(f"Invalid image data: {e}") + + +def create_vision_prompt(text: str, image_path: str) -> List[Dict[str, Any]]: + """ + Create a prompt in the format expected by llama-cpp-python for vision models. + + Args: + text: Text prompt + image_path: Path to the image file + + Returns: + Formatted prompt as list of message dicts + """ + return [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": f"file://{image_path}"}}, + {"type": "text", "text": text} + ] + } + ] + + +@app.route('/health', methods=['GET']) +def health_check(): + """Health check endpoint.""" + try: + is_loaded = llama_model is not None + status = "healthy" if is_loaded else "unhealthy" + + response = HealthResponse( + status=status, + model_loaded=is_loaded, + model_name=model_name if is_loaded else "not loaded" + ) + + return jsonify(response.model_dump()), 200 if is_loaded else 503 + except Exception as e: + logger.error(f"Health check failed: {e}") + return jsonify({"status": "error", "error": str(e)}), 500 + + +@app.route('/infer', methods=['POST']) +def infer(): + """ + Main inference endpoint. + + Expected JSON payload: + { + "prompt": "Describe this image", + "image": "base64_encoded_image_data", + "max_tokens": 256, + "temperature": 0.7, + "top_p": 0.95 + } + """ + try: + # Validate model is loaded + if llama_model is None: + error = ErrorResponse( + error="Model not loaded", + error_type="model", + details={"message": "Model failed to load at startup"} + ) + return jsonify(error.model_dump()), 503 + + # Parse request JSON + data = request.get_json() + if not data: + error = ErrorResponse( + error="No JSON data provided", + error_type="validation" + ) + return jsonify(error.model_dump()), 400 + + # Extract required fields + prompt_text = data.get('prompt') + image_data = data.get('image') + + if not prompt_text: + error = ErrorResponse( + error="Missing required field: prompt", + error_type="validation" + ) + return jsonify(error.model_dump()), 400 + + if not image_data: + error = ErrorResponse( + error="Missing required field: image", + error_type="validation" + ) + return jsonify(error.model_dump()), 400 + + # Extract optional parameters + max_tokens = data.get('max_tokens', 256) + temperature = data.get('temperature', 0.7) + top_p = data.get('top_p', 0.95) + + logger.info(f"Processing inference request with prompt: {prompt_text[:50]}...") + + # Process image + image_path = process_image(image_data) + + try: + # Create vision prompt + messages = create_vision_prompt(prompt_text, image_path) + + # Run inference + result = llama_model.create_chat_completion( + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + ) + + # Extract response + response_text = result['choices'][0]['message']['content'] + + # Get token usage + usage = result.get('usage', {}) + token_usage = TokenUsage( + prompt_tokens=usage.get('prompt_tokens', 0), + completion_tokens=usage.get('completion_tokens', 0), + total_tokens=usage.get('total_tokens', 0) + ) + + # Create response + response = VisionResponse( + success=True, + response_text=response_text, + model=model_name, + token_usage=token_usage, + metadata={ + "max_tokens": max_tokens, + "temperature": temperature, + "top_p": top_p + } + ) + + logger.info(f"Inference completed successfully") + return jsonify(response.model_dump()), 200 + + finally: + # Clean up temporary image file + try: + os.unlink(image_path) + except Exception as e: + logger.warning(f"Failed to delete temporary image: {e}") + + except ValueError as e: + logger.error(f"Validation error: {e}") + error = ErrorResponse( + error=str(e), + error_type="validation" + ) + return jsonify(error.model_dump()), 400 + + except Exception as e: + logger.error(f"Inference error: {e}", exc_info=True) + error = ErrorResponse( + error=str(e), + error_type="system", + details={"traceback": str(e)} + ) + return jsonify(error.model_dump()), 500 + + +@app.route('/', methods=['GET']) +def index(): + """Root endpoint with API information.""" + return jsonify({ + "service": "Llama 3.2 Vision Inference API", + "version": "1.0.0", + "endpoints": { + "/health": "GET - Health check", + "/infer": "POST - Run inference with image and text prompt", + }, + "model": model_name if llama_model else "not loaded" + }), 200 + + +if __name__ == '__main__': + # Load model at startup + try: + load_model() + except Exception as e: + logger.error(f"Failed to load model at startup: {e}") + logger.warning("Starting server anyway, but inference will fail") + + # Start Flask server + port = int(os.environ.get('PORT', 5000)) + app.run(host='0.0.0.0', port=port, debug=False) diff --git a/llama-vision/compose.yml b/llama-vision/compose.yml new file mode 100644 index 0000000..3499627 --- /dev/null +++ b/llama-vision/compose.yml @@ -0,0 +1,35 @@ +version: '3.8' + +services: + llama-vision: + build: + context: . + dockerfile: Dockerfile + image: llama-vision:latest + container_name: llama-vision + ports: + - "5000:5000" + volumes: + # Mount your models directory here + - ./models:/models + environment: + # Configure your model name + - MODEL_NAME=llama-3.2-11b-vision-instruct-q4_k_m.gguf + - MODEL_PATH=/models + - PORT=5000 + restart: unless-stopped + # Resource limits (adjust based on your needs) + deploy: + resources: + limits: + cpus: '4' + memory: 8G + reservations: + cpus: '2' + memory: 4G + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:5000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s diff --git a/llama-vision/requirements.txt b/llama-vision/requirements.txt new file mode 100644 index 0000000..f9e4749 --- /dev/null +++ b/llama-vision/requirements.txt @@ -0,0 +1,6 @@ +flask==3.0.0 +werkzeug==3.0.1 +llama-cpp-python==0.2.90 +pillow==10.1.0 +requests==2.31.0 +pydantic==2.5.0 diff --git a/llama-vision/test_api.py b/llama-vision/test_api.py new file mode 100644 index 0000000..4fb8ce9 --- /dev/null +++ b/llama-vision/test_api.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +Example script to test the Llama Vision API. +""" + +import requests +import base64 +import json +import sys +from pathlib import Path + + +def test_health(base_url: str = "http://localhost:5000"): + """Test the health endpoint.""" + print("Testing health endpoint...") + response = requests.get(f"{base_url}/health") + print(f"Status: {response.status_code}") + print(f"Response: {json.dumps(response.json(), indent=2)}") + return response.status_code == 200 + + +def test_inference(image_path: str, prompt: str, base_url: str = "http://localhost:5000"): + """Test the inference endpoint with an image.""" + print(f"\nTesting inference endpoint...") + print(f"Image: {image_path}") + print(f"Prompt: {prompt}") + + # Read and encode image + try: + with open(image_path, "rb") as f: + image_data = base64.b64encode(f.read()).decode() + except FileNotFoundError: + print(f"Error: Image file not found: {image_path}") + return False + + # Prepare request + payload = { + "prompt": prompt, + "image": image_data, + "max_tokens": 300, + "temperature": 0.7, + "top_p": 0.95 + } + + # Make request + try: + print("Sending request...") + response = requests.post( + f"{base_url}/infer", + json=payload, + timeout=120 # 2 minutes timeout for slow CPU inference + ) + + print(f"Status: {response.status_code}") + result = response.json() + print(f"Response: {json.dumps(result, indent=2)}") + + if result.get("success"): + print(f"\nāœ… Success!") + print(f"Model Response: {result.get('response_text')}") + if result.get("token_usage"): + usage = result["token_usage"] + print(f"Tokens: {usage['total_tokens']} (prompt: {usage['prompt_tokens']}, completion: {usage['completion_tokens']})") + else: + print(f"\nāŒ Error: {result.get('error')}") + + return response.status_code == 200 + + except requests.exceptions.Timeout: + print("Error: Request timed out (inference may take a long time on CPU)") + return False + except Exception as e: + print(f"Error: {e}") + return False + + +def main(): + """Main function.""" + base_url = "http://localhost:5000" + + # Test health + if not test_health(base_url): + print("\nāŒ Health check failed! Is the service running?") + sys.exit(1) + + print("\nāœ… Health check passed!") + + # Test inference if image provided + if len(sys.argv) > 1: + image_path = sys.argv[1] + prompt = sys.argv[2] if len(sys.argv) > 2 else "Describe this image in detail." + + if test_inference(image_path, prompt, base_url): + print("\nāœ… Inference test passed!") + else: + print("\nāŒ Inference test failed!") + sys.exit(1) + else: + print("\nā„¹ļø To test inference, run:") + print(f" python {sys.argv[0]} [prompt]") + print("\nExample:") + print(f" python {sys.argv[0]} test_image.jpg 'What objects are in this image?'") + + +if __name__ == "__main__": + main() From 620e2ff63a2a5971b7d6c40a6ad5a8d15b3194f8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:34:10 +0000 Subject: [PATCH 3/7] Add CLIP model support for vision features and update documentation Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com> --- llama-vision/Dockerfile | 1 + llama-vision/Makefile | 5 ++++- llama-vision/README.md | 37 ++++++++++++++++++++++++++++++------- llama-vision/app/webhook.py | 37 ++++++++++++++++++++++++++----------- llama-vision/compose.yml | 3 ++- 5 files changed, 63 insertions(+), 20 deletions(-) diff --git a/llama-vision/Dockerfile b/llama-vision/Dockerfile index 9147faf..b5c4b58 100644 --- a/llama-vision/Dockerfile +++ b/llama-vision/Dockerfile @@ -39,6 +39,7 @@ COPY app/ /app/ # Environment variables ENV MODEL_NAME="llama-3.2-11b-vision-instruct-q4_k_m.gguf" +ENV CLIP_MODEL_NAME="mmproj-model-f16.gguf" ENV MODEL_PATH="/models" ENV FLASK_APP=webhook.py ENV PYTHONUNBUFFERED=1 diff --git a/llama-vision/Makefile b/llama-vision/Makefile index 3b1332e..f24999c 100644 --- a/llama-vision/Makefile +++ b/llama-vision/Makefile @@ -1,7 +1,8 @@ .PHONY: build run stop clean test help -# Default model name (can be overridden) +# Default model names (can be overridden) MODEL_NAME ?= llama-3.2-11b-vision-instruct-q4_k_m.gguf +CLIP_MODEL_NAME ?= mmproj-model-f16.gguf help: @echo "Llama Vision Docker - Available targets:" @@ -17,6 +18,7 @@ help: @echo "" @echo "Environment variables:" @echo " MODEL_NAME - Model filename (default: $(MODEL_NAME))" + @echo " CLIP_MODEL_NAME - CLIP projector filename (default: $(CLIP_MODEL_NAME))" build: docker build --pull --rm -t llama-vision:latest . @@ -26,6 +28,7 @@ run: -p 5000:5000 \ -v $$(pwd)/models:/models \ -e MODEL_NAME=$(MODEL_NAME) \ + -e CLIP_MODEL_NAME=$(CLIP_MODEL_NAME) \ --name llama-vision \ llama-vision:latest diff --git a/llama-vision/README.md b/llama-vision/README.md index 018190b..d357f9b 100644 --- a/llama-vision/README.md +++ b/llama-vision/README.md @@ -21,16 +21,29 @@ Docker image that runs Llama 3.2 Vision model with CPU inference (no GPU/NVIDIA ### Download Model -You can download the recommended Q4_K_M quantized Llama 3.2 Vision model from Hugging Face: +You need two model files for Llama 3.2 Vision to work: +1. **Main model**: The language model in GGUF format with Q4_K_M quantization +2. **CLIP projector**: The vision encoder/projector model (usually named `mmproj-*.gguf`) + +You can download models from Hugging Face. For example: ```bash -# Example: Download Llama 3.2 11B Vision Instruct Q4_K_M -# Visit https://huggingface.co/ and search for "llama-3.2-vision Q4_K_M GGUF" -# Download the .gguf file and place it in a models directory +# Create models directory mkdir -p models -# Place your downloaded model in ./models/ + +# Download the main model and CLIP projector +# Visit https://huggingface.co/ and search for "llama-3.2-vision Q4_K_M GGUF" +# Or for LLaVA models: +# Main model: https://huggingface.co/mys/ggml_llava-v1.5-7b +# CLIP projector: https://huggingface.co/mys/ggml_llava-v1.5-7b (mmproj-model-f16.gguf) + +# Place both files in ./models/ +# - your-model-name.gguf (main language model) +# - mmproj-model-f16.gguf (CLIP vision projector) ``` +**Note:** Llama 3.2 Vision is a multimodal model that requires both components. Without the CLIP projector, vision features will not work. + ### Build the Docker Image ```bash @@ -44,14 +57,16 @@ docker run -d \ -p 5000:5000 \ -v $(pwd)/models:/models \ -e MODEL_NAME="your-model-name.gguf" \ + -e CLIP_MODEL_NAME="mmproj-model-f16.gguf" \ --name llama-vision \ llama-vision:latest ``` ### Environment Variables -- `MODEL_NAME`: Name of the GGUF model file (default: `llama-3.2-11b-vision-instruct-q4_k_m.gguf`) -- `MODEL_PATH`: Directory containing the model (default: `/models`) +- `MODEL_NAME`: Name of the main GGUF model file (default: `llama-3.2-11b-vision-instruct-q4_k_m.gguf`) +- `CLIP_MODEL_NAME`: Name of the CLIP projector GGUF file (default: `mmproj-model-f16.gguf`) +- `MODEL_PATH`: Directory containing the models (default: `/models`) - `PORT`: Port for the Flask server (default: `5000`) ## API Documentation @@ -211,6 +226,14 @@ This setup uses **Q4_K_M** quantization, which is recommended for CPU inference The recommendation comes from the llama.cpp community discussions, particularly [PR #5780](https://github.com/ggml-org/llama.cpp/pull/5780), where Q4_K_M is noted as the default choice for most use cases. +### Vision Model Components + +Llama 3.2 Vision (and similar multimodal models like LLaVA) require two components: +1. **Main Language Model**: The text generation model in GGUF format +2. **CLIP Projector**: A vision encoder that processes images and projects them into the language model's embedding space (typically named `mmproj-*.gguf`) + +Both files must be present in the models directory for vision inference to work properly. + ### Architecture - **Base Image:** Python 3.11-slim (small footprint) diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py index 233bcb4..3979393 100644 --- a/llama-vision/app/webhook.py +++ b/llama-vision/app/webhook.py @@ -15,7 +15,6 @@ from werkzeug.exceptions import BadRequest from PIL import Image from llama_cpp import Llama -from llama_cpp.llama_chat_format import Llava15ChatHandler from schema import VisionResponse, ErrorResponse, HealthResponse, TokenUsage @@ -41,26 +40,42 @@ def load_model(): model_path_env = os.environ.get('MODEL_PATH', '/models') model_name_env = os.environ.get('MODEL_NAME', 'llama-3.2-11b-vision-instruct-q4_k_m.gguf') + clip_model_name_env = os.environ.get('CLIP_MODEL_NAME', 'mmproj-model-f16.gguf') full_model_path = Path(model_path_env) / model_name_env + full_clip_path = Path(model_path_env) / clip_model_name_env logger.info(f"Loading model from: {full_model_path}") + logger.info(f"Loading CLIP model from: {full_clip_path}") if not full_model_path.exists(): logger.error(f"Model file not found at: {full_model_path}") raise FileNotFoundError(f"Model file not found at: {full_model_path}") + if not full_clip_path.exists(): + logger.warning(f"CLIP model file not found at: {full_clip_path}") + logger.warning("Vision features may not work without CLIP model") + try: - # Initialize with vision support using llava chat handler - # For vision models, we need the appropriate chat handler - llama_model = Llama( - model_path=str(full_model_path), - n_ctx=2048, # Context window - n_threads=os.cpu_count(), # Use all available CPU threads - n_gpu_layers=0, # CPU only, no GPU layers - verbose=False, - chat_format="llava-1-5", # Vision chat format - ) + # Initialize with vision support + # For Llama 3.2 Vision models, we need both the main model and CLIP projector + model_kwargs = { + "model_path": str(full_model_path), + "n_ctx": 2048, # Context window + "n_threads": os.cpu_count(), # Use all available CPU threads + "n_gpu_layers": 0, # CPU only, no GPU layers + "verbose": False, + } + + # Add CLIP model if available + if full_clip_path.exists(): + model_kwargs["clip_model_path"] = str(full_clip_path) + model_kwargs["chat_format"] = "llava-1-5" # Vision chat format + logger.info("CLIP model loaded for vision support") + else: + logger.warning("Running without CLIP model - vision features disabled") + + llama_model = Llama(**model_kwargs) model_name = model_name_env logger.info(f"Model loaded successfully: {model_name}") return True diff --git a/llama-vision/compose.yml b/llama-vision/compose.yml index 3499627..ec08d8b 100644 --- a/llama-vision/compose.yml +++ b/llama-vision/compose.yml @@ -13,8 +13,9 @@ services: # Mount your models directory here - ./models:/models environment: - # Configure your model name + # Configure your model names - MODEL_NAME=llama-3.2-11b-vision-instruct-q4_k_m.gguf + - CLIP_MODEL_NAME=mmproj-model-f16.gguf - MODEL_PATH=/models - PORT=5000 restart: unless-stopped From 556c1b90fa29426fc2c7b42d597b337c5bf993dc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:37:11 +0000 Subject: [PATCH 4/7] Fix security issues, resource handling, and dependency versions Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com> --- llama-vision/README.md | 1 + llama-vision/app/webhook.py | 41 ++++++++++++++++++++++------------- llama-vision/requirements.txt | 12 +++++----- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/llama-vision/README.md b/llama-vision/README.md index d357f9b..07d7223 100644 --- a/llama-vision/README.md +++ b/llama-vision/README.md @@ -68,6 +68,7 @@ docker run -d \ - `CLIP_MODEL_NAME`: Name of the CLIP projector GGUF file (default: `mmproj-model-f16.gguf`) - `MODEL_PATH`: Directory containing the models (default: `/models`) - `PORT`: Port for the Flask server (default: `5000`) +- `N_THREADS`: Number of CPU threads for inference (default: half of available CPUs, or set to 0 for auto) ## API Documentation diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py index 3979393..2cf151b 100644 --- a/llama-vision/app/webhook.py +++ b/llama-vision/app/webhook.py @@ -59,10 +59,20 @@ def load_model(): try: # Initialize with vision support # For Llama 3.2 Vision models, we need both the main model and CLIP projector + + # Configure CPU threads conservatively to avoid resource contention + # Use environment variable if set, otherwise use half of available CPUs + n_threads = int(os.environ.get('N_THREADS', 0)) + if n_threads <= 0: + cpu_count = os.cpu_count() or 4 + n_threads = max(1, cpu_count // 2) + + logger.info(f"Using {n_threads} CPU threads for inference") + model_kwargs = { "model_path": str(full_model_path), "n_ctx": 2048, # Context window - "n_threads": os.cpu_count(), # Use all available CPU threads + "n_threads": n_threads, "n_gpu_layers": 0, # CPU only, no GPU layers "verbose": False, } @@ -216,10 +226,18 @@ def infer(): temperature = data.get('temperature', 0.7) top_p = data.get('top_p', 0.95) - logger.info(f"Processing inference request with prompt: {prompt_text[:50]}...") + logger.info(f"Processing inference request (prompt length: {len(prompt_text)} chars)") # Process image - image_path = process_image(image_data) + image_path = None + try: + image_path = process_image(image_data) + except ValueError as e: + error = ErrorResponse( + error=str(e), + error_type="validation" + ) + return jsonify(error.model_dump()), 400 try: # Create vision prompt @@ -262,18 +280,11 @@ def infer(): finally: # Clean up temporary image file - try: - os.unlink(image_path) - except Exception as e: - logger.warning(f"Failed to delete temporary image: {e}") - - except ValueError as e: - logger.error(f"Validation error: {e}") - error = ErrorResponse( - error=str(e), - error_type="validation" - ) - return jsonify(error.model_dump()), 400 + if image_path: + try: + os.unlink(image_path) + except Exception as e: + logger.warning(f"Failed to delete temporary image: {e}") except Exception as e: logger.error(f"Inference error: {e}", exc_info=True) diff --git a/llama-vision/requirements.txt b/llama-vision/requirements.txt index f9e4749..e11b555 100644 --- a/llama-vision/requirements.txt +++ b/llama-vision/requirements.txt @@ -1,6 +1,6 @@ -flask==3.0.0 -werkzeug==3.0.1 -llama-cpp-python==0.2.90 -pillow==10.1.0 -requests==2.31.0 -pydantic==2.5.0 +flask>=3.0.3,<4.0 +werkzeug>=3.0.3,<4.0 +llama-cpp-python>=0.2.90,<0.3.0 +pillow>=10.4.0,<11.0 +requests>=2.32.0,<3.0 +pydantic>=2.9.0,<3.0 From dbe653c5c6e5bc12236042b98763604bd8236c3c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:38:56 +0000 Subject: [PATCH 5/7] Remove unused import and update docker-compose format Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com> --- llama-vision/app/webhook.py | 1 - llama-vision/compose.yml | 2 -- 2 files changed, 3 deletions(-) diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py index 2cf151b..5ae20c3 100644 --- a/llama-vision/app/webhook.py +++ b/llama-vision/app/webhook.py @@ -12,7 +12,6 @@ from typing import Optional, List, Dict, Any from flask import Flask, request, jsonify -from werkzeug.exceptions import BadRequest from PIL import Image from llama_cpp import Llama diff --git a/llama-vision/compose.yml b/llama-vision/compose.yml index ec08d8b..6045f2c 100644 --- a/llama-vision/compose.yml +++ b/llama-vision/compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: llama-vision: build: From b49c53226d4576e9993c3a2fc690c14b89843f60 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:50:42 +0000 Subject: [PATCH 6/7] Refactor: Create proper JSON schema file and rename Python models - Add response_schema.json with formal JSON schema definitions - Rename schema.py to models.py to clarify it contains internal validation logic - Update README to reference the JSON schema file - Update documentation with JSON examples instead of Python type hints Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com> --- llama-vision/README.md | 42 ++++---- llama-vision/app/{schema.py => models.py} | 0 llama-vision/app/response_schema.json | 118 ++++++++++++++++++++++ llama-vision/app/webhook.py | 2 +- 4 files changed, 143 insertions(+), 19 deletions(-) rename llama-vision/app/{schema.py => models.py} (100%) create mode 100644 llama-vision/app/response_schema.json diff --git a/llama-vision/README.md b/llama-vision/README.md index 07d7223..1a89f03 100644 --- a/llama-vision/README.md +++ b/llama-vision/README.md @@ -142,33 +142,38 @@ docker run -d \ ## Response Schema +The formal JSON schema definition is available in `app/response_schema.json`. Below are examples of the response formats: + ### VisionResponse -```python +```json { - "success": bool, # Request success status - "response_text": str, # Generated text response - "model": str, # Model used for inference - "timestamp": str, # ISO timestamp - "token_usage": { # Token usage statistics - "prompt_tokens": int, - "completion_tokens": int, - "total_tokens": int + "success": true, + "response_text": "The image shows...", + "model": "llama-3.2-11b-vision-instruct-q4_k_m.gguf", + "timestamp": "2024-01-13T12:00:00.000000", + "token_usage": { + "prompt_tokens": 150, + "completion_tokens": 200, + "total_tokens": 350 }, - "metadata": dict, # Additional metadata - "error": str | null # Error message if any + "metadata": { + "max_tokens": 256, + "temperature": 0.7 + }, + "error": null } ``` ### ErrorResponse -```python +```json { - "success": false, # Always false for errors - "error": str, # Error message - "error_type": str, # Error type (validation, model, system) - "timestamp": str, # ISO timestamp - "details": dict | null # Additional error details + "success": false, + "error": "Missing required field: prompt", + "error_type": "validation", + "timestamp": "2024-01-13T12:00:00.000000", + "details": null } ``` @@ -262,7 +267,8 @@ llama-vision/ ā”œā”€ā”€ README.md # This file └── app/ ā”œā”€ā”€ webhook.py # Flask application - └── schema.py # Pydantic response schemas + ā”œā”€ā”€ models.py # Pydantic validation models + └── response_schema.json # JSON schema for API responses ``` ### Building and Testing Locally diff --git a/llama-vision/app/schema.py b/llama-vision/app/models.py similarity index 100% rename from llama-vision/app/schema.py rename to llama-vision/app/models.py diff --git a/llama-vision/app/response_schema.json b/llama-vision/app/response_schema.json new file mode 100644 index 0000000..c63a0eb --- /dev/null +++ b/llama-vision/app/response_schema.json @@ -0,0 +1,118 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Llama Vision API Response Schemas", + "definitions": { + "TokenUsage": { + "type": "object", + "description": "Token usage statistics", + "properties": { + "prompt_tokens": { + "type": "integer", + "description": "Number of tokens in the prompt" + }, + "completion_tokens": { + "type": "integer", + "description": "Number of tokens in the completion" + }, + "total_tokens": { + "type": "integer", + "description": "Total number of tokens used" + } + }, + "required": ["prompt_tokens", "completion_tokens", "total_tokens"] + }, + "VisionResponse": { + "type": "object", + "description": "Standard response format for vision inference requests", + "properties": { + "success": { + "type": "boolean", + "description": "Whether the request was successful" + }, + "response_text": { + "type": "string", + "description": "The generated text response from the model" + }, + "model": { + "type": "string", + "description": "The model used for inference" + }, + "timestamp": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 timestamp of the response" + }, + "token_usage": { + "$ref": "#/definitions/TokenUsage", + "description": "Token usage statistics" + }, + "metadata": { + "type": "object", + "description": "Additional metadata", + "additionalProperties": true + }, + "error": { + "type": ["string", "null"], + "description": "Error message if success is False" + } + }, + "required": ["success", "response_text", "model", "timestamp"] + }, + "ErrorResponse": { + "type": "object", + "description": "Error response format", + "properties": { + "success": { + "type": "boolean", + "const": false, + "description": "Always False for error responses" + }, + "error": { + "type": "string", + "description": "Error message describing what went wrong" + }, + "error_type": { + "type": "string", + "enum": ["validation", "model", "system"], + "description": "Type of error" + }, + "timestamp": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 timestamp of the error" + }, + "details": { + "type": ["object", "null"], + "description": "Additional error details", + "additionalProperties": true + } + }, + "required": ["success", "error", "error_type", "timestamp"] + }, + "HealthResponse": { + "type": "object", + "description": "Health check response format", + "properties": { + "status": { + "type": "string", + "enum": ["healthy", "unhealthy", "degraded"], + "description": "Health status" + }, + "model_loaded": { + "type": "boolean", + "description": "Whether the model is loaded and ready" + }, + "model_name": { + "type": "string", + "description": "Name of the loaded model" + }, + "timestamp": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 timestamp of health check" + } + }, + "required": ["status", "model_loaded", "model_name", "timestamp"] + } + } +} diff --git a/llama-vision/app/webhook.py b/llama-vision/app/webhook.py index 5ae20c3..c1fa9cd 100644 --- a/llama-vision/app/webhook.py +++ b/llama-vision/app/webhook.py @@ -15,7 +15,7 @@ from PIL import Image from llama_cpp import Llama -from schema import VisionResponse, ErrorResponse, HealthResponse, TokenUsage +from models import VisionResponse, ErrorResponse, HealthResponse, TokenUsage # Configure logging From e6b351d9b6169c4cf2252eadc6b41b6f0a14d763 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:52:23 +0000 Subject: [PATCH 7/7] Add enum validation to models for consistency with JSON schema - Use Literal types for error_type and status fields - Ensures runtime validation matches JSON schema constraints - Update docstring to reference response_schema.json Co-authored-by: wpowiertowski <671688+wpowiertowski@users.noreply.github.com> --- llama-vision/app/models.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llama-vision/app/models.py b/llama-vision/app/models.py index 4ea3eb3..37a04b1 100644 --- a/llama-vision/app/models.py +++ b/llama-vision/app/models.py @@ -1,8 +1,9 @@ """ -Response schema definitions for the Llama Vision webhook. +Response models for runtime validation in the Llama Vision webhook. +For the formal JSON schema definition, see response_schema.json. """ -from typing import Optional, List, Dict, Any +from typing import Optional, Dict, Any, Literal from pydantic import BaseModel, Field from datetime import datetime @@ -33,7 +34,9 @@ class ErrorResponse(BaseModel): success: bool = Field(default=False, description="Always False for error responses") error: str = Field(description="Error message describing what went wrong") - error_type: str = Field(description="Type of error (e.g., validation, model, system)") + error_type: Literal["validation", "model", "system"] = Field( + description="Type of error (validation, model, or system)" + ) timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(), description="ISO timestamp of the error") details: Optional[Dict[str, Any]] = Field(None, description="Additional error details") @@ -42,7 +45,9 @@ class ErrorResponse(BaseModel): class HealthResponse(BaseModel): """Health check response format.""" - status: str = Field(description="Health status (healthy, unhealthy, degraded)") + status: Literal["healthy", "unhealthy", "degraded"] = Field( + description="Health status (healthy, unhealthy, or degraded)" + ) model_loaded: bool = Field(description="Whether the model is loaded and ready") model_name: str = Field(description="Name of the loaded model") timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat(),