basetenlabs
diff --git a/‎llama/llama-3_2-11b-vision-instruct/README.md
+71 b/‎llama/llama-3_2-11b-vision-instruct/README.md
+71
diff --git a/‎llama/llama-3_2-11b-vision-instruct/config.yaml
+45 b/‎llama/llama-3_2-11b-vision-instruct/config.yaml
+45
diff --git a/‎llama/llama-3_2-11b-vision-instruct/model/__init__.py b/‎llama/llama-3_2-11b-vision-instruct/model/__init__.py
diff --git a/‎llama/llama-3_2-11b-vision-instruct/model/helper.py
+57 b/‎llama/llama-3_2-11b-vision-instruct/model/helper.py
+57
@@ -0,0 +1,71 @@
+# Llama 3.2 11B Vision Instruct vLLM Truss
+
+This is a [Truss](https://truss.baseten.co/) for Llama 3.2 11B Vision Instruct with vLLM. Llama 3.2 11B Vision Instruct is a multimodal (text + vision) LLM. This README will walk you through how to deploy this Truss on Baseten to get your own instance of it.
+
+
+## Deployment
+
+First, clone this repository:
+
+```sh
+git clone https://github.com/basetenlabs/truss-examples/
+cd llama/llama-3_2-11b-vision-instruct
+```
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+3. Apply for access to the Llama 3.2 11B Vision Instruct model on hugging face [here](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct).
+4. Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens).
+5. Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Note that you will *not* be able to successfully deploy this model without doing this.
+
+With `llama-3_2-11b-vision-instruct` as your working directory, you can deploy the model with:
+
+```sh
+truss push --publish --trusted
+```
+
+Paste your Baseten API key if prompted.
+
+For more information, see [Truss documentation](https://truss.baseten.co).
+
+### Notes
+
+Limitations from vLLM allow for a maximum of 1 image as input. You will get a memory error otherwise. You can keep track of the issue [here](https://github.com/vllm-project/vllm/issues/8826).
+
+## Example usage
+
+```sh
+truss predict -d '{"messages": [{"role": "user", "content": "Tell me about yourself"}]}'
+```
+
+Here's another example of invoking your model via a REST API but for image input:
+
+```
+curl -X POST " https://app.baseten.co/model_versions/YOUR_MODEL_VERSION_ID/predict" \
+     -H "Content-Type: application/json" \
+     -H 'Authorization: Api-Key {YOUR_API_KEY}' \
+     -d '{
+           "messages": [
+            {
+                "role": "user",
+                "content": [
+                {
+                    "type": "text",
+                    "text": "What type of animal is this? Answer in French only"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                    "url": "https://vetmed.illinois.edu/wp-content/uploads/2021/04/pc-keller-hedgehog.jpg"
+                    }
+                }
+                ]
+            }
+            ],
+           "stream": true,
+           "max_tokens": 64,
+           "temperature": 0.2
+         }'
+```
@@ -0,0 +1,45 @@
+model_name: "Llama 3.2 11B Vision Instruct VLLM openai compatible"
+python_version: py311
+model_metadata:
+  example_model_input: {
+    messages: [
+      {
+        role: "user",
+        content: [
+          {
+            type: "text",
+            text: "Describe this image in one sentence."
+          },
+          {
+            type: "image_url",
+            image_url: {
+              url: "https://picsum.photos/id/237/200/300"
+            }
+          }
+        ]
+      }
+    ],
+    stream: true,
+    max_tokens: 512,
+    temperature: 0.5
+  }
+  repo_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  openai_compatible: true
+  vllm_config:
+    tensor_parallel_size: 1
+    enforce_eager: true
+    max_num_seqs: 16
+    limit_mm_per_prompt: {image: 1}
+  tags:
+    - text-generation
+    - multimodal
+requirements:
+  - vllm==0.6.2
+  - uvloop>=0.18.0
+resources:
+  accelerator: A100
+  use_gpu: true
+runtime:
+  predict_concurrency: 128
+secrets:
+  hf_access_token: null
@@ -0,0 +1,57 @@
+import asyncio
+import logging
+import os
+import threading
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_HEALTH_CHECK_INTERVAL = 5  # seconds
+
+
+async def monitor_vllm_server_health(vllm_server_url, health_check_interval):
+    assert vllm_server_url is not None, "vllm_server_url must not be None"
+    try:
+        async with httpx.AsyncClient() as client:
+            while True:
+                response = await client.get(f"{vllm_server_url}/health")
+                if response.status_code != 200:
+                    raise RuntimeError("vLLM is unhealthy")
+                await asyncio.sleep(health_check_interval)
+    except Exception as e:
+        logging.error(
+            f"vLLM has gone into an unhealthy state due to error: {e}, restarting service now..."
+        )
+        os._exit(1)
+
+
+async def monitor_vllm_engine_health(vllm_engine, health_check_interval):
+    assert vllm_engine is not None, "vllm_engine must not be None"
+    try:
+        while True:
+            await vllm_engine.check_health()
+            await asyncio.sleep(health_check_interval)
+    except Exception as e:
+        logging.error(
+            f"vLLM has gone into an unhealthy state due to error: {e}, restarting service now..."
+        )
+        os._exit(1)
+
+
+def run_background_vllm_health_check(
+    use_openai_compatible_server=False,
+    health_check_interval=DEFAULT_HEALTH_CHECK_INTERVAL,
+    vllm_engine=None,
+    vllm_server_url=None,
+):
+    logger.info("Starting background health check loop")
+    loop = asyncio.new_event_loop()
+    if use_openai_compatible_server:
+        loop.create_task(
+            monitor_vllm_server_health(vllm_server_url, health_check_interval)
+        )
+    else:
+        loop.create_task(monitor_vllm_engine_health(vllm_engine, health_check_interval))
+    thread = threading.Thread(target=loop.run_forever, daemon=True)
+    thread.start()