-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Derrick/bt 12247 add llama 3.2 11b (#358)
Added llama3.2 vision instruct with vllm
- Loading branch information
1 parent
b283c55
commit c32b6fd
Showing
5 changed files
with
404 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Llama 3.2 11B Vision Instruct vLLM Truss | ||
|
||
This is a [Truss](https://truss.baseten.co/) for Llama 3.2 11B Vision Instruct with vLLM. Llama 3.2 11B Vision Instruct is a multimodal (text + vision) LLM. This README will walk you through how to deploy this Truss on Baseten to get your own instance of it. | ||
|
||
|
||
## Deployment | ||
|
||
First, clone this repository: | ||
|
||
```sh | ||
git clone https://github.com/basetenlabs/truss-examples/ | ||
cd llama/llama-3_2-11b-vision-instruct | ||
``` | ||
|
||
Before deployment: | ||
|
||
1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). | ||
2. Install the latest version of Truss: `pip install --upgrade truss` | ||
3. Apply for access to the Llama 3.2 11B Vision Instruct model on hugging face [here](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct). | ||
4. Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). | ||
5. Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_token`. Note that you will *not* be able to successfully deploy this model without doing this. | ||
|
||
With `llama-3_2-11b-vision-instruct` as your working directory, you can deploy the model with: | ||
|
||
```sh | ||
truss push --publish --trusted | ||
``` | ||
|
||
Paste your Baseten API key if prompted. | ||
|
||
For more information, see [Truss documentation](https://truss.baseten.co). | ||
|
||
### Notes | ||
|
||
Limitations from vLLM allow for a maximum of 1 image as input. You will get a memory error otherwise. You can keep track of the issue [here](https://github.com/vllm-project/vllm/issues/8826). | ||
|
||
## Example usage | ||
|
||
```sh | ||
truss predict -d '{"messages": [{"role": "user", "content": "Tell me about yourself"}]}' | ||
``` | ||
|
||
Here's another example of invoking your model via a REST API but for image input: | ||
|
||
``` | ||
curl -X POST " https://app.baseten.co/model_versions/YOUR_MODEL_VERSION_ID/predict" \ | ||
-H "Content-Type: application/json" \ | ||
-H 'Authorization: Api-Key {YOUR_API_KEY}' \ | ||
-d '{ | ||
"messages": [ | ||
{ | ||
"role": "user", | ||
"content": [ | ||
{ | ||
"type": "text", | ||
"text": "What type of animal is this? Answer in French only" | ||
}, | ||
{ | ||
"type": "image_url", | ||
"image_url": { | ||
"url": "https://vetmed.illinois.edu/wp-content/uploads/2021/04/pc-keller-hedgehog.jpg" | ||
} | ||
} | ||
] | ||
} | ||
], | ||
"stream": true, | ||
"max_tokens": 64, | ||
"temperature": 0.2 | ||
}' | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
model_name: "Llama 3.2 11B Vision Instruct VLLM openai compatible" | ||
python_version: py311 | ||
model_metadata: | ||
example_model_input: { | ||
messages: [ | ||
{ | ||
role: "user", | ||
content: [ | ||
{ | ||
type: "text", | ||
text: "Describe this image in one sentence." | ||
}, | ||
{ | ||
type: "image_url", | ||
image_url: { | ||
url: "https://picsum.photos/id/237/200/300" | ||
} | ||
} | ||
] | ||
} | ||
], | ||
stream: true, | ||
max_tokens: 512, | ||
temperature: 0.5 | ||
} | ||
repo_id: meta-llama/Llama-3.2-11B-Vision-Instruct | ||
openai_compatible: true | ||
vllm_config: | ||
tensor_parallel_size: 1 | ||
enforce_eager: true | ||
max_num_seqs: 16 | ||
limit_mm_per_prompt: {image: 1} | ||
tags: | ||
- text-generation | ||
- multimodal | ||
requirements: | ||
- vllm==0.6.2 | ||
- uvloop>=0.18.0 | ||
resources: | ||
accelerator: A100 | ||
use_gpu: true | ||
runtime: | ||
predict_concurrency: 128 | ||
secrets: | ||
hf_access_token: null |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import asyncio | ||
import logging | ||
import os | ||
import threading | ||
|
||
import httpx | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
DEFAULT_HEALTH_CHECK_INTERVAL = 5 # seconds | ||
|
||
|
||
async def monitor_vllm_server_health(vllm_server_url, health_check_interval): | ||
assert vllm_server_url is not None, "vllm_server_url must not be None" | ||
try: | ||
async with httpx.AsyncClient() as client: | ||
while True: | ||
response = await client.get(f"{vllm_server_url}/health") | ||
if response.status_code != 200: | ||
raise RuntimeError("vLLM is unhealthy") | ||
await asyncio.sleep(health_check_interval) | ||
except Exception as e: | ||
logging.error( | ||
f"vLLM has gone into an unhealthy state due to error: {e}, restarting service now..." | ||
) | ||
os._exit(1) | ||
|
||
|
||
async def monitor_vllm_engine_health(vllm_engine, health_check_interval): | ||
assert vllm_engine is not None, "vllm_engine must not be None" | ||
try: | ||
while True: | ||
await vllm_engine.check_health() | ||
await asyncio.sleep(health_check_interval) | ||
except Exception as e: | ||
logging.error( | ||
f"vLLM has gone into an unhealthy state due to error: {e}, restarting service now..." | ||
) | ||
os._exit(1) | ||
|
||
|
||
def run_background_vllm_health_check( | ||
use_openai_compatible_server=False, | ||
health_check_interval=DEFAULT_HEALTH_CHECK_INTERVAL, | ||
vllm_engine=None, | ||
vllm_server_url=None, | ||
): | ||
logger.info("Starting background health check loop") | ||
loop = asyncio.new_event_loop() | ||
if use_openai_compatible_server: | ||
loop.create_task( | ||
monitor_vllm_server_health(vllm_server_url, health_check_interval) | ||
) | ||
else: | ||
loop.create_task(monitor_vllm_engine_health(vllm_engine, health_check_interval)) | ||
thread = threading.Thread(target=loop.run_forever, daemon=True) | ||
thread.start() |
Oops, something went wrong.