Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -800,12 +800,13 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A

The following table lists those that are tested in vLLM.

| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|--------|-------------------|----------------------|---------------------------|
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | ✅︎ |
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ |
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ |
| `SiglipModel` | SigLIP | T / I | `google/siglip-base-patch16-224` | | | ✅︎ |
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |

<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
\* Feature support is the same as that of the original model.
Expand Down
73 changes: 49 additions & 24 deletions examples/offline_inference/vision_language_pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,53 @@ def run_e5_v(query: Query) -> ModelRequestData:
)


def run_jinavl_reranker(query: Query) -> ModelRequestData:
if query["modality"] != "text+images":
raise ValueError(f"Unsupported query modality: '{query['modality']}'")

engine_args = EngineArgs(
model="jinaai/jina-reranker-m0",
runner="pooling",
max_model_len=32768,
trust_remote_code=True,
mm_processor_kwargs={
"min_pixels": 3136,
"max_pixels": 602112,
},
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
engine_args=engine_args,
query=query["text"],
documents=query["image"],
)


def run_siglip(query: Query) -> ModelRequestData:
if query["modality"] == "text":
prompt = query["text"]
image = None
elif query["modality"] == "image":
prompt = "" # For image input, make sure that the prompt text is empty
image = query["image"]
else:
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")

engine_args = EngineArgs(
model="google/siglip-base-patch16-224",
runner="pooling",
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)


def _get_vlm2vec_prompt_image(query: Query, image_token: str):
if query["modality"] == "text":
text = query["text"]
Expand Down Expand Up @@ -211,29 +258,6 @@ def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
)


def run_jinavl_reranker(query: Query) -> ModelRequestData:
if query["modality"] != "text+images":
raise ValueError(f"Unsupported query modality: '{query['modality']}'")

engine_args = EngineArgs(
model="jinaai/jina-reranker-m0",
runner="pooling",
max_model_len=32768,
trust_remote_code=True,
mm_processor_kwargs={
"min_pixels": 3136,
"max_pixels": 602112,
},
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
engine_args=engine_args,
query=query["text"],
documents=query["image"],
)


def get_query(modality: QueryModality):
if modality == "text":
return TextQuery(modality="text", text="A dog sitting in the grass")
Expand Down Expand Up @@ -328,9 +352,10 @@ def run_score(model: str, modality: QueryModality, seed: int | None):
model_example_map = {
"clip": run_clip,
"e5_v": run_e5_v,
"jinavl_reranker": run_jinavl_reranker,
"siglip": run_siglip,
"vlm2vec_phi3v": run_vlm2vec_phi3v,
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
"jinavl_reranker": run_jinavl_reranker,
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,25 +83,29 @@ def run_clip(client: OpenAI, model: str):
print("Text embedding output:", response.data[0].embedding)


def run_vlm2vec(client: OpenAI, model: str):
def run_dse_qwen2_vl(client: OpenAI, model: str):
"""
Start the server using:

vllm serve TIGER-Lab/VLM2Vec-Full \
vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
--runner pooling \
--trust-remote-code \
--max-model-len 4096 \
--chat-template examples/template_vlm2vec_phi3v.jinja
--max-model-len 8192 \
--chat-template examples/template_dse_qwen2_vl.jinja
"""

response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "Represent the given image."},
{
"type": "image_url",
"image_url": {
"url": image_url,
},
},
{"type": "text", "text": "What is shown in this image?"},
],
}
],
Expand All @@ -111,33 +115,67 @@ def run_vlm2vec(client: OpenAI, model: str):

print("Image embedding output:", response.data[0].embedding)

# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
# of the minimum input size
buffer = io.BytesIO()
image_placeholder = Image.new("RGB", (56, 56))
image_placeholder.save(buffer, "png")
buffer.seek(0)
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{
"type": "text",
"text": "Represent the given image with the following question: What is in the image.",
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_placeholder}",
},
},
{"type": "text", "text": "Query: What is the weather like today?"},
],
}
],
model=model,
encoding_format="float",
)

print("Image+Text embedding output:", response.data[0].embedding)
print("Text embedding output:", response.data[0].embedding)


def run_siglip(client: OpenAI, model: str):
"""
Start the server using:

vllm serve google/siglip-base-patch16-224 \
--runner pooling
"""

response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "A cat and a dog"},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
],
model=model,
encoding_format="float",
)

print("Image embedding output:", response.data[0].embedding)

response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "a photo of a cat"},
],
}
],
Expand All @@ -148,29 +186,25 @@ def run_vlm2vec(client: OpenAI, model: str):
print("Text embedding output:", response.data[0].embedding)


def run_dse_qwen2_vl(client: OpenAI, model: str):
def run_vlm2vec(client: OpenAI, model: str):
"""
Start the server using:

vllm serve MrLight/dse-qwen2-2b-mrl-v1 \
vllm serve TIGER-Lab/VLM2Vec-Full \
--runner pooling \
--trust-remote-code \
--max-model-len 8192 \
--chat-template examples/template_dse_qwen2_vl.jinja
--max-model-len 4096 \
--chat-template examples/template_vlm2vec_phi3v.jinja
"""

response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url,
},
},
{"type": "text", "text": "What is shown in this image?"},
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "Represent the given image."},
],
}
],
Expand All @@ -180,26 +214,33 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):

print("Image embedding output:", response.data[0].embedding)

# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
# of the minimum input size
buffer = io.BytesIO()
image_placeholder = Image.new("RGB", (56, 56))
image_placeholder.save(buffer, "png")
buffer.seek(0)
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_placeholder}",
},
"type": "text",
"text": "Represent the given image with the following question: What is in the image.",
},
{"type": "text", "text": "Query: What is the weather like today?"},
],
}
],
model=model,
encoding_format="float",
)

print("Image+Text embedding output:", response.data[0].embedding)

response = create_chat_embeddings(
client,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "A cat and a dog"},
],
}
],
Expand All @@ -212,8 +253,9 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):

model_example_map = {
"clip": run_clip,
"vlm2vec": run_vlm2vec,
"dse_qwen2_vl": run_dse_qwen2_vl,
"siglip": run_siglip,
"vlm2vec": run_vlm2vec,
}


Expand Down
Loading