Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model][VLM] Support multi-images inputs for InternVL2 models #8201

Merged
merged 10 commits into from
Sep 7, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor and add internvl examples
Isotr0py committed Sep 6, 2024
commit a0de29b22b4bb1b02a0bf0f12793551ceaab70cc
95 changes: 77 additions & 18 deletions examples/offline_inference_vision_language_multi_image.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,9 @@
from argparse import Namespace
from typing import List

from vllm import LLM
from transformers import AutoTokenizer

from vllm import LLM, SamplingParams
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

@@ -17,36 +19,85 @@
]


def _load_phi3v(image_urls: List[str]):
return LLM(
def load_phi3v(question, image_urls: List[str]):
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
)


def run_phi3v_generate(question: str, image_urls: List[str]):
llm = _load_phi3v(image_urls)

placeholders = "\n".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
stop_token_ids = None
return llm, prompt, stop_token_ids

outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": [fetch_image(url) for url in image_urls]

def load_internvl(question, image_urls: List[str]):
# model_name = "OpenGVLab/InternVL2-2B"
model_name = "/data/LLM-model/InternVL2-2B"

llm = LLM(
model=model_name,
trust_remote_code=True,
max_num_seqs=5,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
)

placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]

tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)

# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return llm, prompt, stop_token_ids


model_example_map = {
"phi3_v": load_phi3v,
"internvl_chat": load_internvl,
}


def run_generate(model, question: str, image_urls: List[str]):
llm, prompt, stop_token_ids = model_example_map[model](question,
image_urls)

sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
stop_token_ids=stop_token_ids)

outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {
"image": [fetch_image(url) for url in image_urls]
},
},
})
sampling_params=sampling_params)

for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)


def run_phi3v_chat(question: str, image_urls: List[str]):
llm = _load_phi3v(image_urls)
def run_chat(model: str, question: str, image_urls: List[str]):
llm, _, stop_token_ids = model_example_map[model](question, image_urls)

sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
stop_token_ids=stop_token_ids)

outputs = llm.chat([{
"role":
@@ -63,20 +114,22 @@ def run_phi3v_chat(question: str, image_urls: List[str]):
},
} for image_url in image_urls),
],
}])
}],
sampling_params=sampling_params)

for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)


def main(args: Namespace):
model = args.model_type
method = args.method

if method == "generate":
run_phi3v_generate(QUESTION, IMAGE_URLS)
run_generate(model, QUESTION, IMAGE_URLS)
elif method == "chat":
run_phi3v_chat(QUESTION, IMAGE_URLS)
run_chat(model, QUESTION, IMAGE_URLS)
else:
raise ValueError(f"Invalid method: {method}")

@@ -85,6 +138,12 @@ def main(args: Namespace):
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models that support multi-image input')
parser.add_argument('--model-type',
'-m',
type=str,
default="phi3_v",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument("--method",
type=str,
default="generate",
7 changes: 6 additions & 1 deletion vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
@@ -129,7 +129,12 @@ def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,

# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_num_blocks(
orig_width, orig_height, min_num, max_num, image_size, use_thumbnail=False)
orig_width,
orig_height,
min_num,
max_num,
image_size,
use_thumbnail=False)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []