refactor and add internvl examples

vllm-project · DarkLight1337 · Sep 7, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024
commit a0de29b22b4bb1b02a0bf0f12793551ceaab70cc
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
@@ -6,7 +6,9 @@
 from argparse import Namespace
 from typing import List
 
-from vllm import LLM
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
 
@@ -17,36 +19,85 @@
 ]
 
 
-def _load_phi3v(image_urls: List[str]):
-    return LLM(
+def load_phi3v(question, image_urls: List[str]):
+    llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         max_model_len=4096,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
-
-
-def run_phi3v_generate(question: str, image_urls: List[str]):
-    llm = _load_phi3v(image_urls)
-
     placeholders = "\n".join(f"<|image_{i}|>"
                              for i, _ in enumerate(image_urls, start=1))
     prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
 
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {
-            "image": [fetch_image(url) for url in image_urls]
+
+def load_internvl(question, image_urls: List[str]):
+    # model_name = "OpenGVLab/InternVL2-2B"
+    model_name = "/data/LLM-model/InternVL2-2B"
+
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_num_seqs=5,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return llm, prompt, stop_token_ids
+
+
+model_example_map = {
+    "phi3_v": load_phi3v,
+    "internvl_chat": load_internvl,
+}
+
+
+def run_generate(model, question: str, image_urls: List[str]):
+    llm, prompt, stop_token_ids = model_example_map[model](question,
+                                                           image_urls)
+
+    sampling_params = SamplingParams(temperature=0.0,
+                                     max_tokens=128,
+                                     stop_token_ids=stop_token_ids)
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": [fetch_image(url) for url in image_urls]
+            },
         },
-    })
+        sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
 
 
-def run_phi3v_chat(question: str, image_urls: List[str]):
-    llm = _load_phi3v(image_urls)
+def run_chat(model: str, question: str, image_urls: List[str]):
+    llm, _, stop_token_ids = model_example_map[model](question, image_urls)
+
+    sampling_params = SamplingParams(temperature=0.0,
+                                     max_tokens=128,
+                                     stop_token_ids=stop_token_ids)
 
     outputs = llm.chat([{
         "role":
@@ -63,20 +114,22 @@ def run_phi3v_chat(question: str, image_urls: List[str]):
                 },
             } for image_url in image_urls),
         ],
-    }])
+    }],
+                       sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
 
 
 def main(args: Namespace):
+    model = args.model_type
     method = args.method
 
     if method == "generate":
-        run_phi3v_generate(QUESTION, IMAGE_URLS)
+        run_generate(model, QUESTION, IMAGE_URLS)
     elif method == "chat":
-        run_phi3v_chat(QUESTION, IMAGE_URLS)
+        run_chat(model, QUESTION, IMAGE_URLS)
     else:
         raise ValueError(f"Invalid method: {method}")
 
@@ -85,6 +138,12 @@ def main(args: Namespace):
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models that support multi-image input')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="phi3_v",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
     parser.add_argument("--method",
                         type=str,
                         default="generate",

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
@@ -129,7 +129,12 @@ def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
 
     # calculate the number of blocks without thumbnail
     blocks, target_width, target_height = calculate_num_blocks(
-        orig_width, orig_height, min_num, max_num, image_size, use_thumbnail=False)
+        orig_width,
+        orig_height,
+        min_num,
+        max_num,
+        image_size,
+        use_thumbnail=False)
     # resize the image
     resized_img = image.resize((target_width, target_height))
     processed_images = []