From d59f88fdbf73748c9d9e7d1971d514f1d14ea6eb Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Sun, 25 Feb 2024 21:57:50 -0800
Subject: [PATCH 1/5] add image-to-text and visual question answering example

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 README.md                                     |  1 +
 docs/source/index.mdx                         |  1 +
 examples/image-to-text/README.md              | 29 ++++++
 examples/image-to-text/run_pipeline.py        | 87 ++++++++++++++++
 examples/visual-question-answering/README.md  | 30 ++++++
 .../visual-question-answering/run_pipeline.py | 98 +++++++++++++++++++
 6 files changed, 246 insertions(+)
 create mode 100644 examples/image-to-text/README.md
 create mode 100644 examples/image-to-text/run_pipeline.py
 create mode 100644 examples/visual-question-answering/README.md
 create mode 100644 examples/visual-question-answering/run_pipeline.py
diff --git a/README.md b/README.md
index f1cd98c624..1b5b7da7e1 100644
--- a/README.md
+++ b/README.md
@@ -190,6 +190,7 @@ The following model architectures, tasks and device distributions have been vali
 | CLIP | :heavy_check_mark: | :heavy_check_mark: | <li>[contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)</li> |
 | BridgeTower | :heavy_check_mark: | :heavy_check_mark: | <li>[contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)</li> |
 | ESMFold |   | <div style="text-align:left"><li>Single card</li></div> | <li>[protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)</li> |
+| Blip |   | <div style="text-align:left"><li>Single card</li></div> | <li>[visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)</li><li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 
 </div>
 
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 2f15ece915..8de7a1d1e2 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -62,6 +62,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | CLIP         | ✅       | ✅        | <li>[contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)</li> |
 | BridgeTower  | ✅       | ✅        | <li>[contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)</li> |
 | ESMFold      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)</li> |
+| Blip         |          | <div style="text-align:left"><li>Single card</li></div> | <li>[visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)</li><li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 
 - Diffusers
 
diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
new file mode 100644
index 0000000000..1c4ebb3857
--- /dev/null
+++ b/examples/image-to-text/README.md
@@ -0,0 +1,29 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Image to Text Examples
+
+This directory contains a script that showcases how to use transformers pipeline API to run image to text task on HPUs.
+
+## Single-HPU inference
+
+```bash
+python3 run_pipeline.py \
+    --model_name_or_path Salesforce/blip-image-captioning-large \
+    --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \
+    --use_hpu_graphs
+```
+valided models: nlpconnect/vit-gpt2-image-captioning,Salesforce/blip-image-captioning-large,Salesforce/blip-image-captioning-base
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
new file mode 100644
index 0000000000..ac02ac56dd
--- /dev/null
+++ b/examples/image-to-text/run_pipeline.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import time
+
+import PIL.Image
+import requests
+import torch
+from transformers import pipeline
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name_or_path",
+        default="Salesforce/blip-image-captioning-large",
+        type=str,
+        help="Path to pre-trained model",
+    )
+    parser.add_argument(
+        "--image_path",
+        default="https://ankur3107.github.io/assets/images/image-captioning-example.png",
+        type=str,
+        help="Path to image",
+    )
+    parser.add_argument(
+        "--use_hpu_graphs",
+        action="store_true",
+        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+    )
+    args = parser.parse_args()
+
+    adapt_transformers_to_gaudi()
+
+    image = PIL.Image.open(requests.get(args.image_path, stream=True, timeout=3000).raw)
+
+    generator = pipeline(
+        "image-to-text",
+        model=args.model_name_or_path,
+        torch_dtype=torch.bfloat16,
+        device="hpu",
+    )
+    generate_kwargs = {"lazy_mode": True, "hpu_graphs": args.use_hpu_graphs}
+    if args.use_hpu_graphs:
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        generator.model = wrap_in_hpu_graph(generator.model)
+
+    # warm up
+    for i in range(5):
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
+            generator(image, generate_kwargs=generate_kwargs)
+
+    start = time.time()
+    with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
+        result = generator(image, generate_kwargs=generate_kwargs)
+    end = time.time()
+    logger.info(f"result = {result}, time = {(end-start) * 1000}ms")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md
new file mode 100644
index 0000000000..6398ca10fc
--- /dev/null
+++ b/examples/visual-question-answering/README.md
@@ -0,0 +1,30 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Visual Question Answering Examples
+
+This directory contains a script that showcases how to use transformers pipeline API to run visual question answering task on HPUs.
+
+## Single-HPU inference
+
+```bash
+python3 run_pipeline.py \
+    --model_name_or_path Salesforce/blip-vqa-capfilt-large \
+    --image_path "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" \
+    --question "how many dogs are in the picture?" \
+    --use_hpu_graphs
+```
+valided models: Salesforce/blip-vqa-base,dandelin/vilt-b32-finetuned-vqa,Salesforce/blip-vqa-capfilt-large
diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py
new file mode 100644
index 0000000000..e49493558b
--- /dev/null
+++ b/examples/visual-question-answering/run_pipeline.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import time
+
+import PIL.Image
+import requests
+import torch
+from transformers import pipeline
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_name_or_path",
+        default="Salesforce/blip-vqa-capfilt-large",
+        type=str,
+        help="Path to pre-trained model",
+    )
+    parser.add_argument(
+        "--image_path",
+        default="https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg",
+        type=str,
+        help="Path to image",
+    )
+    parser.add_argument(
+        "--topk",
+        default=1,
+        type=int,
+        help="topk num",
+    )
+    parser.add_argument(
+        "--question",
+        default="how many dogs are in the picture?",
+        type=str,
+        help="question input",
+    )
+    parser.add_argument(
+        "--use_hpu_graphs",
+        action="store_true",
+        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+    )
+    args = parser.parse_args()
+
+    adapt_transformers_to_gaudi()
+
+    image = PIL.Image.open(requests.get(args.image_path, stream=True, timeout=3000).raw).convert("RGB")
+
+    generator = pipeline(
+        "visual-question-answering",
+        model=args.model_name_or_path,
+        torch_dtype=torch.bfloat16,
+        device="hpu",
+    )
+    if not generator.model.can_generate() and args.use_hpu_graphs:
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        generator.model = wrap_in_hpu_graph(generator.model)
+
+    # warm up
+    for i in range(5):
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
+            generator(image, args.question, topk=args.topk)
+
+    start = time.time()
+    with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
+        result = generator(image, args.question, topk=args.topk)
+    end = time.time()
+    logger.info(f"result = {result}, time = {(end-start) * 1000}ms")
+
+
+if __name__ == "__main__":
+    main()

From 17f6c993545ee53246707f216f83be28f25c14bc Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Tue, 27 Feb 2024 21:53:55 -0800
Subject: [PATCH 2/5] update example to contain fp32 and batch_size

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/image-to-text/README.md              |  3 +-
 examples/image-to-text/run_pipeline.py        | 52 +++++++++++---
 examples/visual-question-answering/README.md  |  3 +-
 .../visual-question-answering/run_pipeline.py | 72 +++++++++++++++----
 4 files changed, 103 insertions(+), 27 deletions(-)

diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 1c4ebb3857..07bd2cdc9f 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -24,6 +24,7 @@ This directory contains a script that showcases how to use transformers pipeline
 python3 run_pipeline.py \
     --model_name_or_path Salesforce/blip-image-captioning-large \
     --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \
-    --use_hpu_graphs
+    --use_hpu_graphs \
+    --bf16
 ```
 valided models: nlpconnect/vit-gpt2-image-captioning,Salesforce/blip-image-captioning-large,Salesforce/blip-image-captioning-base
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index ac02ac56dd..c1681d7df6 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -38,31 +38,59 @@ def main():
 
     parser.add_argument(
         "--model_name_or_path",
-        default="Salesforce/blip-image-captioning-large",
+        default=None,
         type=str,
         help="Path to pre-trained model",
     )
     parser.add_argument(
         "--image_path",
-        default="https://ankur3107.github.io/assets/images/image-captioning-example.png",
+        default=None,
         type=str,
-        help="Path to image",
+        nargs="*",
+        help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")',
     )
+
     parser.add_argument(
         "--use_hpu_graphs",
         action="store_true",
         help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
     )
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+        help="Whether to perform generation in bf16 precision.",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
     args = parser.parse_args()
 
     adapt_transformers_to_gaudi()
+    image_pathes = args.image_path
+    image_pathes_len = len(image_pathes)
+
+    if args.batch_size > image_pathes_len:
+        # Dynamically extends to support larger batch sizes
+        num_path_to_add = args.batch_size - image_pathes_len
+        for i in range(num_path_to_add):
+            image_pathes.append(image_pathes[i % image_pathes_len])
+    elif args.batch_size < image_pathes_len:
+        image_pathes = image_pathes[: args.batch_size]
+
+    images = []
+
+    for image_path in image_pathes:
+        images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw))
 
-    image = PIL.Image.open(requests.get(args.image_path, stream=True, timeout=3000).raw)
+    if args.bf16:
+        model_dtype = torch.bfloat16
+    else:
+        model_dtype = torch.float32
 
     generator = pipeline(
         "image-to-text",
         model=args.model_name_or_path,
-        torch_dtype=torch.bfloat16,
+        torch_dtype=model_dtype,
         device="hpu",
     )
     generate_kwargs = {"lazy_mode": True, "hpu_graphs": args.use_hpu_graphs}
@@ -71,16 +99,18 @@ def main():
 
         generator.model = wrap_in_hpu_graph(generator.model)
 
+    autocast_enable = model_dtype == torch.bfloat16
     # warm up
-    for i in range(5):
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
-            generator(image, generate_kwargs=generate_kwargs)
+    for i in range(args.warmup):
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
+            generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
 
     start = time.time()
-    with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
-        result = generator(image, generate_kwargs=generate_kwargs)
+    for i in range(args.n_iterations):
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
+            result = generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
     end = time.time()
-    logger.info(f"result = {result}, time = {(end-start) * 1000}ms")
+    logger.info(f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms")
 
 
 if __name__ == "__main__":
diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md
index 6398ca10fc..767a56bae8 100644
--- a/examples/visual-question-answering/README.md
+++ b/examples/visual-question-answering/README.md
@@ -25,6 +25,7 @@ python3 run_pipeline.py \
     --model_name_or_path Salesforce/blip-vqa-capfilt-large \
     --image_path "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" \
     --question "how many dogs are in the picture?" \
-    --use_hpu_graphs
+    --use_hpu_graphs \
+    --bf16
 ```
 valided models: Salesforce/blip-vqa-base,dandelin/vilt-b32-finetuned-vqa,Salesforce/blip-vqa-capfilt-large
diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py
index e49493558b..547684dbde 100644
--- a/examples/visual-question-answering/run_pipeline.py
+++ b/examples/visual-question-answering/run_pipeline.py
@@ -38,15 +38,16 @@ def main():
 
     parser.add_argument(
         "--model_name_or_path",
-        default="Salesforce/blip-vqa-capfilt-large",
+        default=None,
         type=str,
         help="Path to pre-trained model",
     )
     parser.add_argument(
         "--image_path",
-        default="https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg",
+        default=None,
         type=str,
-        help="Path to image",
+        nargs="*",
+        help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")',
     )
     parser.add_argument(
         "--topk",
@@ -56,25 +57,62 @@ def main():
     )
     parser.add_argument(
         "--question",
-        default="how many dogs are in the picture?",
+        default=None,
         type=str,
-        help="question input",
+        nargs="*",
+        help='question as input. Can be a single string (eg: --question "Q1"), or a list of space-separated strings (eg: --question "Q1" "Q2")',
     )
     parser.add_argument(
         "--use_hpu_graphs",
         action="store_true",
         help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
     )
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+        help="Whether to perform in bf16 precision.",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
     args = parser.parse_args()
 
     adapt_transformers_to_gaudi()
-
-    image = PIL.Image.open(requests.get(args.image_path, stream=True, timeout=3000).raw).convert("RGB")
+    image_pathes = args.image_path
+    image_pathes_len = len(image_pathes)
+
+    if args.batch_size > image_pathes_len:
+        # Dynamically extends to support larger batch sizes
+        num_path_to_add = args.batch_size - image_pathes_len
+        for i in range(num_path_to_add):
+            image_pathes.append(image_pathes[i % image_pathes_len])
+    elif args.batch_size < image_pathes_len:
+        image_pathes = image_pathes[: args.batch_size]
+
+    questions = args.question
+    questions_len = len(questions)
+    if args.batch_size > questions_len:
+        # Dynamically extends to support larger batch sizes
+        num_question_to_add = args.batch_size - questions_len
+        for i in range(num_question_to_add):
+            questions.append(questions[i % questions_len])
+    elif args.batch_size < questions_len:
+        questions = questions[: args.batch_size]
+
+    images = []
+
+    for image_path in image_pathes:
+        images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw).convert("RGB"))
+
+    if args.bf16:
+        model_dtype = torch.bfloat16
+    else:
+        model_dtype = torch.float32
 
     generator = pipeline(
         "visual-question-answering",
         model=args.model_name_or_path,
-        torch_dtype=torch.bfloat16,
+        torch_dtype=model_dtype,
         device="hpu",
     )
     if not generator.model.can_generate() and args.use_hpu_graphs:
@@ -82,16 +120,22 @@ def main():
 
         generator.model = wrap_in_hpu_graph(generator.model)
 
+    autocast_enable = model_dtype == torch.bfloat16
+    model_input = []
+    for i in range(args.batch_size):
+        model_input.append({"image": images[i], "question": questions[i]})
+
     # warm up
-    for i in range(5):
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
-            generator(image, args.question, topk=args.topk)
+    for i in range(args.warmup):
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
+            generator(model_input, batch_size=args.batch_size, topk=args.topk)
 
     start = time.time()
-    with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
-        result = generator(image, args.question, topk=args.topk)
+    for i in range(args.n_iterations):
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
+            result = generator(model_input, batch_size=args.batch_size, topk=args.topk)
     end = time.time()
-    logger.info(f"result = {result}, time = {(end-start) * 1000}ms")
+    logger.info(f"result = {result}, time = {(end-start) * 1000/args.n_iterations}ms")
 
 
 if __name__ == "__main__":

From 381dbf5c6696da6291aced8151a42eba34ae97f6 Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Wed, 28 Feb 2024 23:29:21 -0800
Subject: [PATCH 3/5] no need to autocast for image to text

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/image-to-text/run_pipeline.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index c1681d7df6..da17a3c43b 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -99,16 +99,13 @@ def main():
 
         generator.model = wrap_in_hpu_graph(generator.model)
 
-    autocast_enable = model_dtype == torch.bfloat16
     # warm up
     for i in range(args.warmup):
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
-            generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
+        generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
 
     start = time.time()
     for i in range(args.n_iterations):
-        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
-            result = generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
+        result = generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
     end = time.time()
     logger.info(f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms")
 

From 97a5c1bfbd1532e6589029156d48f04f7edc17c4 Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Sun, 3 Mar 2024 20:52:36 -0800
Subject: [PATCH 4/5] add prompt input in the example, llava needs it

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/image-to-text/run_pipeline.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index da17a3c43b..9bceae88ba 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -50,11 +50,18 @@ def main():
         help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")',
     )
 
+    parser.add_argument(
+        "--prompt",
+        default=None,
+        type=str,
+        help='Optional argument to give a prompt of your choice as input. is a single string (eg: --prompt "Hello world")',
+    )
     parser.add_argument(
         "--use_hpu_graphs",
         action="store_true",
         help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
     )
+    parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
     parser.add_argument(
         "--bf16",
         action="store_true",
@@ -93,7 +100,12 @@ def main():
         torch_dtype=model_dtype,
         device="hpu",
     )
-    generate_kwargs = {"lazy_mode": True, "hpu_graphs": args.use_hpu_graphs}
+    generate_kwargs = {
+        "lazy_mode": True,
+        "hpu_graphs": args.use_hpu_graphs,
+        "max_new_tokens": args.max_new_tokens,
+        "ignore_eos": False,
+    }
     if args.use_hpu_graphs:
         from habana_frameworks.torch.hpu import wrap_in_hpu_graph
 
@@ -101,11 +113,11 @@ def main():
 
     # warm up
     for i in range(args.warmup):
-        generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
+        generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
 
     start = time.time()
     for i in range(args.n_iterations):
-        result = generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
+        result = generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
     end = time.time()
     logger.info(f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms")
 

From caf309bc0e7f5524ba112ccba887ecfaf573db76 Mon Sep 17 00:00:00 2001
From: "Wang, Yi A" <yi.a.wang@intel.com>
Date: Wed, 6 Mar 2024 20:05:38 -0800
Subject: [PATCH 5/5] address the review comment

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 examples/image-to-text/README.md               |  7 +++++--
 examples/image-to-text/run_pipeline.py         | 16 ++++++++--------
 examples/visual-question-answering/README.md   |  8 ++++++--
 .../visual-question-answering/run_pipeline.py  | 18 +++++++++---------
 4 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 07bd2cdc9f..67a9857bcb 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -16,7 +16,7 @@ limitations under the License.
 
 # Image to Text Examples
 
-This directory contains a script that showcases how to use transformers pipeline API to run image to text task on HPUs.
+This directory contains a script that showcases how to use the Transformers pipeline API to run image to text task on HPUs.
 
 ## Single-HPU inference
 
@@ -27,4 +27,7 @@ python3 run_pipeline.py \
     --use_hpu_graphs \
     --bf16
 ```
-valided models: nlpconnect/vit-gpt2-image-captioning,Salesforce/blip-image-captioning-large,Salesforce/blip-image-captioning-base
+Models that have been validated:
+  - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning)
+  - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large)
+  - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)
\ No newline at end of file
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index 9bceae88ba..ccaf51a57b 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -73,20 +73,20 @@ def main():
     args = parser.parse_args()
 
     adapt_transformers_to_gaudi()
-    image_pathes = args.image_path
-    image_pathes_len = len(image_pathes)
+    image_paths = args.image_path
+    image_paths_len = len(image_paths)
 
-    if args.batch_size > image_pathes_len:
+    if args.batch_size > image_paths_len:
         # Dynamically extends to support larger batch sizes
-        num_path_to_add = args.batch_size - image_pathes_len
+        num_path_to_add = args.batch_size - image_paths_len
         for i in range(num_path_to_add):
-            image_pathes.append(image_pathes[i % image_pathes_len])
-    elif args.batch_size < image_pathes_len:
-        image_pathes = image_pathes[: args.batch_size]
+            image_paths.append(image_paths[i % image_paths_len])
+    elif args.batch_size < image_paths_len:
+        image_paths = image_paths[: args.batch_size]
 
     images = []
 
-    for image_path in image_pathes:
+    for image_path in image_paths:
         images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw))
 
     if args.bf16:
diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md
index 767a56bae8..efbe1f2a92 100644
--- a/examples/visual-question-answering/README.md
+++ b/examples/visual-question-answering/README.md
@@ -16,7 +16,7 @@ limitations under the License.
 
 # Visual Question Answering Examples
 
-This directory contains a script that showcases how to use transformers pipeline API to run visual question answering task on HPUs.
+This directory contains a script that showcases how to use the Transformers pipeline API to run visual question answering task on HPUs.
 
 ## Single-HPU inference
 
@@ -28,4 +28,8 @@ python3 run_pipeline.py \
     --use_hpu_graphs \
     --bf16
 ```
-valided models: Salesforce/blip-vqa-base,dandelin/vilt-b32-finetuned-vqa,Salesforce/blip-vqa-capfilt-large
+
+Models that have been validated:
+  - [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base)
+  - [dandelin/vilt-b32-finetuned-vqa](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+  - [Salesforce/blip-vqa-capfilt-large](https://huggingface.co/Salesforce/blip-vqa-capfilt-large)
\ No newline at end of file
diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py
index 547684dbde..7b4e817bb7 100644
--- a/examples/visual-question-answering/run_pipeline.py
+++ b/examples/visual-question-answering/run_pipeline.py
@@ -78,16 +78,16 @@ def main():
     args = parser.parse_args()
 
     adapt_transformers_to_gaudi()
-    image_pathes = args.image_path
-    image_pathes_len = len(image_pathes)
+    image_paths = args.image_path
+    image_paths_len = len(image_paths)
 
-    if args.batch_size > image_pathes_len:
+    if args.batch_size > image_paths_len:
         # Dynamically extends to support larger batch sizes
-        num_path_to_add = args.batch_size - image_pathes_len
+        num_path_to_add = args.batch_size - image_paths_len
         for i in range(num_path_to_add):
-            image_pathes.append(image_pathes[i % image_pathes_len])
-    elif args.batch_size < image_pathes_len:
-        image_pathes = image_pathes[: args.batch_size]
+            image_paths.append(image_paths[i % image_paths_len])
+    elif args.batch_size < image_paths_len:
+        image_paths = image_paths[: args.batch_size]
 
     questions = args.question
     questions_len = len(questions)
@@ -101,7 +101,7 @@ def main():
 
     images = []
 
-    for image_path in image_pathes:
+    for image_path in image_paths:
         images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw).convert("RGB"))
 
     if args.bf16:
@@ -115,7 +115,7 @@ def main():
         torch_dtype=model_dtype,
         device="hpu",
     )
-    if not generator.model.can_generate() and args.use_hpu_graphs:
+    if args.use_hpu_graphs:
         from habana_frameworks.torch.hpu import wrap_in_hpu_graph
 
         generator.model = wrap_in_hpu_graph(generator.model)