diff --git a/README.md b/README.md
index f1cd98c624..1b5b7da7e1 100644
--- a/README.md
+++ b/README.md
@@ -190,6 +190,7 @@ The following model architectures, tasks and device distributions have been vali
| CLIP | :heavy_check_mark: | :heavy_check_mark: |
[contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text) |
| BridgeTower | :heavy_check_mark: | :heavy_check_mark: | [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text) |
| ESMFold | | Single card | [protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding) |
+| Blip | | Single card | [visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text) |
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 2f15ece915..8de7a1d1e2 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -62,6 +62,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
| CLIP | ✅ | ✅ | [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text) |
| BridgeTower | ✅ | ✅ | [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text) |
| ESMFold | | Single card | [protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding) |
+| Blip | | Single card | [visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text) |
- Diffusers
diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
new file mode 100644
index 0000000000..67a9857bcb
--- /dev/null
+++ b/examples/image-to-text/README.md
@@ -0,0 +1,33 @@
+
+
+# Image to Text Examples
+
+This directory contains a script that showcases how to use the Transformers pipeline API to run image to text task on HPUs.
+
+## Single-HPU inference
+
+```bash
+python3 run_pipeline.py \
+ --model_name_or_path Salesforce/blip-image-captioning-large \
+ --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \
+ --use_hpu_graphs \
+ --bf16
+```
+Models that have been validated:
+ - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning)
+ - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large)
+ - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)
\ No newline at end of file
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
new file mode 100644
index 0000000000..ccaf51a57b
--- /dev/null
+++ b/examples/image-to-text/run_pipeline.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import time
+
+import PIL.Image
+import requests
+import torch
+from transformers import pipeline
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+
+logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument(
+ "--model_name_or_path",
+ default=None,
+ type=str,
+ help="Path to pre-trained model",
+ )
+ parser.add_argument(
+ "--image_path",
+ default=None,
+ type=str,
+ nargs="*",
+ help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")',
+ )
+
+ parser.add_argument(
+ "--prompt",
+ default=None,
+ type=str,
+ help='Optional argument to give a prompt of your choice as input. is a single string (eg: --prompt "Hello world")',
+ )
+ parser.add_argument(
+ "--use_hpu_graphs",
+ action="store_true",
+ help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+ )
+ parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
+ parser.add_argument(
+ "--bf16",
+ action="store_true",
+ help="Whether to perform generation in bf16 precision.",
+ )
+ parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+ parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+ parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
+ args = parser.parse_args()
+
+ adapt_transformers_to_gaudi()
+ image_paths = args.image_path
+ image_paths_len = len(image_paths)
+
+ if args.batch_size > image_paths_len:
+ # Dynamically extends to support larger batch sizes
+ num_path_to_add = args.batch_size - image_paths_len
+ for i in range(num_path_to_add):
+ image_paths.append(image_paths[i % image_paths_len])
+ elif args.batch_size < image_paths_len:
+ image_paths = image_paths[: args.batch_size]
+
+ images = []
+
+ for image_path in image_paths:
+ images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw))
+
+ if args.bf16:
+ model_dtype = torch.bfloat16
+ else:
+ model_dtype = torch.float32
+
+ generator = pipeline(
+ "image-to-text",
+ model=args.model_name_or_path,
+ torch_dtype=model_dtype,
+ device="hpu",
+ )
+ generate_kwargs = {
+ "lazy_mode": True,
+ "hpu_graphs": args.use_hpu_graphs,
+ "max_new_tokens": args.max_new_tokens,
+ "ignore_eos": False,
+ }
+ if args.use_hpu_graphs:
+ from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+ generator.model = wrap_in_hpu_graph(generator.model)
+
+ # warm up
+ for i in range(args.warmup):
+ generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
+
+ start = time.time()
+ for i in range(args.n_iterations):
+ result = generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
+ end = time.time()
+ logger.info(f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md
new file mode 100644
index 0000000000..efbe1f2a92
--- /dev/null
+++ b/examples/visual-question-answering/README.md
@@ -0,0 +1,35 @@
+
+
+# Visual Question Answering Examples
+
+This directory contains a script that showcases how to use the Transformers pipeline API to run visual question answering task on HPUs.
+
+## Single-HPU inference
+
+```bash
+python3 run_pipeline.py \
+ --model_name_or_path Salesforce/blip-vqa-capfilt-large \
+ --image_path "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" \
+ --question "how many dogs are in the picture?" \
+ --use_hpu_graphs \
+ --bf16
+```
+
+Models that have been validated:
+ - [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base)
+ - [dandelin/vilt-b32-finetuned-vqa](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+ - [Salesforce/blip-vqa-capfilt-large](https://huggingface.co/Salesforce/blip-vqa-capfilt-large)
\ No newline at end of file
diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py
new file mode 100644
index 0000000000..7b4e817bb7
--- /dev/null
+++ b/examples/visual-question-answering/run_pipeline.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import time
+
+import PIL.Image
+import requests
+import torch
+from transformers import pipeline
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+
+logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument(
+ "--model_name_or_path",
+ default=None,
+ type=str,
+ help="Path to pre-trained model",
+ )
+ parser.add_argument(
+ "--image_path",
+ default=None,
+ type=str,
+ nargs="*",
+ help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")',
+ )
+ parser.add_argument(
+ "--topk",
+ default=1,
+ type=int,
+ help="topk num",
+ )
+ parser.add_argument(
+ "--question",
+ default=None,
+ type=str,
+ nargs="*",
+ help='question as input. Can be a single string (eg: --question "Q1"), or a list of space-separated strings (eg: --question "Q1" "Q2")',
+ )
+ parser.add_argument(
+ "--use_hpu_graphs",
+ action="store_true",
+ help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+ )
+ parser.add_argument(
+ "--bf16",
+ action="store_true",
+ help="Whether to perform in bf16 precision.",
+ )
+ parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+ parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+ parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
+ args = parser.parse_args()
+
+ adapt_transformers_to_gaudi()
+ image_paths = args.image_path
+ image_paths_len = len(image_paths)
+
+ if args.batch_size > image_paths_len:
+ # Dynamically extends to support larger batch sizes
+ num_path_to_add = args.batch_size - image_paths_len
+ for i in range(num_path_to_add):
+ image_paths.append(image_paths[i % image_paths_len])
+ elif args.batch_size < image_paths_len:
+ image_paths = image_paths[: args.batch_size]
+
+ questions = args.question
+ questions_len = len(questions)
+ if args.batch_size > questions_len:
+ # Dynamically extends to support larger batch sizes
+ num_question_to_add = args.batch_size - questions_len
+ for i in range(num_question_to_add):
+ questions.append(questions[i % questions_len])
+ elif args.batch_size < questions_len:
+ questions = questions[: args.batch_size]
+
+ images = []
+
+ for image_path in image_paths:
+ images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw).convert("RGB"))
+
+ if args.bf16:
+ model_dtype = torch.bfloat16
+ else:
+ model_dtype = torch.float32
+
+ generator = pipeline(
+ "visual-question-answering",
+ model=args.model_name_or_path,
+ torch_dtype=model_dtype,
+ device="hpu",
+ )
+ if args.use_hpu_graphs:
+ from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+ generator.model = wrap_in_hpu_graph(generator.model)
+
+ autocast_enable = model_dtype == torch.bfloat16
+ model_input = []
+ for i in range(args.batch_size):
+ model_input.append({"image": images[i], "question": questions[i]})
+
+ # warm up
+ for i in range(args.warmup):
+ with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
+ generator(model_input, batch_size=args.batch_size, topk=args.topk)
+
+ start = time.time()
+ for i in range(args.n_iterations):
+ with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
+ result = generator(model_input, batch_size=args.batch_size, topk=args.topk)
+ end = time.time()
+ logger.info(f"result = {result}, time = {(end-start) * 1000/args.n_iterations}ms")
+
+
+if __name__ == "__main__":
+ main()