From d59f88fdbf73748c9d9e7d1971d514f1d14ea6eb Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Sun, 25 Feb 2024 21:57:50 -0800 Subject: [PATCH 1/5] add image-to-text and visual question answering example Signed-off-by: Wang, Yi A --- README.md | 1 + docs/source/index.mdx | 1 + examples/image-to-text/README.md | 29 ++++++ examples/image-to-text/run_pipeline.py | 87 ++++++++++++++++ examples/visual-question-answering/README.md | 30 ++++++ .../visual-question-answering/run_pipeline.py | 98 +++++++++++++++++++ 6 files changed, 246 insertions(+) create mode 100644 examples/image-to-text/README.md create mode 100644 examples/image-to-text/run_pipeline.py create mode 100644 examples/visual-question-answering/README.md create mode 100644 examples/visual-question-answering/run_pipeline.py diff --git a/README.md b/README.md index f1cd98c624..1b5b7da7e1 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,7 @@ The following model architectures, tasks and device distributions have been vali | CLIP | :heavy_check_mark: | :heavy_check_mark: |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | | BridgeTower | :heavy_check_mark: | :heavy_check_mark: |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | | ESMFold | |
  • Single card
  • |
  • [protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)
  • | +| Blip | |
  • Single card
  • |
  • [visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 2f15ece915..8de7a1d1e2 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -62,6 +62,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be | CLIP | ✅ | ✅ |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | | BridgeTower | ✅ | ✅ |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | | ESMFold | |
  • Single card
  • |
  • [protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)
  • | +| Blip | |
  • Single card
  • |
  • [visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | - Diffusers diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md new file mode 100644 index 0000000000..1c4ebb3857 --- /dev/null +++ b/examples/image-to-text/README.md @@ -0,0 +1,29 @@ + + +# Image to Text Examples + +This directory contains a script that showcases how to use transformers pipeline API to run image to text task on HPUs. + +## Single-HPU inference + +```bash +python3 run_pipeline.py \ + --model_name_or_path Salesforce/blip-image-captioning-large \ + --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \ + --use_hpu_graphs +``` +valided models: nlpconnect/vit-gpt2-image-captioning,Salesforce/blip-image-captioning-large,Salesforce/blip-image-captioning-base diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py new file mode 100644 index 0000000000..ac02ac56dd --- /dev/null +++ b/examples/image-to-text/run_pipeline.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import argparse +import logging +import time + +import PIL.Image +import requests +import torch +from transformers import pipeline + +from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name_or_path", + default="Salesforce/blip-image-captioning-large", + type=str, + help="Path to pre-trained model", + ) + parser.add_argument( + "--image_path", + default="https://ankur3107.github.io/assets/images/image-captioning-example.png", + type=str, + help="Path to image", + ) + parser.add_argument( + "--use_hpu_graphs", + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + args = parser.parse_args() + + adapt_transformers_to_gaudi() + + image = PIL.Image.open(requests.get(args.image_path, stream=True, timeout=3000).raw) + + generator = pipeline( + "image-to-text", + model=args.model_name_or_path, + torch_dtype=torch.bfloat16, + device="hpu", + ) + generate_kwargs = {"lazy_mode": True, "hpu_graphs": args.use_hpu_graphs} + if args.use_hpu_graphs: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + generator.model = wrap_in_hpu_graph(generator.model) + + # warm up + for i in range(5): + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True): + generator(image, generate_kwargs=generate_kwargs) + + start = time.time() + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True): + result = generator(image, generate_kwargs=generate_kwargs) + end = time.time() + logger.info(f"result = {result}, time = {(end-start) * 1000}ms") + + +if __name__ == "__main__": + main() diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md new file mode 100644 index 0000000000..6398ca10fc --- /dev/null +++ b/examples/visual-question-answering/README.md @@ -0,0 +1,30 @@ + + +# Visual Question Answering Examples + +This directory contains a script that showcases how to use transformers pipeline API to run visual question answering task on HPUs. + +## Single-HPU inference + +```bash +python3 run_pipeline.py \ + --model_name_or_path Salesforce/blip-vqa-capfilt-large \ + --image_path "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" \ + --question "how many dogs are in the picture?" \ + --use_hpu_graphs +``` +valided models: Salesforce/blip-vqa-base,dandelin/vilt-b32-finetuned-vqa,Salesforce/blip-vqa-capfilt-large diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py new file mode 100644 index 0000000000..e49493558b --- /dev/null +++ b/examples/visual-question-answering/run_pipeline.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import argparse +import logging +import time + +import PIL.Image +import requests +import torch +from transformers import pipeline + +from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name_or_path", + default="Salesforce/blip-vqa-capfilt-large", + type=str, + help="Path to pre-trained model", + ) + parser.add_argument( + "--image_path", + default="https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg", + type=str, + help="Path to image", + ) + parser.add_argument( + "--topk", + default=1, + type=int, + help="topk num", + ) + parser.add_argument( + "--question", + default="how many dogs are in the picture?", + type=str, + help="question input", + ) + parser.add_argument( + "--use_hpu_graphs", + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + args = parser.parse_args() + + adapt_transformers_to_gaudi() + + image = PIL.Image.open(requests.get(args.image_path, stream=True, timeout=3000).raw).convert("RGB") + + generator = pipeline( + "visual-question-answering", + model=args.model_name_or_path, + torch_dtype=torch.bfloat16, + device="hpu", + ) + if not generator.model.can_generate() and args.use_hpu_graphs: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + generator.model = wrap_in_hpu_graph(generator.model) + + # warm up + for i in range(5): + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True): + generator(image, args.question, topk=args.topk) + + start = time.time() + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True): + result = generator(image, args.question, topk=args.topk) + end = time.time() + logger.info(f"result = {result}, time = {(end-start) * 1000}ms") + + +if __name__ == "__main__": + main() From 17f6c993545ee53246707f216f83be28f25c14bc Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Tue, 27 Feb 2024 21:53:55 -0800 Subject: [PATCH 2/5] update example to contain fp32 and batch_size Signed-off-by: Wang, Yi A --- examples/image-to-text/README.md | 3 +- examples/image-to-text/run_pipeline.py | 52 +++++++++++--- examples/visual-question-answering/README.md | 3 +- .../visual-question-answering/run_pipeline.py | 72 +++++++++++++++---- 4 files changed, 103 insertions(+), 27 deletions(-) diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index 1c4ebb3857..07bd2cdc9f 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -24,6 +24,7 @@ This directory contains a script that showcases how to use transformers pipeline python3 run_pipeline.py \ --model_name_or_path Salesforce/blip-image-captioning-large \ --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \ - --use_hpu_graphs + --use_hpu_graphs \ + --bf16 ``` valided models: nlpconnect/vit-gpt2-image-captioning,Salesforce/blip-image-captioning-large,Salesforce/blip-image-captioning-base diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py index ac02ac56dd..c1681d7df6 100644 --- a/examples/image-to-text/run_pipeline.py +++ b/examples/image-to-text/run_pipeline.py @@ -38,31 +38,59 @@ def main(): parser.add_argument( "--model_name_or_path", - default="Salesforce/blip-image-captioning-large", + default=None, type=str, help="Path to pre-trained model", ) parser.add_argument( "--image_path", - default="https://ankur3107.github.io/assets/images/image-captioning-example.png", + default=None, type=str, - help="Path to image", + nargs="*", + help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")', ) + parser.add_argument( "--use_hpu_graphs", action="store_true", help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", ) + parser.add_argument( + "--bf16", + action="store_true", + help="Whether to perform generation in bf16 precision.", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.") + parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.") args = parser.parse_args() adapt_transformers_to_gaudi() + image_pathes = args.image_path + image_pathes_len = len(image_pathes) + + if args.batch_size > image_pathes_len: + # Dynamically extends to support larger batch sizes + num_path_to_add = args.batch_size - image_pathes_len + for i in range(num_path_to_add): + image_pathes.append(image_pathes[i % image_pathes_len]) + elif args.batch_size < image_pathes_len: + image_pathes = image_pathes[: args.batch_size] + + images = [] + + for image_path in image_pathes: + images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw)) - image = PIL.Image.open(requests.get(args.image_path, stream=True, timeout=3000).raw) + if args.bf16: + model_dtype = torch.bfloat16 + else: + model_dtype = torch.float32 generator = pipeline( "image-to-text", model=args.model_name_or_path, - torch_dtype=torch.bfloat16, + torch_dtype=model_dtype, device="hpu", ) generate_kwargs = {"lazy_mode": True, "hpu_graphs": args.use_hpu_graphs} @@ -71,16 +99,18 @@ def main(): generator.model = wrap_in_hpu_graph(generator.model) + autocast_enable = model_dtype == torch.bfloat16 # warm up - for i in range(5): - with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True): - generator(image, generate_kwargs=generate_kwargs) + for i in range(args.warmup): + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable): + generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs) start = time.time() - with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True): - result = generator(image, generate_kwargs=generate_kwargs) + for i in range(args.n_iterations): + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable): + result = generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs) end = time.time() - logger.info(f"result = {result}, time = {(end-start) * 1000}ms") + logger.info(f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms") if __name__ == "__main__": diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md index 6398ca10fc..767a56bae8 100644 --- a/examples/visual-question-answering/README.md +++ b/examples/visual-question-answering/README.md @@ -25,6 +25,7 @@ python3 run_pipeline.py \ --model_name_or_path Salesforce/blip-vqa-capfilt-large \ --image_path "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" \ --question "how many dogs are in the picture?" \ - --use_hpu_graphs + --use_hpu_graphs \ + --bf16 ``` valided models: Salesforce/blip-vqa-base,dandelin/vilt-b32-finetuned-vqa,Salesforce/blip-vqa-capfilt-large diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py index e49493558b..547684dbde 100644 --- a/examples/visual-question-answering/run_pipeline.py +++ b/examples/visual-question-answering/run_pipeline.py @@ -38,15 +38,16 @@ def main(): parser.add_argument( "--model_name_or_path", - default="Salesforce/blip-vqa-capfilt-large", + default=None, type=str, help="Path to pre-trained model", ) parser.add_argument( "--image_path", - default="https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg", + default=None, type=str, - help="Path to image", + nargs="*", + help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")', ) parser.add_argument( "--topk", @@ -56,25 +57,62 @@ def main(): ) parser.add_argument( "--question", - default="how many dogs are in the picture?", + default=None, type=str, - help="question input", + nargs="*", + help='question as input. Can be a single string (eg: --question "Q1"), or a list of space-separated strings (eg: --question "Q1" "Q2")', ) parser.add_argument( "--use_hpu_graphs", action="store_true", help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", ) + parser.add_argument( + "--bf16", + action="store_true", + help="Whether to perform in bf16 precision.", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.") + parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.") args = parser.parse_args() adapt_transformers_to_gaudi() - - image = PIL.Image.open(requests.get(args.image_path, stream=True, timeout=3000).raw).convert("RGB") + image_pathes = args.image_path + image_pathes_len = len(image_pathes) + + if args.batch_size > image_pathes_len: + # Dynamically extends to support larger batch sizes + num_path_to_add = args.batch_size - image_pathes_len + for i in range(num_path_to_add): + image_pathes.append(image_pathes[i % image_pathes_len]) + elif args.batch_size < image_pathes_len: + image_pathes = image_pathes[: args.batch_size] + + questions = args.question + questions_len = len(questions) + if args.batch_size > questions_len: + # Dynamically extends to support larger batch sizes + num_question_to_add = args.batch_size - questions_len + for i in range(num_question_to_add): + questions.append(questions[i % questions_len]) + elif args.batch_size < questions_len: + questions = questions[: args.batch_size] + + images = [] + + for image_path in image_pathes: + images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw).convert("RGB")) + + if args.bf16: + model_dtype = torch.bfloat16 + else: + model_dtype = torch.float32 generator = pipeline( "visual-question-answering", model=args.model_name_or_path, - torch_dtype=torch.bfloat16, + torch_dtype=model_dtype, device="hpu", ) if not generator.model.can_generate() and args.use_hpu_graphs: @@ -82,16 +120,22 @@ def main(): generator.model = wrap_in_hpu_graph(generator.model) + autocast_enable = model_dtype == torch.bfloat16 + model_input = [] + for i in range(args.batch_size): + model_input.append({"image": images[i], "question": questions[i]}) + # warm up - for i in range(5): - with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True): - generator(image, args.question, topk=args.topk) + for i in range(args.warmup): + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable): + generator(model_input, batch_size=args.batch_size, topk=args.topk) start = time.time() - with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True): - result = generator(image, args.question, topk=args.topk) + for i in range(args.n_iterations): + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable): + result = generator(model_input, batch_size=args.batch_size, topk=args.topk) end = time.time() - logger.info(f"result = {result}, time = {(end-start) * 1000}ms") + logger.info(f"result = {result}, time = {(end-start) * 1000/args.n_iterations}ms") if __name__ == "__main__": From 381dbf5c6696da6291aced8151a42eba34ae97f6 Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Wed, 28 Feb 2024 23:29:21 -0800 Subject: [PATCH 3/5] no need to autocast for image to text Signed-off-by: Wang, Yi A --- examples/image-to-text/run_pipeline.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py index c1681d7df6..da17a3c43b 100644 --- a/examples/image-to-text/run_pipeline.py +++ b/examples/image-to-text/run_pipeline.py @@ -99,16 +99,13 @@ def main(): generator.model = wrap_in_hpu_graph(generator.model) - autocast_enable = model_dtype == torch.bfloat16 # warm up for i in range(args.warmup): - with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable): - generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs) + generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs) start = time.time() for i in range(args.n_iterations): - with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable): - result = generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs) + result = generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs) end = time.time() logger.info(f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms") From 97a5c1bfbd1532e6589029156d48f04f7edc17c4 Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Sun, 3 Mar 2024 20:52:36 -0800 Subject: [PATCH 4/5] add prompt input in the example, llava needs it Signed-off-by: Wang, Yi A --- examples/image-to-text/run_pipeline.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py index da17a3c43b..9bceae88ba 100644 --- a/examples/image-to-text/run_pipeline.py +++ b/examples/image-to-text/run_pipeline.py @@ -50,11 +50,18 @@ def main(): help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")', ) + parser.add_argument( + "--prompt", + default=None, + type=str, + help='Optional argument to give a prompt of your choice as input. is a single string (eg: --prompt "Hello world")', + ) parser.add_argument( "--use_hpu_graphs", action="store_true", help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", ) + parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.") parser.add_argument( "--bf16", action="store_true", @@ -93,7 +100,12 @@ def main(): torch_dtype=model_dtype, device="hpu", ) - generate_kwargs = {"lazy_mode": True, "hpu_graphs": args.use_hpu_graphs} + generate_kwargs = { + "lazy_mode": True, + "hpu_graphs": args.use_hpu_graphs, + "max_new_tokens": args.max_new_tokens, + "ignore_eos": False, + } if args.use_hpu_graphs: from habana_frameworks.torch.hpu import wrap_in_hpu_graph @@ -101,11 +113,11 @@ def main(): # warm up for i in range(args.warmup): - generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs) + generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs) start = time.time() for i in range(args.n_iterations): - result = generator(images, batch_size=args.batch_size, generate_kwargs=generate_kwargs) + result = generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs) end = time.time() logger.info(f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms") From caf309bc0e7f5524ba112ccba887ecfaf573db76 Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Wed, 6 Mar 2024 20:05:38 -0800 Subject: [PATCH 5/5] address the review comment Signed-off-by: Wang, Yi A --- examples/image-to-text/README.md | 7 +++++-- examples/image-to-text/run_pipeline.py | 16 ++++++++-------- examples/visual-question-answering/README.md | 8 ++++++-- .../visual-question-answering/run_pipeline.py | 18 +++++++++--------- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index 07bd2cdc9f..67a9857bcb 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -16,7 +16,7 @@ limitations under the License. # Image to Text Examples -This directory contains a script that showcases how to use transformers pipeline API to run image to text task on HPUs. +This directory contains a script that showcases how to use the Transformers pipeline API to run image to text task on HPUs. ## Single-HPU inference @@ -27,4 +27,7 @@ python3 run_pipeline.py \ --use_hpu_graphs \ --bf16 ``` -valided models: nlpconnect/vit-gpt2-image-captioning,Salesforce/blip-image-captioning-large,Salesforce/blip-image-captioning-base +Models that have been validated: + - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning) + - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) + - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base) \ No newline at end of file diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py index 9bceae88ba..ccaf51a57b 100644 --- a/examples/image-to-text/run_pipeline.py +++ b/examples/image-to-text/run_pipeline.py @@ -73,20 +73,20 @@ def main(): args = parser.parse_args() adapt_transformers_to_gaudi() - image_pathes = args.image_path - image_pathes_len = len(image_pathes) + image_paths = args.image_path + image_paths_len = len(image_paths) - if args.batch_size > image_pathes_len: + if args.batch_size > image_paths_len: # Dynamically extends to support larger batch sizes - num_path_to_add = args.batch_size - image_pathes_len + num_path_to_add = args.batch_size - image_paths_len for i in range(num_path_to_add): - image_pathes.append(image_pathes[i % image_pathes_len]) - elif args.batch_size < image_pathes_len: - image_pathes = image_pathes[: args.batch_size] + image_paths.append(image_paths[i % image_paths_len]) + elif args.batch_size < image_paths_len: + image_paths = image_paths[: args.batch_size] images = [] - for image_path in image_pathes: + for image_path in image_paths: images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw)) if args.bf16: diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md index 767a56bae8..efbe1f2a92 100644 --- a/examples/visual-question-answering/README.md +++ b/examples/visual-question-answering/README.md @@ -16,7 +16,7 @@ limitations under the License. # Visual Question Answering Examples -This directory contains a script that showcases how to use transformers pipeline API to run visual question answering task on HPUs. +This directory contains a script that showcases how to use the Transformers pipeline API to run visual question answering task on HPUs. ## Single-HPU inference @@ -28,4 +28,8 @@ python3 run_pipeline.py \ --use_hpu_graphs \ --bf16 ``` -valided models: Salesforce/blip-vqa-base,dandelin/vilt-b32-finetuned-vqa,Salesforce/blip-vqa-capfilt-large + +Models that have been validated: + - [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) + - [dandelin/vilt-b32-finetuned-vqa](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa) + - [Salesforce/blip-vqa-capfilt-large](https://huggingface.co/Salesforce/blip-vqa-capfilt-large) \ No newline at end of file diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py index 547684dbde..7b4e817bb7 100644 --- a/examples/visual-question-answering/run_pipeline.py +++ b/examples/visual-question-answering/run_pipeline.py @@ -78,16 +78,16 @@ def main(): args = parser.parse_args() adapt_transformers_to_gaudi() - image_pathes = args.image_path - image_pathes_len = len(image_pathes) + image_paths = args.image_path + image_paths_len = len(image_paths) - if args.batch_size > image_pathes_len: + if args.batch_size > image_paths_len: # Dynamically extends to support larger batch sizes - num_path_to_add = args.batch_size - image_pathes_len + num_path_to_add = args.batch_size - image_paths_len for i in range(num_path_to_add): - image_pathes.append(image_pathes[i % image_pathes_len]) - elif args.batch_size < image_pathes_len: - image_pathes = image_pathes[: args.batch_size] + image_paths.append(image_paths[i % image_paths_len]) + elif args.batch_size < image_paths_len: + image_paths = image_paths[: args.batch_size] questions = args.question questions_len = len(questions) @@ -101,7 +101,7 @@ def main(): images = [] - for image_path in image_pathes: + for image_path in image_paths: images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw).convert("RGB")) if args.bf16: @@ -115,7 +115,7 @@ def main(): torch_dtype=model_dtype, device="hpu", ) - if not generator.model.can_generate() and args.use_hpu_graphs: + if args.use_hpu_graphs: from habana_frameworks.torch.hpu import wrap_in_hpu_graph generator.model = wrap_in_hpu_graph(generator.model)