diff --git a/README.md b/README.md index f1cd98c624..1b5b7da7e1 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,7 @@ The following model architectures, tasks and device distributions have been vali | CLIP | :heavy_check_mark: | :heavy_check_mark: |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | | BridgeTower | :heavy_check_mark: | :heavy_check_mark: |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | | ESMFold | |
  • Single card
  • |
  • [protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)
  • | +| Blip | |
  • Single card
  • |
  • [visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 2f15ece915..8de7a1d1e2 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -62,6 +62,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be | CLIP | ✅ | ✅ |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | | BridgeTower | ✅ | ✅ |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | | ESMFold | |
  • Single card
  • |
  • [protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)
  • | +| Blip | |
  • Single card
  • |
  • [visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | - Diffusers diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md new file mode 100644 index 0000000000..67a9857bcb --- /dev/null +++ b/examples/image-to-text/README.md @@ -0,0 +1,33 @@ + + +# Image to Text Examples + +This directory contains a script that showcases how to use the Transformers pipeline API to run image to text task on HPUs. + +## Single-HPU inference + +```bash +python3 run_pipeline.py \ + --model_name_or_path Salesforce/blip-image-captioning-large \ + --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \ + --use_hpu_graphs \ + --bf16 +``` +Models that have been validated: + - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning) + - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) + - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base) \ No newline at end of file diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py new file mode 100644 index 0000000000..ccaf51a57b --- /dev/null +++ b/examples/image-to-text/run_pipeline.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import argparse +import logging +import time + +import PIL.Image +import requests +import torch +from transformers import pipeline + +from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + help="Path to pre-trained model", + ) + parser.add_argument( + "--image_path", + default=None, + type=str, + nargs="*", + help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")', + ) + + parser.add_argument( + "--prompt", + default=None, + type=str, + help='Optional argument to give a prompt of your choice as input. is a single string (eg: --prompt "Hello world")', + ) + parser.add_argument( + "--use_hpu_graphs", + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.") + parser.add_argument( + "--bf16", + action="store_true", + help="Whether to perform generation in bf16 precision.", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.") + parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.") + args = parser.parse_args() + + adapt_transformers_to_gaudi() + image_paths = args.image_path + image_paths_len = len(image_paths) + + if args.batch_size > image_paths_len: + # Dynamically extends to support larger batch sizes + num_path_to_add = args.batch_size - image_paths_len + for i in range(num_path_to_add): + image_paths.append(image_paths[i % image_paths_len]) + elif args.batch_size < image_paths_len: + image_paths = image_paths[: args.batch_size] + + images = [] + + for image_path in image_paths: + images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw)) + + if args.bf16: + model_dtype = torch.bfloat16 + else: + model_dtype = torch.float32 + + generator = pipeline( + "image-to-text", + model=args.model_name_or_path, + torch_dtype=model_dtype, + device="hpu", + ) + generate_kwargs = { + "lazy_mode": True, + "hpu_graphs": args.use_hpu_graphs, + "max_new_tokens": args.max_new_tokens, + "ignore_eos": False, + } + if args.use_hpu_graphs: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + generator.model = wrap_in_hpu_graph(generator.model) + + # warm up + for i in range(args.warmup): + generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs) + + start = time.time() + for i in range(args.n_iterations): + result = generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs) + end = time.time() + logger.info(f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms") + + +if __name__ == "__main__": + main() diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md new file mode 100644 index 0000000000..efbe1f2a92 --- /dev/null +++ b/examples/visual-question-answering/README.md @@ -0,0 +1,35 @@ + + +# Visual Question Answering Examples + +This directory contains a script that showcases how to use the Transformers pipeline API to run visual question answering task on HPUs. + +## Single-HPU inference + +```bash +python3 run_pipeline.py \ + --model_name_or_path Salesforce/blip-vqa-capfilt-large \ + --image_path "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" \ + --question "how many dogs are in the picture?" \ + --use_hpu_graphs \ + --bf16 +``` + +Models that have been validated: + - [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) + - [dandelin/vilt-b32-finetuned-vqa](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa) + - [Salesforce/blip-vqa-capfilt-large](https://huggingface.co/Salesforce/blip-vqa-capfilt-large) \ No newline at end of file diff --git a/examples/visual-question-answering/run_pipeline.py b/examples/visual-question-answering/run_pipeline.py new file mode 100644 index 0000000000..7b4e817bb7 --- /dev/null +++ b/examples/visual-question-answering/run_pipeline.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import argparse +import logging +import time + +import PIL.Image +import requests +import torch +from transformers import pipeline + +from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + help="Path to pre-trained model", + ) + parser.add_argument( + "--image_path", + default=None, + type=str, + nargs="*", + help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")', + ) + parser.add_argument( + "--topk", + default=1, + type=int, + help="topk num", + ) + parser.add_argument( + "--question", + default=None, + type=str, + nargs="*", + help='question as input. Can be a single string (eg: --question "Q1"), or a list of space-separated strings (eg: --question "Q1" "Q2")', + ) + parser.add_argument( + "--use_hpu_graphs", + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument( + "--bf16", + action="store_true", + help="Whether to perform in bf16 precision.", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.") + parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.") + args = parser.parse_args() + + adapt_transformers_to_gaudi() + image_paths = args.image_path + image_paths_len = len(image_paths) + + if args.batch_size > image_paths_len: + # Dynamically extends to support larger batch sizes + num_path_to_add = args.batch_size - image_paths_len + for i in range(num_path_to_add): + image_paths.append(image_paths[i % image_paths_len]) + elif args.batch_size < image_paths_len: + image_paths = image_paths[: args.batch_size] + + questions = args.question + questions_len = len(questions) + if args.batch_size > questions_len: + # Dynamically extends to support larger batch sizes + num_question_to_add = args.batch_size - questions_len + for i in range(num_question_to_add): + questions.append(questions[i % questions_len]) + elif args.batch_size < questions_len: + questions = questions[: args.batch_size] + + images = [] + + for image_path in image_paths: + images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw).convert("RGB")) + + if args.bf16: + model_dtype = torch.bfloat16 + else: + model_dtype = torch.float32 + + generator = pipeline( + "visual-question-answering", + model=args.model_name_or_path, + torch_dtype=model_dtype, + device="hpu", + ) + if args.use_hpu_graphs: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + generator.model = wrap_in_hpu_graph(generator.model) + + autocast_enable = model_dtype == torch.bfloat16 + model_input = [] + for i in range(args.batch_size): + model_input.append({"image": images[i], "question": questions[i]}) + + # warm up + for i in range(args.warmup): + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable): + generator(model_input, batch_size=args.batch_size, topk=args.topk) + + start = time.time() + for i in range(args.n_iterations): + with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable): + result = generator(model_input, batch_size=args.batch_size, topk=args.topk) + end = time.time() + logger.info(f"result = {result}, time = {(end-start) * 1000/args.n_iterations}ms") + + +if __name__ == "__main__": + main()