-
Notifications
You must be signed in to change notification settings - Fork 0
/
image_captioning_app.py
55 lines (42 loc) · 1.98 KB
/
image_captioning_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# Step 1: Set up the environment
# Run pip install gradio transformers Pillow matplotlib to install the required libraries.
# Import the required libraries:
import gradio as gr
from PIL import Image
import numpy as np
# from transformers import AutoProcessor, BlipForConditionalGeneration
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
import matplotlib.pyplot as plt
device = "cuda" if torch.cuda.is_available() else "cpu"
# Step 2: Load the pretrained model
# processor = Blip2Processor.from_pretrained("Salesforce/blip-image-captioning-large")
# model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.float16)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b", device_map={"": 0}, torch_dtype=torch.float16
).to(device)
# Step 3: Define the image captioning function
def caption_image(input_image: np.ndarray) -> str:
# Convert the numpy array to a PIL image and convert to RGB
raw_image = Image.fromarray(input_image).convert("RGB")
# Process the image
inputs = processor(images=raw_image, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to the same device as the model
# Generate a caption for the image
outputs = model.generate(**inputs, max_length=50)
# Decode the generated tokens to text and store it into 'caption'
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
# Step 4: Create the Gradio interface
iface = gr.Interface(
fn=caption_image,
inputs=gr.Image(),
outputs="text",
title="Image Captioning with BLIP",
description="Simple web app for Generating captions for images using a trained model.",
)
# Step 5: Launch the Web App - Gradio interface
iface.launch()
# Step 6: Run the application script
# python app.py