Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/transformers/pipelines/depth_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def preprocess(self, image, timeout=None):
image = load_image(image, timeout)
self.image_size = image.size
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
if self.framework == 'pt':
model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
return model_inputs

def _forward(self, model_inputs):
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/pipelines/document_question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,10 @@ def preprocess(
if input.get("image", None) is not None:
image = load_image(input["image"], timeout=timeout)
if self.image_processor is not None:
image_features.update(self.image_processor(images=image, return_tensors=self.framework))
image_inputs = self.image_processor(images=image, return_tensors=self.framework)
if self.framework == 'pt':
image_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in image_inputs.items()}
image_features.update(image_inputs)
elif self.feature_extractor is not None:
image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
elif self.model_type == ModelType.VisionEncoderDecoder:
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/pipelines/image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES

if is_torch_available():
import torch

from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES

logger = logging.get_logger(__name__)
Expand Down Expand Up @@ -159,6 +161,8 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
def preprocess(self, image, timeout=None):
image = load_image(image, timeout=timeout)
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
if self.framework == 'pt':
model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
return model_inputs

def _forward(self, model_inputs):
Expand Down
6 changes: 5 additions & 1 deletion src/transformers/pipelines/image_feature_extraction.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from typing import Dict

from ..utils import add_end_docstrings, is_vision_available
from ..utils import add_end_docstrings, is_vision_available, is_torch_available
from .base import GenericTensor, Pipeline, build_pipeline_init_args


if is_vision_available():
from ..image_utils import load_image

if is_torch_available():
import torch

@add_end_docstrings(
build_pipeline_init_args(has_image_processor=True),
Expand Down Expand Up @@ -60,6 +62,8 @@ def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None,
def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
image = load_image(image, timeout=timeout)
model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
if self.framework == 'pt':
model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
return model_inputs

def _forward(self, model_inputs):
Expand Down
6 changes: 6 additions & 0 deletions src/transformers/pipelines/image_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from ..image_utils import load_image

if is_torch_available():
import torch

from ..models.auto.modeling_auto import (
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
Expand Down Expand Up @@ -147,6 +149,8 @@ def preprocess(self, image, subtask=None, timeout=None):
else:
kwargs = {"task_inputs": [subtask]}
inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
if self.framework == 'pt':
inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
inputs["task_inputs"] = self.tokenizer(
inputs["task_inputs"],
padding="max_length",
Expand All @@ -155,6 +159,8 @@ def preprocess(self, image, subtask=None, timeout=None):
)["input_ids"]
else:
inputs = self.image_processor(images=[image], return_tensors="pt")
if self.framework == 'pt':
inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
inputs["target_size"] = target_size
return inputs

Expand Down
4 changes: 4 additions & 0 deletions src/transformers/pipelines/image_to_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from ..image_utils import load_image

if is_torch_available():
import torch

from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES

logger = logging.get_logger(__name__)
Expand Down Expand Up @@ -119,6 +121,8 @@ def _forward(self, model_inputs):
def preprocess(self, image, timeout=None):
image = load_image(image, timeout=timeout)
inputs = self.image_processor(images=[image], return_tensors="pt")
if self.framework == 'pt':
inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
return inputs

def postprocess(self, model_outputs):
Expand Down
8 changes: 8 additions & 0 deletions src/transformers/pipelines/image_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,17 +138,23 @@ def preprocess(self, image, prompt=None, timeout=None):

if model_type == "git":
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
if self.framework == 'pt':
model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
input_ids = [self.tokenizer.cls_token_id] + input_ids
input_ids = torch.tensor(input_ids).unsqueeze(0)
model_inputs.update({"input_ids": input_ids})

elif model_type == "pix2struct":
model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
if self.framework == 'pt':
model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}

elif model_type != "vision-encoder-decoder":
# vision-encoder-decoder does not support conditional generation
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
if self.framework == 'pt':
model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
model_inputs.update(text_inputs)

Expand All @@ -157,6 +163,8 @@ def preprocess(self, image, prompt=None, timeout=None):

else:
model_inputs = self.image_processor(images=image, return_tensors=self.framework)
if self.framework == 'pt':
model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}

if self.model.config.model_type == "git" and prompt is None:
model_inputs["input_ids"] = None
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/pipelines/mask_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ def preprocess(
image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
)
model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
if self.framework == 'pt':
model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}

with self.device_placement():
if self.framework == "pt":
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/pipelines/object_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def preprocess(self, image, timeout=None):
image = load_image(image, timeout=timeout)
target_size = torch.IntTensor([[image.height, image.width]])
inputs = self.image_processor(images=[image], return_tensors="pt")
if self.framework == 'pt':
inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
if self.tokenizer is not None:
inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
inputs["target_size"] = target_size
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/pipelines/video_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@


if is_torch_available():
import torch

from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES

logger = logging.get_logger(__name__)
Expand Down Expand Up @@ -106,6 +108,8 @@ def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
video = list(video)

model_inputs = self.image_processor(video, return_tensors=self.framework)
if self.framework == 'pt':
model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
return model_inputs

def _forward(self, model_inputs):
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/pipelines/visual_question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from ..image_utils import load_image

if is_torch_available():
import torch

from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
from .pt_utils import KeyDataset

Expand Down Expand Up @@ -155,6 +157,8 @@ def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
truncation=truncation,
)
image_features = self.image_processor(images=image, return_tensors=self.framework)
if self.framework == 'pt':
image_features = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in image_features.items()}
model_inputs.update(image_features)
return model_inputs

Expand Down
2 changes: 2 additions & 0 deletions src/transformers/pipelines/zero_shot_image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def _sanitize_parameters(self, **kwargs):
def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}.", timeout=None):
image = load_image(image, timeout=timeout)
inputs = self.image_processor(images=[image], return_tensors=self.framework)
if self.framework == 'pt':
inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
inputs["candidate_labels"] = candidate_labels
sequences = [hypothesis_template.format(x) for x in candidate_labels]
padding = "max_length" if self.model.config.model_type == "siglip" else True
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/pipelines/zero_shot_object_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ def preprocess(self, inputs, timeout=None):
for i, candidate_label in enumerate(candidate_labels):
text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
image_features = self.image_processor(image, return_tensors=self.framework)
if self.framework == 'pt':
image_features = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in image_features.items()}
yield {
"is_last": i == len(candidate_labels) - 1,
"target_size": target_size,
Expand Down