Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,12 @@ def run(self):
package_data={"": ["**/*.cu", "**/*.cpp", "**/*.cuh", "**/*.h", "**/*.pyx", "py.typed"]},
zip_safe=False,
extras_require=extras,
entry_points={"console_scripts": ["transformers=transformers.commands.transformers_cli:main", "transformers-cli=transformers.commands.transformers_cli:main_cli"]},
entry_points={
"console_scripts": [
"transformers=transformers.commands.transformers_cli:main",
"transformers-cli=transformers.commands.transformers_cli:main_cli",
]
},
python_requires=">=3.9.0",
install_requires=list(install_requires),
classifiers=[
Expand Down
30 changes: 6 additions & 24 deletions src/transformers/models/aria/image_processing_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import Iterable, List, Optional, Tuple, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_patch_output_size, select_best_resolution
from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
Expand Down Expand Up @@ -71,23 +70,6 @@ def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> Li
return patches


def _get_patch_output_size(image, target_resolution, input_data_format):
original_height, original_width = get_image_size(image, channel_dim=input_data_format)
target_height, target_width = target_resolution

scale_w = target_width / original_width
scale_h = target_height / original_height

if scale_w < scale_h:
new_width = target_width
new_height = min(math.ceil(original_height * scale_w), target_height)
else:
new_height = target_height
new_width = min(math.ceil(original_width * scale_h), target_width)

return new_height, new_width


class AriaImageProcessor(BaseImageProcessor):
"""
A vision processor for the Aria model that handles image preprocessing.
Expand Down Expand Up @@ -375,7 +357,7 @@ def _resize_for_patching(
Returns:
np.array: The resized and padded image.
"""
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

# Resize the image
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
Expand All @@ -389,12 +371,12 @@ def _pad_for_patching(
Pad an image to a target resolution while maintaining aspect ratio.
"""
target_height, target_width = target_resolution
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
paste_x, r_x = divmod(target_width - new_width, 2)
paste_y, r_y = divmod(target_height - new_height, 2)

padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))
padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x)))

return padded_image

Expand Down
30 changes: 6 additions & 24 deletions src/transformers/models/aria/modular_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import Dict, Iterable, List, Optional, Tuple, Union

import numpy as np

from ...activations import ACT2FN
from ...configuration_utils import PretrainedConfig
from ...generation import GenerationMixin
from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_patch_output_size, select_best_resolution
from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
Expand Down Expand Up @@ -461,23 +460,6 @@ def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tens
return out


def _get_patch_output_size(image, target_resolution, input_data_format):
original_height, original_width = get_image_size(image, channel_dim=input_data_format)
target_height, target_width = target_resolution

scale_w = target_width / original_width
scale_h = target_height / original_height

if scale_w < scale_h:
new_width = target_width
new_height = min(math.ceil(original_height * scale_w), target_height)
else:
new_height = target_height
new_width = min(math.ceil(original_width * scale_h), target_width)

return new_height, new_width


class AriaImageProcessor(BaseImageProcessor):
"""
A vision processor for the Aria model that handles image preprocessing.
Expand Down Expand Up @@ -765,7 +747,7 @@ def _resize_for_patching(
Returns:
np.array: The resized and padded image.
"""
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

# Resize the image
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
Expand All @@ -779,12 +761,12 @@ def _pad_for_patching(
Pad an image to a target resolution while maintaining aspect ratio.
"""
target_height, target_width = target_resolution
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
paste_x, r_x = divmod(target_width - new_width, 2)
paste_y, r_y = divmod(target_height - new_height, 2)

padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))
padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x)))

return padded_image

Expand Down
36 changes: 12 additions & 24 deletions src/transformers/models/llava_next/image_processing_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,17 @@
# limitations under the License.
"""Image processor class for LLaVa-NeXT."""

import math
from typing import Dict, Iterable, List, Optional, Tuple, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
from ...image_processing_utils import (
BaseImageProcessor,
BatchFeature,
get_patch_output_size,
get_size_dict,
select_best_resolution,
)
from ...image_transforms import (
PaddingMode,
convert_to_rgb,
Expand Down Expand Up @@ -99,23 +104,6 @@ def expand_to_square(image: np.array, background_color, input_data_format) -> np
return result


def _get_patch_output_size(image, target_resolution, input_data_format):
original_height, original_width = get_image_size(image, channel_dim=input_data_format)
target_height, target_width = target_resolution

scale_w = target_width / original_width
scale_h = target_height / original_height

if scale_w < scale_h:
new_width = target_width
new_height = min(math.ceil(original_height * scale_w), target_height)
else:
new_height = target_height
new_width = min(math.ceil(original_width * scale_h), target_width)

return new_height, new_width


class LlavaNextImageProcessor(BaseImageProcessor):
r"""
Constructs a LLaVa-NeXT image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques
Expand Down Expand Up @@ -429,7 +417,7 @@ def _resize_for_patching(
Returns:
np.array: The resized and padded image.
"""
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

# Resize the image
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
Expand All @@ -443,12 +431,12 @@ def _pad_for_patching(
Pad an image to a target resolution while maintaining aspect ratio.
"""
target_height, target_width = target_resolution
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
paste_x, r_x = divmod(target_width - new_width, 2)
paste_y, r_y = divmod(target_height - new_height, 2)

padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))
padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x)))

return padded_image

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]):
A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
of the form `(height, width)`.
do_pad (`bool`, *optional*):
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
""",
)
def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature:
Expand Down Expand Up @@ -164,10 +164,10 @@ def _pad_for_patching(
target_height, target_width = target_resolution
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
paste_x, r_x = divmod(target_width - new_width, 2)
paste_y, r_y = divmod(target_height - new_height, 2)

padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x, paste_y])
padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x + r_x, paste_y + r_y])

return padded_image

Expand Down
12 changes: 6 additions & 6 deletions src/transformers/models/llava_next/modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,14 @@ def unpad_image(tensor, original_size):

if original_aspect_ratio > current_aspect_ratio:
scale_factor = current_width / original_width
new_height = int(round(original_height * scale_factor, 7))
padding = (current_height - new_height) // 2
unpadded_tensor = tensor[:, padding : current_height - padding, :]
new_height = min(math.ceil(original_height * scale_factor), current_height)
padding, r = divmod(current_height - new_height, 2)
unpadded_tensor = tensor[:, padding : current_height - (padding + r), :]
else:
scale_factor = current_height / original_height
new_width = int(round(original_width * scale_factor, 7))
padding = (current_width - new_width) // 2
unpadded_tensor = tensor[:, :, padding : current_width - padding]
new_width = min(math.ceil(original_width * scale_factor), current_width)
padding, r = divmod(current_width - new_width, 2)
unpadded_tensor = tensor[:, :, padding : current_width - (padding + r)]

return unpadded_tensor

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,14 +262,14 @@ def unpad_image(tensor, original_size):

if original_aspect_ratio > current_aspect_ratio:
scale_factor = current_width / original_width
new_height = int(round(original_height * scale_factor, 7))
padding = (current_height - new_height) // 2
unpadded_tensor = tensor[:, padding : current_height - padding, :]
new_height = min(math.ceil(original_height * scale_factor), current_height)
padding, r = divmod(current_height - new_height, 2)
unpadded_tensor = tensor[:, padding : current_height - (padding + r), :]
else:
scale_factor = current_height / original_height
new_width = int(round(original_width * scale_factor, 7))
padding = (current_width - new_width) // 2
unpadded_tensor = tensor[:, :, padding : current_width - padding]
new_width = min(math.ceil(original_width * scale_factor), current_width)
padding, r = divmod(current_width - new_width, 2)
unpadded_tensor = tensor[:, :, padding : current_width - (padding + r)]

return unpadded_tensor

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,17 @@
# limitations under the License.
"""Image processor class for LLaVa-Onevision."""

import math
from typing import Dict, Iterable, List, Optional, Tuple, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
from ...image_processing_utils import (
BaseImageProcessor,
BatchFeature,
get_patch_output_size,
get_size_dict,
select_best_resolution,
)
from ...image_transforms import (
PaddingMode,
convert_to_rgb,
Expand Down Expand Up @@ -99,24 +104,6 @@ def expand_to_square(image: np.array, background_color, input_data_format) -> np
return result


# Copied from transformers.models.llava_next.image_processing_llava_next._get_patch_output_size
def _get_patch_output_size(image, target_resolution, input_data_format):
original_height, original_width = get_image_size(image, channel_dim=input_data_format)
target_height, target_width = target_resolution

scale_w = target_width / original_width
scale_h = target_height / original_height

if scale_w < scale_h:
new_width = target_width
new_height = min(math.ceil(original_height * scale_w), target_height)
else:
new_height = target_height
new_width = min(math.ceil(original_width * scale_h), target_width)

return new_height, new_width


class LlavaOnevisionImageProcessor(BaseImageProcessor):
r"""
Constructs a LLaVa-Onevision image processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame.
Expand Down Expand Up @@ -151,8 +138,8 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor):
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
"""
Expand Down Expand Up @@ -321,7 +308,7 @@ def _resize_for_patching(
Returns:
np.array: The resized and padded image.
"""
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

# Resize the image
resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
Expand All @@ -336,12 +323,12 @@ def _pad_for_patching(
Pad an image to a target resolution while maintaining aspect ratio.
"""
target_height, target_width = target_resolution
new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
paste_x, r_x = divmod(target_width - new_width, 2)
paste_y, r_y = divmod(target_height - new_height, 2)

padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))
padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x)))

return padded_image

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]):
A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
of the form `(height, width)`.
do_pad (`bool`, *optional*):
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
""",
)
def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature:
Expand Down Expand Up @@ -146,10 +146,10 @@ def _pad_for_patching(
target_height, target_width = target_resolution
new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
paste_x, r_x = divmod(target_width - new_width, 2)
paste_y, r_y = divmod(target_height - new_height, 2)

padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x, paste_y])
padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x + r_x, paste_y + r_y])

return padded_image

Expand Down
Loading