Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,7 @@
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoTokenizer,
FlaxVisionEncoderDecoderModel,
HfArgumentParser,
)
from transformers import AutoConfig, AutoImageProcessor, AutoTokenizer, FlaxVisionEncoderDecoderModel, HfArgumentParser


@dataclass
Expand Down Expand Up @@ -108,13 +102,13 @@ def main():
model.config.decoder_start_token_id = decoder_start_token_id
model.config.pad_token_id = pad_token_id

feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.encoder_model_name_or_path)
image_processor = AutoImageProcessor.from_pretrained(model_args.encoder_model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(model_args.decoder_model_name_or_path)
tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)

model.save_pretrained(model_args.output_dir)
feature_extractor.save_pretrained(model_args.output_dir)
image_processor.save_pretrained(model_args.output_dir)
tokenizer.save_pretrained(model_args.output_dir)


Expand Down
60 changes: 30 additions & 30 deletions examples/flax/image-captioning/run_image_captioning_flax.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
from huggingface_hub import Repository, create_repo
from transformers import (
AutoFeatureExtractor,
AutoImageProcessor,
AutoTokenizer,
FlaxVisionEncoderDecoderModel,
HfArgumentParser,
Expand Down Expand Up @@ -106,12 +106,12 @@ class TrainingArguments:
default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
)
_block_size_doc = """
The default value `0` will preprocess (tokenization + feature extraction) the whole dataset before training and
The default value `0` will preprocess (tokenization + image processing) the whole dataset before training and
cache the results. This uses more disk space, but avoids (repeated) processing time during training. This is a
good option if your disk space is large enough to store the whole processed dataset.
If a positive value is given, the captions in the dataset will be tokenized before training and the results are
cached. During training, it iterates the dataset in chunks of size `block_size`. On each block, images are
transformed by the feature extractor with the results being kept in memory (no cache), and batches of size
transformed by the image processor with the results being kept in memory (no cache), and batches of size
`batch_size` are yielded before processing the next block. This could avoid the heavy disk usage when the
dataset is large.
"""
Expand Down Expand Up @@ -477,7 +477,7 @@ def main():
dtype=getattr(jnp, model_args.dtype),
use_auth_token=True if model_args.use_auth_token else None,
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
image_processor = AutoImageProcessor.from_pretrained(
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
Expand Down Expand Up @@ -546,7 +546,7 @@ def filter_fn(examples):
for image_file in examples[image_column]:
try:
image = Image.open(image_file)
feature_extractor(images=image, return_tensors="np")
image_processor(images=image, return_tensors="np")
bools.append(True)
except Exception:
bools.append(False)
Expand Down Expand Up @@ -582,9 +582,9 @@ def tokenization_fn(examples, max_target_length):

return model_inputs

def feature_extraction_fn(examples, check_image=True):
def image_processing_fn(examples, check_image=True):
"""
Run feature extraction on images
Run preprocessing on images

If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
Otherwise, an exception will be thrown.
Expand All @@ -609,18 +609,18 @@ def feature_extraction_fn(examples, check_image=True):
else:
images = [Image.open(image_file) for image_file in examples[image_column]]

encoder_inputs = feature_extractor(images=images, return_tensors="np")
encoder_inputs = image_processor(images=images, return_tensors="np")
model_inputs["pixel_values"] = encoder_inputs.pixel_values

return model_inputs

def preprocess_fn(examples, max_target_length, check_image=True):
"""Run tokenization + image feature extraction"""
"""Run tokenization + image processing"""

model_inputs = {}
# This contains image path column
model_inputs.update(tokenization_fn(examples, max_target_length))
model_inputs.update(feature_extraction_fn(model_inputs, check_image=check_image))
model_inputs.update(image_processing_fn(model_inputs, check_image=check_image))
# Remove image path column
model_inputs.pop(image_column)

Expand All @@ -644,15 +644,15 @@ def preprocess_fn(examples, max_target_length, check_image=True):
}
)

# If `block_size` is `0`, tokenization & image feature extraction is done at the beginning
run_feat_ext_at_beginning = training_args.block_size == 0
# If `block_size` is `0`, tokenization & image processing is done at the beginning
run_img_proc_at_beginning = training_args.block_size == 0
# Used in .map() below
function_kwarg = preprocess_fn if run_feat_ext_at_beginning else tokenization_fn
function_kwarg = preprocess_fn if run_img_proc_at_beginning else tokenization_fn
# `features` is used only for the final preprocessed dataset (for the performance purpose).
features_kwarg = features if run_feat_ext_at_beginning else None
# Keep `image_column` if the feature extraction is done during training
remove_columns_kwarg = [x for x in column_names if x != image_column or run_feat_ext_at_beginning]
processor_names = "tokenizer and feature extractor" if run_feat_ext_at_beginning else "tokenizer"
features_kwarg = features if run_img_proc_at_beginning else None
# Keep `image_column` if the image processing is done during training
remove_columns_kwarg = [x for x in column_names if x != image_column or run_img_proc_at_beginning]
processor_names = "tokenizer and image processor" if run_img_proc_at_beginning else "tokenizer"

# Store some constant
train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
Expand All @@ -671,9 +671,9 @@ def preprocess_fn(examples, max_target_length, check_image=True):
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
# remove problematic examples
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
# (if image processing is performed at the beginning, the filtering is done during preprocessing below
# instead here.)
if not run_feat_ext_at_beginning:
if not run_img_proc_at_beginning:
train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
train_dataset = train_dataset.map(
function=function_kwarg,
Expand All @@ -686,7 +686,7 @@ def preprocess_fn(examples, max_target_length, check_image=True):
fn_kwargs={"max_target_length": data_args.max_target_length},
features=features_kwarg,
)
if run_feat_ext_at_beginning:
if run_img_proc_at_beginning:
# set format (for performance) since the dataset is ready to be used
train_dataset = train_dataset.with_format("numpy")

Expand All @@ -705,9 +705,9 @@ def preprocess_fn(examples, max_target_length, check_image=True):
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
# remove problematic examples
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
# (if image processing is performed at the beginning, the filtering is done during preprocessing below
# instead here.)
if not run_feat_ext_at_beginning:
if not run_img_proc_at_beginning:
eval_dataset = eval_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
eval_dataset = eval_dataset.map(
function=function_kwarg,
Expand All @@ -720,7 +720,7 @@ def preprocess_fn(examples, max_target_length, check_image=True):
fn_kwargs={"max_target_length": data_args.val_max_target_length},
features=features_kwarg,
)
if run_feat_ext_at_beginning:
if run_img_proc_at_beginning:
# set format (for performance) since the dataset is ready to be used
eval_dataset = eval_dataset.with_format("numpy")

Expand All @@ -735,9 +735,9 @@ def preprocess_fn(examples, max_target_length, check_image=True):
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
predict_dataset = predict_dataset.select(range(max_predict_samples))
# remove problematic examples
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
# (if image processing is performed at the beginning, the filtering is done during preprocessing below
# instead here.)
if not run_feat_ext_at_beginning:
if not run_img_proc_at_beginning:
predict_dataset = predict_dataset.filter(
filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers
)
Expand All @@ -752,7 +752,7 @@ def preprocess_fn(examples, max_target_length, check_image=True):
fn_kwargs={"max_target_length": data_args.val_max_target_length},
features=features_kwarg,
)
if run_feat_ext_at_beginning:
if run_img_proc_at_beginning:
# set format (for performance) since the dataset is ready to be used
predict_dataset = predict_dataset.with_format("numpy")

Expand All @@ -771,8 +771,8 @@ def blockwise_data_loader(
"""
Wrap the simple `data_loader` in a block-wise way if `block_size` > 0, else it's the same as `data_loader`.

If `block_size` > 0, it requires `ds` to have a column that gives image paths in order to perform image feature
extraction (with the column name being specified by `image_column`). The tokenization should be done before
If `block_size` > 0, it requires `ds` to have a column that gives image paths in order to perform image
processing (with the column name being specified by `image_column`). The tokenization should be done before
training in this case.
"""

Expand Down Expand Up @@ -804,7 +804,7 @@ def blockwise_data_loader(
_ds = ds.select(selected_indices)

_ds = _ds.map(
feature_extraction_fn,
image_processing_fn,
batched=True,
num_proc=data_args.preprocessing_num_workers,
remove_columns=[image_column],
Expand All @@ -813,7 +813,7 @@ def blockwise_data_loader(
keep_in_memory=keep_in_memory,
# The images are already checked either in `.filter()` or in `preprocess_fn()`
fn_kwargs={"check_image": False},
desc=f"Running feature extraction on {split} dataset".replace(" ", " "),
desc=f"Running image processing on {split} dataset".replace(" ", " "),
)
_ds = _ds.with_format("numpy")

Expand Down
14 changes: 7 additions & 7 deletions examples/pytorch/contrastive-image-text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,24 +52,24 @@ ds = datasets.load_dataset("ydshieh/coco_dataset_script", "2017", data_dir=COCO_

### Create a model from a vision encoder model and a text decoder model
Next, we create a [VisionTextDualEncoderModel](https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder#visiontextdualencoder).
The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
Here is an example of how to load the model using pre-trained vision and text models.

```python3
from transformers import (
VisionTextDualEncoderModel,
VisionTextDualEncoderProcessor,
AutoTokenizer,
AutoFeatureExtractor
VisionTextDualEncoderModel,
VisionTextDualEncoderProcessor,
AutoTokenizer,
AutoImageProcessor
)

model = VisionTextDualEncoderModel.from_vision_text_pretrained(
"openai/clip-vit-base-patch32", "roberta-base"
)

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
feat_ext = AutoFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")
processor = VisionTextDualEncoderProcessor(feat_ext, tokenizer)
image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)

# save the model and processor
model.save_pretrained("clip-roberta")
Expand Down
14 changes: 7 additions & 7 deletions examples/pytorch/contrastive-image-text/run_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

import transformers
from transformers import (
AutoFeatureExtractor,
AutoImageProcessor,
AutoModel,
AutoTokenizer,
HfArgumentParser,
Expand Down Expand Up @@ -74,7 +74,7 @@ class ModelArguments:
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
)
Expand Down Expand Up @@ -308,7 +308,7 @@ def main():
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

# 5. Load pretrained model, tokenizer, and feature extractor
# 5. Load pretrained model, tokenizer, and image processor
if model_args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
Expand All @@ -323,9 +323,9 @@ def main():
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)

# Load feature_extractor, in this script we only use this to get the mean and std for normalization.
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name or model_args.model_name_or_path,
# Load image_processor, in this script we only use this to get the mean and std for normalization.
image_processor = AutoImageProcessor.from_pretrained(
model_args.image_processor_name or model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
Expand Down Expand Up @@ -386,7 +386,7 @@ def _freeze_params(module):
# 7. Preprocessing the datasets.
# Initialize torchvision transforms and jit it for faster processing.
image_transformations = Transform(
config.vision_config.image_size, feature_extractor.image_mean, feature_extractor.image_std
config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
)
image_transformations = torch.jit.script(image_transformations)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
AutoConfig,
AutoFeatureExtractor,
AutoImageProcessor,
AutoModelForImageClassification,
HfArgumentParser,
Trainer,
Expand Down Expand Up @@ -141,7 +141,7 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
use_auth_token: bool = field(
default=False,
metadata={
Expand Down Expand Up @@ -283,19 +283,19 @@ def compute_metrics(p):
use_auth_token=True if model_args.use_auth_token else None,
ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name or model_args.model_name_or_path,
image_processor = AutoImageProcessor.from_pretrained(
model_args.image_processor_name or model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)

# Define torchvision transforms to be applied to each image.
if "shortest_edge" in feature_extractor.size:
size = feature_extractor.size["shortest_edge"]
if "shortest_edge" in image_processor.size:
size = image_processor.size["shortest_edge"]
else:
size = (feature_extractor.size["height"], feature_extractor.size["width"])
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
size = (image_processor.size["height"], image_processor.size["width"])
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
_train_transforms = Compose(
[
RandomResizedCrop(size),
Expand Down Expand Up @@ -352,7 +352,7 @@ def val_transforms(example_batch):
train_dataset=dataset["train"] if training_args.do_train else None,
eval_dataset=dataset["validation"] if training_args.do_eval else None,
compute_metrics=compute_metrics,
tokenizer=feature_extractor,
tokenizer=image_processor,
data_collator=collate_fn,
)

Expand Down
Loading