huggingface · amyeroberts · Jan 19, 2023 · Dec 21, 2022 · Jan 16, 2023 · Jan 19, 2023
diff --git a/examples/flax/image-captioning/create_model_from_encoder_decoder_models.py b/examples/flax/image-captioning/create_model_from_encoder_decoder_models.py
@@ -22,13 +22,7 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
-from transformers import (
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoTokenizer,
-    FlaxVisionEncoderDecoderModel,
-    HfArgumentParser,
-)
+from transformers import AutoConfig, AutoImageProcessor, AutoTokenizer, FlaxVisionEncoderDecoderModel, HfArgumentParser
 
 
 @dataclass
@@ -108,13 +102,13 @@ def main():
     model.config.decoder_start_token_id = decoder_start_token_id
     model.config.pad_token_id = pad_token_id
 
-    feature_extractor = AutoFeatureExtractor.from_pretrained(model_args.encoder_model_name_or_path)
+    image_processor = AutoImageProcessor.from_pretrained(model_args.encoder_model_name_or_path)
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.decoder_model_name_or_path)
     tokenizer.pad_token = tokenizer.convert_ids_to_tokens(model.config.pad_token_id)
 
     model.save_pretrained(model_args.output_dir)
-    feature_extractor.save_pretrained(model_args.output_dir)
+    image_processor.save_pretrained(model_args.output_dir)
     tokenizer.save_pretrained(model_args.output_dir)
 
 

diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -47,7 +47,7 @@
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from huggingface_hub import Repository, create_repo
 from transformers import (
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoTokenizer,
     FlaxVisionEncoderDecoderModel,
     HfArgumentParser,
@@ -106,12 +106,12 @@ class TrainingArguments:
         default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
     )
     _block_size_doc = """
-        The default value `0` will preprocess (tokenization + feature extraction) the whole dataset before training and
+        The default value `0` will preprocess (tokenization + image processing) the whole dataset before training and
         cache the results. This uses more disk space, but avoids (repeated) processing time during training. This is a
         good option if your disk space is large enough to store the whole processed dataset.
         If a positive value is given, the captions in the dataset will be tokenized before training and the results are
         cached. During training, it iterates the dataset in chunks of size `block_size`. On each block, images are
-        transformed by the feature extractor with the results being kept in memory (no cache), and batches of size
+        transformed by the image processor with the results being kept in memory (no cache), and batches of size
         `batch_size` are yielded before processing the next block. This could avoid the heavy disk usage when the
         dataset is large.
         """
@@ -477,7 +477,7 @@ def main():
         dtype=getattr(jnp, model_args.dtype),
         use_auth_token=True if model_args.use_auth_token else None,
     )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
+    image_processor = AutoImageProcessor.from_pretrained(
         model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_auth_token=True if model_args.use_auth_token else None,
@@ -546,7 +546,7 @@ def filter_fn(examples):
         for image_file in examples[image_column]:
             try:
                 image = Image.open(image_file)
-                feature_extractor(images=image, return_tensors="np")
+                image_processor(images=image, return_tensors="np")
                 bools.append(True)
             except Exception:
                 bools.append(False)
@@ -582,9 +582,9 @@ def tokenization_fn(examples, max_target_length):
 
         return model_inputs
 
-    def feature_extraction_fn(examples, check_image=True):
+    def image_processing_fn(examples, check_image=True):
         """
-        Run feature extraction on images
+        Run preprocessing on images
 
         If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
         Otherwise, an exception will be thrown.
@@ -609,18 +609,18 @@ def feature_extraction_fn(examples, check_image=True):
         else:
             images = [Image.open(image_file) for image_file in examples[image_column]]
 
-        encoder_inputs = feature_extractor(images=images, return_tensors="np")
+        encoder_inputs = image_processor(images=images, return_tensors="np")
         model_inputs["pixel_values"] = encoder_inputs.pixel_values
 
         return model_inputs
 
     def preprocess_fn(examples, max_target_length, check_image=True):
-        """Run tokenization + image feature extraction"""
+        """Run tokenization + image processing"""
 
         model_inputs = {}
         # This contains image path column
         model_inputs.update(tokenization_fn(examples, max_target_length))
-        model_inputs.update(feature_extraction_fn(model_inputs, check_image=check_image))
+        model_inputs.update(image_processing_fn(model_inputs, check_image=check_image))
         # Remove image path column
         model_inputs.pop(image_column)
 
@@ -644,15 +644,15 @@ def preprocess_fn(examples, max_target_length, check_image=True):
         }
     )
 
-    # If `block_size` is `0`, tokenization & image feature extraction is done at the beginning
-    run_feat_ext_at_beginning = training_args.block_size == 0
+    # If `block_size` is `0`, tokenization & image processing is done at the beginning
+    run_img_proc_at_beginning = training_args.block_size == 0
     # Used in .map() below
-    function_kwarg = preprocess_fn if run_feat_ext_at_beginning else tokenization_fn
+    function_kwarg = preprocess_fn if run_img_proc_at_beginning else tokenization_fn
     # `features` is used only for the final preprocessed dataset (for the performance purpose).
-    features_kwarg = features if run_feat_ext_at_beginning else None
-    # Keep `image_column` if the feature extraction is done during training
-    remove_columns_kwarg = [x for x in column_names if x != image_column or run_feat_ext_at_beginning]
-    processor_names = "tokenizer and feature extractor" if run_feat_ext_at_beginning else "tokenizer"
+    features_kwarg = features if run_img_proc_at_beginning else None
+    # Keep `image_column` if the image processing is done during training
+    remove_columns_kwarg = [x for x in column_names if x != image_column or run_img_proc_at_beginning]
+    processor_names = "tokenizer and image processor" if run_img_proc_at_beginning else "tokenizer"
 
     # Store some constant
     train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
@@ -671,9 +671,9 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
             train_dataset = train_dataset.select(range(max_train_samples))
         # remove problematic examples
-        # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
+        # (if image processing is performed at the beginning, the filtering is done during preprocessing below
         # instead here.)
-        if not run_feat_ext_at_beginning:
+        if not run_img_proc_at_beginning:
             train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
         train_dataset = train_dataset.map(
             function=function_kwarg,
@@ -686,7 +686,7 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             fn_kwargs={"max_target_length": data_args.max_target_length},
             features=features_kwarg,
         )
-        if run_feat_ext_at_beginning:
+        if run_img_proc_at_beginning:
             # set format (for performance) since the dataset is ready to be used
             train_dataset = train_dataset.with_format("numpy")
 
@@ -705,9 +705,9 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
             eval_dataset = eval_dataset.select(range(max_eval_samples))
         # remove problematic examples
-        # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
+        # (if image processing is performed at the beginning, the filtering is done during preprocessing below
         # instead here.)
-        if not run_feat_ext_at_beginning:
+        if not run_img_proc_at_beginning:
             eval_dataset = eval_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
         eval_dataset = eval_dataset.map(
             function=function_kwarg,
@@ -720,7 +720,7 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             fn_kwargs={"max_target_length": data_args.val_max_target_length},
             features=features_kwarg,
         )
-        if run_feat_ext_at_beginning:
+        if run_img_proc_at_beginning:
             # set format (for performance) since the dataset is ready to be used
             eval_dataset = eval_dataset.with_format("numpy")
 
@@ -735,9 +735,9 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
             predict_dataset = predict_dataset.select(range(max_predict_samples))
         # remove problematic examples
-        # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
+        # (if image processing is performed at the beginning, the filtering is done during preprocessing below
         # instead here.)
-        if not run_feat_ext_at_beginning:
+        if not run_img_proc_at_beginning:
             predict_dataset = predict_dataset.filter(
                 filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers
             )
@@ -752,7 +752,7 @@ def preprocess_fn(examples, max_target_length, check_image=True):
             fn_kwargs={"max_target_length": data_args.val_max_target_length},
             features=features_kwarg,
         )
-        if run_feat_ext_at_beginning:
+        if run_img_proc_at_beginning:
             # set format (for performance) since the dataset is ready to be used
             predict_dataset = predict_dataset.with_format("numpy")
 
@@ -771,8 +771,8 @@ def blockwise_data_loader(
         """
         Wrap the simple `data_loader` in a block-wise way if `block_size` > 0, else it's the same as `data_loader`.
 
-        If `block_size` > 0, it requires `ds` to have a column that gives image paths in order to perform image feature
-        extraction (with the column name being specified by `image_column`). The tokenization should be done before
+        If `block_size` > 0, it requires `ds` to have a column that gives image paths in order to perform image
+        processing (with the column name being specified by `image_column`). The tokenization should be done before
         training in this case.
         """
 
@@ -804,7 +804,7 @@ def blockwise_data_loader(
                 _ds = ds.select(selected_indices)
 
                 _ds = _ds.map(
-                    feature_extraction_fn,
+                    image_processing_fn,
                     batched=True,
                     num_proc=data_args.preprocessing_num_workers,
                     remove_columns=[image_column],
@@ -813,7 +813,7 @@ def blockwise_data_loader(
                     keep_in_memory=keep_in_memory,
                     # The images are already checked either in `.filter()` or in `preprocess_fn()`
                     fn_kwargs={"check_image": False},
-                    desc=f"Running feature extraction on {split} dataset".replace("  ", " "),
+                    desc=f"Running image processing on {split} dataset".replace("  ", " "),
                 )
                 _ds = _ds.with_format("numpy")
 

diff --git a/examples/pytorch/contrastive-image-text/README.md b/examples/pytorch/contrastive-image-text/README.md
@@ -52,24 +52,24 @@ ds = datasets.load_dataset("ydshieh/coco_dataset_script", "2017", data_dir=COCO_
 
 ### Create a model from a vision encoder model and a text decoder model
 Next, we create a [VisionTextDualEncoderModel](https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder#visiontextdualencoder).
-The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder. 
+The `VisionTextDualEncoderModel` class let's you load any vision and text encoder model to create a dual encoder.
 Here is an example of how to load the model using pre-trained vision and text models.
 
 ```python3
 from transformers import (
-    VisionTextDualEncoderModel, 
-    VisionTextDualEncoderProcessor, 
-    AutoTokenizer, 
-    AutoFeatureExtractor
+    VisionTextDualEncoderModel,
+    VisionTextDualEncoderProcessor,
+    AutoTokenizer,
+    AutoImageProcessor
 )
 
 model = VisionTextDualEncoderModel.from_vision_text_pretrained(
     "openai/clip-vit-base-patch32", "roberta-base"
 )
 
 tokenizer = AutoTokenizer.from_pretrained("roberta-base")
-feat_ext = AutoFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")
-processor = VisionTextDualEncoderProcessor(feat_ext, tokenizer)
+image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
 
 # save the model and processor
 model.save_pretrained("clip-roberta")

diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -38,7 +38,7 @@
 
 import transformers
 from transformers import (
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModel,
     AutoTokenizer,
     HfArgumentParser,
@@ -74,7 +74,7 @@ class ModelArguments:
     tokenizer_name: Optional[str] = field(
         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
     )
-    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
     cache_dir: Optional[str] = field(
         default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
     )
@@ -308,7 +308,7 @@ def main():
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
 
-    # 5. Load pretrained model, tokenizer, and feature extractor
+    # 5. Load pretrained model, tokenizer, and image processor
     if model_args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
@@ -323,9 +323,9 @@ def main():
             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
         )
 
-    # Load feature_extractor, in this script we only use this to get the mean and std for normalization.
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name or model_args.model_name_or_path,
+    # Load image_processor, in this script we only use this to get the mean and std for normalization.
+    image_processor = AutoImageProcessor.from_pretrained(
+        model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
@@ -386,7 +386,7 @@ def _freeze_params(module):
     # 7. Preprocessing the datasets.
     # Initialize torchvision transforms and jit it for faster processing.
     image_transformations = Transform(
-        config.vision_config.image_size, feature_extractor.image_mean, feature_extractor.image_std
+        config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
     )
     image_transformations = torch.jit.script(image_transformations)
 

diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
@@ -38,7 +38,7 @@
 from transformers import (
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
     AutoConfig,
-    AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModelForImageClassification,
     HfArgumentParser,
     Trainer,
@@ -141,7 +141,7 @@ class ModelArguments:
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
     use_auth_token: bool = field(
         default=False,
         metadata={
@@ -283,19 +283,19 @@ def compute_metrics(p):
         use_auth_token=True if model_args.use_auth_token else None,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name or model_args.model_name_or_path,
+    image_processor = AutoImageProcessor.from_pretrained(
+        model_args.image_processor_name or model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
     )
 
     # Define torchvision transforms to be applied to each image.
-    if "shortest_edge" in feature_extractor.size:
-        size = feature_extractor.size["shortest_edge"]
+    if "shortest_edge" in image_processor.size:
+        size = image_processor.size["shortest_edge"]
     else:
-        size = (feature_extractor.size["height"], feature_extractor.size["width"])
-    normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+        size = (image_processor.size["height"], image_processor.size["width"])
+    normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
     _train_transforms = Compose(
         [
             RandomResizedCrop(size),
@@ -352,7 +352,7 @@ def val_transforms(example_batch):
         train_dataset=dataset["train"] if training_args.do_train else None,
         eval_dataset=dataset["validation"] if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=feature_extractor,
+        tokenizer=image_processor,
         data_collator=collate_fn,
     )