Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions src/transformers/dynamic_module_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,8 @@ def custom_object_save(obj, folder, config=None):
"this code in a separate module so we can include it in the saved folder and make it easier to share via "
"the Hub."
)
# Add object class to the config auto_map
if config is not None:

def _set_auto_map_in_config(_config):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to apply that code to one or several configs, so putting it in an internal function.

module_name = obj.__class__.__module__
last_module = module_name.split(".")[-1]
full_name = f"{last_module}.{obj.__class__.__name__}"
Expand All @@ -418,12 +418,21 @@ def custom_object_save(obj, folder, config=None):

full_name = (slow_tokenizer_class, fast_tokenizer_class)

if isinstance(config, dict):
config["auto_map"] = full_name
elif getattr(config, "auto_map", None) is not None:
config.auto_map[obj._auto_class] = full_name
if isinstance(_config, dict):
auto_map = _config.get("auto_map", {})
auto_map[obj._auto_class] = full_name
_config["auto_map"] = auto_map
elif getattr(_config, "auto_map", None) is not None:
_config.auto_map[obj._auto_class] = full_name
else:
config.auto_map = {obj._auto_class: full_name}
_config.auto_map = {obj._auto_class: full_name}

# Add object class to the config auto_map
if isinstance(config, (list, tuple)):
for cfg in config:
_set_auto_map_in_config(cfg)
elif config is not None:
_set_auto_map_in_config(config)

# Copy module file to the output folder.
object_file = sys.modules[obj.__module__].__file__
Expand Down
101 changes: 70 additions & 31 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,22 @@

# Build the list of all feature extractors
from ...configuration_utils import PretrainedConfig
from ...dynamic_module_utils import get_class_from_dynamic_module
from ...feature_extraction_utils import FeatureExtractionMixin
from ...file_utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo
from ...tokenization_utils import TOKENIZER_CONFIG_FILE
from ...utils import logging
from .auto_factory import _LazyAutoMapping
from .configuration_auto import (
CONFIG_MAPPING_NAMES,
AutoConfig,
config_class_to_model_type,
model_type_to_module_name,
replace_list_option_in_docstrings,
)


logger = logging.get_logger(__name__)

PROCESSOR_MAPPING_NAMES = OrderedDict(
[
("clip", "CLIPProcessor"),
Expand Down Expand Up @@ -120,6 +123,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
`kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
should only be set to `True` for repositories you trust and in which you have read the code, as it will
execute code present on the Hub on your local machine.
kwargs (`Dict[str, Any]`, *optional*):
The values in kwargs of any keys which are feature extractor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
Expand All @@ -143,10 +150,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
>>> processor = AutoProcessor.from_pretrained("./test/saved_model/")
```"""
config = kwargs.pop("config", None)
trust_remote_code = kwargs.pop("trust_remote_code", False)
kwargs["_from_auto"] = True

processor_class = None
processor_auto_map = None

# First, let's see if we have a preprocessor config.
# Filter the kwargs for `get_file_from_repo``.
# Filter the kwargs for `get_file_from_repo`.
get_file_from_repo_kwargs = {
key: kwargs[key] for key in inspect.signature(get_file_from_repo).parameters.keys() if key in kwargs
}
Expand All @@ -156,35 +167,63 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
)
if preprocessor_config_file is not None:
config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
if "processor_class" in config_dict:
processor_class = processor_class_from_name(config_dict["processor_class"])
return processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)

# Next, let's check whether the processor class is saved in a tokenizer
# Let's start by checking whether the processor class is saved in a feature extractor
tokenizer_config_file = get_file_from_repo(
pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, **get_file_from_repo_kwargs
)
if tokenizer_config_file is not None:
with open(tokenizer_config_file, encoding="utf-8") as reader:
config_dict = json.load(reader)

if "processor_class" in config_dict:
processor_class = processor_class_from_name(config_dict["processor_class"])
return processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)

# Otherwise, load config, if it can be loaded.
if not isinstance(config, PretrainedConfig):
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

model_type = config_class_to_model_type(type(config).__name__)

if getattr(config, "processor_class", None) is not None:
processor_class = processor_class_from_name(config.processor_class)
return processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)

model_type = config_class_to_model_type(type(config).__name__)
if model_type is not None:
processor_class = config_dict.get("processor_class", None)
if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]

if processor_class is None:
# Next, let's check whether the processor class is saved in a tokenizer
tokenizer_config_file = get_file_from_repo(
pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, **get_file_from_repo_kwargs
)
if tokenizer_config_file is not None:
with open(tokenizer_config_file, encoding="utf-8") as reader:
config_dict = json.load(reader)

processor_class = config_dict.get("processor_class", None)
if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]

if processor_class is None:
# Otherwise, load config, if it can be loaded.
if not isinstance(config, PretrainedConfig):
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
)

# And check if the config contains the processor class.
processor_class = getattr(config, "processor_class", None)
if hasattr(config, "auto_map") and "AutoProcessor" in config.auto_map:
processor_auto_map = config.auto_map["AutoProcessor"]

if processor_class is not None:
# If we have custom code for a feature extractor, we get the proper class.
if processor_auto_map is not None:
if not trust_remote_code:
raise ValueError(
f"Loading {pretrained_model_name_or_path} requires you to execute the feature extractor file "
"in that repo on your local machine. Make sure you have read the code there to avoid "
"malicious use, then set the option `trust_remote_code=True` to remove this error."
)
if kwargs.get("revision", None) is None:
logger.warning(
"Explicitly passing a `revision` is encouraged when loading a feature extractor with custom "
"code to ensure no malicious code has been contributed in a newer revision."
)

module_file, class_name = processor_auto_map.split(".")
processor_class = get_class_from_dynamic_module(
pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs
)
else:
processor_class = processor_class_from_name(processor_class)

return processor_class.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
)

# Last try: we use the PROCESSOR_MAPPING.
if type(config) in PROCESSOR_MAPPING:
return PROCESSOR_MAPPING[type(config)].from_pretrained(pretrained_model_name_or_path, **kwargs)

raise ValueError(
Expand Down
8 changes: 7 additions & 1 deletion src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
# Next, let's try to use the tokenizer_config file to get the tokenizer class.
tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
config_tokenizer_class = tokenizer_config.get("tokenizer_class")
tokenizer_auto_map = tokenizer_config.get("auto_map")
tokenizer_auto_map = None
if "auto_map" in tokenizer_config:
if isinstance(tokenizer_config["auto_map"], (tuple, list)):
# Legacy format for dynamic tokenizers
tokenizer_auto_map = tokenizer_config["auto_map"]
else:
tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
Comment on lines +472 to +478
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, thanks for going the extra mile to be backwards-compatible.


# If that did not work, let's try to use the config.
if config_tokenizer_class is None:
Expand Down
48 changes: 47 additions & 1 deletion src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@
"""

import importlib.util
import os
from pathlib import Path

from .dynamic_module_utils import custom_object_save
from .tokenization_utils_base import PreTrainedTokenizerBase

# Comment to write

# Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
spec = importlib.util.spec_from_file_location(
"transformers", Path(__file__).parent / "__init__.py", submodule_search_locations=[Path(__file__).parent]
)
Expand All @@ -42,6 +46,7 @@ class ProcessorMixin:
# Names need to be attr_class for attr in attributes
feature_extractor_class = None
tokenizer_class = None
_auto_class = None

# args have to match the attributes class attribute
def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -101,6 +106,14 @@ def save_pretrained(self, save_directory):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
os.makedirs(save_directory, exist_ok=True)
# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
# loaded from the Hub.
if self._auto_class is not None:
attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs]
custom_object_save(self, save_directory, config=configs)

for attribute_name in self.attributes:
attribute = getattr(self, attribute_name)
# Include the processor class in the attribute config so this processor can then be reloaded with the
Expand All @@ -109,6 +122,13 @@ def save_pretrained(self, save_directory):
attribute._set_processor_class(self.__class__.__name__)
attribute.save_pretrained(save_directory)

if self._auto_class is not None:
# We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
for attribute_name in self.attributes:
attribute = getattr(self, attribute_name)
if isinstance(attribute, PreTrainedTokenizerBase):
del attribute.init_kwargs["auto_map"]

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Expand Down Expand Up @@ -142,6 +162,32 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(*args)

@classmethod
def register_for_auto_class(cls, auto_class="AutoProcessor"):
"""
Register this class with a given auto class. This should only be used for custom feature extractors as the ones
in the library are already mapped with `AutoProcessor`.

<Tip warning={true}>

This API is experimental and may have some slight breaking changes in the next releases.

</Tip>

Args:
auto_class (`str` or `type`, *optional*, defaults to `"AutoProcessor"`):
The auto class to register this new feature extractor with.
"""
if not isinstance(auto_class, str):
auto_class = auto_class.__name__

import transformers.models.auto as auto_module

if not hasattr(auto_module, auto_class):
raise ValueError(f"{auto_class} is not a valid auto class.")

cls._auto_class = auto_class

@classmethod
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
args = []
Expand Down
4 changes: 1 addition & 3 deletions tests/test_feature_extraction_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@
if is_vision_available():
from PIL import Image

SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate (see below), that's why I'm removing it.



SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")

Expand Down Expand Up @@ -119,7 +117,7 @@ def test_init_without_params(self):


@is_staging_test
class ConfigPushToHubTester(unittest.TestCase):
class FeatureExtractorPushToHubTester(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls._token = login(username=USER, password=PASS)
Expand Down
Loading