From 3239683196c511cc480e5ef50d2ca1fc46b8a0f4 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Fri, 8 Dec 2023 12:37:41 +0800 Subject: [PATCH 01/27] try fix --- paddlenlp/generation/configuration_utils.py | 34 ++- paddlenlp/transformers/__init__.py | 1 + paddlenlp/transformers/aistudio_utils.py | 27 ++- paddlenlp/transformers/auto/configuration.py | 34 +-- .../transformers/auto/image_processing.py | 225 ++++++++++++++++++ paddlenlp/transformers/auto/modeling.py | 42 ++-- paddlenlp/transformers/auto/processing.py | 82 +++++-- paddlenlp/transformers/auto/tokenizer.py | 17 +- paddlenlp/transformers/configuration_utils.py | 43 +++- .../transformers/feature_extraction_utils.py | 43 ++-- .../transformers/image_processing_utils.py | 41 ++-- paddlenlp/transformers/model_utils.py | 23 +- .../speecht5/feature_extraction.py | 2 + paddlenlp/transformers/tokenizer_utils.py | 20 +- .../transformers/tokenizer_utils_base.py | 32 ++- paddlenlp/transformers/utils.py | 47 ++-- 16 files changed, 555 insertions(+), 158 deletions(-) create mode 100644 paddlenlp/transformers/auto/image_processing.py diff --git a/paddlenlp/generation/configuration_utils.py b/paddlenlp/generation/configuration_utils.py index 8f92f2949159..cdf8e4216f10 100644 --- a/paddlenlp/generation/configuration_utils.py +++ b/paddlenlp/generation/configuration_utils.py @@ -27,6 +27,7 @@ from paddlenlp.transformers.utils import resolve_cache_dir from paddlenlp.utils.log import logger +from ..transformers.aistudio_utils import aistudio_download from ..utils import GENERATION_CONFIG_NAME from ..utils.downloader import ( COMMUNITY_MODEL_PREFIX, @@ -336,6 +337,7 @@ def from_pretrained( cls, pretrained_model_name_or_path: Union[str, os.PathLike], from_hf_hub: bool = False, + from_aistudio: bool = False, config_file_name: Optional[Union[str, os.PathLike]] = None, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, @@ -404,12 +406,11 @@ def from_pretrained( ```""" config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME - subfolder = kwargs.pop("subfolder", None) + subfolder = kwargs.pop("subfolder", "") + if subfolder is None: + subfolder = "" - config_path = os.path.join(pretrained_model_name_or_path, config_file_name) - config_path = str(config_path) - - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) # 1. get the configuration file from local file, eg: /cache/path/model_config.json if os.path.isfile(pretrained_model_name_or_path): @@ -418,24 +419,37 @@ def from_pretrained( # 2. get the configuration file from url, eg: https://ip/path/to/model_config.json elif is_url(pretrained_model_name_or_path): resolved_config_file = get_path_from_url_with_filelock( - pretrained_model_name_or_path, cache_dir, check_exist=not force_download + pretrained_model_name_or_path, + cache_dir=os.path.join(cache_dir, pretrained_model_name_or_path, subfolder), + check_exist=not force_download, ) # 3. get the configuration file from local dir with default name, eg: /local/path elif os.path.isdir(pretrained_model_name_or_path): - configuration_file = os.path.join(pretrained_model_name_or_path, GENERATION_CONFIG_NAME) + configuration_file = os.path.join(pretrained_model_name_or_path, subfolder, config_file_name) if os.path.exists(configuration_file): resolved_config_file = configuration_file else: # try to detect old-school config file raise FileNotFoundError("please make sure there is `generation_config.json` under the dir") - - # 4. get the configuration file from HF hub + # 4. get the configuration file from aistudio + elif from_aistudio: + resolved_config_file = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=config_file_name, + cache_dir=cache_dir, + subfolder=subfolder, + ) + # 5. get the configuration file from HF hub elif from_hf_hub: resolved_config_file = resolve_hf_generation_config_path( repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, subfolder=subfolder ) else: - community_url = "/".join([COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, GENERATION_CONFIG_NAME]) + url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, config_file_name] + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + if subfolder != "": + url_list.insert(2, subfolder) + community_url = "/".join(url_list) if url_file_exists(community_url): resolved_config_file = get_path_from_url_with_filelock( community_url, cache_dir, check_exist=not force_download diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index 1546cb16d3dd..0c7baf9595b6 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -209,6 +209,7 @@ from .auto.modeling import * from .auto.tokenizer import * from .auto.processing import * +from .auto.image_processing import * from .auto.configuration import * from .codegen.modeling import * from .codegen.tokenizer import * diff --git a/paddlenlp/transformers/aistudio_utils.py b/paddlenlp/transformers/aistudio_utils.py index 5aed6e05dd17..6c1756ed3362 100644 --- a/paddlenlp/transformers/aistudio_utils.py +++ b/paddlenlp/transformers/aistudio_utils.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from aistudio_sdk.hub import download @@ -23,11 +25,32 @@ class EntryNotFoundError(Exception): pass -def aistudio_download(repo_id: str, filename: str): - # TODO: add arguments such as cache_dir, revision, etc. +def _add_subfolder(weights_name: str, subfolder: Optional[str] = None) -> str: + if subfolder is not None and subfolder != "": + weights_name = "/".join([subfolder, weights_name]) + return weights_name + + +def aistudio_download( + repo_id: str, + filename: str = None, + cache_dir: Optional[str] = None, + subfolder: Optional[str] = "", + revision: Optional[str] = None, + **kwargs, +): + if revision is None: + revision = "master" + filename = _add_subfolder(filename, subfolder) + download_kwargs = {} + if revision is not None: + download_kwargs["revision"] = revision + if cache_dir is not None: + download_kwargs["cache_dir"] = cache_dir res = download( repo_id=repo_id, filename=filename, + **download_kwargs, ) if "path" in res: return res["path"] diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index d75e7842c445..182a82514558 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -159,12 +159,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar config = AutoConfig.from_pretrained("bert-base-uncased") config.save_pretrained('./bert-base-uncased') """ - subfolder = kwargs.get("subfolder", None) - from_aistudio = kwargs.get("from_aistudio", False) - from_hf_hub = kwargs.get("from_hf_hub", False) - cache_dir = resolve_cache_dir( - pretrained_model_name_or_path, from_hf_hub=from_hf_hub, cache_dir=kwargs.pop("cache_dir", None) - ) + subfolder = kwargs.pop("subfolder", "") + if subfolder is None: + subfolder = "" + from_aistudio = kwargs.pop("from_aistudio", False) + from_hf_hub = kwargs.pop("from_hf_hub", False) + cache_dir = kwargs.pop("cache_dir", None) + cache_dir = resolve_cache_dir(from_hf_hub=from_hf_hub, from_aistudio=from_aistudio, cache_dir=cache_dir) if not cls.name2class: cls.name2class = {} @@ -182,10 +183,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar # From local dir path elif os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, cls.config_file) + config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_file) if not os.path.exists(config_file): # try to load legacy config file - legacy_config_file = os.path.join(pretrained_model_name_or_path, cls.legacy_config_file) + legacy_config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.legacy_config_file) if not os.path.exists(legacy_config_file): raise ValueError( f"config file<{cls.config_file}> or legacy config file<{cls.legacy_config_file}> not found" @@ -203,6 +204,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar file = aistudio_download( repo_id=pretrained_model_name_or_path, filename=cls.config_file, + subfolder=subfolder, + cache_dir=cache_dir, ) return cls.from_pretrained(os.path.dirname(file)) elif from_hf_hub: @@ -219,15 +222,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar # Assuming from community-contributed pretrained models else: - # support subfolder - if subfolder is not None: - pretrained_model_name_or_path = os.path.join(pretrained_model_name_or_path, subfolder) + url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.config_file] + legacy_url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_config_file] + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + if subfolder != "": + url_list.insert(2, subfolder) + legacy_url_list.insert(2, subfolder) + community_config_path = "/".join(url_list) + legacy_community_config_path = "/".join(legacy_url_list) - community_config_path = "/".join([COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.config_file]) if not url_file_exists(community_config_path): - legacy_community_config_path = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_config_file] - ) if not url_file_exists(legacy_community_config_path): raise RuntimeError( f"Can't load Config for '{pretrained_model_name_or_path}'.\n" diff --git a/paddlenlp/transformers/auto/image_processing.py b/paddlenlp/transformers/auto/image_processing.py new file mode 100644 index 000000000000..c22d2ff16a50 --- /dev/null +++ b/paddlenlp/transformers/auto/image_processing.py @@ -0,0 +1,225 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import io +import json +import os +from collections import OrderedDict + +from huggingface_hub import hf_hub_download + +from ... import __version__ +from ...utils.downloader import COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock +from ...utils.import_utils import import_module +from ...utils.log import logger +from ..aistudio_utils import aistudio_download +from ..utils import resolve_cache_dir + +__all__ = [ + "AutoImageProcessor", +] + +IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( + [ + ("ChineseCLIPImageProcessor", "chineseclip"), + ("CLIPImageProcessor", "clip"), + ("ErnieViLImageProcessor", "ernie_vil"), + ("ViTImageProcessor", "clipseg"), + ] +) + + +def get_configurations(): + MAPPING_NAMES = OrderedDict() + for key, class_name in IMAGE_PROCESSOR_MAPPING_NAMES.items(): + import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.image_processing") + processor_name = getattr(import_class, key) + name = tuple(processor_name.pretrained_init_configuration.keys()) + if MAPPING_NAMES.get(name, None) is None: + MAPPING_NAMES[name] = [] + MAPPING_NAMES[name].append(processor_name) + return MAPPING_NAMES + + +class AutoImageProcessor: + """ + AutoClass can help you automatically retrieve the relevant model given the provided + pretrained weights/vocabulary. + AutoImageProcessor is a generic processor class that will be instantiated as one of the + base processor classes when created with the AutoImageProcessor.from_pretrained() classmethod. + """ + + MAPPING_NAMES = get_configurations() + _processor_mapping = MAPPING_NAMES + _name_mapping = IMAGE_PROCESSOR_MAPPING_NAMES + image_processor_config_file = "preprocessor_config.json" + + def __init__(self, *args, **kwargs): + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`" + ) + + @classmethod + def _get_image_processor_class_from_config(cls, pretrained_model_name_or_path, config_file_path): + with io.open(config_file_path, encoding="utf-8") as f: + init_kwargs = json.load(f) + # class name corresponds to this configuration + init_class = init_kwargs.pop("init_class", None) + if init_class is None: + init_class = init_kwargs.pop("image_processor_type", None) + + if init_class: + # replace old name to new name + init_class = init_class.replace("FeatureExtractor", "ImageProcessor") + try: + class_name = cls._name_mapping[init_class] + import_class = import_module(f"paddlenlp.transformers.{class_name}.image_processing") + processor_class = getattr(import_class, init_class) + return processor_class + except Exception: + init_class = None + + # If no `init_class`, we use pattern recognition to recognize the processor class. + if init_class is None: + logger.info("We use pattern recognition to recognize the processor class.") + for key, pattern in cls._name_mapping.items(): + if pattern in pretrained_model_name_or_path.lower(): + init_class = key + class_name = cls._name_mapping[init_class] + import_class = import_module(f"paddlenlp.transformers.{class_name}.image_processing") + processor_class = getattr(import_class, init_class) + break + return processor_class + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + """ + Creates an instance of `AutoImageProcessor`. Related resources are loaded by + specifying name of a built-in pretrained model, or a community-contributed + pretrained model, or a local file directory path. + + Args: + pretrained_model_name_or_path (str): Name of pretrained model or dir path + to load from. The string can be: + + - Name of built-in pretrained model + - Name of a community-contributed pretrained model. + - Local directory path which contains processor related resources + and processor config file ("processor_config.json"). + *args (tuple): position arguments for model `__init__`. If provided, + use these as position argument values for processor initialization. + **kwargs (dict): keyword arguments for model `__init__`. If provided, + use these to update pre-defined keyword argument values for processor + initialization. + + Returns: + Pretrainedprocessor: An instance of `Pretrainedprocessor`. + + + Example: + .. code-block:: + from paddlenlp.transformers import AutoImageProcessor + processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32") + processor.save_pretrained('clip_processor') + """ + cache_dir = kwargs.get("cache_dir", None) + subfolder = kwargs.get("subfolder", "") + if subfolder is None: + subfolder = "" + from_aistudio = kwargs.get("from_aistudio", False) + from_hf_hub = kwargs.get("from_hf_hub", False) + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) + kwargs["subfolder"] = subfolder + kwargs["cache_dir"] = cache_dir + + all_processor_names = [] + for names, processor_class in cls._processor_mapping.items(): + for name in names: + all_processor_names.append(name) + + # From AI Studio or HF Hub + if from_aistudio or from_hf_hub: + if from_aistudio: + config_file = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=cls.image_processor_config_file, + cache_dir=cache_dir, + subfolder=subfolder, + ) + else: + config_file = hf_hub_download( + repo_id=pretrained_model_name_or_path, + filename=cls.image_processor_config_file, + subfolder=subfolder, + cache_dir=cache_dir, + library_name="PaddleNLP", + library_version=__version__, + ) + if os.path.exists(config_file): + processor_class = cls._get_image_processor_class_from_config( + pretrained_model_name_or_path, + config_file, + ) + logger.info(f"We are using {processor_class} to load '{pretrained_model_name_or_path}'.") + return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + # From built-in pretrained models + elif pretrained_model_name_or_path in all_processor_names: + for names, processor_classes in cls._processor_mapping.items(): + for pattern in names: + if pattern == pretrained_model_name_or_path: + actual_processor_class = processor_classes[0] + logger.info( + "We are using %s to load '%s'." % (actual_processor_class, pretrained_model_name_or_path) + ) + return actual_processor_class.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + # From local dir path + elif os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.image_processor_config_file) + if os.path.exists(config_file): + processor_class = cls._get_image_processor_class_from_config( + pretrained_model_name_or_path, config_file + ) + logger.info("We are using %s to load '%s'." % (processor_class, pretrained_model_name_or_path)) + return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + # Assuming from community-contributed pretrained models + else: + url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.image_processor_config_file] + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + if subfolder != "": + url_list.insert(2, subfolder) + community_config_path = "/".join(url_list) + + try: + resolved_vocab_file = get_path_from_url_with_filelock(community_config_path, cache_dir) + except RuntimeError as err: + logger.error(err) + raise RuntimeError( + f"Can't load processor for '{pretrained_model_name_or_path}'.\n" + f"Please make sure that '{pretrained_model_name_or_path}' is:\n" + "- a correct model-identifier of built-in pretrained models,\n" + "- or a correct model-identifier of community-contributed pretrained models,\n" + "- or the correct path to a directory containing relevant processor files.\n" + ) + + if os.path.exists(resolved_vocab_file): + processor_class = cls._get_image_processor_class_from_config( + pretrained_model_name_or_path, resolved_vocab_file + ) + logger.info("We are using %s to load '%s'." % (processor_class, pretrained_model_name_or_path)) + return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py index dc23a9aee29d..69d64653458f 100644 --- a/paddlenlp/transformers/auto/modeling.py +++ b/paddlenlp/transformers/auto/modeling.py @@ -278,15 +278,23 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, cache_dir = kwargs.get("cache_dir", None) from_aistudio = kwargs.get("from_aistudio", False) from_hf_hub = kwargs.get("from_hf_hub", False) - subfolder = kwargs.get("subfolder", None) - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) - + subfolder = kwargs.get("subfolder", "") + if subfolder is None: + subfolder = "" + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) + kwargs["cache_dir"] = cache_dir + kwargs["subfolder"] = subfolder all_model_names = [] for pretrained_model_names, model_name in cls._pretrained_model_dict.items(): for name in pretrained_model_names: all_model_names.append(name) if from_aistudio: - config_file = aistudio_download(repo_id=pretrained_model_name_or_path, filename=cls.model_config_file) + config_file = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=cls.model_config_file, + subfolder=subfolder, + cache_dir=cache_dir, + ) if os.path.exists(config_file): model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, config_file) logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.") @@ -294,7 +302,9 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, else: logger.warning(f"{config_file} is not a valid path to a model config file") elif from_hf_hub: - if hf_file_exists(repo_id=pretrained_model_name_or_path, filename=cls.model_config_file): + if hf_file_exists( + repo_id=pretrained_model_name_or_path, filename=cls.model_config_file, subfolder=subfolder + ): config_file = hf_hub_download( repo_id=pretrained_model_name_or_path, filename=cls.model_config_file, @@ -303,7 +313,9 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, library_name="PaddleNLP", library_version=__version__, ) - elif hf_file_exists(repo_id=pretrained_model_name_or_path, filename=cls.legacy_model_config_file): + elif hf_file_exists( + repo_id=pretrained_model_name_or_path, filename=cls.legacy_model_config_file, subfolder=subfolder + ): logger.info("Standard config do not exist, loading from legacy config") config_file = hf_hub_download( repo_id=pretrained_model_name_or_path, @@ -352,8 +364,8 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, cls.model_config_file) - legacy_config_file = os.path.join(pretrained_model_name_or_path, cls.legacy_model_config_file) + config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.model_config_file) + legacy_config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.legacy_model_config_file) if os.path.exists(config_file): model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, config_file) logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.") @@ -367,12 +379,14 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, logger.warning(f"{config_file} is not a valid path to a model config file") # Assuming from community-contributed pretrained models else: - standard_community_url = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.model_config_file] - ) - legacy_community_url = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_model_config_file] - ) + standard_url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.model_config_file] + legacy_url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_model_config_file] + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + if subfolder != "": + standard_url_list.insert(2, subfolder) + legacy_url_list.insert(2, subfolder) + standard_community_url = "/".join(standard_url_list) + legacy_community_url = "/".join(legacy_url_list) try: if url_file_exists(standard_community_url): resolved_vocab_file = get_path_from_url_with_filelock(standard_community_url, cache_dir) diff --git a/paddlenlp/transformers/auto/processing.py b/paddlenlp/transformers/auto/processing.py index 870773b1b338..dc26cc89587a 100644 --- a/paddlenlp/transformers/auto/processing.py +++ b/paddlenlp/transformers/auto/processing.py @@ -19,9 +19,13 @@ import os from collections import OrderedDict +from huggingface_hub import hf_hub_download + +from ... import __version__ from ...utils.downloader import COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock from ...utils.import_utils import import_module from ...utils.log import logger +from ..aistudio_utils import aistudio_download from ..utils import resolve_cache_dir __all__ = [ @@ -79,14 +83,28 @@ def _get_processor_class_from_config(cls, pretrained_model_name_or_path, config_ init_class = init_kwargs.pop("init_class", None) if init_class is None: init_class = init_kwargs.pop("processor_class", None) + if init_class is None: + init_class = init_kwargs.pop("image_processor_type", None) + # replace old name to new name + if init_class is not None and init_class.endswith("ImageProcessor"): + init_class = init_class.replace("ImageProcessor", "Processor") + if init_class is None: + init_class = init_kwargs.pop("feature_extractor_type", None) + # replace old name to new name + if init_class is not None and init_class.endswith("FeatureExtractor"): + init_class = init_class.replace("FeatureExtractor", "Processor") if init_class: - class_name = cls._name_mapping[init_class] - import_class = import_module(f"paddlenlp.transformers.{class_name}.processing") - processor_class = getattr(import_class, init_class) - return processor_class + try: + class_name = cls._name_mapping[init_class] + import_class = import_module(f"paddlenlp.transformers.{class_name}.processing") + processor_class = getattr(import_class, init_class) + return processor_class + except Exception: + init_class = None + # If no `init_class`, we use pattern recognition to recognize the processor class. - else: + if init_class is None: logger.info("We use pattern recognition to recognize the processor class.") for key, pattern in cls._name_mapping.items(): if pattern in pretrained_model_name_or_path.lower(): @@ -128,17 +146,48 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") processor.save_pretrained('clip_processor') """ - cache_dir = resolve_cache_dir( - pretrained_model_name_or_path=pretrained_model_name_or_path, - from_hf_hub=False, # TODO: from_hf_hub not supported yet - cache_dir=kwargs.pop("cache_dir", None), - ) + cache_dir = kwargs.get("cache_dir", None) + subfolder = kwargs.get("subfolder", "") + if subfolder is None: + subfolder = "" + from_aistudio = kwargs.get("from_aistudio", False) + from_hf_hub = kwargs.get("from_hf_hub", False) + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) + kwargs["subfolder"] = subfolder + kwargs["cache_dir"] = cache_dir + all_processor_names = [] for names, processor_class in cls._processor_mapping.items(): for name in names: all_processor_names.append(name) + + # From AI Studio or HF Hub + if from_aistudio or from_hf_hub: + if from_aistudio: + config_file = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=cls.processor_config_file, + cache_dir=cache_dir, + subfolder=subfolder, + ) + else: + config_file = hf_hub_download( + repo_id=pretrained_model_name_or_path, + filename=cls.processor_config_file, + subfolder=subfolder, + cache_dir=cache_dir, + library_name="PaddleNLP", + library_version=__version__, + ) + if os.path.exists(config_file): + processor_class = cls._get_processor_class_from_config( + pretrained_model_name_or_path, + config_file, + ) + logger.info(f"We are using {processor_class} to load '{pretrained_model_name_or_path}'.") + return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) # From built-in pretrained models - if pretrained_model_name_or_path in all_processor_names: + elif pretrained_model_name_or_path in all_processor_names: for names, processor_classes in cls._processor_mapping.items(): for pattern in names: if pattern == pretrained_model_name_or_path: @@ -151,16 +200,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, cls.processor_config_file) + config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.processor_config_file) if os.path.exists(config_file): processor_class = cls._get_processor_class_from_config(pretrained_model_name_or_path, config_file) logger.info("We are using %s to load '%s'." % (processor_class, pretrained_model_name_or_path)) return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) # Assuming from community-contributed pretrained models else: - community_config_path = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.processor_config_file] - ) + url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.processor_config_file] + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + if subfolder != "": + url_list.insert(2, subfolder) + community_config_path = "/".join(url_list) + try: resolved_vocab_file = get_path_from_url_with_filelock(community_config_path, cache_dir) except RuntimeError as err: diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 7d4728d1d01c..cb49be64e3a4 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -265,9 +265,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): use_fast = kwargs.pop("use_fast", False) cache_dir = kwargs.get("cache_dir", None) subfolder = kwargs.get("subfolder", "") + if subfolder is None: + subfolder = "" from_aistudio = kwargs.get("from_aistudio", False) from_hf_hub = kwargs.get("from_hf_hub", False) - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) if "use_faster" in kwargs: use_fast = kwargs.pop("use_faster", False) @@ -281,7 +283,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if from_aistudio or from_hf_hub: if from_aistudio: config_file = aistudio_download( - repo_id=pretrained_model_name_or_path, filename=cls.tokenizer_config_file + repo_id=pretrained_model_name_or_path, + filename=cls.tokenizer_config_file, + cache_dir=cache_dir, + subfolder=subfolder, ) else: config_file = hf_hub_download( @@ -347,9 +352,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): raise FileNotFoundError(f"{config_file} is not found under '{pretrained_model_name_or_path}'") # Assuming from community-contributed pretrained models else: - community_config_path = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.tokenizer_config_file] - ) + url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.tokenizer_config_file] + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + if subfolder != "": + url_list.insert(2, subfolder) + community_config_path = "/".join(url_list) try: resolved_vocab_file = get_path_from_url_with_filelock(community_config_path, cache_dir) except RuntimeError as err: diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py index 9d5fe5091fe7..bbbaddc44404 100644 --- a/paddlenlp/transformers/configuration_utils.py +++ b/paddlenlp/transformers/configuration_utils.py @@ -702,14 +702,20 @@ def get_config_dict( original_kwargs = copy.deepcopy(kwargs) cache_dir = kwargs.pop("cache_dir", None) from_hf_hub = kwargs.get("from_hf_hub", False) - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) + from_aistudio = kwargs.get("from_aistudio", False) + subfolder = kwargs.get("subfolder", "") + if subfolder is None: + subfolder = "" + + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) + kwargs["cache_dir"] = cache_dir # Get config dict associated with the base config file - config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, cache_dir=cache_dir, **kwargs) + config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) # That config file may point us toward another config file to use. if "configuration_files" in config_dict: - original_kwargs["cache_dir"] = cache_dir + original_kwargs["cache_dir"] = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) configuration_file = get_configuration_file(config_dict["configuration_files"]) config_dict, kwargs = cls._get_config_dict( pretrained_model_name_or_path, _configuration_file=configuration_file, **original_kwargs @@ -724,7 +730,9 @@ def _get_config_dict( cache_dir = kwargs.pop("cache_dir", None) from_hf_hub = kwargs.pop("from_hf_hub", False) from_aistudio = kwargs.pop("from_aistudio", False) - subfolder = kwargs.pop("subfolder", None) + subfolder = kwargs.pop("subfolder", "") + if subfolder is None: + subfolder = "" force_download = kwargs.pop("force_download", False) pretrained_model_name_or_path = str(pretrained_model_name_or_path) @@ -745,17 +753,19 @@ def _get_config_dict( # 2. get the configuration file from url, eg: https://ip/path/to/model_config.json elif is_url(pretrained_model_name_or_path): resolved_config_file = get_path_from_url_with_filelock( - pretrained_model_name_or_path, cache_dir, check_exist=not force_download + pretrained_model_name_or_path, + os.path.join(cache_dir, pretrained_model_name_or_path, subfolder), + check_exist=not force_download, ) # 3. get the configuration file from local dir with default name, eg: /local/path elif os.path.isdir(pretrained_model_name_or_path): configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME) - configuration_file = os.path.join(pretrained_model_name_or_path, configuration_file) + configuration_file = os.path.join(pretrained_model_name_or_path, subfolder, configuration_file) if os.path.exists(configuration_file): resolved_config_file = configuration_file else: # try to detect old-school config file - configuration_file = os.path.join(pretrained_model_name_or_path, LEGACY_CONFIG_NAME) + configuration_file = os.path.join(pretrained_model_name_or_path, subfolder, LEGACY_CONFIG_NAME) if os.path.exists(configuration_file): resolved_config_file = configuration_file else: @@ -763,20 +773,27 @@ def _get_config_dict( "please make sure there is `model_config.json` under the dir, or you can pass the `_configuration_file` " "param into `from_pretarined` method to specific the configuration file name" ) # 4. load it as the community resource file - - # 4. get the configuration file from HF hub + # 4. get the configuration file from aistudio + elif from_aistudio: + resolved_config_file = aistudio_download(repo_id=pretrained_model_name_or_path, filename=CONFIG_NAME) + # 5. get the configuration file from HF HUB elif from_hf_hub: resolved_config_file = resolve_hf_config_path( repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, subfolder=subfolder ) - elif from_aistudio: - resolved_config_file = aistudio_download(repo_id=pretrained_model_name_or_path, filename=CONFIG_NAME) else: - community_url = "/".join([COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, CONFIG_NAME]) + url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, CONFIG_NAME] + legacy_url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, LEGACY_CONFIG_NAME] + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + if subfolder != "": + url_list.insert(2, subfolder) + legacy_url_list.insert(2, subfolder) + community_url = "/".join(url_list) + if url_file_exists(community_url): return cls._get_config_dict(community_url, cache_dir=cache_dir, **kwargs) - community_url = "/".join([COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, LEGACY_CONFIG_NAME]) + community_url = "/".join(legacy_url_list) if url_file_exists(community_url): return cls._get_config_dict(community_url, cache_dir=cache_dir, **kwargs) diff --git a/paddlenlp/transformers/feature_extraction_utils.py b/paddlenlp/transformers/feature_extraction_utils.py index 6cce95b48b9a..77ad16d8e708 100644 --- a/paddlenlp/transformers/feature_extraction_utils.py +++ b/paddlenlp/transformers/feature_extraction_utils.py @@ -27,6 +27,7 @@ from .. import __version__ from ..utils.downloader import COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock from ..utils.log import logger +from .aistudio_utils import aistudio_download from .tokenizer_utils_base import TensorType from .utils import resolve_cache_dir @@ -134,6 +135,8 @@ class FeatureExtractionMixin(object): extractors. """ + pretrained_init_configuration = {} + pretrained_feature_extractor_file = [] _auto_class = None @@ -245,20 +248,29 @@ def get_feature_extractor_dict( """ cache_dir = kwargs.pop("cache_dir", None) from_hf_hub = kwargs.pop("from_hf_hub", False) - subfolder = kwargs.pop("subfolder", None) - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) + from_aistudio = kwargs.pop("from_aistudio", False) + subfolder = kwargs.pop("subfolder", "") + if subfolder is None: + subfolder = "" + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) + pretrained_model_name_or_path = str(pretrained_model_name_or_path) is_local = os.path.isdir(pretrained_model_name_or_path) if os.path.isdir(pretrained_model_name_or_path): - if subfolder is None: - resolved_feature_extractor_file = os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME) - else: - resolved_feature_extractor_file = os.path.join( - pretrained_model_name_or_path, subfolder, FEATURE_EXTRACTOR_NAME - ) + resolved_feature_extractor_file = os.path.join( + pretrained_model_name_or_path, subfolder, FEATURE_EXTRACTOR_NAME + ) elif os.path.isfile(pretrained_model_name_or_path): resolved_feature_extractor_file = pretrained_model_name_or_path is_local = True + elif from_aistudio: + feature_extractor_file = FEATURE_EXTRACTOR_NAME + resolved_feature_extractor_file = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=feature_extractor_file, + cache_dir=cache_dir, + subfolder=subfolder, + ) elif from_hf_hub: feature_extractor_file = FEATURE_EXTRACTOR_NAME resolved_feature_extractor_file = hf_hub_download( @@ -275,16 +287,11 @@ def get_feature_extractor_dict( feature_extractor_file = cls.pretrained_feature_extractor_file[pretrained_model_name_or_path] else: # Assuming from community-contributed pretrained models - if subfolder is None: - feature_extractor_file = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME] - ) - else: - feature_extractor_file = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, subfolder, FEATURE_EXTRACTOR_NAME] - ) - # update cache_dir - cache_dir = os.path.join(cache_dir, subfolder) + url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME] + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + if subfolder != "": + url_list.insert(2, subfolder) + feature_extractor_file = "/".join(url_list) try: resolved_feature_extractor_file = get_path_from_url_with_filelock(feature_extractor_file, cache_dir) except EnvironmentError: diff --git a/paddlenlp/transformers/image_processing_utils.py b/paddlenlp/transformers/image_processing_utils.py index 8dfcfa30b841..5f8c6c5c5798 100644 --- a/paddlenlp/transformers/image_processing_utils.py +++ b/paddlenlp/transformers/image_processing_utils.py @@ -35,6 +35,7 @@ from .. import __version__ from ..utils.downloader import COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock from ..utils.log import logger +from .aistudio_utils import aistudio_download from .feature_extraction_utils import BatchFeature as BaseBatchFeature from .utils import resolve_cache_dir @@ -62,6 +63,7 @@ class ImageProcessingMixin(object): extractors. """ + pretrained_init_configuration = {} _auto_class = None def __init__(self, **kwargs): @@ -317,21 +319,29 @@ def get_image_processor_dict( """ cache_dir = kwargs.pop("cache_dir", None) from_hf_hub = kwargs.pop("from_hf_hub", False) - subfolder = kwargs.pop("subfolder", None) - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) + from_aistudio = kwargs.pop("from_aistudio", False) + subfolder = kwargs.pop("subfolder", "") + if subfolder is None: + subfolder = "" + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) pretrained_model_name_or_path = str(pretrained_model_name_or_path) is_local = os.path.isdir(pretrained_model_name_or_path) if os.path.isdir(pretrained_model_name_or_path): - if subfolder is None: - resolved_image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME) - else: - resolved_image_processor_file = os.path.join( - pretrained_model_name_or_path, subfolder, IMAGE_PROCESSOR_NAME - ) + resolved_image_processor_file = os.path.join( + pretrained_model_name_or_path, subfolder, IMAGE_PROCESSOR_NAME + ) elif os.path.isfile(pretrained_model_name_or_path): resolved_image_processor_file = pretrained_model_name_or_path is_local = True + elif from_aistudio: + image_processor_file = IMAGE_PROCESSOR_NAME + resolved_image_processor_file = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=image_processor_file, + cache_dir=cache_dir, + subfolder=subfolder, + ) elif from_hf_hub: image_processor_file = IMAGE_PROCESSOR_NAME resolved_image_processor_file = hf_hub_download( @@ -344,16 +354,11 @@ def get_image_processor_dict( ) else: # Assuming from community-contributed pretrained models - if subfolder is None: - image_processor_file = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME] - ) - else: - image_processor_file = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, subfolder, IMAGE_PROCESSOR_NAME] - ) - # update cache_dir - cache_dir = os.path.join(cache_dir, subfolder) + url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME] + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + if subfolder != "": + url_list.insert(2, subfolder) + image_processor_file = "/".join(url_list) try: # Load from local folder or from cache or download from model Hub and cache resolved_image_processor_file = get_path_from_url_with_filelock(image_processor_file, cache_dir) diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 0c1b9fb413ee..28c6df983da1 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -1410,7 +1410,7 @@ def _resolve_model_file_path( from_hf_hub: bool = False, from_aistudio: bool = False, cache_dir: str | None = None, - subfolder: Optional[str] = None, + subfolder: Optional[str] = "", config: PretrainedConfig = None, convert_from_torch: bool = False, use_safetensors: bool | None = None, @@ -2024,9 +2024,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): force_download = kwargs.get("force_download", False) ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) dtype = kwargs.pop("dtype", None) - from_hf_hub = kwargs.get("from_hf_hub", False) - from_aistudio = kwargs.get("from_aistudio", False) - subfolder = kwargs.get("subfolder", None) + from_hf_hub = kwargs.pop("from_hf_hub", False) + from_aistudio = kwargs.pop("from_aistudio", False) + subfolder = kwargs.pop("subfolder", None) + if subfolder is None: + subfolder = "" variant = kwargs.pop("variant", None) use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) @@ -2049,20 +2051,25 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): if convert_from_torch is None: convert_from_torch = False - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) # 1. get the PretrainedConfig to init model if not isinstance(config, PretrainedConfig): config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( config_path, cache_dir=cache_dir, + from_hf_hub=from_hf_hub, + from_aistudio=from_aistudio, + subfolder=subfolder, return_unused_kwargs=True, **kwargs, ) if "from_aistudio" in model_kwargs: model_kwargs.pop("from_aistudio") - if not os.path.exists(os.path.join(cache_dir, CONFIG_NAME)): - config.save_pretrained(cache_dir) + + if not from_hf_hub and not from_aistudio: + if not os.path.exists(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder, CONFIG_NAME)): + config.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)) # refine options for config convert_from_torch = cls.support_conversion(config) and convert_from_torch @@ -2208,6 +2215,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): pretrained_model_name_or_path, cache_dir=cache_dir, force_download=force_download, + from_hf_hub=from_hf_hub, + from_aistudio=from_aistudio, subfolder=subfolder, **kwargs, ) diff --git a/paddlenlp/transformers/speecht5/feature_extraction.py b/paddlenlp/transformers/speecht5/feature_extraction.py index 4ecd4a306a3e..7272743f9c10 100644 --- a/paddlenlp/transformers/speecht5/feature_extraction.py +++ b/paddlenlp/transformers/speecht5/feature_extraction.py @@ -25,6 +25,8 @@ from ..feature_extraction_utils import BatchFeature from ..tokenizer_utils_base import PaddingStrategy +__all__ = ["SpeechT5FeatureExtractor"] + class SpeechT5FeatureExtractor(SequenceFeatureExtractor): r""" diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py index 5d66d99fecee..3e08223183d6 100644 --- a/paddlenlp/transformers/tokenizer_utils.py +++ b/paddlenlp/transformers/tokenizer_utils.py @@ -676,16 +676,26 @@ def encode_chat_inputs(self, conversations: List[List[str, str]]): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): - cache_dir = kwargs.get("cache_dir", None) - from_hf_hub = kwargs.get("from_hf_hub", None) - + cache_dir = kwargs.pop("cache_dir", None) + from_hf_hub = kwargs.pop("from_hf_hub", False) + from_aistudio = kwargs.pop("from_aistudio", False) + subfolder = kwargs.pop("subfolder", "") + if subfolder is None: + subfolder = "" + + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) + kwargs["subfolder"] = subfolder + kwargs["cache_dir"] = cache_dir + kwargs["from_hf_hub"] = from_hf_hub + kwargs["from_aistudio"] = from_aistudio tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) # load chat-template pretrained_model_name_or_path = str(pretrained_model_name_or_path) - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) - chat_template_file = os.path.join(cache_dir, CHAT_TEMPLATE_CONFIG_NAME) + chat_template_file = os.path.join( + cache_dir, pretrained_model_name_or_path, subfolder, CHAT_TEMPLATE_CONFIG_NAME + ) if not os.path.exists(chat_template_file): return tokenizer diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 81ee0e5513a4..cb43ee1a7379 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1408,7 +1408,7 @@ def get_vocab(self) -> Dict[str, int]: raise NotImplementedError() @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False, subfolder=None, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ Creates an instance of `PretrainedTokenizer`. Related resources are loaded by specifying name of a built-in pretrained model, or a community-contributed @@ -1451,8 +1451,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False pretrained_model_name_or_path = str(pretrained_model_name_or_path) cache_dir = kwargs.pop("cache_dir", None) - from_aistudio = kwargs.pop("from_aistudio", None) - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) + from_hf_hub = kwargs.pop("from_hf_hub", False) + from_aistudio = kwargs.pop("from_aistudio", False) + subfolder = kwargs.pop("subfolder", "") + if subfolder is None: + subfolder = "" + + cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) vocab_files = {} init_configuration = {} @@ -1471,6 +1476,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False # Deep copy to avoid modifiying the class attributes vocab_files = copy.deepcopy(cls.resource_files_names) vocab_files["tokenizer_config_file"] = cls.tokenizer_config_file + # From built-in pretrained models elif pretrained_model_name_or_path in cls.pretrained_init_configuration: for file_id, map_list in cls.pretrained_resource_files_map.items(): @@ -1484,13 +1490,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False if os.path.isfile(full_file_name): vocab_files[file_id] = full_file_name else: + url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path] + if subfolder != "": + url_list.insert(2, subfolder) # Assuming from community-contributed pretrained models for file_id, file_name in vocab_files_target.items(): - full_file_name = "/".join([COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, file_name]) + full_file_name = "/".join(url_list + [file_name]) vocab_files[file_id] = full_file_name - vocab_files["tokenizer_config_file"] = "/".join( - [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.tokenizer_config_file] - ) + + vocab_files["tokenizer_config_file"] = "/".join(url_list + [cls.tokenizer_config_file]) resolved_vocab_files = {} for file_id, file_path in vocab_files.items(): @@ -1501,6 +1509,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False resolved_vocab_files[file_id] = aistudio_download( repo_id=pretrained_model_name_or_path, filename=file_path, + cache_dir=cache_dir, + subfolder=subfolder, ) elif from_hf_hub: resolved_vocab_files[file_id] = hf_hub_download( @@ -1512,7 +1522,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False library_version=__version__, ) else: - path = os.path.join(cache_dir, file_path.split("/")[-1]) + path = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder, file_path.split("/")[-1]) if os.path.exists(path): logger.info("Already cached %s" % path) resolved_vocab_files[file_id] = path @@ -1528,7 +1538,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, from_hf_hub=False logger.warning(f"file<{file_path}> not exist") resolved_vocab_files[file_id] = None continue - resolved_vocab_files[file_id] = get_path_from_url_with_filelock(file_path, cache_dir) + resolved_vocab_files[file_id] = get_path_from_url_with_filelock( + file_path, os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + ) except RuntimeError as err: if file_id not in cls.resource_files_names: resolved_vocab_files[file_id] = None @@ -1651,7 +1663,7 @@ def convert_added_tokens(obj): ) # save all of related things into default root dir if pretrained_model_name_or_path in cls.pretrained_init_configuration: - tokenizer.save_pretrained(cache_dir) + tokenizer.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)) return tokenizer diff --git a/paddlenlp/transformers/utils.py b/paddlenlp/transformers/utils.py index ecd6f77b790f..ce8d3c0f8b7e 100644 --- a/paddlenlp/transformers/utils.py +++ b/paddlenlp/transformers/utils.py @@ -287,31 +287,20 @@ def param_in_func(func, param_field: str) -> bool: return param_field in result[0] -def resolve_cache_dir(pretrained_model_name_or_path: str, from_hf_hub: bool, cache_dir: Optional[str] = None) -> str: +def resolve_cache_dir(from_hf_hub: bool, from_aistudio: bool, cache_dir: Optional[str] = None) -> str: """resolve cache dir for PretrainedModel and PretrainedConfig Args: - pretrained_model_name_or_path (str): the name or path of pretrained model from_hf_hub (bool): if load from huggingface hub cache_dir (str): cache_dir for models """ - if os.path.isdir(pretrained_model_name_or_path): - return pretrained_model_name_or_path - - # hf hub library takes care of appending the model name so we don't append the model name + if cache_dir is not None: + return cache_dir + if from_aistudio: + return None if from_hf_hub: - if cache_dir is not None: - return cache_dir - else: - return HF_CACHE_HOME - else: - if cache_dir is not None: - # since model_clas.from_pretrained calls config_clas.from_pretrained, the model_name may get appended twice - if cache_dir.endswith(pretrained_model_name_or_path): - return cache_dir - else: - return os.path.join(cache_dir, pretrained_model_name_or_path) - return os.path.join(MODEL_HOME, pretrained_model_name_or_path) + return HF_CACHE_HOME + return MODEL_HOME def find_transformer_model_type(model_class: Type) -> str: @@ -411,9 +400,10 @@ def paddlenlp_hub_download( cache_dir: Union[str, Path, None] = None, local_dir: Union[str, Path, None] = None, ) -> str: - + if subfolder is None: + subfolder = "" # check in cache_dir - weight_file_path = os.path.join(cache_dir, filename) + weight_file_path = os.path.join(cache_dir, repo_id, subfolder, filename) if os.path.exists(weight_file_path): logger.info(f"Already cached {weight_file_path}") @@ -448,13 +438,18 @@ def paddlenlp_hub_download( return None # find in community repo - community_model_file_path = "/".join([COMMUNITY_MODEL_PREFIX, repo_id, filename]) + url_list = [COMMUNITY_MODEL_PREFIX, repo_id, filename] + if subfolder != "": + url_list.insert(2, subfolder) + community_model_file_path = "/".join(url_list) assert is_url(community_model_file_path) # check wether the target file exist in the comunity bos server if url_file_exists(community_model_file_path): logger.info(f"Downloading {community_model_file_path}") - weight_file_path = get_path_from_url_with_filelock(community_model_file_path, cache_dir) + weight_file_path = get_path_from_url_with_filelock( + community_model_file_path, os.path.join(cache_dir, repo_id, subfolder) + ) # # check the downloaded weight file and registered weight file name download_check(community_model_file_path, "paddlenlp_hub_download") return weight_file_path @@ -517,17 +512,17 @@ def cached_file( return None return resolved_file - if cache_dir is None: - cache_dir = os.path.join(MODEL_HOME, ".cache") - if isinstance(cache_dir, Path): + if cache_dir is not None and isinstance(cache_dir, Path): cache_dir = str(cache_dir) if from_aistudio: try: - resolved_file = aistudio_download(repo_id=path_or_repo_id, filename=filename) + resolved_file = aistudio_download(repo_id=path_or_repo_id, filename=filename, cache_dir=cache_dir) except: resolved_file = None else: + if cache_dir is None: + cache_dir = os.path.join(MODEL_HOME, ".cache") try: # Load from URL or cache if already cached resolved_file = paddlenlp_hub_download( From f81af43e6772081edfd001f2e39b966cddcc78e0 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Fri, 8 Dec 2023 17:30:31 +0800 Subject: [PATCH 02/27] fix hf download bug ... --- paddlenlp/transformers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/utils.py b/paddlenlp/transformers/utils.py index ce8d3c0f8b7e..87523087db9c 100644 --- a/paddlenlp/transformers/utils.py +++ b/paddlenlp/transformers/utils.py @@ -582,7 +582,7 @@ def cached_file_for_hf_hub( download_check(path_or_repo_id, full_filename, addition="from_hf_hub") resolved_file = hf_hub_download( repo_id=path_or_repo_id, - filename=full_filename, + filename=filename, cache_dir=cache_dir, subfolder=subfolder, library_name="PaddleNLP", From 865b6dd529e2144fddde025712aeeb02fa2a708e Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Sat, 9 Dec 2023 18:24:11 +0800 Subject: [PATCH 03/27] update config download bug --- paddlenlp/transformers/configuration_utils.py | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py index bbbaddc44404..2d328df453cd 100644 --- a/paddlenlp/transformers/configuration_utils.py +++ b/paddlenlp/transformers/configuration_utils.py @@ -38,7 +38,6 @@ COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock, hf_file_exists, - is_url, url_file_exists, ) from ..utils.log import logger @@ -709,6 +708,7 @@ def get_config_dict( cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir) kwargs["cache_dir"] = cache_dir + kwargs["subfolder"] = subfolder # Get config dict associated with the base config file config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) @@ -749,15 +749,7 @@ def _get_config_dict( # 1. get the configuration file from local file, eg: /cache/path/model_config.json if os.path.isfile(pretrained_model_name_or_path): resolved_config_file = pretrained_model_name_or_path - - # 2. get the configuration file from url, eg: https://ip/path/to/model_config.json - elif is_url(pretrained_model_name_or_path): - resolved_config_file = get_path_from_url_with_filelock( - pretrained_model_name_or_path, - os.path.join(cache_dir, pretrained_model_name_or_path, subfolder), - check_exist=not force_download, - ) - # 3. get the configuration file from local dir with default name, eg: /local/path + # 2. get the configuration file from local dir with default name, eg: /local/path elif os.path.isdir(pretrained_model_name_or_path): configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME) configuration_file = os.path.join(pretrained_model_name_or_path, subfolder, configuration_file) @@ -773,10 +765,10 @@ def _get_config_dict( "please make sure there is `model_config.json` under the dir, or you can pass the `_configuration_file` " "param into `from_pretarined` method to specific the configuration file name" ) # 4. load it as the community resource file - # 4. get the configuration file from aistudio + # 3. get the configuration file from aistudio elif from_aistudio: resolved_config_file = aistudio_download(repo_id=pretrained_model_name_or_path, filename=CONFIG_NAME) - # 5. get the configuration file from HF HUB + # 4. get the configuration file from HF HUB elif from_hf_hub: resolved_config_file = resolve_hf_config_path( repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, subfolder=subfolder @@ -789,15 +781,22 @@ def _get_config_dict( url_list.insert(2, subfolder) legacy_url_list.insert(2, subfolder) community_url = "/".join(url_list) + legacy_community_url = "/".join(legacy_url_list) if url_file_exists(community_url): - return cls._get_config_dict(community_url, cache_dir=cache_dir, **kwargs) - - community_url = "/".join(legacy_url_list) - if url_file_exists(community_url): - return cls._get_config_dict(community_url, cache_dir=cache_dir, **kwargs) - - raise FileNotFoundError(f"configuration file<{CONFIG_NAME}> or <{LEGACY_CONFIG_NAME}> not found") + resolved_config_file = get_path_from_url_with_filelock( + community_url, + cache_dir, + check_exist=not force_download, + ) + elif url_file_exists(legacy_community_url): + resolved_config_file = get_path_from_url_with_filelock( + legacy_community_url, + cache_dir, + check_exist=not force_download, + ) + else: + raise FileNotFoundError(f"configuration file<{CONFIG_NAME}> or <{LEGACY_CONFIG_NAME}> not found") try: logger.info(f"Loading configuration file {resolved_config_file}") From d12720a6529ac7c87e468640861f2da24e471093 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Mon, 11 Dec 2023 14:08:55 +0800 Subject: [PATCH 04/27] fix --- paddlenlp/transformers/configuration_utils.py | 7 ++++++- paddlenlp/transformers/utils.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py index 2d328df453cd..1af6506c2d21 100644 --- a/paddlenlp/transformers/configuration_utils.py +++ b/paddlenlp/transformers/configuration_utils.py @@ -767,7 +767,12 @@ def _get_config_dict( ) # 4. load it as the community resource file # 3. get the configuration file from aistudio elif from_aistudio: - resolved_config_file = aistudio_download(repo_id=pretrained_model_name_or_path, filename=CONFIG_NAME) + resolved_config_file = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=CONFIG_NAME, + subfolder=subfolder, + cache_dir=cache_dir, + ) # 4. get the configuration file from HF HUB elif from_hf_hub: resolved_config_file = resolve_hf_config_path( diff --git a/paddlenlp/transformers/utils.py b/paddlenlp/transformers/utils.py index 87523087db9c..49a8a9d532c1 100644 --- a/paddlenlp/transformers/utils.py +++ b/paddlenlp/transformers/utils.py @@ -517,7 +517,9 @@ def cached_file( if from_aistudio: try: - resolved_file = aistudio_download(repo_id=path_or_repo_id, filename=filename, cache_dir=cache_dir) + resolved_file = aistudio_download( + repo_id=path_or_repo_id, filename=filename, subfolder=subfolder, cache_dir=cache_dir + ) except: resolved_file = None else: @@ -658,7 +660,12 @@ def get_checkpoint_shard_files( for shard_filename in tqdm.tqdm(shard_filenames, desc="Downloading shards", disable=not show_progress_bar): try: if from_aistudio: - cached_filename = aistudio_download(repo_id=pretrained_model_name_or_path, filename=shard_filename) + cached_filename = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=shard_filename, + subfolder=subfolder, + cache_dir=cache_dir, + ) else: cached_filename = paddlenlp_hub_download( pretrained_model_name_or_path, From ba3778dbd78397b767538888ae44b864ab0b5c8d Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Mon, 11 Dec 2023 15:25:45 +0800 Subject: [PATCH 05/27] add subfolder --- paddlenlp/transformers/tokenizer_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index cb43ee1a7379..cef59afa9781 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1486,7 +1486,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): elif os.path.isdir(pretrained_model_name_or_path): vocab_files_target["tokenizer_config_file"] = cls.tokenizer_config_file for file_id, file_name in vocab_files_target.items(): - full_file_name = os.path.join(pretrained_model_name_or_path, file_name) + full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name) if os.path.isfile(full_file_name): vocab_files[file_id] = full_file_name else: From 5d7b26925af6d11e83d693e1a0378571cea75297 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 12 Dec 2023 22:22:37 +0800 Subject: [PATCH 06/27] update --- paddlenlp/transformers/model_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 28c6df983da1..a630b917061b 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -2138,9 +2138,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # try to get the name-mapping info logger.info( f"Starting to convert pytorch weight file<{resolved_archive_file}> to " - f"paddle weight file<{os.path.join(cache_dir, PADDLE_WEIGHTS_NAME)}> ..." + f"paddle weight file<{os.path.join(cache_dir, pretrained_model_name_or_path, subfolder, PADDLE_WEIGHTS_NAME)}> ..." + ) + state_dict = cls.convert( + resolved_archive_file, + config, + cache_dir=os.path.join(cache_dir, pretrained_model_name_or_path, subfolder), ) - state_dict = cls.convert(resolved_archive_file, config, cache_dir) else: raise ValueError(f"Unexpected file: {resolved_archive_file} for weight conversion.") else: From bdac7b25f1aad97f29a595bbb860841688f59f9a Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Wed, 13 Dec 2023 16:19:55 +0800 Subject: [PATCH 07/27] =?UTF-8?q?=E4=BC=98=E5=85=88=E7=BA=A7,=E5=85=88?= =?UTF-8?q?=E6=9C=AC=E5=9C=B0,=E5=86=8Dbuiltin,=E5=86=8Daistudio,=E5=86=8D?= =?UTF-8?q?hf=20hub,=E5=86=8Dbos?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../transformers/auto/image_processing.py | 44 +++++----- paddlenlp/transformers/auto/modeling.py | 82 +++++++++---------- paddlenlp/transformers/auto/processing.py | 41 +++++----- paddlenlp/transformers/auto/tokenizer.py | 46 +++++------ 4 files changed, 107 insertions(+), 106 deletions(-) diff --git a/paddlenlp/transformers/auto/image_processing.py b/paddlenlp/transformers/auto/image_processing.py index c22d2ff16a50..88283ce17180 100644 --- a/paddlenlp/transformers/auto/image_processing.py +++ b/paddlenlp/transformers/auto/image_processing.py @@ -151,8 +151,29 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): for name in names: all_processor_names.append(name) + # From local dir path + if os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.image_processor_config_file) + if os.path.exists(config_file): + processor_class = cls._get_image_processor_class_from_config( + pretrained_model_name_or_path, config_file + ) + logger.info("We are using %s to load '%s'." % (processor_class, pretrained_model_name_or_path)) + return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + # From built-in pretrained models + elif pretrained_model_name_or_path in all_processor_names: + for names, processor_classes in cls._processor_mapping.items(): + for pattern in names: + if pattern == pretrained_model_name_or_path: + actual_processor_class = processor_classes[0] + logger.info( + "We are using %s to load '%s'." % (actual_processor_class, pretrained_model_name_or_path) + ) + return actual_processor_class.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) # From AI Studio or HF Hub - if from_aistudio or from_hf_hub: + elif from_aistudio or from_hf_hub: if from_aistudio: config_file = aistudio_download( repo_id=pretrained_model_name_or_path, @@ -176,27 +197,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) logger.info(f"We are using {processor_class} to load '{pretrained_model_name_or_path}'.") return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - # From built-in pretrained models - elif pretrained_model_name_or_path in all_processor_names: - for names, processor_classes in cls._processor_mapping.items(): - for pattern in names: - if pattern == pretrained_model_name_or_path: - actual_processor_class = processor_classes[0] - logger.info( - "We are using %s to load '%s'." % (actual_processor_class, pretrained_model_name_or_path) - ) - return actual_processor_class.from_pretrained( - pretrained_model_name_or_path, *model_args, **kwargs - ) - # From local dir path - elif os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.image_processor_config_file) - if os.path.exists(config_file): - processor_class = cls._get_image_processor_class_from_config( - pretrained_model_name_or_path, config_file - ) - logger.info("We are using %s to load '%s'." % (processor_class, pretrained_model_name_or_path)) - return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) # Assuming from community-contributed pretrained models else: url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.image_processor_config_file] diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py index 69d64653458f..24e63e8e5fe3 100644 --- a/paddlenlp/transformers/auto/modeling.py +++ b/paddlenlp/transformers/auto/modeling.py @@ -288,45 +288,17 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, for pretrained_model_names, model_name in cls._pretrained_model_dict.items(): for name in pretrained_model_names: all_model_names.append(name) - if from_aistudio: - config_file = aistudio_download( - repo_id=pretrained_model_name_or_path, - filename=cls.model_config_file, - subfolder=subfolder, - cache_dir=cache_dir, - ) + # From local dir path + if os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.model_config_file) + legacy_config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.legacy_model_config_file) if os.path.exists(config_file): model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, config_file) logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.") return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - else: - logger.warning(f"{config_file} is not a valid path to a model config file") - elif from_hf_hub: - if hf_file_exists( - repo_id=pretrained_model_name_or_path, filename=cls.model_config_file, subfolder=subfolder - ): - config_file = hf_hub_download( - repo_id=pretrained_model_name_or_path, - filename=cls.model_config_file, - subfolder=subfolder, - cache_dir=cache_dir, - library_name="PaddleNLP", - library_version=__version__, - ) - elif hf_file_exists( - repo_id=pretrained_model_name_or_path, filename=cls.legacy_model_config_file, subfolder=subfolder - ): + elif os.path.exists(legacy_config_file): logger.info("Standard config do not exist, loading from legacy config") - config_file = hf_hub_download( - repo_id=pretrained_model_name_or_path, - filename=cls.legacy_model_config_file, - subfolder=subfolder, - cache_dir=cache_dir, - library_name="PaddleNLP", - library_version=__version__, - ) - if os.path.exists(config_file): - model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, config_file) + model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, legacy_config_file) logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.") return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) else: @@ -362,22 +334,50 @@ def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, ) logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.") return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - # From local dir path - elif os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.model_config_file) - legacy_config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.legacy_model_config_file) + # Assuming from community-contributed pretrained models + elif from_aistudio: + config_file = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=cls.model_config_file, + subfolder=subfolder, + cache_dir=cache_dir, + ) if os.path.exists(config_file): model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, config_file) logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.") return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif os.path.exists(legacy_config_file): + else: + logger.warning(f"{config_file} is not a valid path to a model config file") + elif from_hf_hub: + if hf_file_exists( + repo_id=pretrained_model_name_or_path, filename=cls.model_config_file, subfolder=subfolder + ): + config_file = hf_hub_download( + repo_id=pretrained_model_name_or_path, + filename=cls.model_config_file, + subfolder=subfolder, + cache_dir=cache_dir, + library_name="PaddleNLP", + library_version=__version__, + ) + elif hf_file_exists( + repo_id=pretrained_model_name_or_path, filename=cls.legacy_model_config_file, subfolder=subfolder + ): logger.info("Standard config do not exist, loading from legacy config") - model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, legacy_config_file) + config_file = hf_hub_download( + repo_id=pretrained_model_name_or_path, + filename=cls.legacy_model_config_file, + subfolder=subfolder, + cache_dir=cache_dir, + library_name="PaddleNLP", + library_version=__version__, + ) + if os.path.exists(config_file): + model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, config_file) logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.") return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) else: logger.warning(f"{config_file} is not a valid path to a model config file") - # Assuming from community-contributed pretrained models else: standard_url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.model_config_file] legacy_url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_model_config_file] diff --git a/paddlenlp/transformers/auto/processing.py b/paddlenlp/transformers/auto/processing.py index dc26cc89587a..15cf28f9474d 100644 --- a/paddlenlp/transformers/auto/processing.py +++ b/paddlenlp/transformers/auto/processing.py @@ -161,8 +161,28 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): for name in names: all_processor_names.append(name) + # From local dir path + if os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.processor_config_file) + if os.path.exists(config_file): + processor_class = cls._get_processor_class_from_config(pretrained_model_name_or_path, config_file) + logger.info("We are using %s to load '%s'." % (processor_class, pretrained_model_name_or_path)) + return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + # From built-in pretrained models + elif pretrained_model_name_or_path in all_processor_names: + for names, processor_classes in cls._processor_mapping.items(): + for pattern in names: + if pattern == pretrained_model_name_or_path: + actual_processor_class = processor_classes[0] + logger.info( + "We are using %s to load '%s'." % (actual_processor_class, pretrained_model_name_or_path) + ) + return actual_processor_class.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + # From AI Studio or HF Hub - if from_aistudio or from_hf_hub: + elif from_aistudio or from_hf_hub: if from_aistudio: config_file = aistudio_download( repo_id=pretrained_model_name_or_path, @@ -186,25 +206,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) logger.info(f"We are using {processor_class} to load '{pretrained_model_name_or_path}'.") return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - # From built-in pretrained models - elif pretrained_model_name_or_path in all_processor_names: - for names, processor_classes in cls._processor_mapping.items(): - for pattern in names: - if pattern == pretrained_model_name_or_path: - actual_processor_class = processor_classes[0] - logger.info( - "We are using %s to load '%s'." % (actual_processor_class, pretrained_model_name_or_path) - ) - return actual_processor_class.from_pretrained( - pretrained_model_name_or_path, *model_args, **kwargs - ) - # From local dir path - elif os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.processor_config_file) - if os.path.exists(config_file): - processor_class = cls._get_processor_class_from_config(pretrained_model_name_or_path, config_file) - logger.info("We are using %s to load '%s'." % (processor_class, pretrained_model_name_or_path)) - return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) # Assuming from community-contributed pretrained models else: url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.processor_config_file] diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index cb49be64e3a4..a20e763d26c6 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -279,30 +279,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): for names, tokenizer_class in cls._tokenizer_mapping.items(): for name in names: all_tokenizer_names.append(name) - # From AI Studio or HF Hub - if from_aistudio or from_hf_hub: - if from_aistudio: - config_file = aistudio_download( - repo_id=pretrained_model_name_or_path, - filename=cls.tokenizer_config_file, - cache_dir=cache_dir, - subfolder=subfolder, - ) - else: - config_file = hf_hub_download( - repo_id=pretrained_model_name_or_path, - filename=cls.tokenizer_config_file, - subfolder=subfolder, - cache_dir=cache_dir, - library_name="PaddleNLP", - library_version=__version__, - ) + # From local dir path + if os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file) if os.path.exists(config_file): tokenizer_class = cls._get_tokenizer_class_from_config( pretrained_model_name_or_path, config_file, use_fast ) logger.info(f"We are using {tokenizer_class} to load '{pretrained_model_name_or_path}'.") return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + else: + raise FileNotFoundError(f"{config_file} is not found under '{pretrained_model_name_or_path}'") # From built-in pretrained models elif pretrained_model_name_or_path in all_tokenizer_names: for names, tokenizer_classes in cls._tokenizer_mapping.items(): @@ -339,17 +326,30 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): return actual_tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs ) - # From local dir path - elif os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file) + # From AI Studio or HF Hub + elif from_aistudio or from_hf_hub: + if from_aistudio: + config_file = aistudio_download( + repo_id=pretrained_model_name_or_path, + filename=cls.tokenizer_config_file, + cache_dir=cache_dir, + subfolder=subfolder, + ) + else: + config_file = hf_hub_download( + repo_id=pretrained_model_name_or_path, + filename=cls.tokenizer_config_file, + subfolder=subfolder, + cache_dir=cache_dir, + library_name="PaddleNLP", + library_version=__version__, + ) if os.path.exists(config_file): tokenizer_class = cls._get_tokenizer_class_from_config( pretrained_model_name_or_path, config_file, use_fast ) logger.info(f"We are using {tokenizer_class} to load '{pretrained_model_name_or_path}'.") return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - else: - raise FileNotFoundError(f"{config_file} is not found under '{pretrained_model_name_or_path}'") # Assuming from community-contributed pretrained models else: url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.tokenizer_config_file] From 0cbf287c2e34f9352652492841148dfc5682d741 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 19 Dec 2023 17:48:46 +0800 Subject: [PATCH 08/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0chattemplate=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E6=A3=80=E7=B4=A2=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddlenlp/transformers/tokenizer_utils.py | 9 +++------ paddlenlp/transformers/tokenizer_utils_base.py | 6 ++++++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py index 3e08223183d6..5c13716ff882 100644 --- a/paddlenlp/transformers/tokenizer_utils.py +++ b/paddlenlp/transformers/tokenizer_utils.py @@ -688,14 +688,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): kwargs["cache_dir"] = cache_dir kwargs["from_hf_hub"] = from_hf_hub kwargs["from_aistudio"] = from_aistudio - tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + kwargs["return_tokenizer_file_dir"] = True + tokenizer, tokenizer_config_file_dir = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) # load chat-template - pretrained_model_name_or_path = str(pretrained_model_name_or_path) - - chat_template_file = os.path.join( - cache_dir, pretrained_model_name_or_path, subfolder, CHAT_TEMPLATE_CONFIG_NAME - ) + chat_template_file = os.path.join(tokenizer_config_file_dir, CHAT_TEMPLATE_CONFIG_NAME) if not os.path.exists(chat_template_file): return tokenizer diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index cef59afa9781..d5f2dd0d5777 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1454,6 +1454,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): from_hf_hub = kwargs.pop("from_hf_hub", False) from_aistudio = kwargs.pop("from_aistudio", False) subfolder = kwargs.pop("subfolder", "") + return_tokenizer_file_dir = kwargs.pop("return_tokenizer_file_dir", False) + if subfolder is None: subfolder = "" @@ -1558,7 +1560,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # Did we saved some inputs and kwargs to reload ? has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) + tokenizer_config_file_dir = None if tokenizer_config_file is not None: + tokenizer_config_file_dir = os.path.dirname(tokenizer_config_file) with io.open(tokenizer_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: @@ -1665,6 +1669,8 @@ def convert_added_tokens(obj): if pretrained_model_name_or_path in cls.pretrained_init_configuration: tokenizer.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)) + if return_tokenizer_file_dir: + return tokenizer, tokenizer_config_file_dir return tokenizer def save_pretrained(self, save_directory, filename_prefix: Optional[str] = None, **kwargs): From 6cc9c7edf93f7c20a6c45eb1be6a39c24f7c6620 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 19 Dec 2023 21:20:37 +0800 Subject: [PATCH 09/27] update --- paddlenlp/transformers/tokenizer_utils_base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index d5f2dd0d5777..4e862beab607 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1555,14 +1555,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant tokenizer files.\n" ) - + tokenizer_config_file_dir_list = set() + for k, v in resolved_vocab_files.items(): + tokenizer_config_file_dir_list.add(os.path.dirname(v)) + assert len(tokenizer_config_file_dir_list) <= 1, "All tokenizer files should be in the same directory." # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) - tokenizer_config_file_dir = None if tokenizer_config_file is not None: - tokenizer_config_file_dir = os.path.dirname(tokenizer_config_file) with io.open(tokenizer_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: @@ -1670,7 +1671,7 @@ def convert_added_tokens(obj): tokenizer.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)) if return_tokenizer_file_dir: - return tokenizer, tokenizer_config_file_dir + return tokenizer, tokenizer_config_file_dir_list[0] return tokenizer def save_pretrained(self, save_directory, filename_prefix: Optional[str] = None, **kwargs): From 91ab9b8365c6b5fd5fe1c070390ef65f06f3afa3 Mon Sep 17 00:00:00 2001 From: CrazyBoyM Date: Tue, 19 Dec 2023 13:32:47 +0000 Subject: [PATCH 10/27] fix subfolder && add tests --- paddlenlp/transformers/auto/configuration.py | 2 +- paddlenlp/transformers/auto/tokenizer.py | 2 +- paddlenlp/transformers/configuration_utils.py | 1 - paddlenlp/transformers/tokenizer_utils.py | 10 +- .../transformers/tokenizer_utils_base.py | 6 + .../load_subfolder/test_config.py | 87 +++++++++++++ .../load_subfolder/test_image_processor.py | 57 +++++++++ .../transformers/load_subfolder/test_model.py | 102 +++++++++++++++ .../load_subfolder/test_processor.py | 53 ++++++++ .../load_subfolder/test_tokenizer.py | 120 ++++++++++++++++++ 10 files changed, 431 insertions(+), 9 deletions(-) create mode 100644 tests/transformers/load_subfolder/test_config.py create mode 100644 tests/transformers/load_subfolder/test_image_processor.py create mode 100644 tests/transformers/load_subfolder/test_model.py create mode 100644 tests/transformers/load_subfolder/test_processor.py create mode 100644 tests/transformers/load_subfolder/test_tokenizer.py diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index 182a82514558..510dc1e02a32 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -159,7 +159,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar config = AutoConfig.from_pretrained("bert-base-uncased") config.save_pretrained('./bert-base-uncased') """ - subfolder = kwargs.pop("subfolder", "") + subfolder = kwargs.get("subfolder", "") if subfolder is None: subfolder = "" from_aistudio = kwargs.pop("from_aistudio", False) diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index a20e763d26c6..0d0b7b93e281 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -281,7 +281,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): all_tokenizer_names.append(name) # From local dir path if os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file) + config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.tokenizer_config_file) if os.path.exists(config_file): tokenizer_class = cls._get_tokenizer_class_from_config( pretrained_model_name_or_path, config_file, use_fast diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py index 1af6506c2d21..fd2da84a9777 100644 --- a/paddlenlp/transformers/configuration_utils.py +++ b/paddlenlp/transformers/configuration_utils.py @@ -787,7 +787,6 @@ def _get_config_dict( legacy_url_list.insert(2, subfolder) community_url = "/".join(url_list) legacy_community_url = "/".join(legacy_url_list) - if url_file_exists(community_url): resolved_config_file = get_path_from_url_with_filelock( community_url, diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py index 3e08223183d6..41422fabe971 100644 --- a/paddlenlp/transformers/tokenizer_utils.py +++ b/paddlenlp/transformers/tokenizer_utils.py @@ -688,14 +688,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): kwargs["cache_dir"] = cache_dir kwargs["from_hf_hub"] = from_hf_hub kwargs["from_aistudio"] = from_aistudio - tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + kwargs["return_tokenizer_file_dir"] = True + tokenizer, tokenizer_config_file_dir = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) # load chat-template - pretrained_model_name_or_path = str(pretrained_model_name_or_path) - - chat_template_file = os.path.join( - cache_dir, pretrained_model_name_or_path, subfolder, CHAT_TEMPLATE_CONFIG_NAME - ) + # breakpoint() + chat_template_file = os.path.join(tokenizer_config_file_dir, CHAT_TEMPLATE_CONFIG_NAME) if not os.path.exists(chat_template_file): return tokenizer diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index cef59afa9781..d5f2dd0d5777 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1454,6 +1454,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): from_hf_hub = kwargs.pop("from_hf_hub", False) from_aistudio = kwargs.pop("from_aistudio", False) subfolder = kwargs.pop("subfolder", "") + return_tokenizer_file_dir = kwargs.pop("return_tokenizer_file_dir", False) + if subfolder is None: subfolder = "" @@ -1558,7 +1560,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # Did we saved some inputs and kwargs to reload ? has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) + tokenizer_config_file_dir = None if tokenizer_config_file is not None: + tokenizer_config_file_dir = os.path.dirname(tokenizer_config_file) with io.open(tokenizer_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: @@ -1665,6 +1669,8 @@ def convert_added_tokens(obj): if pretrained_model_name_or_path in cls.pretrained_init_configuration: tokenizer.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)) + if return_tokenizer_file_dir: + return tokenizer, tokenizer_config_file_dir return tokenizer def save_pretrained(self, save_directory, filename_prefix: Optional[str] = None, **kwargs): diff --git a/tests/transformers/load_subfolder/test_config.py b/tests/transformers/load_subfolder/test_config.py new file mode 100644 index 000000000000..bc5f150cd182 --- /dev/null +++ b/tests/transformers/load_subfolder/test_config.py @@ -0,0 +1,87 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddlenlp.transformers import AutoConfig, BertConfig, CLIPConfig, T5Config +from paddlenlp.utils.log import logger + + +class ConfigLoadTester(unittest.TestCase): + def test_bert_config_load(self): + logger.info("Download Bert Config from PaddleNLP BOS") + bert_config = BertConfig.from_pretrained("bert-base-uncased", from_hf_hub=False) + bert_config = AutoConfig.from_pretrained("bert-base-uncased", from_hf_hub=False) + + logger.info("Download config from local") + bert_config.save_pretrained("./paddlenlp-test-config/bert-base-uncased") + bert_config = BertConfig.from_pretrained("./paddlenlp-test-config/bert-base-uncased") + bert_config = AutoConfig.from_pretrained("./paddlenlp-test-config/bert-base-uncased") + logger.info("Download config from local with subfolder") + bert_config = BertConfig.from_pretrained("./paddlenlp-test-config", subfolder="bert-base-uncased") + bert_config = AutoConfig.from_pretrained("./paddlenlp-test-config", subfolder="bert-base-uncased") + + logger.info("Download Bert Config from PaddleNLP BOS with subfolder") + bert_config = BertConfig.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False + ) + bert_config = AutoConfig.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False + ) + + logger.info("Download Bert Config from aistudio") + bert_config = BertConfig.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True) + bert_config = AutoConfig.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True) + + def test_clip_config_load(self): + logger.info("Download CLIP Config from PaddleNLP BOS") + clip_config = CLIPConfig.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + clip_config = AutoConfig.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + + logger.info("Download CLIP Config from local") + clip_config.save_pretrained("./paddlenlp-test-config/clip-vit-base-patch32") + clip_config = CLIPConfig.from_pretrained("./paddlenlp-test-config/clip-vit-base-patch32") + clip_config = AutoConfig.from_pretrained("./paddlenlp-test-config/clip-vit-base-patch32") + logger.info("Download CLIP Config from local with subfolder") + clip_config = CLIPConfig.from_pretrained("./paddlenlp-test-config", subfolder="clip-vit-base-patch32") + clip_config = AutoConfig.from_pretrained("./paddlenlp-test-config", subfolder="clip-vit-base-patch32") + + logger.info("Download CLIP Config from PaddleNLP BOS with subfolder") + clip_config = CLIPConfig.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + clip_config = AutoConfig.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + + logger.info("Download CLIP Config from aistudio") + clip_config = CLIPConfig.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + clip_config = AutoConfig.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + + def test_t5_config_load(self): + logger.info("Download T5 Config from PaddleNLP BOS") + t5_config = T5Config.from_pretrained("t5-small", from_hf_hub=False) + t5_config = AutoConfig.from_pretrained("t5-small", from_hf_hub=False) + + logger.info("Download T5 Config from PaddleNLP BOS with subfolder") + t5_config = T5Config.from_pretrained("baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False) + t5_config = AutoConfig.from_pretrained("baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False) + logger.info("Download T5 Config from local") + t5_config.save_pretrained("./paddlenlp-test-config/t5-small") + t5_config = T5Config.from_pretrained("./paddlenlp-test-config/t5-small") + t5_config = AutoConfig.from_pretrained("./paddlenlp-test-config/t5-small") + + logger.info("Download T5 Config from aistudio") + t5_config = T5Config.from_pretrained("aistudio/t5-small", from_aistudio=True) + t5_config = AutoConfig.from_pretrained("aistudio/t5-small", from_aistudio=True) diff --git a/tests/transformers/load_subfolder/test_image_processor.py b/tests/transformers/load_subfolder/test_image_processor.py new file mode 100644 index 000000000000..a909015e804d --- /dev/null +++ b/tests/transformers/load_subfolder/test_image_processor.py @@ -0,0 +1,57 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddlenlp.transformers import AutoImageProcessor, CLIPImageProcessor +from paddlenlp.utils.log import logger + + +class ImageProcessorLoadTester(unittest.TestCase): + def test_clip_load(self): + logger.info("Download model from PaddleNLP BOS") + clip_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + clip_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + + logger.info("Download model from local") + clip_processor.save_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + clip_processor = CLIPImageProcessor.from_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + clip_processor = AutoImageProcessor.from_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + logger.info("Download model from PaddleNLP BOS with subfolder") + clip_processor = CLIPImageProcessor.from_pretrained( + "./paddlenlp-test-model/", subfolder="clip-vit-base-patch32" + ) + clip_processor = AutoImageProcessor.from_pretrained( + "./paddlenlp-test-model/", subfolder="clip-vit-base-patch32" + ) + + logger.info("Download model from PaddleNLP BOS with subfolder") + clip_processor = CLIPImageProcessor.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + clip_processor = AutoImageProcessor.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + + logger.info("Download model from aistudio") + clip_processor = CLIPImageProcessor.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + clip_processor = AutoImageProcessor.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + + logger.info("Download model from aistudio with subfolder") + clip_processor = CLIPImageProcessor.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + ) + clip_processor = AutoImageProcessor.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + ) diff --git a/tests/transformers/load_subfolder/test_model.py b/tests/transformers/load_subfolder/test_model.py new file mode 100644 index 000000000000..98b440d5249f --- /dev/null +++ b/tests/transformers/load_subfolder/test_model.py @@ -0,0 +1,102 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddlenlp.transformers import AutoModel, BertModel, CLIPModel, T5Model +from paddlenlp.utils.log import logger + + +class ModelLoadTester(unittest.TestCase): + def test_bert_load(self): + logger.info("Download model from PaddleNLP BOS") + bert_model = BertModel.from_pretrained("bert-base-uncased", from_hf_hub=False) + bert_model = AutoModel.from_pretrained("bert-base-uncased", from_hf_hub=False) + + logger.info("Download model from PaddleNLP BOS with subfolder") + bert_model = BertModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False + ) + bert_model = AutoModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False + ) + + logger.info("Download model from aistudio") + bert_model = BertModel.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True) + bert_model = AutoModel.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True) + + logger.info("Download model from aistudio with subfolder") + bert_model = BertModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="bert-base-uncased", from_aistudio=True + ) + bert_model = AutoModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="bert-base-uncased", from_aistudio=True + ) + + logger.info("Download model from local") + bert_model.save_pretrained("./paddlenlp-test-model/bert-base-uncased") + bert_model = BertModel.from_pretrained("./paddlenlp-test-model/", subfolder="bert-base-uncased") + bert_model = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="bert-base-uncased") + + def test_clip_load(self): + logger.info("Download model from PaddleNLP BOS") + clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + clip_model = AutoModel.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + + logger.info("Download model from PaddleNLP BOS with subfolder") + clip_model = CLIPModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + clip_model = AutoModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + + logger.info("Download model from aistudio") + clip_model = CLIPModel.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + clip_model = AutoModel.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + + logger.info("Download model from aistudio with subfolder") + clip_model = CLIPModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + ) + clip_model = AutoModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + ) + + logger.info("Download model from local") + clip_model.save_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + clip_model = CLIPModel.from_pretrained("./paddlenlp-test-model/", subfolder="clip-vit-base-patch32") + clip_model = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="clip-vit-base-patch32") + + def test_t5_load(self): + logger.info("Download model from PaddleNLP BOS") + t5_model = T5Model.from_pretrained("t5-small", from_hf_hub=False) + t5_model = AutoModel.from_pretrained("t5-small", from_hf_hub=False) + + logger.info("Download model from PaddleNLP BOS with subfolder") + t5_model = T5Model.from_pretrained("baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False) + t5_model = AutoModel.from_pretrained("baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False) + + logger.info("Download model from aistudio") + t5_model = T5Model.from_pretrained("aistudio/t5-small", from_aistudio=True) + t5_model = AutoModel.from_pretrained("aistudio/t5-small", from_aistudio=True) + + logger.info("Download model from aistudio with subfolder") + t5_model = T5Model.from_pretrained("aistudio/paddlenlp-test-model", subfolder="t5-small", from_aistudio=True) + t5_model = AutoModel.from_pretrained("aistudio/paddlenlp-test-model", subfolder="t5-small", from_aistudio=True) + + logger.info("Download model from local") + t5_model.save_pretrained("./paddlenlp-test-model/t5-small") + t5_model = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="t5-small") + t5_model = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="t5-small") diff --git a/tests/transformers/load_subfolder/test_processor.py b/tests/transformers/load_subfolder/test_processor.py new file mode 100644 index 000000000000..537f0bb48c2f --- /dev/null +++ b/tests/transformers/load_subfolder/test_processor.py @@ -0,0 +1,53 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddlenlp.transformers import AutoProcessor, CLIPProcessor +from paddlenlp.utils.log import logger + + +class ProcessorLoadTester(unittest.TestCase): + def test_clip_load(self): + logger.info("Download model from PaddleNLP BOS") + clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + + logger.info("Download model from local") + clip_processor.save_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + clip_processor = CLIPProcessor.from_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + clip_processor = AutoProcessor.from_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + logger.info("Download model from PaddleNLP BOS with subfolder") + clip_processor = CLIPProcessor.from_pretrained("./paddlenlp-test-model/", subfolder="clip-vit-base-patch32") + clip_processor = AutoProcessor.from_pretrained("./paddlenlp-test-model/", subfolder="clip-vit-base-patch32") + + logger.info("Download model from PaddleNLP BOS with subfolder") + clip_processor = CLIPProcessor.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + clip_processor = AutoProcessor.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + + logger.info("Download model from aistudio") + clip_processor = CLIPProcessor.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + clip_processor = AutoProcessor.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + + logger.info("Download model from aistudio with subfolder") + clip_processor = CLIPProcessor.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + ) + clip_processor = AutoProcessor.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + ) diff --git a/tests/transformers/load_subfolder/test_tokenizer.py b/tests/transformers/load_subfolder/test_tokenizer.py new file mode 100644 index 000000000000..de44b2baf701 --- /dev/null +++ b/tests/transformers/load_subfolder/test_tokenizer.py @@ -0,0 +1,120 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddlenlp.transformers import ( + AutoTokenizer, + BertTokenizer, + CLIPTokenizer, + T5Tokenizer, +) +from paddlenlp.utils.log import logger + + +class TokenizerLoadTester(unittest.TestCase): + def test_bert_load(self): + logger.info("Download model from PaddleNLP BOS") + bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", from_hf_hub=False) + bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", from_hf_hub=False) + + logger.info("Download model from local") + bert_tokenizer.save_pretrained("./paddlenlp-test-model/bert-base-uncased") + bert_tokenizer = BertTokenizer.from_pretrained("./paddlenlp-test-model/bert-base-uncased") + bert_tokenizer = AutoTokenizer.from_pretrained("./paddlenlp-test-model/bert-base-uncased") + bert_tokenizer = BertTokenizer.from_pretrained("./paddlenlp-test-model/", subfolder="bert-base-uncased") + bert_tokenizer = AutoTokenizer.from_pretrained("./paddlenlp-test-model/", subfolder="bert-base-uncased") + + logger.info("Download model from PaddleNLP BOS with subfolder") + bert_tokenizer = BertTokenizer.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False + ) + bert_tokenizer = AutoTokenizer.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False + ) + + logger.info("Download model from aistudio") + bert_tokenizer = BertTokenizer.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True) + bert_tokenizer = AutoTokenizer.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True) + + logger.info("Download model from aistudio with subfolder") + bert_tokenizer = BertTokenizer.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="bert-base-uncased", from_aistudio=True + ) + bert_tokenizer = AutoTokenizer.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="bert-base-uncased", from_aistudio=True + ) + + def test_clip_load(self): + logger.info("Download model from PaddleNLP BOS") + clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + clip_tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + + logger.info("Download model from local") + clip_tokenizer.save_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + clip_tokenizer = CLIPTokenizer.from_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + clip_tokenizer = AutoTokenizer.from_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") + clip_tokenizer = CLIPTokenizer.from_pretrained("./paddlenlp-test-model/", subfolder="clip-vit-base-patch32") + clip_tokenizer = AutoTokenizer.from_pretrained("./paddlenlp-test-model/", subfolder="clip-vit-base-patch32") + + logger.info("Download model from PaddleNLP BOS with subfolder") + clip_tokenizer = CLIPTokenizer.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + clip_tokenizer = AutoTokenizer.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + ) + + logger.info("Download model from aistudio") + clip_tokenizer = CLIPTokenizer.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + clip_tokenizer = AutoTokenizer.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + + logger.info("Download model from aistudio with subfolder") + clip_tokenizer = CLIPTokenizer.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + ) + clip_tokenizer = AutoTokenizer.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + ) + + def test_t5_load(self): + logger.info("Download model from PaddleNLP BOS") + t5_tokenizer = T5Tokenizer.from_pretrained("t5-small", from_hf_hub=False) + t5_tokenizer = AutoTokenizer.from_pretrained("t5-small", from_hf_hub=False) + + logger.info("Download model from local") + t5_tokenizer.save_pretrained("./paddlenlp-test-model/t5-small") + t5_tokenizer = T5Tokenizer.from_pretrained("./paddlenlp-test-model/t5-small") + t5_tokenizer = AutoTokenizer.from_pretrained("./paddlenlp-test-model/t5-small") + t5_tokenizer = T5Tokenizer.from_pretrained("./paddlenlp-test-model/", subfolder="t5-small") + t5_tokenizer = AutoTokenizer.from_pretrained("./paddlenlp-test-model/", subfolder="t5-small") + + logger.info("Download model from PaddleNLP BOS with subfolder") + t5_tokenizer = T5Tokenizer.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False + ) + t5_tokenizer = AutoTokenizer.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False + ) + + logger.info("Download model from aistudio") + t5_tokenizer = T5Tokenizer.from_pretrained("aistudio/t5-small", from_aistudio=True) + t5_tokenizer = AutoTokenizer.from_pretrained("aistudio/t5-small", from_aistudio=True) + + t5_tokenizer = T5Tokenizer.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="t5-small", from_aistudio=True + ) + t5_tokenizer = AutoTokenizer.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="t5-small", from_aistudio=True + ) From 259fa80ea1977e7d2280ff415e10c8e297feb950 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 19 Dec 2023 21:47:23 +0800 Subject: [PATCH 11/27] fix --- paddlenlp/transformers/tokenizer_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 4e862beab607..aea363094f68 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1671,7 +1671,7 @@ def convert_added_tokens(obj): tokenizer.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)) if return_tokenizer_file_dir: - return tokenizer, tokenizer_config_file_dir_list[0] + return tokenizer, list(tokenizer_config_file_dir_list)[0] return tokenizer def save_pretrained(self, save_directory, filename_prefix: Optional[str] = None, **kwargs): From 1f808a9640a604bb619926b0d5535ccac8bfaa2a Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 19 Dec 2023 21:55:32 +0800 Subject: [PATCH 12/27] update --- paddlenlp/transformers/tokenizer_utils_base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index aea363094f68..9596da645f6d 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1557,8 +1557,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): ) tokenizer_config_file_dir_list = set() for k, v in resolved_vocab_files.items(): - tokenizer_config_file_dir_list.add(os.path.dirname(v)) - assert len(tokenizer_config_file_dir_list) <= 1, "All tokenizer files should be in the same directory." + if v is not None and os.path.isfile(v): + tokenizer_config_file_dir_list.add(os.path.dirname(v)) + assert len(tokenizer_config_file_dir_list) == 1, "All tokenizer files should be in the same directory." # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None From 26dd597590e94e4a4939ecf81c2925a3abda0244 Mon Sep 17 00:00:00 2001 From: CrazyBoyM Date: Tue, 19 Dec 2023 15:13:49 +0000 Subject: [PATCH 13/27] fix tokenizer_config_file_dir_list --- paddlenlp/transformers/configuration_utils.py | 1 + paddlenlp/transformers/tokenizer_utils_base.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py index fd2da84a9777..1af6506c2d21 100644 --- a/paddlenlp/transformers/configuration_utils.py +++ b/paddlenlp/transformers/configuration_utils.py @@ -787,6 +787,7 @@ def _get_config_dict( legacy_url_list.insert(2, subfolder) community_url = "/".join(url_list) legacy_community_url = "/".join(legacy_url_list) + if url_file_exists(community_url): resolved_config_file = get_path_from_url_with_filelock( community_url, diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 9596da645f6d..3df310373e22 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1559,7 +1559,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): for k, v in resolved_vocab_files.items(): if v is not None and os.path.isfile(v): tokenizer_config_file_dir_list.add(os.path.dirname(v)) - assert len(tokenizer_config_file_dir_list) == 1, "All tokenizer files should be in the same directory." + tokenizer_config_file_dir_list = list(tokenizer_config_file_dir_list) + # TODO: check this + assert len(tokenizer_config_file_dir_list) > 0, "All tokenizer files should be in the same directory." # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None From 899a45bb5f7098da0420abc725d58bc08bd54e13 Mon Sep 17 00:00:00 2001 From: CrazyBoyM Date: Thu, 21 Dec 2023 17:56:04 +0000 Subject: [PATCH 14/27] subfolder test --- .../transformers/load_subfolder/test_model.py | 424 ++++++++++++++++-- 1 file changed, 382 insertions(+), 42 deletions(-) diff --git a/tests/transformers/load_subfolder/test_model.py b/tests/transformers/load_subfolder/test_model.py index 98b440d5249f..0718fbf54b94 100644 --- a/tests/transformers/load_subfolder/test_model.py +++ b/tests/transformers/load_subfolder/test_model.py @@ -12,91 +12,431 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import tempfile import unittest -from paddlenlp.transformers import AutoModel, BertModel, CLIPModel, T5Model +import pytest + +from paddlenlp.transformers import AutoModel, BertModel, CLIPTextModel, T5Model from paddlenlp.utils.log import logger class ModelLoadTester(unittest.TestCase): + @pytest.mark.skip + def test_config_diff(self, config_1, config_2): + config_1 = config_1.to_dict() + config_2 = config_2.to_dict() + config_1.pop("architectures", None) + config_2.pop("architectures", None) + assert config_1 == config_2, "config not equal" + + @pytest.mark.skip + def test_cache_dir( + self, model_cls, repo_id="", subfolder=None, use_safetensors=False, from_aistudio=False, from_hf_hub=False + ): + with tempfile.TemporaryDirectory() as cache_dir: + model_cls.from_pretrained( + repo_id, + subfolder=subfolder, + cache_dir=cache_dir, + use_safetensors=use_safetensors, + from_aistudio=from_aistudio, + from_hf_hub=from_hf_hub, + ) + file_list = [] + for root, dirs, files in os.walk(cache_dir): + for file in files: + file_list.append(file) + assert len(file_list) > 0, "cache_dir is empty" + assert "config.json" in file_list, "config.json not in cache_dir" + if use_safetensors: + assert any(".safetensors" in f for f in file_list), "*.safetensors not in cache_dir" + else: + assert any(".pdparams" in f for f in file_list), "*.pdparams not in cache_dir" + def test_bert_load(self): + # BOS + logger.info("Download model from PaddleNLP BOS") + bert_model_bos = BertModel.from_pretrained("baicai/tiny-bert-2", from_hf_hub=False) + bert_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-bert-2", from_hf_hub=False) + self.test_config_diff(bert_model_bos.config, bert_model_bos_auto.config) + + logger.info("Download model from PaddleNLP BOS with subfolder") + bert_model_bos_sub = BertModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=False + ) + self.test_config_diff(bert_model_bos.config, bert_model_bos_sub.config) + + bert_model_bos_sub_auto = AutoModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=False + ) + self.test_config_diff(bert_model_bos_sub.config, bert_model_bos_sub_auto.config) + + # aistudio + logger.info("Download model from aistudio") + bert_model_aistudio = BertModel.from_pretrained("aistudio/tiny-bert", from_aistudio=True) + self.test_config_diff(bert_model_bos.config, bert_model_aistudio.config) + bert_model_aistudio_auto = AutoModel.from_pretrained("aistudio/tiny-bert", from_aistudio=True) + self.test_config_diff(bert_model_aistudio.config, bert_model_aistudio_auto.config) + + logger.info("Download model from aistudio with subfolder") + bert_model_aistudio_sub = BertModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-bert", from_aistudio=True + ) + self.test_config_diff(bert_model_aistudio.config, bert_model_aistudio_sub.config) + bert_model_aistudio_sub_auto = AutoModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-bert", from_aistudio=True + ) + self.test_config_diff(bert_model_aistudio_sub.config, bert_model_aistudio_sub_auto.config) + + # local + logger.info("Download model from local") + bert_model_bos.save_pretrained("./paddlenlp-test-model/tiny-bert") + bert_model_local = BertModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-bert") + self.test_config_diff(bert_model_bos.config, bert_model_local.config) + bert_model_local_auto = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-bert") + self.test_config_diff(bert_model_local.config, bert_model_local_auto.config) + + logger.info("Test cache_dir") + # BOS + self.test_cache_dir(BertModel, "baicai/tiny-bert-2", from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/tiny-bert-2", from_hf_hub=False) + self.test_cache_dir(BertModel, "baicai/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=False) + + # aistudio + self.test_cache_dir(BertModel, "aistudio/tiny-bert", from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/tiny-bert", from_aistudio=True) + self.test_cache_dir(BertModel, "aistudio/paddlenlp-test-model", subfolder="tiny-bert", from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-bert", from_aistudio=True) + + def test_bert_load_safe(self): + # BOS logger.info("Download model from PaddleNLP BOS") - bert_model = BertModel.from_pretrained("bert-base-uncased", from_hf_hub=False) - bert_model = AutoModel.from_pretrained("bert-base-uncased", from_hf_hub=False) + bert_model_bos = BertModel.from_pretrained("baicai/tiny-bert-2", use_safetensors=True, from_hf_hub=False) + bert_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-bert-2", use_safetensors=True, from_hf_hub=False) + self.test_config_diff(bert_model_bos.config, bert_model_bos_auto.config) logger.info("Download model from PaddleNLP BOS with subfolder") - bert_model = BertModel.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False + bert_model_bos_sub = BertModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_hf_hub=False ) - bert_model = AutoModel.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False + self.test_config_diff(bert_model_bos.config, bert_model_bos_sub.config) + + bert_model_bos_sub_auto = AutoModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_hf_hub=False ) + self.test_config_diff(bert_model_bos_sub.config, bert_model_bos_sub_auto.config) + # aistudio logger.info("Download model from aistudio") - bert_model = BertModel.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True) - bert_model = AutoModel.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True) + bert_model_aistudio = BertModel.from_pretrained("aistudio/tiny-bert", use_safetensors=True, from_aistudio=True) + self.test_config_diff(bert_model_bos.config, bert_model_aistudio.config) + bert_model_aistudio_auto = AutoModel.from_pretrained( + "aistudio/tiny-bert", use_safetensors=True, from_aistudio=True + ) + self.test_config_diff(bert_model_aistudio.config, bert_model_aistudio_auto.config) logger.info("Download model from aistudio with subfolder") - bert_model = BertModel.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="bert-base-uncased", from_aistudio=True + bert_model_aistudio_sub = BertModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_aistudio=True ) - bert_model = AutoModel.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="bert-base-uncased", from_aistudio=True + self.test_config_diff(bert_model_aistudio.config, bert_model_aistudio_sub.config) + bert_model_aistudio_sub_auto = AutoModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_aistudio=True ) + self.test_config_diff(bert_model_aistudio_sub.config, bert_model_aistudio_sub_auto.config) + # local logger.info("Download model from local") - bert_model.save_pretrained("./paddlenlp-test-model/bert-base-uncased") - bert_model = BertModel.from_pretrained("./paddlenlp-test-model/", subfolder="bert-base-uncased") - bert_model = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="bert-base-uncased") + bert_model_bos.save_pretrained("./paddlenlp-test-model/tiny-bert", safe_serialization=True) + bert_model_local = BertModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=True + ) + self.test_config_diff(bert_model_bos.config, bert_model_local.config) + bert_model_local_auto = AutoModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=True + ) + self.test_config_diff(bert_model_local.config, bert_model_local_auto.config) + + logger.info("Test cache_dir") + # BOS + self.test_cache_dir(BertModel, "baicai/tiny-bert-2", use_safetensors=True, from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/tiny-bert-2", use_safetensors=True, from_hf_hub=False) + self.test_cache_dir( + BertModel, "baicai/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_hf_hub=False + ) + self.test_cache_dir( + AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_hf_hub=False + ) + + # aistudio + self.test_cache_dir(BertModel, "aistudio/tiny-bert", use_safetensors=True, from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/tiny-bert", use_safetensors=True, from_aistudio=True) + self.test_cache_dir( + BertModel, "aistudio/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_aistudio=True + ) + self.test_cache_dir( + AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_aistudio=True + ) def test_clip_load(self): + # BOS + logger.info("Download model from PaddleNLP BOS") + clip_model_bos = CLIPTextModel.from_pretrained("baicai/tiny-clip", from_hf_hub=False) + clip_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-clip", from_hf_hub=False) + self.test_config_diff(clip_model_bos.config, clip_model_bos_auto.config) + + logger.info("Download model from PaddleNLP BOS with subfolder") + clip_model_bos_sub = CLIPTextModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=False + ) + self.test_config_diff(clip_model_bos.config, clip_model_bos_sub.config) + + clip_model_bos_sub_auto = AutoModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=False + ) + self.test_config_diff(clip_model_bos_sub.config, clip_model_bos_sub_auto.config) + + # aistudio + logger.info("Download model from aistudio") + clip_model_aistudio = CLIPTextModel.from_pretrained("aistudio/tiny-clip", from_aistudio=True) + self.test_config_diff(clip_model_bos.config, clip_model_aistudio.config) + clip_model_aistudio_auto = AutoModel.from_pretrained("aistudio/tiny-clip", from_aistudio=True) + self.test_config_diff(clip_model_aistudio.config, clip_model_aistudio_auto.config) + + logger.info("Download model from aistudio with subfolder") + clip_model_aistudio_sub = CLIPTextModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-clip", from_aistudio=True + ) + self.test_config_diff(clip_model_aistudio.config, clip_model_aistudio_sub.config) + clip_model_aistudio_sub_auto = AutoModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-clip", from_aistudio=True + ) + self.test_config_diff(clip_model_aistudio_sub.config, clip_model_aistudio_sub_auto.config) + + # local + logger.info("Download model from local") + clip_model_bos.save_pretrained("./paddlenlp-test-model/tiny-clip") + clip_model_local = CLIPTextModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-clip") + self.test_config_diff(clip_model_bos.config, clip_model_local.config) + clip_model_local_auto = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-clip") + self.test_config_diff(clip_model_local.config, clip_model_local_auto.config) + + logger.info("Test cache_dir") + # BOS + self.test_cache_dir(CLIPTextModel, "baicai/tiny-clip", from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/tiny-clip", from_hf_hub=False) + self.test_cache_dir(CLIPTextModel, "baicai/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=False) + + # aistudio + self.test_cache_dir(CLIPTextModel, "aistudio/tiny-clip", from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/tiny-clip", from_aistudio=True) + self.test_cache_dir(CLIPTextModel, "aistudio/paddlenlp-test-model", subfolder="tiny-clip", from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-clip", from_aistudio=True) + + def test_clip_load_safe(self): + # BOS logger.info("Download model from PaddleNLP BOS") - clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) - clip_model = AutoModel.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False) + clip_model_bos = CLIPTextModel.from_pretrained("baicai/tiny-clip", use_safetensors=True, from_hf_hub=False) + clip_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-clip", use_safetensors=True, from_hf_hub=False) + self.test_config_diff(clip_model_bos.config, clip_model_bos_auto.config) logger.info("Download model from PaddleNLP BOS with subfolder") - clip_model = CLIPModel.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + clip_model_bos_sub = CLIPTextModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=True, from_hf_hub=False ) - clip_model = AutoModel.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False + self.test_config_diff(clip_model_bos.config, clip_model_bos_sub.config) + + clip_model_bos_sub_auto = AutoModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=True, from_hf_hub=False ) + self.test_config_diff(clip_model_bos_sub.config, clip_model_bos_sub_auto.config) + # aistudio logger.info("Download model from aistudio") - clip_model = CLIPModel.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) - clip_model = AutoModel.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True) + clip_model_aistudio = CLIPTextModel.from_pretrained( + "aistudio/tiny-clip", use_safetensors=True, from_aistudio=True + ) + self.test_config_diff(clip_model_bos.config, clip_model_aistudio.config) + clip_model_aistudio_auto = AutoModel.from_pretrained( + "aistudio/tiny-clip", use_safetensors=True, from_aistudio=True + ) + self.test_config_diff(clip_model_aistudio.config, clip_model_aistudio_auto.config) logger.info("Download model from aistudio with subfolder") - clip_model = CLIPModel.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + clip_model_aistudio_sub = CLIPTextModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=True, from_aistudio=True ) - clip_model = AutoModel.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True + self.test_config_diff(clip_model_aistudio.config, clip_model_aistudio_sub.config) + clip_model_aistudio_sub_auto = AutoModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=True, from_aistudio=True ) + self.test_config_diff(clip_model_aistudio_sub.config, clip_model_aistudio_sub_auto.config) + # local logger.info("Download model from local") - clip_model.save_pretrained("./paddlenlp-test-model/clip-vit-base-patch32") - clip_model = CLIPModel.from_pretrained("./paddlenlp-test-model/", subfolder="clip-vit-base-patch32") - clip_model = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="clip-vit-base-patch32") + clip_model_bos.save_pretrained("./paddlenlp-test-model/tiny-clip", safe_serialization=True) + clip_model_local = CLIPTextModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=True + ) + self.test_config_diff(clip_model_bos.config, clip_model_local.config) + clip_model_local_auto = AutoModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=True + ) + self.test_config_diff(clip_model_local.config, clip_model_local_auto.config) + + logger.info("Test cache_dir") + # BOS + self.test_cache_dir(CLIPTextModel, "baicai/tiny-clip", use_safetensors=True, from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/tiny-clip", use_safetensors=True, from_hf_hub=False) + self.test_cache_dir( + CLIPTextModel, + "baicai/paddlenlp-test-model", + subfolder="tiny-clip", + use_safetensors=True, + from_hf_hub=False, + ) + self.test_cache_dir( + AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=True, from_hf_hub=False + ) + + # aistudio + self.test_cache_dir(CLIPTextModel, "aistudio/tiny-clip", use_safetensors=True, from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/tiny-clip", use_safetensors=True, from_aistudio=True) + self.test_cache_dir( + CLIPTextModel, + "aistudio/paddlenlp-test-model", + subfolder="tiny-clip", + use_safetensors=True, + from_aistudio=True, + ) + self.test_cache_dir( + AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=True, from_aistudio=True + ) def test_t5_load(self): + # BOS + logger.info("Download model from PaddleNLP BOS") + t5_model_bos = T5Model.from_pretrained("baicai/tiny-t5", from_hf_hub=False) + t5_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-t5", from_hf_hub=False) + self.test_config_diff(t5_model_bos.config, t5_model_bos_auto.config) + + logger.info("Download model from PaddleNLP BOS with subfolder") + t5_model_bos_sub = T5Model.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=False + ) + self.test_config_diff(t5_model_bos.config, t5_model_bos_sub.config) + + t5_model_bos_sub_auto = AutoModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=False + ) + self.test_config_diff(t5_model_bos_sub.config, t5_model_bos_sub_auto.config) + + # aistudio + logger.info("Download model from aistudio") + t5_model_aistudio = T5Model.from_pretrained("aistudio/tiny-t5", from_aistudio=True) + self.test_config_diff(t5_model_bos.config, t5_model_aistudio.config) + t5_model_aistudio_auto = AutoModel.from_pretrained("aistudio/tiny-t5", from_aistudio=True) + self.test_config_diff(t5_model_aistudio.config, t5_model_aistudio_auto.config) + + logger.info("Download model from aistudio with subfolder") + t5_model_aistudio_sub = T5Model.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-t5", from_aistudio=True + ) + self.test_config_diff(t5_model_aistudio.config, t5_model_aistudio_sub.config) + t5_model_aistudio_sub_auto = AutoModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-t5", from_aistudio=True + ) + self.test_config_diff(t5_model_aistudio_sub.config, t5_model_aistudio_sub_auto.config) + + # local + logger.info("Download model from local") + t5_model_bos.save_pretrained("./paddlenlp-test-model/tiny-t5") + t5_model_local = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5") + self.test_config_diff(t5_model_bos.config, t5_model_local.config) + t5_model_local_auto = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5") + self.test_config_diff(t5_model_local.config, t5_model_local_auto.config) + + logger.info("Test cache_dir") + # BOS + self.test_cache_dir(T5Model, "baicai/tiny-t5", from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/tiny-t5", from_hf_hub=False) + self.test_cache_dir(T5Model, "baicai/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=False) + + # aistudio + self.test_cache_dir(T5Model, "aistudio/tiny-t5", from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/tiny-t5", from_aistudio=True) + self.test_cache_dir(T5Model, "aistudio/paddlenlp-test-model", subfolder="tiny-t5", from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-t5", from_aistudio=True) + + def test_t5_load_safe(self): + # BOS logger.info("Download model from PaddleNLP BOS") - t5_model = T5Model.from_pretrained("t5-small", from_hf_hub=False) - t5_model = AutoModel.from_pretrained("t5-small", from_hf_hub=False) + t5_model_bos = T5Model.from_pretrained("baicai/tiny-t5", use_safetensors=True, from_hf_hub=False) + t5_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-t5", use_safetensors=True, from_hf_hub=False) + self.test_config_diff(t5_model_bos.config, t5_model_bos_auto.config) logger.info("Download model from PaddleNLP BOS with subfolder") - t5_model = T5Model.from_pretrained("baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False) - t5_model = AutoModel.from_pretrained("baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False) + t5_model_bos_sub = T5Model.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=True, from_hf_hub=False + ) + self.test_config_diff(t5_model_bos.config, t5_model_bos_sub.config) + + t5_model_bos_sub_auto = AutoModel.from_pretrained( + "baicai/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=True, from_hf_hub=False + ) + self.test_config_diff(t5_model_bos_sub.config, t5_model_bos_sub_auto.config) + # aistudio logger.info("Download model from aistudio") - t5_model = T5Model.from_pretrained("aistudio/t5-small", from_aistudio=True) - t5_model = AutoModel.from_pretrained("aistudio/t5-small", from_aistudio=True) + t5_model_aistudio = T5Model.from_pretrained("aistudio/tiny-t5", use_safetensors=True, from_aistudio=True) + self.test_config_diff(t5_model_bos.config, t5_model_aistudio.config) + t5_model_aistudio_auto = AutoModel.from_pretrained( + "aistudio/tiny-t5", use_safetensors=True, from_aistudio=True + ) + self.test_config_diff(t5_model_aistudio.config, t5_model_aistudio_auto.config) logger.info("Download model from aistudio with subfolder") - t5_model = T5Model.from_pretrained("aistudio/paddlenlp-test-model", subfolder="t5-small", from_aistudio=True) - t5_model = AutoModel.from_pretrained("aistudio/paddlenlp-test-model", subfolder="t5-small", from_aistudio=True) + t5_model_aistudio_sub = T5Model.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=True, from_aistudio=True + ) + self.test_config_diff(t5_model_aistudio.config, t5_model_aistudio_sub.config) + t5_model_aistudio_sub_auto = AutoModel.from_pretrained( + "aistudio/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=True, from_aistudio=True + ) + self.test_config_diff(t5_model_aistudio_sub.config, t5_model_aistudio_sub_auto.config) + # local logger.info("Download model from local") - t5_model.save_pretrained("./paddlenlp-test-model/t5-small") - t5_model = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="t5-small") - t5_model = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="t5-small") + t5_model_bos.save_pretrained("./paddlenlp-test-model/tiny-t5", safe_serialization=True) + t5_model_local = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=True) + self.test_config_diff(t5_model_bos.config, t5_model_local.config) + t5_model_local_auto = AutoModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=True + ) + self.test_config_diff(t5_model_local.config, t5_model_local_auto.config) + + logger.info("Test cache_dir") + # BOS + self.test_cache_dir(T5Model, "baicai/tiny-t5", use_safetensors=True, from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/tiny-t5", use_safetensors=True, from_hf_hub=False) + self.test_cache_dir( + T5Model, "baicai/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=True, from_hf_hub=False + ) + self.test_cache_dir( + AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=True, from_hf_hub=False + ) + + # aistudio + self.test_cache_dir(T5Model, "aistudio/tiny-t5", use_safetensors=True, from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/tiny-t5", use_safetensors=True, from_aistudio=True) + self.test_cache_dir( + T5Model, "aistudio/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=True, from_aistudio=True + ) + self.test_cache_dir( + AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=True, from_aistudio=True + ) From 121fcda0fcb601ddf411049c1f158b54578db7ef Mon Sep 17 00:00:00 2001 From: CrazyBoyM Date: Mon, 25 Dec 2023 16:20:49 +0000 Subject: [PATCH 15/27] fix from_pretrained() load hf sharded model --- paddlenlp/transformers/conversion_utils.py | 1 - paddlenlp/transformers/model_utils.py | 264 ++++----- paddlenlp/transformers/utils.py | 8 + .../transformers/load_subfolder/test_model.py | 519 ++++++++++++++++-- 4 files changed, 605 insertions(+), 187 deletions(-) diff --git a/paddlenlp/transformers/conversion_utils.py b/paddlenlp/transformers/conversion_utils.py index 054239e91e1b..c883c1f87a14 100644 --- a/paddlenlp/transformers/conversion_utils.py +++ b/paddlenlp/transformers/conversion_utils.py @@ -1012,7 +1012,6 @@ def convert(cls, weight_file: str, config: PretrainedConfig, cache_dir: str) -> """ # FIXME(wj-Mcat): add compatibility with downstream models name_mappings = cls._get_name_mappings(config) - if weight_file.endswith(".index.json"): if ".safetensors." in weight_file: files = [file for file in os.listdir(os.path.dirname(weight_file)) if file.startswith("model-")] diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index a630b917061b..48abe2e610b6 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -354,7 +354,9 @@ def load_state_dict( return state_dict -def resolve_weight_file_from_hf_hub(repo_id: str, cache_dir: str, support_conversion: bool, subfolder=None): +def resolve_weight_file_from_hf_hub( + repo_id: str, cache_dir: str, support_conversion: bool, subfolder=None, use_safetensors=False +): """find the suitable weight file name Args: @@ -363,30 +365,34 @@ def resolve_weight_file_from_hf_hub(repo_id: str, cache_dir: str, support_conver support_conversion (bool): whether support converting pytorch weight file to paddle weight file subfolder (str, optional) An optional value corresponding to a folder inside the repo. """ - is_local = os.path.isdir(repo_id) - if not is_local: - if hf_file_exists(repo_id, PADDLE_WEIGHTS_NAME, subfolder=subfolder): + is_sharded = False + if use_safetensors: + # SAFE WEIGHTS + if hf_file_exists(repo_id, SAFE_WEIGHTS_INDEX_NAME, subfolder=subfolder): + file_name = SAFE_WEIGHTS_INDEX_NAME + is_sharded = True + elif hf_file_exists(repo_id, SAFE_WEIGHTS_NAME, subfolder=subfolder): + file_name = SAFE_WEIGHTS_NAME + else: + # RAW WEIGHTS + if hf_file_exists(repo_id, PADDLE_WEIGHTS_INDEX_NAME, subfolder=subfolder): + file_name = PADDLE_WEIGHTS_INDEX_NAME + is_sharded = True + elif hf_file_exists(repo_id, PYTORCH_WEIGHTS_INDEX_NAME, subfolder=subfolder): + file_name = PYTORCH_WEIGHTS_INDEX_NAME + is_sharded = True + elif hf_file_exists(repo_id, PADDLE_WEIGHTS_NAME, subfolder=subfolder): file_name = PADDLE_WEIGHTS_NAME - assert ( - support_conversion is False - ), "Please call set convert_from_torch for paddle weights on huggingface hub, eg. Model.from_pretrained(model_name, from_hf_hub=True, convert_from_torch=False)" elif hf_file_exists(repo_id, PYTORCH_WEIGHTS_NAME, subfolder=subfolder): - if not support_conversion: - raise EntryNotFoundError( - f"can not download `{PADDLE_WEIGHTS_NAME} from https://huggingface.co/{repo_id}` " - "and current model doesn't support conversion from pytorch weight file to paddle weight file" - ) file_name = PYTORCH_WEIGHTS_NAME + else: raise EntryNotFoundError( message=f"can not find the paddle/pytorch weight file from: https://huggingface.co/{repo_id}", response=None, ) - else: - # for local file, we use support_conversion to select paddle or torch weight. - file_name = PYTORCH_WEIGHTS_NAME if support_conversion else PADDLE_WEIGHTS_NAME - file_name_list = [SAFE_WEIGHTS_NAME] + [file_name] + [PYTORCH_WEIGHTS_INDEX_NAME] + [SAFE_WEIGHTS_INDEX_NAME] + file_name_list = [file_name] resolved_file = None for fn in file_name_list: resolved_file = cached_file_for_hf_hub( @@ -402,7 +408,7 @@ def resolve_weight_file_from_hf_hub(repo_id: str, cache_dir: str, support_conver f"'https://huggingface.co/{repo_id}' for available files." ) - return resolved_file + return resolved_file, is_sharded def register_base_model(cls): @@ -1441,16 +1447,6 @@ def _resolve_model_file_path( is_sharded = False sharded_metadata = None - # -1. when it's from HF - if from_hf_hub or convert_from_torch: - resolved_archive_file = resolve_weight_file_from_hf_hub( - pretrained_model_name_or_path, - cache_dir=cache_dir, - support_conversion=convert_from_torch, - subfolder=subfolder, - ) - return resolved_archive_file, sharded_metadata, is_sharded - if pretrained_model_name_or_path is not None: # the following code use a lot of os.path.join, hence setting subfolder to empty str if None if subfolder is None: @@ -1554,105 +1550,118 @@ def get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, v filename = pretrained_model_name_or_path resolved_archive_file = get_path_from_url_with_filelock(pretrained_model_name_or_path) else: - # set correct filename - if use_safetensors is not False: - filename = _add_variant(SAFE_WEIGHTS_NAME, variant) - else: - filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - - try: - # Load from URL or cache if already cached - cached_file_kwargs = dict( + # -1. when it's from HF + if from_hf_hub: + resolved_archive_file, is_sharded = resolve_weight_file_from_hf_hub( + pretrained_model_name_or_path, cache_dir=cache_dir, + support_conversion=convert_from_torch, subfolder=subfolder, - from_aistudio=from_aistudio, - _raise_exceptions_for_missing_entries=False, + use_safetensors=use_safetensors, ) - resolved_archive_file = None - if pretrained_model_name_or_path in cls.pretrained_init_configuration: - # fetch the weight url from the `pretrained_resource_files_map` - resource_file_url = cls.pretrained_resource_files_map["model_state"][ - pretrained_model_name_or_path - ] - resolved_archive_file = cached_file( - resource_file_url, _add_variant(PADDLE_WEIGHTS_NAME, variant), **cached_file_kwargs - ) - - if resolved_archive_file is None: - resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs - ) + else: + # set correct filename + if use_safetensors is not False: + filename = _add_variant(SAFE_WEIGHTS_NAME, variant) else: - # xxx.pdparams in pretrained_resource_files_map renamed model_state.pdparams filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None - # result when internet is up, the repo and revision exist, but the file does not. - if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): - # Maybe the checkpoint is sharded, we try to grab the index name in this case. - resolved_archive_file = cached_file( - pretrained_model_name_or_path, - _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), - **cached_file_kwargs, + try: + # Load from URL or cache if already cached + cached_file_kwargs = dict( + cache_dir=cache_dir, + subfolder=subfolder, + from_aistudio=from_aistudio, + _raise_exceptions_for_missing_entries=False, ) - if resolved_archive_file is not None: - is_sharded = True - elif use_safetensors: - raise EnvironmentError( - f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`." + resolved_archive_file = None + if pretrained_model_name_or_path in cls.pretrained_init_configuration: + # fetch the weight url from the `pretrained_resource_files_map` + resource_file_url = cls.pretrained_resource_files_map["model_state"][ + pretrained_model_name_or_path + ] + resolved_archive_file = cached_file( + resource_file_url, _add_variant(PADDLE_WEIGHTS_NAME, variant), **cached_file_kwargs + ) + + if resolved_archive_file is None: + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs ) + else: - # This repo has no safetensors file of any kind, we switch to PyTorch. + # xxx.pdparams in pretrained_resource_files_map renamed model_state.pdparams filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) + + # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None + # result when internet is up, the repo and revision exist, but the file does not. + if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs + pretrained_model_name_or_path, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, ) - if resolved_archive_file is None and filename == _add_variant(PADDLE_WEIGHTS_NAME, variant): - # Maybe the checkpoint is sharded, we try to grab the index name in this case. - resolved_archive_file = cached_file( - pretrained_model_name_or_path, - _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant), - **cached_file_kwargs, - ) - # raise ValueError(resolved_archive_file) - if resolved_archive_file is not None: - is_sharded = True - if resolved_archive_file is None: - # Otherwise, maybe there is a TF or Flax model file. We try those to give a helpful error - # message. + if resolved_archive_file is not None: + is_sharded = True + elif use_safetensors: + raise EnvironmentError( + f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`." + ) + else: + # This repo has no safetensors file of any kind, we switch to PyTorch. + filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs + ) + if resolved_archive_file is None and filename == _add_variant(PADDLE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + # raise ValueError(resolved_archive_file) + if resolved_archive_file is not None: + is_sharded = True + if resolved_archive_file is None: + # Otherwise, maybe there is a TF or Flax model file. We try those to give a helpful error + # message. + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(PADDLE_WEIGHTS_NAME, variant)}." + ) + except Exception as e: + logger.info(e) + # For any other exception, we throw a generic error. raise EnvironmentError( - f"{pretrained_model_name_or_path} does not appear to have a file named" - f" {_add_variant(PADDLE_WEIGHTS_NAME, variant)}." + f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" + " from 'https://paddlenlp.bj.bcebos.com'" ) - except Exception as e: - logger.info(e) - # For any other exception, we throw a generic error. - raise EnvironmentError( - f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" - " from 'https://paddlenlp.bj.bcebos.com'" - ) if is_local: logger.info(f"Loading weights file {archive_file}") resolved_archive_file = archive_file else: - logger.info(f"Loading weights file {filename} from cache at {resolved_archive_file}") + logger.info(f"Loading weights file from cache at {resolved_archive_file}") else: resolved_archive_file = None # We'll need to download and cache each checkpoint shard if the checkpoint is sharded. + resolved_sharded_files = None if is_sharded: # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. - resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + resolved_sharded_files, sharded_metadata = get_checkpoint_shard_files( pretrained_model_name_or_path, resolved_archive_file, from_aistudio=from_aistudio, + from_hf_hub=from_hf_hub, cache_dir=cache_dir, subfolder=subfolder, ) - return resolved_archive_file, sharded_metadata, is_sharded + return resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded @classmethod def _load_pretrained_model( @@ -1660,7 +1669,7 @@ def _load_pretrained_model( model: PretrainedModel, state_dict: Dict[str, Tensor], loaded_keys: List[str], - resolved_archive_file, + resolved_archive_file: Union[str, List], pretrained_model_name_or_path, config=None, ignore_mismatched_sizes=False, @@ -2113,7 +2122,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): use_keep_in_fp32_modules = False # resolve model_weight file - resolved_archive_file, sharded_metadata, is_sharded = cls._resolve_model_file_path( + resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded = cls._resolve_model_file_path( pretrained_model_name_or_path, cache_dir=cache_dir, subfolder=subfolder, @@ -2125,41 +2134,40 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): variant=variant, ) - # load pt weights early so that we know which dtype to init the model under + if convert_from_torch and state_dict is None: + if ( + resolved_archive_file.endswith(PYTORCH_WEIGHTS_NAME) + or resolved_archive_file.endswith(PYTORCH_WEIGHTS_INDEX_NAME) + or resolved_archive_file.endswith(SAFE_WEIGHTS_NAME) + or resolved_archive_file.endswith(SAFE_WEIGHTS_INDEX_NAME) + ): + # try to get the name-mapping info + logger.info( + f"Starting to convert pytorch weight file<{resolved_archive_file}> to " + f"paddle weight file<{os.path.join(cache_dir, pretrained_model_name_or_path, subfolder, PADDLE_WEIGHTS_NAME)}> ..." + ) + state_dict = cls.convert( + resolved_archive_file, + config, + cache_dir=os.path.join(cache_dir, pretrained_model_name_or_path, subfolder), + ) + else: + raise ValueError(f"Unexpected file: {resolved_archive_file} for weight conversion.") + # load pt weights early so that we know which dtype to init the model under + if not is_sharded and state_dict is None: - # Time to load the checkpoint - if convert_from_torch: - if ( - resolved_archive_file.endswith(PYTORCH_WEIGHTS_NAME) - or resolved_archive_file.endswith(PYTORCH_WEIGHTS_INDEX_NAME) - or resolved_archive_file.endswith(SAFE_WEIGHTS_NAME) - or resolved_archive_file.endswith(SAFE_WEIGHTS_INDEX_NAME) - ): - # try to get the name-mapping info - logger.info( - f"Starting to convert pytorch weight file<{resolved_archive_file}> to " - f"paddle weight file<{os.path.join(cache_dir, pretrained_model_name_or_path, subfolder, PADDLE_WEIGHTS_NAME)}> ..." - ) - state_dict = cls.convert( - resolved_archive_file, - config, - cache_dir=os.path.join(cache_dir, pretrained_model_name_or_path, subfolder), - ) - else: - raise ValueError(f"Unexpected file: {resolved_archive_file} for weight conversion.") + # 4. loading non-sharded ckpt from the state dict + if config.tensor_parallel_degree > 1 and resolved_archive_file.endswith("model_state.pdparams"): + state_dict = cls.convert_tensor_parallel(resolved_archive_file, config) + elif config.tensor_parallel_degree > 1 and resolved_archive_file.endswith("model.safetensors"): + with safe_open(resolved_archive_file, framework="np", device="cpu") as f: + loaded_keys = f.keys() + tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys) + state_dict = load_state_dict(resolved_archive_file, tp_actions) else: - # 4. loading non-sharded ckpt from the state dict - if config.tensor_parallel_degree > 1 and resolved_archive_file.endswith("model_state.pdparams"): - state_dict = cls.convert_tensor_parallel(resolved_archive_file, config) - elif config.tensor_parallel_degree > 1 and resolved_archive_file.endswith("model.safetensors"): - with safe_open(resolved_archive_file, framework="np", device="cpu") as f: - loaded_keys = f.keys() - tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys) - state_dict = load_state_dict(resolved_archive_file, tp_actions) - else: - state_dict = load_state_dict(resolved_archive_file) + state_dict = load_state_dict(resolved_archive_file) - logger.info("Loaded weights file from disk, setting weights to model.") + logger.info("Loaded weights file from disk, setting weights to model.") # Check if `_keep_in_fp32_modules` is not None use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and dtype == "float16" @@ -2202,7 +2210,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): model=model, state_dict=state_dict, loaded_keys=loaded_state_dict_keys, - resolved_archive_file=resolved_archive_file, + resolved_archive_file=resolved_sharded_files if is_sharded else resolved_archive_file, pretrained_model_name_or_path=pretrained_model_name_or_path, config=config, ignore_mismatched_sizes=ignore_mismatched_sizes, diff --git a/paddlenlp/transformers/utils.py b/paddlenlp/transformers/utils.py index 49a8a9d532c1..9b1afe235afd 100644 --- a/paddlenlp/transformers/utils.py +++ b/paddlenlp/transformers/utils.py @@ -612,6 +612,7 @@ def get_checkpoint_shard_files( cache_dir=None, subfolder="", from_aistudio=False, + from_hf_hub=False, ): """ For a given model: @@ -666,6 +667,13 @@ def get_checkpoint_shard_files( subfolder=subfolder, cache_dir=cache_dir, ) + elif from_hf_hub: + cached_filename = hf_hub_download( + repo_id=pretrained_model_name_or_path, + filename=shard_filename, + subfolder=subfolder, + cache_dir=cache_dir, + ) else: cached_filename = paddlenlp_hub_download( pretrained_model_name_or_path, diff --git a/tests/transformers/load_subfolder/test_model.py b/tests/transformers/load_subfolder/test_model.py index 0718fbf54b94..285d46e8d402 100644 --- a/tests/transformers/load_subfolder/test_model.py +++ b/tests/transformers/load_subfolder/test_model.py @@ -53,63 +53,146 @@ def test_cache_dir( if use_safetensors: assert any(".safetensors" in f for f in file_list), "*.safetensors not in cache_dir" else: - assert any(".pdparams" in f for f in file_list), "*.pdparams not in cache_dir" + if from_hf_hub: + assert any(".bin" in f for f in file_list), "*.bin not in cache_dir" + else: + assert any(".pdparams" in f for f in file_list), "*.pdparams not in cache_dir" def test_bert_load(self): # BOS logger.info("Download model from PaddleNLP BOS") - bert_model_bos = BertModel.from_pretrained("baicai/tiny-bert-2", from_hf_hub=False) - bert_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-bert-2", from_hf_hub=False) + bert_model_bos = BertModel.from_pretrained("baicai/tiny-bert-2", use_safetensors=False, from_hf_hub=False) + bert_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-bert-2", use_safetensors=False, from_hf_hub=False) self.test_config_diff(bert_model_bos.config, bert_model_bos_auto.config) logger.info("Download model from PaddleNLP BOS with subfolder") bert_model_bos_sub = BertModel.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=False + "baicai/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=False, from_hf_hub=False ) self.test_config_diff(bert_model_bos.config, bert_model_bos_sub.config) bert_model_bos_sub_auto = AutoModel.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=False + "baicai/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=False, from_hf_hub=False ) self.test_config_diff(bert_model_bos_sub.config, bert_model_bos_sub_auto.config) # aistudio logger.info("Download model from aistudio") - bert_model_aistudio = BertModel.from_pretrained("aistudio/tiny-bert", from_aistudio=True) + bert_model_aistudio = BertModel.from_pretrained( + "aistudio/tiny-bert", use_safetensors=False, from_aistudio=True + ) self.test_config_diff(bert_model_bos.config, bert_model_aistudio.config) - bert_model_aistudio_auto = AutoModel.from_pretrained("aistudio/tiny-bert", from_aistudio=True) + bert_model_aistudio_auto = AutoModel.from_pretrained( + "aistudio/tiny-bert", use_safetensors=False, from_aistudio=True + ) self.test_config_diff(bert_model_aistudio.config, bert_model_aistudio_auto.config) + # hf + logger.info("Download model from hf") + bert_model_hf = BertModel.from_pretrained("Baicai003/tiny-bert", from_hf_hub=True, use_safetensors=False) + bert_model_hf_auto = AutoModel.from_pretrained("Baicai003/tiny-bert", from_hf_hub=True, use_safetensors=False) + self.test_config_diff(bert_model_hf.config, bert_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + bert_model_hf_sub = BertModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(bert_model_hf.config, bert_model_hf_sub.config) + bert_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(bert_model_hf_sub.config, bert_model_hf_sub_auto.config) + bert_model_hf = BertModel.from_pretrained("Baicai003/tiny-bert-one", from_hf_hub=True, use_safetensors=False) + self.test_config_diff(bert_model_hf.config, bert_model_hf.config) + bert_model_hf_auto = AutoModel.from_pretrained( + "Baicai003/tiny-bert-one", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(bert_model_hf.config, bert_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + bert_model_hf_sub = BertModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-bert-one", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(bert_model_hf.config, bert_model_hf_sub.config) + bert_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-bert-one", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(bert_model_hf_sub.config, bert_model_hf_sub_auto.config) + logger.info("Download model from aistudio with subfolder") bert_model_aistudio_sub = BertModel.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="tiny-bert", from_aistudio=True + "aistudio/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=False, from_aistudio=True ) self.test_config_diff(bert_model_aistudio.config, bert_model_aistudio_sub.config) bert_model_aistudio_sub_auto = AutoModel.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="tiny-bert", from_aistudio=True + "aistudio/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=False, from_aistudio=True ) self.test_config_diff(bert_model_aistudio_sub.config, bert_model_aistudio_sub_auto.config) # local logger.info("Download model from local") - bert_model_bos.save_pretrained("./paddlenlp-test-model/tiny-bert") - bert_model_local = BertModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-bert") + bert_model_bos.save_pretrained("./paddlenlp-test-model/tiny-bert", safe_serialization=True) + bert_model_local = BertModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=False + ) self.test_config_diff(bert_model_bos.config, bert_model_local.config) - bert_model_local_auto = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-bert") + bert_model_local_auto = AutoModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=False + ) self.test_config_diff(bert_model_local.config, bert_model_local_auto.config) logger.info("Test cache_dir") # BOS - self.test_cache_dir(BertModel, "baicai/tiny-bert-2", from_hf_hub=False) - self.test_cache_dir(AutoModel, "baicai/tiny-bert-2", from_hf_hub=False) - self.test_cache_dir(BertModel, "baicai/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=False) - self.test_cache_dir(AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=False) + self.test_cache_dir(BertModel, "baicai/tiny-bert-2", use_safetensors=False, from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/tiny-bert-2", use_safetensors=False, from_hf_hub=False) + self.test_cache_dir( + BertModel, "baicai/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=False, from_hf_hub=False + ) + self.test_cache_dir( + AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=False, from_hf_hub=False + ) # aistudio - self.test_cache_dir(BertModel, "aistudio/tiny-bert", from_aistudio=True) - self.test_cache_dir(AutoModel, "aistudio/tiny-bert", from_aistudio=True) - self.test_cache_dir(BertModel, "aistudio/paddlenlp-test-model", subfolder="tiny-bert", from_aistudio=True) - self.test_cache_dir(AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-bert", from_aistudio=True) + self.test_cache_dir(BertModel, "aistudio/tiny-bert", use_safetensors=False, from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/tiny-bert", use_safetensors=False, from_aistudio=True) + self.test_cache_dir( + BertModel, + "aistudio/paddlenlp-test-model", + subfolder="tiny-bert", + use_safetensors=False, + from_aistudio=True, + ) + self.test_cache_dir( + AutoModel, + "aistudio/paddlenlp-test-model", + subfolder="tiny-bert", + use_safetensors=False, + from_aistudio=True, + ) + + # hf + self.test_cache_dir(BertModel, "Baicai003/tiny-bert", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir(AutoModel, "Baicai003/tiny-bert", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir( + BertModel, "Baicai003/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=True, use_safetensors=False + ) + self.test_cache_dir( + AutoModel, "Baicai003/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=True, use_safetensors=False + ) + self.test_cache_dir(BertModel, "Baicai003/tiny-bert-one", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir(AutoModel, "Baicai003/tiny-bert-one", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir( + BertModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-bert-one", + from_hf_hub=True, + use_safetensors=False, + ) + self.test_cache_dir( + AutoModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-bert-one", + from_hf_hub=True, + use_safetensors=False, + ) def test_bert_load_safe(self): # BOS @@ -138,6 +221,36 @@ def test_bert_load_safe(self): ) self.test_config_diff(bert_model_aistudio.config, bert_model_aistudio_auto.config) + # hf + logger.info("Download model from hf") + bert_model_hf = BertModel.from_pretrained("Baicai003/tiny-bert", from_hf_hub=True, use_safetensors=True) + bert_model_hf_auto = AutoModel.from_pretrained("Baicai003/tiny-bert", from_hf_hub=True, use_safetensors=True) + self.test_config_diff(bert_model_hf.config, bert_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + bert_model_hf_sub = BertModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(bert_model_hf.config, bert_model_hf_sub.config) + bert_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(bert_model_hf_sub.config, bert_model_hf_sub_auto.config) + bert_model_hf = BertModel.from_pretrained("Baicai003/tiny-bert-one", from_hf_hub=True, use_safetensors=True) + self.test_config_diff(bert_model_hf.config, bert_model_hf.config) + bert_model_hf_auto = AutoModel.from_pretrained( + "Baicai003/tiny-bert-one", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(bert_model_hf.config, bert_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + bert_model_hf_sub = BertModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-bert-one", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(bert_model_hf.config, bert_model_hf_sub.config) + bert_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-bert-one", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(bert_model_hf_sub.config, bert_model_hf_sub_auto.config) + logger.info("Download model from aistudio with subfolder") bert_model_aistudio_sub = BertModel.from_pretrained( "aistudio/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_aistudio=True @@ -181,61 +294,177 @@ def test_bert_load_safe(self): AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-bert", use_safetensors=True, from_aistudio=True ) + # hf + self.test_cache_dir(BertModel, "Baicai003/tiny-bert", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir(AutoModel, "Baicai003/tiny-bert", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir( + BertModel, "Baicai003/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=True, use_safetensors=True + ) + self.test_cache_dir( + AutoModel, "Baicai003/paddlenlp-test-model", subfolder="tiny-bert", from_hf_hub=True, use_safetensors=True + ) + self.test_cache_dir(BertModel, "Baicai003/tiny-bert-one", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir(AutoModel, "Baicai003/tiny-bert-one", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir( + BertModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-bert-one", + from_hf_hub=True, + use_safetensors=True, + ) + self.test_cache_dir( + AutoModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-bert-one", + from_hf_hub=True, + use_safetensors=True, + ) + def test_clip_load(self): # BOS logger.info("Download model from PaddleNLP BOS") - clip_model_bos = CLIPTextModel.from_pretrained("baicai/tiny-clip", from_hf_hub=False) - clip_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-clip", from_hf_hub=False) + clip_model_bos = CLIPTextModel.from_pretrained("baicai/tiny-clip", use_safetensors=False, from_hf_hub=False) + clip_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-clip", use_safetensors=False, from_hf_hub=False) self.test_config_diff(clip_model_bos.config, clip_model_bos_auto.config) logger.info("Download model from PaddleNLP BOS with subfolder") clip_model_bos_sub = CLIPTextModel.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=False + "baicai/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=False, from_hf_hub=False ) self.test_config_diff(clip_model_bos.config, clip_model_bos_sub.config) clip_model_bos_sub_auto = AutoModel.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=False + "baicai/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=False, from_hf_hub=False ) self.test_config_diff(clip_model_bos_sub.config, clip_model_bos_sub_auto.config) # aistudio logger.info("Download model from aistudio") - clip_model_aistudio = CLIPTextModel.from_pretrained("aistudio/tiny-clip", from_aistudio=True) + clip_model_aistudio = CLIPTextModel.from_pretrained( + "aistudio/tiny-clip", use_safetensors=False, from_aistudio=True + ) self.test_config_diff(clip_model_bos.config, clip_model_aistudio.config) - clip_model_aistudio_auto = AutoModel.from_pretrained("aistudio/tiny-clip", from_aistudio=True) + clip_model_aistudio_auto = AutoModel.from_pretrained( + "aistudio/tiny-clip", use_safetensors=False, from_aistudio=True + ) self.test_config_diff(clip_model_aistudio.config, clip_model_aistudio_auto.config) logger.info("Download model from aistudio with subfolder") clip_model_aistudio_sub = CLIPTextModel.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="tiny-clip", from_aistudio=True + "aistudio/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=False, from_aistudio=True ) self.test_config_diff(clip_model_aistudio.config, clip_model_aistudio_sub.config) clip_model_aistudio_sub_auto = AutoModel.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="tiny-clip", from_aistudio=True + "aistudio/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=False, from_aistudio=True ) self.test_config_diff(clip_model_aistudio_sub.config, clip_model_aistudio_sub_auto.config) + # hf + logger.info("Download model from hf") + clip_model_hf = CLIPTextModel.from_pretrained("Baicai003/tiny-clip", from_hf_hub=True, use_safetensors=False) + clip_model_hf_auto = AutoModel.from_pretrained("Baicai003/tiny-clip", from_hf_hub=True, use_safetensors=False) + self.test_config_diff(clip_model_hf.config, clip_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + clip_model_hf_sub = CLIPTextModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(clip_model_hf.config, clip_model_hf_sub.config) + clip_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(clip_model_hf_sub.config, clip_model_hf_sub_auto.config) + clip_model_hf = CLIPTextModel.from_pretrained( + "Baicai003/tiny-clip-one", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(clip_model_hf.config, clip_model_hf.config) + clip_model_hf_auto = AutoModel.from_pretrained( + "Baicai003/tiny-clip-one", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(clip_model_hf.config, clip_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + clip_model_hf_sub = CLIPTextModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-clip-one", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(clip_model_hf.config, clip_model_hf_sub.config) + clip_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-clip-one", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(clip_model_hf_sub.config, clip_model_hf_sub_auto.config) + # local logger.info("Download model from local") - clip_model_bos.save_pretrained("./paddlenlp-test-model/tiny-clip") - clip_model_local = CLIPTextModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-clip") + clip_model_bos.save_pretrained("./paddlenlp-test-model/tiny-clip", safe_serialization=True) + clip_model_local = CLIPTextModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=False + ) self.test_config_diff(clip_model_bos.config, clip_model_local.config) - clip_model_local_auto = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-clip") + clip_model_local_auto = AutoModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=False + ) self.test_config_diff(clip_model_local.config, clip_model_local_auto.config) logger.info("Test cache_dir") # BOS - self.test_cache_dir(CLIPTextModel, "baicai/tiny-clip", from_hf_hub=False) - self.test_cache_dir(AutoModel, "baicai/tiny-clip", from_hf_hub=False) - self.test_cache_dir(CLIPTextModel, "baicai/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=False) - self.test_cache_dir(AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=False) + self.test_cache_dir(CLIPTextModel, "baicai/tiny-clip", use_safetensors=False, from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/tiny-clip", use_safetensors=False, from_hf_hub=False) + self.test_cache_dir( + CLIPTextModel, + "baicai/paddlenlp-test-model", + subfolder="tiny-clip", + use_safetensors=False, + from_hf_hub=False, + ) + self.test_cache_dir( + AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=False, from_hf_hub=False + ) # aistudio - self.test_cache_dir(CLIPTextModel, "aistudio/tiny-clip", from_aistudio=True) - self.test_cache_dir(AutoModel, "aistudio/tiny-clip", from_aistudio=True) - self.test_cache_dir(CLIPTextModel, "aistudio/paddlenlp-test-model", subfolder="tiny-clip", from_aistudio=True) - self.test_cache_dir(AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-clip", from_aistudio=True) + self.test_cache_dir(CLIPTextModel, "aistudio/tiny-clip", use_safetensors=False, from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/tiny-clip", use_safetensors=False, from_aistudio=True) + self.test_cache_dir( + CLIPTextModel, + "aistudio/paddlenlp-test-model", + subfolder="tiny-clip", + use_safetensors=False, + from_aistudio=True, + ) + self.test_cache_dir( + AutoModel, + "aistudio/paddlenlp-test-model", + subfolder="tiny-clip", + use_safetensors=False, + from_aistudio=True, + ) + + # hf + self.test_cache_dir(CLIPTextModel, "Baicai003/tiny-clip", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir(AutoModel, "Baicai003/tiny-clip", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir( + CLIPTextModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-clip", + from_hf_hub=True, + use_safetensors=False, + ) + self.test_cache_dir( + AutoModel, "Baicai003/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=True, use_safetensors=False + ) + self.test_cache_dir(CLIPTextModel, "Baicai003/tiny-clip-one", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir(AutoModel, "Baicai003/tiny-clip-one", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir( + CLIPTextModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-clip-one", + from_hf_hub=True, + use_safetensors=False, + ) + self.test_cache_dir( + AutoModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-clip-one", + from_hf_hub=True, + use_safetensors=False, + ) def test_clip_load_safe(self): # BOS @@ -276,6 +505,38 @@ def test_clip_load_safe(self): ) self.test_config_diff(clip_model_aistudio_sub.config, clip_model_aistudio_sub_auto.config) + # hf + logger.info("Download model from hf") + clip_model_hf = CLIPTextModel.from_pretrained("Baicai003/tiny-clip", from_hf_hub=True, use_safetensors=True) + clip_model_hf_auto = AutoModel.from_pretrained("Baicai003/tiny-clip", from_hf_hub=True, use_safetensors=True) + self.test_config_diff(clip_model_hf.config, clip_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + clip_model_hf_sub = CLIPTextModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(clip_model_hf.config, clip_model_hf_sub.config) + clip_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(clip_model_hf_sub.config, clip_model_hf_sub_auto.config) + clip_model_hf = CLIPTextModel.from_pretrained( + "Baicai003/tiny-clip-one", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(clip_model_hf.config, clip_model_hf.config) + clip_model_hf_auto = AutoModel.from_pretrained( + "Baicai003/tiny-clip-one", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(clip_model_hf.config, clip_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + clip_model_hf_sub = CLIPTextModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-clip-one", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(clip_model_hf.config, clip_model_hf_sub.config) + clip_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-clip-one", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(clip_model_hf_sub.config, clip_model_hf_sub_auto.config) + # local logger.info("Download model from local") clip_model_bos.save_pretrained("./paddlenlp-test-model/tiny-clip", safe_serialization=True) @@ -317,61 +578,153 @@ def test_clip_load_safe(self): AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-clip", use_safetensors=True, from_aistudio=True ) + # hf + self.test_cache_dir(CLIPTextModel, "Baicai003/tiny-clip", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir(AutoModel, "Baicai003/tiny-clip", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir( + CLIPTextModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-clip", + from_hf_hub=True, + use_safetensors=True, + ) + self.test_cache_dir( + AutoModel, "Baicai003/paddlenlp-test-model", subfolder="tiny-clip", from_hf_hub=True, use_safetensors=True + ) + self.test_cache_dir(CLIPTextModel, "Baicai003/tiny-clip-one", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir(AutoModel, "Baicai003/tiny-clip-one", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir( + CLIPTextModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-clip-one", + from_hf_hub=True, + use_safetensors=True, + ) + self.test_cache_dir( + AutoModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-clip-one", + from_hf_hub=True, + use_safetensors=True, + ) + def test_t5_load(self): # BOS logger.info("Download model from PaddleNLP BOS") - t5_model_bos = T5Model.from_pretrained("baicai/tiny-t5", from_hf_hub=False) - t5_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-t5", from_hf_hub=False) + t5_model_bos = T5Model.from_pretrained("baicai/tiny-t5", use_safetensors=False, from_hf_hub=False) + t5_model_bos_auto = AutoModel.from_pretrained("baicai/tiny-t5", use_safetensors=False, from_hf_hub=False) self.test_config_diff(t5_model_bos.config, t5_model_bos_auto.config) logger.info("Download model from PaddleNLP BOS with subfolder") t5_model_bos_sub = T5Model.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=False + "baicai/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=False, from_hf_hub=False ) self.test_config_diff(t5_model_bos.config, t5_model_bos_sub.config) t5_model_bos_sub_auto = AutoModel.from_pretrained( - "baicai/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=False + "baicai/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=False, from_hf_hub=False ) self.test_config_diff(t5_model_bos_sub.config, t5_model_bos_sub_auto.config) # aistudio logger.info("Download model from aistudio") - t5_model_aistudio = T5Model.from_pretrained("aistudio/tiny-t5", from_aistudio=True) + t5_model_aistudio = T5Model.from_pretrained("aistudio/tiny-t5", use_safetensors=False, from_aistudio=True) self.test_config_diff(t5_model_bos.config, t5_model_aistudio.config) - t5_model_aistudio_auto = AutoModel.from_pretrained("aistudio/tiny-t5", from_aistudio=True) + t5_model_aistudio_auto = AutoModel.from_pretrained( + "aistudio/tiny-t5", use_safetensors=False, from_aistudio=True + ) self.test_config_diff(t5_model_aistudio.config, t5_model_aistudio_auto.config) logger.info("Download model from aistudio with subfolder") t5_model_aistudio_sub = T5Model.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="tiny-t5", from_aistudio=True + "aistudio/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=False, from_aistudio=True ) self.test_config_diff(t5_model_aistudio.config, t5_model_aistudio_sub.config) t5_model_aistudio_sub_auto = AutoModel.from_pretrained( - "aistudio/paddlenlp-test-model", subfolder="tiny-t5", from_aistudio=True + "aistudio/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=False, from_aistudio=True ) self.test_config_diff(t5_model_aistudio_sub.config, t5_model_aistudio_sub_auto.config) + # hf + logger.info("Download model from hf") + t5_model_hf = T5Model.from_pretrained("Baicai003/tiny-t5", from_hf_hub=True, use_safetensors=False) + t5_model_hf_auto = AutoModel.from_pretrained("Baicai003/tiny-t5", from_hf_hub=True, use_safetensors=False) + self.test_config_diff(t5_model_hf.config, t5_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + t5_model_hf_sub = T5Model.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(t5_model_hf.config, t5_model_hf_sub.config) + t5_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(t5_model_hf_sub.config, t5_model_hf_sub_auto.config) + t5_model_hf = T5Model.from_pretrained("Baicai003/tiny-t5-one", from_hf_hub=True, use_safetensors=False) + self.test_config_diff(t5_model_hf.config, t5_model_hf.config) + t5_model_hf_auto = AutoModel.from_pretrained("Baicai003/tiny-t5-one", from_hf_hub=True, use_safetensors=False) + self.test_config_diff(t5_model_hf.config, t5_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + t5_model_hf_sub = T5Model.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-t5-one", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(t5_model_hf.config, t5_model_hf_sub.config) + t5_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-t5-one", from_hf_hub=True, use_safetensors=False + ) + self.test_config_diff(t5_model_hf_sub.config, t5_model_hf_sub_auto.config) + # local logger.info("Download model from local") - t5_model_bos.save_pretrained("./paddlenlp-test-model/tiny-t5") - t5_model_local = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5") + t5_model_bos.save_pretrained("./paddlenlp-test-model/tiny-t5", safe_serialization=True) + t5_model_local = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=False) self.test_config_diff(t5_model_bos.config, t5_model_local.config) - t5_model_local_auto = AutoModel.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5") + t5_model_local_auto = AutoModel.from_pretrained( + "./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=False + ) self.test_config_diff(t5_model_local.config, t5_model_local_auto.config) logger.info("Test cache_dir") # BOS - self.test_cache_dir(T5Model, "baicai/tiny-t5", from_hf_hub=False) - self.test_cache_dir(AutoModel, "baicai/tiny-t5", from_hf_hub=False) - self.test_cache_dir(T5Model, "baicai/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=False) - self.test_cache_dir(AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=False) + self.test_cache_dir(T5Model, "baicai/tiny-t5", use_safetensors=False, from_hf_hub=False) + self.test_cache_dir(AutoModel, "baicai/tiny-t5", use_safetensors=False, from_hf_hub=False) + self.test_cache_dir( + T5Model, "baicai/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=False, from_hf_hub=False + ) + self.test_cache_dir( + AutoModel, "baicai/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=False, from_hf_hub=False + ) # aistudio - self.test_cache_dir(T5Model, "aistudio/tiny-t5", from_aistudio=True) - self.test_cache_dir(AutoModel, "aistudio/tiny-t5", from_aistudio=True) - self.test_cache_dir(T5Model, "aistudio/paddlenlp-test-model", subfolder="tiny-t5", from_aistudio=True) - self.test_cache_dir(AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-t5", from_aistudio=True) + self.test_cache_dir(T5Model, "aistudio/tiny-t5", use_safetensors=False, from_aistudio=True) + self.test_cache_dir(AutoModel, "aistudio/tiny-t5", use_safetensors=False, from_aistudio=True) + self.test_cache_dir( + T5Model, "aistudio/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=False, from_aistudio=True + ) + self.test_cache_dir( + AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=False, from_aistudio=True + ) + + # hf + self.test_cache_dir(T5Model, "Baicai003/tiny-t5", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir(AutoModel, "Baicai003/tiny-t5", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir( + T5Model, "Baicai003/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=True, use_safetensors=False + ) + self.test_cache_dir( + AutoModel, "Baicai003/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=True, use_safetensors=False + ) + self.test_cache_dir(T5Model, "Baicai003/tiny-t5-one", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir(AutoModel, "Baicai003/tiny-t5-one", from_hf_hub=True, use_safetensors=False) + self.test_cache_dir( + T5Model, "Baicai003/paddlenlp-test-model", subfolder="tiny-t5-one", from_hf_hub=True, use_safetensors=False + ) + self.test_cache_dir( + AutoModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-t5-one", + from_hf_hub=True, + use_safetensors=False, + ) def test_t5_load_safe(self): # BOS @@ -410,6 +763,34 @@ def test_t5_load_safe(self): ) self.test_config_diff(t5_model_aistudio_sub.config, t5_model_aistudio_sub_auto.config) + # hf + logger.info("Download model from hf") + t5_model_hf = T5Model.from_pretrained("Baicai003/tiny-t5", from_hf_hub=True, use_safetensors=True) + t5_model_hf_auto = AutoModel.from_pretrained("Baicai003/tiny-t5", from_hf_hub=True, use_safetensors=True) + self.test_config_diff(t5_model_hf.config, t5_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + t5_model_hf_sub = T5Model.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(t5_model_hf.config, t5_model_hf_sub.config) + t5_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(t5_model_hf_sub.config, t5_model_hf_sub_auto.config) + t5_model_hf = T5Model.from_pretrained("Baicai003/tiny-t5-one", from_hf_hub=True, use_safetensors=True) + self.test_config_diff(t5_model_hf.config, t5_model_hf.config) + t5_model_hf_auto = AutoModel.from_pretrained("Baicai003/tiny-t5-one", from_hf_hub=True, use_safetensors=True) + self.test_config_diff(t5_model_hf.config, t5_model_hf_auto.config) + logger.info("Download model from hf with subfolder") + t5_model_hf_sub = T5Model.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-t5-one", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(t5_model_hf.config, t5_model_hf_sub.config) + t5_model_hf_sub_auto = AutoModel.from_pretrained( + "Baicai003/paddlenlp-test-model", subfolder="tiny-t5-one", from_hf_hub=True, use_safetensors=True + ) + self.test_config_diff(t5_model_hf_sub.config, t5_model_hf_sub_auto.config) + # local logger.info("Download model from local") t5_model_bos.save_pretrained("./paddlenlp-test-model/tiny-t5", safe_serialization=True) @@ -440,3 +821,25 @@ def test_t5_load_safe(self): self.test_cache_dir( AutoModel, "aistudio/paddlenlp-test-model", subfolder="tiny-t5", use_safetensors=True, from_aistudio=True ) + + # hf + self.test_cache_dir(T5Model, "Baicai003/tiny-t5", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir(AutoModel, "Baicai003/tiny-t5", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir( + T5Model, "Baicai003/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=True, use_safetensors=True + ) + self.test_cache_dir( + AutoModel, "Baicai003/paddlenlp-test-model", subfolder="tiny-t5", from_hf_hub=True, use_safetensors=True + ) + self.test_cache_dir(T5Model, "Baicai003/tiny-t5-one", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir(AutoModel, "Baicai003/tiny-t5-one", from_hf_hub=True, use_safetensors=True) + self.test_cache_dir( + T5Model, "Baicai003/paddlenlp-test-model", subfolder="tiny-t5-one", from_hf_hub=True, use_safetensors=True + ) + self.test_cache_dir( + AutoModel, + "Baicai003/paddlenlp-test-model", + subfolder="tiny-t5-one", + from_hf_hub=True, + use_safetensors=True, + ) From 2f76ee341ae7891fdc3ad084b27b2079341613d1 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 26 Dec 2023 10:10:02 +0800 Subject: [PATCH 16/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddlenlp/transformers/model_utils.py | 57 ++++++++++++++++----------- paddlenlp/utils/serialization.py | 36 +++++++++++------ 2 files changed, 58 insertions(+), 35 deletions(-) diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 48abe2e610b6..c0ef8ec372b2 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -24,8 +24,7 @@ import warnings from contextlib import contextmanager from functools import partial - -# from pathlib import Path +from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union import aistudio_sdk @@ -324,7 +323,7 @@ def load_state_dict( # Check format of the archive with safe_open(checkpoint_file, framework="np") as f: metadata = f.metadata() - if metadata.get("format") not in ["pd", "np"]: + if metadata.get("format", "np") not in ["pd", "np"]: raise OSError( f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure " "you save your model with the `save_pretrained` method." @@ -355,14 +354,14 @@ def load_state_dict( def resolve_weight_file_from_hf_hub( - repo_id: str, cache_dir: str, support_conversion: bool, subfolder=None, use_safetensors=False + repo_id: str, cache_dir: str, convert_from_torch: bool, subfolder=None, use_safetensors=False ): """find the suitable weight file name Args: repo_id (str): repo name of huggingface hub cache_dir (str): cache dir for hf - support_conversion (bool): whether support converting pytorch weight file to paddle weight file + convert_from_torch (bool): whether support converting pytorch weight file to paddle weight file subfolder (str, optional) An optional value corresponding to a folder inside the repo. """ is_sharded = False @@ -373,24 +372,35 @@ def resolve_weight_file_from_hf_hub( is_sharded = True elif hf_file_exists(repo_id, SAFE_WEIGHTS_NAME, subfolder=subfolder): file_name = SAFE_WEIGHTS_NAME - else: - # RAW WEIGHTS - if hf_file_exists(repo_id, PADDLE_WEIGHTS_INDEX_NAME, subfolder=subfolder): - file_name = PADDLE_WEIGHTS_INDEX_NAME - is_sharded = True - elif hf_file_exists(repo_id, PYTORCH_WEIGHTS_INDEX_NAME, subfolder=subfolder): - file_name = PYTORCH_WEIGHTS_INDEX_NAME - is_sharded = True - elif hf_file_exists(repo_id, PADDLE_WEIGHTS_NAME, subfolder=subfolder): - file_name = PADDLE_WEIGHTS_NAME - elif hf_file_exists(repo_id, PYTORCH_WEIGHTS_NAME, subfolder=subfolder): - file_name = PYTORCH_WEIGHTS_NAME - else: raise EntryNotFoundError( - message=f"can not find the paddle/pytorch weight file from: https://huggingface.co/{repo_id}", + message=f"can not find the safetensors weight file from: https://huggingface.co/{repo_id}", response=None, ) + else: + if convert_from_torch: + # TORCH WEIGHTS + if hf_file_exists(repo_id, PYTORCH_WEIGHTS_INDEX_NAME, subfolder=subfolder): + file_name = PYTORCH_WEIGHTS_INDEX_NAME + is_sharded = True + elif hf_file_exists(repo_id, PYTORCH_WEIGHTS_NAME, subfolder=subfolder): + file_name = PYTORCH_WEIGHTS_NAME + else: + raise EntryNotFoundError( + message=f"can not find the pytorch weight file from: https://huggingface.co/{repo_id}", + response=None, + ) + else: + if hf_file_exists(repo_id, PADDLE_WEIGHTS_INDEX_NAME, subfolder=subfolder): + file_name = PADDLE_WEIGHTS_INDEX_NAME + is_sharded = True + elif hf_file_exists(repo_id, PADDLE_WEIGHTS_NAME, subfolder=subfolder): + file_name = PADDLE_WEIGHTS_NAME + else: + raise EntryNotFoundError( + message=f"can not find the paddle weight file from: https://huggingface.co/{repo_id}", + response=None, + ) file_name_list = [file_name] resolved_file = None @@ -574,10 +584,11 @@ def shard_checkpoint( # Otherwise, let's build the index weight_map = {} shards = {} + weights_name_suffix = Path(weights_name).suffix for idx, shard in enumerate(sharded_state_dicts): - shard_file = weights_name.replace(".pdparams", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.pdparams") - shard_file = shard_file.replace( - ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors" + # replace `suffix` -> `-00001-of-00002suffix` + shard_file = weights_name.replace( + weights_name_suffix, f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}{weights_name_suffix}" ) shards[shard_file] = shard for key in shard.keys(): @@ -1555,7 +1566,7 @@ def get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, v resolved_archive_file, is_sharded = resolve_weight_file_from_hf_hub( pretrained_model_name_or_path, cache_dir=cache_dir, - support_conversion=convert_from_torch, + convert_from_torch=convert_from_torch, subfolder=subfolder, use_safetensors=use_safetensors, ) diff --git a/paddlenlp/utils/serialization.py b/paddlenlp/utils/serialization.py index 48d837bf49da..9c9289577024 100644 --- a/paddlenlp/utils/serialization.py +++ b/paddlenlp/utils/serialization.py @@ -25,6 +25,7 @@ from safetensors import safe_open from paddlenlp.utils.env import PYTORCH_WEIGHTS_NAME, SAFE_WEIGHTS_NAME +from paddlenlp.utils.import_utils import is_torch_available MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30 @@ -220,16 +221,27 @@ def persistent_load(saved_id): torch_zip.close() elif path.endswith(SAFE_WEIGHTS_NAME) or os.path.split(path)[-1].startswith("model-"): # Check format of the archive - with safe_open(path, framework="pt") as f: - metadata = f.metadata() - if metadata.get("format") not in ["pt"]: - raise OSError( - f"You have open the `convert_from_torch` flag but the safetensors archive passed at {path} does not contain the 'pt' metadata." - ) - state_dict = {} - with safe_open(path, framework="pt") as f: - for key in f.keys(): - weight = f.get_tensor(key) - state_dict[key] = weight.numpy() - + if is_torch_available(): + with safe_open(path, framework="pt") as f: + metadata = f.metadata() + if metadata.get("format", "pt") not in ["pt"]: + raise OSError( + f"You have open the `convert_from_torch` flag but the safetensors archive passed at {path} does not contain the 'pt' metadata." + ) + state_dict = {} + with safe_open(path, framework="pt") as f: + for key in f.keys(): + weight = f.get_tensor(key) + state_dict[key] = weight.numpy() + else: + with safe_open(path, framework="np") as f: + metadata = f.metadata() + if metadata.get("format", "pt") not in ["pt", "np"]: + raise OSError( + f"You have open the `convert_from_torch` flag but the safetensors archive passed at {path} does not contain the 'pt' or 'np' metadata." + ) + state_dict = {} + with safe_open(path, framework="np") as f: + for key in f.keys(): + state_dict[key] = f.get_tensor(key) return state_dict From a8ca961e2c241923bea84fb99295be14bbde4333 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 26 Dec 2023 10:34:59 +0800 Subject: [PATCH 17/27] update use_safetensors --- tests/transformers/load_subfolder/test_model.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/transformers/load_subfolder/test_model.py b/tests/transformers/load_subfolder/test_model.py index 285d46e8d402..34647b283c63 100644 --- a/tests/transformers/load_subfolder/test_model.py +++ b/tests/transformers/load_subfolder/test_model.py @@ -131,11 +131,11 @@ def test_bert_load(self): logger.info("Download model from local") bert_model_bos.save_pretrained("./paddlenlp-test-model/tiny-bert", safe_serialization=True) bert_model_local = BertModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=False + "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=True ) self.test_config_diff(bert_model_bos.config, bert_model_local.config) bert_model_local_auto = AutoModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=False + "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=True ) self.test_config_diff(bert_model_local.config, bert_model_local_auto.config) @@ -395,11 +395,11 @@ def test_clip_load(self): logger.info("Download model from local") clip_model_bos.save_pretrained("./paddlenlp-test-model/tiny-clip", safe_serialization=True) clip_model_local = CLIPTextModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=False + "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=True ) self.test_config_diff(clip_model_bos.config, clip_model_local.config) clip_model_local_auto = AutoModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=False + "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=True ) self.test_config_diff(clip_model_local.config, clip_model_local_auto.config) @@ -676,10 +676,10 @@ def test_t5_load(self): # local logger.info("Download model from local") t5_model_bos.save_pretrained("./paddlenlp-test-model/tiny-t5", safe_serialization=True) - t5_model_local = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=False) + t5_model_local = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=True) self.test_config_diff(t5_model_bos.config, t5_model_local.config) t5_model_local_auto = AutoModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=False + "./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=True ) self.test_config_diff(t5_model_local.config, t5_model_local_auto.config) From 4a31701da86fe1512547179c1172e77dcf28ec76 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 26 Dec 2023 10:37:06 +0800 Subject: [PATCH 18/27] update --- .../transformers/load_subfolder/test_model.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/transformers/load_subfolder/test_model.py b/tests/transformers/load_subfolder/test_model.py index 34647b283c63..6fbdb53caa7d 100644 --- a/tests/transformers/load_subfolder/test_model.py +++ b/tests/transformers/load_subfolder/test_model.py @@ -129,13 +129,13 @@ def test_bert_load(self): # local logger.info("Download model from local") - bert_model_bos.save_pretrained("./paddlenlp-test-model/tiny-bert", safe_serialization=True) + bert_model_bos.save_pretrained("./paddlenlp-test-model/tiny-bert", safe_serialization=False) bert_model_local = BertModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=True + "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=False ) self.test_config_diff(bert_model_bos.config, bert_model_local.config) bert_model_local_auto = AutoModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=True + "./paddlenlp-test-model/", subfolder="tiny-bert", use_safetensors=False ) self.test_config_diff(bert_model_local.config, bert_model_local_auto.config) @@ -393,13 +393,13 @@ def test_clip_load(self): # local logger.info("Download model from local") - clip_model_bos.save_pretrained("./paddlenlp-test-model/tiny-clip", safe_serialization=True) + clip_model_bos.save_pretrained("./paddlenlp-test-model/tiny-clip", safe_serialization=False) clip_model_local = CLIPTextModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=True + "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=False ) self.test_config_diff(clip_model_bos.config, clip_model_local.config) clip_model_local_auto = AutoModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=True + "./paddlenlp-test-model/", subfolder="tiny-clip", use_safetensors=False ) self.test_config_diff(clip_model_local.config, clip_model_local_auto.config) @@ -675,11 +675,11 @@ def test_t5_load(self): # local logger.info("Download model from local") - t5_model_bos.save_pretrained("./paddlenlp-test-model/tiny-t5", safe_serialization=True) - t5_model_local = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=True) + t5_model_bos.save_pretrained("./paddlenlp-test-model/tiny-t5", safe_serialization=False) + t5_model_local = T5Model.from_pretrained("./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=False) self.test_config_diff(t5_model_bos.config, t5_model_local.config) t5_model_local_auto = AutoModel.from_pretrained( - "./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=True + "./paddlenlp-test-model/", subfolder="tiny-t5", use_safetensors=False ) self.test_config_diff(t5_model_local.config, t5_model_local_auto.config) From 84dec4e0ec2d38a8596e818094ef148d7ca8dbb7 Mon Sep 17 00:00:00 2001 From: CrazyBoyM Date: Wed, 27 Dec 2023 12:29:33 +0000 Subject: [PATCH 19/27] fix resolve_weight_file_from_hf_hub --- paddlenlp/transformers/model_utils.py | 214 +++++++++++++------------- 1 file changed, 103 insertions(+), 111 deletions(-) diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 68b1bf2021cd..046382e432c1 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -48,7 +48,7 @@ from paddle.utils.download import is_url as is_remote_url from tqdm.auto import tqdm -from paddlenlp.utils.downloader import get_path_from_url_with_filelock, hf_file_exists +from paddlenlp.utils.downloader import get_path_from_url_with_filelock from paddlenlp.utils.env import ( CONFIG_NAME, LEGACY_CONFIG_NAME, @@ -366,50 +366,28 @@ def resolve_weight_file_from_hf_hub( subfolder (str, optional) An optional value corresponding to a folder inside the repo. """ is_sharded = False + if use_safetensors: - # SAFE WEIGHTS - if hf_file_exists(repo_id, SAFE_WEIGHTS_INDEX_NAME, subfolder=subfolder): - file_name = SAFE_WEIGHTS_INDEX_NAME - is_sharded = True - elif hf_file_exists(repo_id, SAFE_WEIGHTS_NAME, subfolder=subfolder): - file_name = SAFE_WEIGHTS_NAME - else: - raise EntryNotFoundError( - message=f"can not find the safetensors weight file from: https://huggingface.co/{repo_id}", - response=None, - ) + file_name_list = [ + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + ] else: - if convert_from_torch: - # TORCH WEIGHTS - if hf_file_exists(repo_id, PYTORCH_WEIGHTS_INDEX_NAME, subfolder=subfolder): - file_name = PYTORCH_WEIGHTS_INDEX_NAME - is_sharded = True - elif hf_file_exists(repo_id, PYTORCH_WEIGHTS_NAME, subfolder=subfolder): - file_name = PYTORCH_WEIGHTS_NAME - else: - raise EntryNotFoundError( - message=f"can not find the pytorch weight file from: https://huggingface.co/{repo_id}", - response=None, - ) - else: - if hf_file_exists(repo_id, PADDLE_WEIGHTS_INDEX_NAME, subfolder=subfolder): - file_name = PADDLE_WEIGHTS_INDEX_NAME - is_sharded = True - elif hf_file_exists(repo_id, PADDLE_WEIGHTS_NAME, subfolder=subfolder): - file_name = PADDLE_WEIGHTS_NAME - else: - raise EntryNotFoundError( - message=f"can not find the paddle weight file from: https://huggingface.co/{repo_id}", - response=None, - ) - - file_name_list = [file_name] + file_name_list = [ + PYTORCH_WEIGHTS_INDEX_NAME, + PADDLE_WEIGHTS_INDEX_NAME, + PYTORCH_WEIGHTS_NAME, + PADDLE_WEIGHTS_NAME, + SAFE_WEIGHTS_NAME, # (NOTE,lxl): 兼容极端情况 + ] resolved_file = None for fn in file_name_list: resolved_file = cached_file_for_hf_hub( repo_id, fn, cache_dir, subfolder, _raise_exceptions_for_missing_entries=False ) if resolved_file is not None: + if resolved_file.endswith(".json"): + is_sharded = True break if resolved_file is None: @@ -1458,6 +1436,30 @@ def _resolve_model_file_path( is_sharded = False sharded_metadata = None + # -1. when it's from HF + if from_hf_hub or convert_from_torch: + resolved_archive_file, is_sharded = resolve_weight_file_from_hf_hub( + pretrained_model_name_or_path, + cache_dir=cache_dir, + convert_from_torch=convert_from_torch, + subfolder=subfolder, + use_safetensors=use_safetensors, + ) + # We'll need to download and cache each checkpoint shard if the checkpoint is sharded. + resolved_sharded_files = None + if is_sharded: + # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. + resolved_sharded_files, sharded_metadata = get_checkpoint_shard_files( + pretrained_model_name_or_path, + resolved_archive_file, + from_aistudio=from_aistudio, + from_hf_hub=from_hf_hub, + cache_dir=cache_dir, + subfolder=subfolder, + ) + + return resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded + if pretrained_model_name_or_path is not None: # the following code use a lot of os.path.join, hence setting subfolder to empty str if None if subfolder is None: @@ -1561,95 +1563,85 @@ def get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, v filename = pretrained_model_name_or_path resolved_archive_file = get_path_from_url_with_filelock(pretrained_model_name_or_path) else: - # -1. when it's from HF - if from_hf_hub: - resolved_archive_file, is_sharded = resolve_weight_file_from_hf_hub( - pretrained_model_name_or_path, + + # set correct filename + if use_safetensors is not False: + filename = _add_variant(SAFE_WEIGHTS_NAME, variant) + else: + filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) + + try: + # Load from URL or cache if already cached + cached_file_kwargs = dict( cache_dir=cache_dir, - convert_from_torch=convert_from_torch, subfolder=subfolder, - use_safetensors=use_safetensors, + from_aistudio=from_aistudio, + _raise_exceptions_for_missing_entries=False, ) - else: + resolved_archive_file = None + if pretrained_model_name_or_path in cls.pretrained_init_configuration: + # fetch the weight url from the `pretrained_resource_files_map` + resource_file_url = cls.pretrained_resource_files_map["model_state"][ + pretrained_model_name_or_path + ] + resolved_archive_file = cached_file( + resource_file_url, _add_variant(PADDLE_WEIGHTS_NAME, variant), **cached_file_kwargs + ) + + if resolved_archive_file is None: + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs + ) - # set correct filename - if use_safetensors is not False: - filename = _add_variant(SAFE_WEIGHTS_NAME, variant) else: + # xxx.pdparams in pretrained_resource_files_map renamed model_state.pdparams filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - try: - # Load from URL or cache if already cached - cached_file_kwargs = dict( - cache_dir=cache_dir, - subfolder=subfolder, - from_aistudio=from_aistudio, - _raise_exceptions_for_missing_entries=False, + # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None + # result when internet is up, the repo and revision exist, but the file does not. + if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, ) - resolved_archive_file = None - if pretrained_model_name_or_path in cls.pretrained_init_configuration: - # fetch the weight url from the `pretrained_resource_files_map` - resource_file_url = cls.pretrained_resource_files_map["model_state"][ - pretrained_model_name_or_path - ] - resolved_archive_file = cached_file( - resource_file_url, _add_variant(PADDLE_WEIGHTS_NAME, variant), **cached_file_kwargs - ) - - if resolved_archive_file is None: - resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs + if resolved_archive_file is not None: + is_sharded = True + elif use_safetensors: + raise EnvironmentError( + f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`." ) - else: - # xxx.pdparams in pretrained_resource_files_map renamed model_state.pdparams + # This repo has no safetensors file of any kind, we switch to PyTorch. filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - - # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None - # result when internet is up, the repo and revision exist, but the file does not. - if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): - # Maybe the checkpoint is sharded, we try to grab the index name in this case. - resolved_archive_file = cached_file( - pretrained_model_name_or_path, - _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), - **cached_file_kwargs, - ) - if resolved_archive_file is not None: - is_sharded = True - elif use_safetensors: - raise EnvironmentError( - f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`." - ) - else: - # This repo has no safetensors file of any kind, we switch to PyTorch. - filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs - ) - if resolved_archive_file is None and filename == _add_variant(PADDLE_WEIGHTS_NAME, variant): - # Maybe the checkpoint is sharded, we try to grab the index name in this case. resolved_archive_file = cached_file( - pretrained_model_name_or_path, - _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant), - **cached_file_kwargs, - ) - # raise ValueError(resolved_archive_file) - if resolved_archive_file is not None: - is_sharded = True - if resolved_archive_file is None: - # Otherwise, maybe there is a TF or Flax model file. We try those to give a helpful error - # message. - raise EnvironmentError( - f"{pretrained_model_name_or_path} does not appear to have a file named" - f" {_add_variant(PADDLE_WEIGHTS_NAME, variant)}." + pretrained_model_name_or_path, filename, **cached_file_kwargs ) - except Exception as e: - logger.info(e) - # For any other exception, we throw a generic error. + if resolved_archive_file is None and filename == _add_variant(PADDLE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + # raise ValueError(resolved_archive_file) + if resolved_archive_file is not None: + is_sharded = True + if resolved_archive_file is None: + # Otherwise, maybe there is a TF or Flax model file. We try those to give a helpful error + # message. raise EnvironmentError( - f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" - " from 'https://paddlenlp.bj.bcebos.com'" + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(PADDLE_WEIGHTS_NAME, variant)}." ) + except Exception as e: + logger.info(e) + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" + " from 'https://paddlenlp.bj.bcebos.com'" + ) if is_local: logger.info(f"Loading weights file {archive_file}") From a1a21c3793bc35f9aecd16ac518cc43bcc68049e Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Thu, 28 Dec 2023 11:55:33 +0800 Subject: [PATCH 20/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0bos=E6=97=A7=E7=9A=84?= =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddlenlp/transformers/model_utils.py | 8 +++++--- paddlenlp/transformers/utils.py | 19 ++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 046382e432c1..9bf8deb4ef3c 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -1556,7 +1556,7 @@ def get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, v f" {pretrained_model_name_or_path}." ) # pretrained_model_name_or_path is file - elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)): + elif os.path.isfile(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path is_local = True elif is_remote_url(pretrained_model_name_or_path): @@ -1585,14 +1585,16 @@ def get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, v pretrained_model_name_or_path ] resolved_archive_file = cached_file( - resource_file_url, _add_variant(PADDLE_WEIGHTS_NAME, variant), **cached_file_kwargs + resource_file_url, + _add_variant(PADDLE_WEIGHTS_NAME, variant), + pretrained_model_name_or_path=pretrained_model_name_or_path, + **cached_file_kwargs, ) if resolved_archive_file is None: resolved_archive_file = cached_file( pretrained_model_name_or_path, filename, **cached_file_kwargs ) - else: # xxx.pdparams in pretrained_resource_files_map renamed model_state.pdparams filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) diff --git a/paddlenlp/transformers/utils.py b/paddlenlp/transformers/utils.py index 9b1afe235afd..aacfc3f5b682 100644 --- a/paddlenlp/transformers/utils.py +++ b/paddlenlp/transformers/utils.py @@ -398,12 +398,17 @@ def paddlenlp_hub_download( *, subfolder: Optional[str] = None, cache_dir: Union[str, Path, None] = None, - local_dir: Union[str, Path, None] = None, + pretrained_model_name_or_path: str = None, ) -> str: if subfolder is None: subfolder = "" + if pretrained_model_name_or_path is not None and is_url(repo_id): + cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) + else: + cache_dir = os.path.join(cache_dir, repo_id, subfolder) + # check in cache_dir - weight_file_path = os.path.join(cache_dir, repo_id, subfolder, filename) + weight_file_path = os.path.join(cache_dir, filename) if os.path.exists(weight_file_path): logger.info(f"Already cached {weight_file_path}") @@ -447,9 +452,7 @@ def paddlenlp_hub_download( # check wether the target file exist in the comunity bos server if url_file_exists(community_model_file_path): logger.info(f"Downloading {community_model_file_path}") - weight_file_path = get_path_from_url_with_filelock( - community_model_file_path, os.path.join(cache_dir, repo_id, subfolder) - ) + weight_file_path = get_path_from_url_with_filelock(community_model_file_path, cache_dir) # # check the downloaded weight file and registered weight file name download_check(community_model_file_path, "paddlenlp_hub_download") return weight_file_path @@ -469,6 +472,7 @@ def cached_file( from_aistudio: bool = False, _raise_exceptions_for_missing_entries: bool = True, _raise_exceptions_for_connection_errors: bool = True, + pretrained_model_name_or_path=None, ) -> str: """ Tries to locate a file in a local folder and repo, downloads and cache it if necessary. @@ -523,8 +527,8 @@ def cached_file( except: resolved_file = None else: - if cache_dir is None: - cache_dir = os.path.join(MODEL_HOME, ".cache") + # if cache_dir is None: + # cache_dir = os.path.join(MODEL_HOME, ".cache") try: # Load from URL or cache if already cached resolved_file = paddlenlp_hub_download( @@ -533,6 +537,7 @@ def cached_file( subfolder=None if len(subfolder) == 0 else subfolder, # revision=revision, cache_dir=cache_dir, + pretrained_model_name_or_path=pretrained_model_name_or_path, ) except HTTPError as err: # First we try to see if we have a cached version (not up to date): From 66d26d1ef6613c5d0bb9a8c2e1ed7da7f33e4c8d Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Thu, 28 Dec 2023 14:55:10 +0800 Subject: [PATCH 21/27] update download from hf hubgit add . --- paddlenlp/transformers/model_utils.py | 178 ++++++++++++-------------- 1 file changed, 84 insertions(+), 94 deletions(-) diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 9bf8deb4ef3c..37253fd47a4e 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -48,7 +48,7 @@ from paddle.utils.download import is_url as is_remote_url from tqdm.auto import tqdm -from paddlenlp.utils.downloader import get_path_from_url_with_filelock +from paddlenlp.utils.downloader import get_path_from_url_with_filelock, hf_file_exists from paddlenlp.utils.env import ( CONFIG_NAME, LEGACY_CONFIG_NAME, @@ -324,14 +324,17 @@ def load_state_dict( # Check format of the archive with safe_open(checkpoint_file, framework="np") as f: metadata = f.metadata() + if metadata is None: + metadata = {"format", "np"} + if metadata.get("format", "np") not in ["pd", "np"]: raise OSError( f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure " "you save your model with the `save_pretrained` method." ) - if metadata["format"] == "pd": + if metadata.get("format", "np") == "pd": raise ValueError("Currently unsupport paddle weights file, use numpy instead.") - if metadata["format"] == "np": + if metadata.get("format", "np") == "np": state_dict = {} with safe_open(checkpoint_file, framework="np") as f: for key in f.keys(): @@ -382,6 +385,8 @@ def resolve_weight_file_from_hf_hub( ] resolved_file = None for fn in file_name_list: + if not hf_file_exists(repo_id, filename=fn, subfolder=subfolder): + continue resolved_file = cached_file_for_hf_hub( repo_id, fn, cache_dir, subfolder, _raise_exceptions_for_missing_entries=False ) @@ -1436,30 +1441,6 @@ def _resolve_model_file_path( is_sharded = False sharded_metadata = None - # -1. when it's from HF - if from_hf_hub or convert_from_torch: - resolved_archive_file, is_sharded = resolve_weight_file_from_hf_hub( - pretrained_model_name_or_path, - cache_dir=cache_dir, - convert_from_torch=convert_from_torch, - subfolder=subfolder, - use_safetensors=use_safetensors, - ) - # We'll need to download and cache each checkpoint shard if the checkpoint is sharded. - resolved_sharded_files = None - if is_sharded: - # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. - resolved_sharded_files, sharded_metadata = get_checkpoint_shard_files( - pretrained_model_name_or_path, - resolved_archive_file, - from_aistudio=from_aistudio, - from_hf_hub=from_hf_hub, - cache_dir=cache_dir, - subfolder=subfolder, - ) - - return resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded - if pretrained_model_name_or_path is not None: # the following code use a lot of os.path.join, hence setting subfolder to empty str if None if subfolder is None: @@ -1563,87 +1544,96 @@ def get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, v filename = pretrained_model_name_or_path resolved_archive_file = get_path_from_url_with_filelock(pretrained_model_name_or_path) else: - - # set correct filename - if use_safetensors is not False: - filename = _add_variant(SAFE_WEIGHTS_NAME, variant) - else: - filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - - try: - # Load from URL or cache if already cached - cached_file_kwargs = dict( + # -1. when it's from HF + if from_hf_hub: + resolved_archive_file, is_sharded = resolve_weight_file_from_hf_hub( + pretrained_model_name_or_path, cache_dir=cache_dir, + convert_from_torch=convert_from_torch, subfolder=subfolder, - from_aistudio=from_aistudio, - _raise_exceptions_for_missing_entries=False, + use_safetensors=use_safetensors, ) - resolved_archive_file = None - if pretrained_model_name_or_path in cls.pretrained_init_configuration: - # fetch the weight url from the `pretrained_resource_files_map` - resource_file_url = cls.pretrained_resource_files_map["model_state"][ - pretrained_model_name_or_path - ] - resolved_archive_file = cached_file( - resource_file_url, - _add_variant(PADDLE_WEIGHTS_NAME, variant), - pretrained_model_name_or_path=pretrained_model_name_or_path, - **cached_file_kwargs, - ) - - if resolved_archive_file is None: - resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs - ) + else: + # set correct filename + if use_safetensors is not False: + filename = _add_variant(SAFE_WEIGHTS_NAME, variant) else: - # xxx.pdparams in pretrained_resource_files_map renamed model_state.pdparams filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None - # result when internet is up, the repo and revision exist, but the file does not. - if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): - # Maybe the checkpoint is sharded, we try to grab the index name in this case. - resolved_archive_file = cached_file( - pretrained_model_name_or_path, - _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), - **cached_file_kwargs, + try: + # Load from URL or cache if already cached + cached_file_kwargs = dict( + cache_dir=cache_dir, + subfolder=subfolder, + from_aistudio=from_aistudio, + _raise_exceptions_for_missing_entries=False, ) - if resolved_archive_file is not None: - is_sharded = True - elif use_safetensors: - raise EnvironmentError( - f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`." + resolved_archive_file = None + if pretrained_model_name_or_path in cls.pretrained_init_configuration: + # fetch the weight url from the `pretrained_resource_files_map` + resource_file_url = cls.pretrained_resource_files_map["model_state"][ + pretrained_model_name_or_path + ] + resolved_archive_file = cached_file( + resource_file_url, + _add_variant(PADDLE_WEIGHTS_NAME, variant), + pretrained_model_name_or_path=pretrained_model_name_or_path, + **cached_file_kwargs, + ) + + if resolved_archive_file is None: + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs ) else: - # This repo has no safetensors file of any kind, we switch to PyTorch. + # xxx.pdparams in pretrained_resource_files_map renamed model_state.pdparams filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) + + # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None + # result when internet is up, the repo and revision exist, but the file does not. + if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs + pretrained_model_name_or_path, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, ) - if resolved_archive_file is None and filename == _add_variant(PADDLE_WEIGHTS_NAME, variant): - # Maybe the checkpoint is sharded, we try to grab the index name in this case. - resolved_archive_file = cached_file( - pretrained_model_name_or_path, - _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant), - **cached_file_kwargs, - ) - # raise ValueError(resolved_archive_file) - if resolved_archive_file is not None: - is_sharded = True - if resolved_archive_file is None: - # Otherwise, maybe there is a TF or Flax model file. We try those to give a helpful error - # message. + if resolved_archive_file is not None: + is_sharded = True + elif use_safetensors: + raise EnvironmentError( + f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`." + ) + else: + # This repo has no safetensors file of any kind, we switch to PyTorch. + filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs + ) + if resolved_archive_file is None and filename == _add_variant(PADDLE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + # raise ValueError(resolved_archive_file) + if resolved_archive_file is not None: + is_sharded = True + if resolved_archive_file is None: + # Otherwise, maybe there is a TF or Flax model file. We try those to give a helpful error + # message. + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(PADDLE_WEIGHTS_NAME, variant)}." + ) + except Exception as e: + logger.info(e) + # For any other exception, we throw a generic error. raise EnvironmentError( - f"{pretrained_model_name_or_path} does not appear to have a file named" - f" {_add_variant(PADDLE_WEIGHTS_NAME, variant)}." + f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" + " from 'https://paddlenlp.bj.bcebos.com'" ) - except Exception as e: - logger.info(e) - # For any other exception, we throw a generic error. - raise EnvironmentError( - f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" - " from 'https://paddlenlp.bj.bcebos.com'" - ) if is_local: logger.info(f"Loading weights file {archive_file}") From 041c56c96b01b29839149bdb349d2b5b7b75b4e7 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Thu, 28 Dec 2023 15:24:04 +0800 Subject: [PATCH 22/27] update logging --- paddlenlp/transformers/tokenizer_utils_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 3df310373e22..2c3ac240114b 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1530,7 +1530,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): resolved_vocab_files[file_id] = path else: - logger.info("Downloading %s and saved to %s" % (file_path, cache_dir)) + logger.info( + "Downloading %s and saved to %s" + % (file_path, os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)) + ) try: if not url_file_exists(file_path): # skip warning for chat-template config file From 96b5916cbf455becf8ba41bda6cd5d5d12f2f303 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Thu, 28 Dec 2023 17:07:18 +0800 Subject: [PATCH 23/27] update --- paddlenlp/transformers/model_utils.py | 171 ++++++++++++++------------ 1 file changed, 92 insertions(+), 79 deletions(-) diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py index 37253fd47a4e..926f5ffc19b7 100644 --- a/paddlenlp/transformers/model_utils.py +++ b/paddlenlp/transformers/model_utils.py @@ -48,7 +48,7 @@ from paddle.utils.download import is_url as is_remote_url from tqdm.auto import tqdm -from paddlenlp.utils.downloader import get_path_from_url_with_filelock, hf_file_exists +from paddlenlp.utils.downloader import get_path_from_url_with_filelock from paddlenlp.utils.env import ( CONFIG_NAME, LEGACY_CONFIG_NAME, @@ -385,8 +385,6 @@ def resolve_weight_file_from_hf_hub( ] resolved_file = None for fn in file_name_list: - if not hf_file_exists(repo_id, filename=fn, subfolder=subfolder): - continue resolved_file = cached_file_for_hf_hub( repo_id, fn, cache_dir, subfolder, _raise_exceptions_for_missing_entries=False ) @@ -1441,6 +1439,30 @@ def _resolve_model_file_path( is_sharded = False sharded_metadata = None + # -1. when it's from HF + if from_hf_hub or convert_from_torch: + resolved_archive_file, is_sharded = resolve_weight_file_from_hf_hub( + pretrained_model_name_or_path, + cache_dir=cache_dir, + convert_from_torch=convert_from_torch, + subfolder=subfolder, + use_safetensors=use_safetensors, + ) + # We'll need to download and cache each checkpoint shard if the checkpoint is sharded. + resolved_sharded_files = None + if is_sharded: + # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. + resolved_sharded_files, sharded_metadata = get_checkpoint_shard_files( + pretrained_model_name_or_path, + resolved_archive_file, + from_aistudio=from_aistudio, + from_hf_hub=from_hf_hub, + cache_dir=cache_dir, + subfolder=subfolder, + ) + + return resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded + if pretrained_model_name_or_path is not None: # the following code use a lot of os.path.join, hence setting subfolder to empty str if None if subfolder is None: @@ -1544,96 +1566,87 @@ def get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, v filename = pretrained_model_name_or_path resolved_archive_file = get_path_from_url_with_filelock(pretrained_model_name_or_path) else: - # -1. when it's from HF - if from_hf_hub: - resolved_archive_file, is_sharded = resolve_weight_file_from_hf_hub( - pretrained_model_name_or_path, + + # set correct filename + if use_safetensors is not False: + filename = _add_variant(SAFE_WEIGHTS_NAME, variant) + else: + filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) + + try: + # Load from URL or cache if already cached + cached_file_kwargs = dict( cache_dir=cache_dir, - convert_from_torch=convert_from_torch, subfolder=subfolder, - use_safetensors=use_safetensors, + from_aistudio=from_aistudio, + _raise_exceptions_for_missing_entries=False, ) - else: - # set correct filename - if use_safetensors is not False: - filename = _add_variant(SAFE_WEIGHTS_NAME, variant) + resolved_archive_file = None + if pretrained_model_name_or_path in cls.pretrained_init_configuration: + # fetch the weight url from the `pretrained_resource_files_map` + resource_file_url = cls.pretrained_resource_files_map["model_state"][ + pretrained_model_name_or_path + ] + resolved_archive_file = cached_file( + resource_file_url, + _add_variant(PADDLE_WEIGHTS_NAME, variant), + pretrained_model_name_or_path=pretrained_model_name_or_path, + **cached_file_kwargs, + ) + + if resolved_archive_file is None: + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs + ) else: + # xxx.pdparams in pretrained_resource_files_map renamed model_state.pdparams filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - try: - # Load from URL or cache if already cached - cached_file_kwargs = dict( - cache_dir=cache_dir, - subfolder=subfolder, - from_aistudio=from_aistudio, - _raise_exceptions_for_missing_entries=False, + # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None + # result when internet is up, the repo and revision exist, but the file does not. + if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, ) - resolved_archive_file = None - if pretrained_model_name_or_path in cls.pretrained_init_configuration: - # fetch the weight url from the `pretrained_resource_files_map` - resource_file_url = cls.pretrained_resource_files_map["model_state"][ - pretrained_model_name_or_path - ] - resolved_archive_file = cached_file( - resource_file_url, - _add_variant(PADDLE_WEIGHTS_NAME, variant), - pretrained_model_name_or_path=pretrained_model_name_or_path, - **cached_file_kwargs, - ) - - if resolved_archive_file is None: - resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs + if resolved_archive_file is not None: + is_sharded = True + elif use_safetensors: + raise EnvironmentError( + f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`." ) else: - # xxx.pdparams in pretrained_resource_files_map renamed model_state.pdparams + # This repo has no safetensors file of any kind, we switch to PyTorch. filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - - # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None - # result when internet is up, the repo and revision exist, but the file does not. - if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): - # Maybe the checkpoint is sharded, we try to grab the index name in this case. - resolved_archive_file = cached_file( - pretrained_model_name_or_path, - _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), - **cached_file_kwargs, - ) - if resolved_archive_file is not None: - is_sharded = True - elif use_safetensors: - raise EnvironmentError( - f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`." - ) - else: - # This repo has no safetensors file of any kind, we switch to PyTorch. - filename = _add_variant(PADDLE_WEIGHTS_NAME, variant) - resolved_archive_file = cached_file( - pretrained_model_name_or_path, filename, **cached_file_kwargs - ) - if resolved_archive_file is None and filename == _add_variant(PADDLE_WEIGHTS_NAME, variant): - # Maybe the checkpoint is sharded, we try to grab the index name in this case. resolved_archive_file = cached_file( - pretrained_model_name_or_path, - _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant), - **cached_file_kwargs, - ) - # raise ValueError(resolved_archive_file) - if resolved_archive_file is not None: - is_sharded = True - if resolved_archive_file is None: - # Otherwise, maybe there is a TF or Flax model file. We try those to give a helpful error - # message. - raise EnvironmentError( - f"{pretrained_model_name_or_path} does not appear to have a file named" - f" {_add_variant(PADDLE_WEIGHTS_NAME, variant)}." + pretrained_model_name_or_path, filename, **cached_file_kwargs ) - except Exception as e: - logger.info(e) - # For any other exception, we throw a generic error. + if resolved_archive_file is None and filename == _add_variant(PADDLE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + # raise ValueError(resolved_archive_file) + if resolved_archive_file is not None: + is_sharded = True + if resolved_archive_file is None: + # Otherwise, maybe there is a TF or Flax model file. We try those to give a helpful error + # message. raise EnvironmentError( - f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" - " from 'https://paddlenlp.bj.bcebos.com'" + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(PADDLE_WEIGHTS_NAME, variant)}." ) + except Exception as e: + logger.info(e) + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" + " from 'https://paddlenlp.bj.bcebos.com'" + ) if is_local: logger.info(f"Loading weights file {archive_file}") From c7443cc03bba7f8bca0a51b36dba6eb6caf4d087 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Thu, 28 Dec 2023 18:16:21 +0800 Subject: [PATCH 24/27] =?UTF-8?q?=E5=85=B3=E9=97=AD=E4=BB=A3=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/transformers/load_subfolder/test_config.py | 4 ++++ tests/transformers/load_subfolder/test_model.py | 4 ++++ tests/transformers/load_subfolder/test_processor.py | 5 +++++ tests/transformers/load_subfolder/test_tokenizer.py | 5 +++++ 4 files changed, 18 insertions(+) diff --git a/tests/transformers/load_subfolder/test_config.py b/tests/transformers/load_subfolder/test_config.py index bc5f150cd182..016096d6ffa2 100644 --- a/tests/transformers/load_subfolder/test_config.py +++ b/tests/transformers/load_subfolder/test_config.py @@ -11,7 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +os.environ[ + "no_proxy" +] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import unittest from paddlenlp.transformers import AutoConfig, BertConfig, CLIPConfig, T5Config diff --git a/tests/transformers/load_subfolder/test_model.py b/tests/transformers/load_subfolder/test_model.py index 6fbdb53caa7d..b54547a0f96a 100644 --- a/tests/transformers/load_subfolder/test_model.py +++ b/tests/transformers/load_subfolder/test_model.py @@ -13,6 +13,10 @@ # limitations under the License. import os + +os.environ[ + "no_proxy" +] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import tempfile import unittest diff --git a/tests/transformers/load_subfolder/test_processor.py b/tests/transformers/load_subfolder/test_processor.py index 537f0bb48c2f..f0a781ff615e 100644 --- a/tests/transformers/load_subfolder/test_processor.py +++ b/tests/transformers/load_subfolder/test_processor.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + +os.environ[ + "no_proxy" +] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import unittest from paddlenlp.transformers import AutoProcessor, CLIPProcessor diff --git a/tests/transformers/load_subfolder/test_tokenizer.py b/tests/transformers/load_subfolder/test_tokenizer.py index de44b2baf701..e46321e35328 100644 --- a/tests/transformers/load_subfolder/test_tokenizer.py +++ b/tests/transformers/load_subfolder/test_tokenizer.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + +os.environ[ + "no_proxy" +] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import unittest from paddlenlp.transformers import ( From 9730a76b044718b7d9615c8ff6c6c056d546d3f2 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Thu, 28 Dec 2023 18:24:32 +0800 Subject: [PATCH 25/27] update --- tests/transformers/load_subfolder/test_image_processor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/transformers/load_subfolder/test_image_processor.py b/tests/transformers/load_subfolder/test_image_processor.py index a909015e804d..ed3825fd374a 100644 --- a/tests/transformers/load_subfolder/test_image_processor.py +++ b/tests/transformers/load_subfolder/test_image_processor.py @@ -11,7 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +os.environ[ + "no_proxy" +] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import unittest from paddlenlp.transformers import AutoImageProcessor, CLIPImageProcessor From d58741c6b4dea8c23575c1d3f24ce8a78ce5eefa Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Thu, 28 Dec 2023 19:02:43 +0800 Subject: [PATCH 26/27] update --- tests/transformers/load_subfolder/test_config.py | 5 ----- tests/transformers/load_subfolder/test_image_processor.py | 4 ---- tests/transformers/load_subfolder/test_model.py | 4 ---- tests/transformers/load_subfolder/test_processor.py | 4 ---- tests/transformers/load_subfolder/test_tokenizer.py | 4 ---- 5 files changed, 21 deletions(-) diff --git a/tests/transformers/load_subfolder/test_config.py b/tests/transformers/load_subfolder/test_config.py index 016096d6ffa2..1e7c1f687af8 100644 --- a/tests/transformers/load_subfolder/test_config.py +++ b/tests/transformers/load_subfolder/test_config.py @@ -11,11 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os - -os.environ[ - "no_proxy" -] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import unittest from paddlenlp.transformers import AutoConfig, BertConfig, CLIPConfig, T5Config diff --git a/tests/transformers/load_subfolder/test_image_processor.py b/tests/transformers/load_subfolder/test_image_processor.py index ed3825fd374a..a909015e804d 100644 --- a/tests/transformers/load_subfolder/test_image_processor.py +++ b/tests/transformers/load_subfolder/test_image_processor.py @@ -11,11 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -os.environ[ - "no_proxy" -] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import unittest from paddlenlp.transformers import AutoImageProcessor, CLIPImageProcessor diff --git a/tests/transformers/load_subfolder/test_model.py b/tests/transformers/load_subfolder/test_model.py index b54547a0f96a..6fbdb53caa7d 100644 --- a/tests/transformers/load_subfolder/test_model.py +++ b/tests/transformers/load_subfolder/test_model.py @@ -13,10 +13,6 @@ # limitations under the License. import os - -os.environ[ - "no_proxy" -] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import tempfile import unittest diff --git a/tests/transformers/load_subfolder/test_processor.py b/tests/transformers/load_subfolder/test_processor.py index f0a781ff615e..ac4af8859c1d 100644 --- a/tests/transformers/load_subfolder/test_processor.py +++ b/tests/transformers/load_subfolder/test_processor.py @@ -12,11 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -os.environ[ - "no_proxy" -] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import unittest from paddlenlp.transformers import AutoProcessor, CLIPProcessor diff --git a/tests/transformers/load_subfolder/test_tokenizer.py b/tests/transformers/load_subfolder/test_tokenizer.py index e46321e35328..2508b326ca6f 100644 --- a/tests/transformers/load_subfolder/test_tokenizer.py +++ b/tests/transformers/load_subfolder/test_tokenizer.py @@ -12,11 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -os.environ[ - "no_proxy" -] = "baidu.com,127.0.0.1,0.0.0.0,localhost,bcebos.com,pip.baidu-int.com,mirrors.baidubce.com,repo.baidubce.com,repo.bcm.baidubce.com,pypi.tuna.tsinghua.edu.cn,aistudio.baidu.com" import unittest from paddlenlp.transformers import ( From 075fb0c2a45c55ffb1136e488d764385edbc1b5b Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 2 Jan 2024 11:59:34 +0800 Subject: [PATCH 27/27] fix image process --- paddlenlp/transformers/auto/image_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/auto/image_processing.py b/paddlenlp/transformers/auto/image_processing.py index 88283ce17180..7ee0c04b4fe5 100644 --- a/paddlenlp/transformers/auto/image_processing.py +++ b/paddlenlp/transformers/auto/image_processing.py @@ -80,7 +80,7 @@ def _get_image_processor_class_from_config(cls, pretrained_model_name_or_path, c # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class is None: - init_class = init_kwargs.pop("image_processor_type", None) + init_class = init_kwargs.pop("image_processor_type", init_kwargs.pop("feature_extractor_type", None)) if init_class: # replace old name to new name