Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DONE] aistudio, hf hub, bos update download #7608

Merged
merged 36 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
3239683
try fix
JunnYu Dec 8, 2023
f81af43
fix hf download bug ...
JunnYu Dec 8, 2023
865b6dd
update config download bug
JunnYu Dec 9, 2023
d12720a
fix
JunnYu Dec 11, 2023
ba3778d
add subfolder
JunnYu Dec 11, 2023
5d7b269
update
JunnYu Dec 12, 2023
bdac7b2
优先级,先本地,再builtin,再aistudio,再hf hub,再bos
JunnYu Dec 13, 2023
0cbf287
更新chattemplate文件检索路径
JunnYu Dec 19, 2023
6cc9c7e
update
JunnYu Dec 19, 2023
91ab9b8
fix subfolder && add tests
CrazyBoyM Dec 19, 2023
7a1b4ff
merge
CrazyBoyM Dec 19, 2023
259fa80
fix
JunnYu Dec 19, 2023
058663e
Merge branch 'JunnYu:support_subfolder' into support_subfolder
CrazyBoyM Dec 19, 2023
1f808a9
update
JunnYu Dec 19, 2023
df08811
Merge branch 'JunnYu:support_subfolder' into support_subfolder
CrazyBoyM Dec 19, 2023
26dd597
fix tokenizer_config_file_dir_list
CrazyBoyM Dec 19, 2023
899a45b
subfolder test
CrazyBoyM Dec 21, 2023
121fcda
fix from_pretrained() load hf sharded model
CrazyBoyM Dec 25, 2023
a376387
Merge pull request #9 from CrazyBoyM/support_subfolder
JunnYu Dec 26, 2023
2f76ee3
更新逻辑
JunnYu Dec 26, 2023
3990fc7
Merge branch 'develop' into support_subfolder
JunnYu Dec 26, 2023
a8ca961
update use_safetensors
JunnYu Dec 26, 2023
4a31701
update
JunnYu Dec 26, 2023
046f20d
Merge branch 'PaddlePaddle:develop' into support_subfolder
JunnYu Dec 27, 2023
84dec4e
fix resolve_weight_file_from_hf_hub
CrazyBoyM Dec 27, 2023
a1ccabf
Merge pull request #10 from CrazyBoyM/support_subfolder
JunnYu Dec 27, 2023
a1a21c3
更新bos旧的下载方式
JunnYu Dec 28, 2023
48c8a72
Merge branch 'PaddlePaddle:develop' into support_subfolder
JunnYu Dec 28, 2023
66d26d1
update download from hf hubgit add .
JunnYu Dec 28, 2023
041c56c
update logging
JunnYu Dec 28, 2023
96b5916
update
JunnYu Dec 28, 2023
c7443cc
关闭代理
JunnYu Dec 28, 2023
9730a76
update
JunnYu Dec 28, 2023
d58741c
update
JunnYu Dec 28, 2023
075fb0c
fix image process
JunnYu Jan 2, 2024
76f9fd4
Merge branch 'develop' into support_subfolder
JunnYu Jan 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions paddlenlp/generation/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from paddlenlp.transformers.utils import resolve_cache_dir
from paddlenlp.utils.log import logger

from ..transformers.aistudio_utils import aistudio_download
from ..utils import GENERATION_CONFIG_NAME
from ..utils.downloader import (
COMMUNITY_MODEL_PREFIX,
Expand Down Expand Up @@ -336,6 +337,7 @@ def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
from_hf_hub: bool = False,
from_aistudio: bool = False,
config_file_name: Optional[Union[str, os.PathLike]] = None,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
Expand Down Expand Up @@ -404,12 +406,11 @@ def from_pretrained(
```"""
config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME

subfolder = kwargs.pop("subfolder", None)
subfolder = kwargs.pop("subfolder", "")
if subfolder is None:
subfolder = ""

config_path = os.path.join(pretrained_model_name_or_path, config_file_name)
config_path = str(config_path)

cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir)
cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir)

# 1. get the configuration file from local file, eg: /cache/path/model_config.json
if os.path.isfile(pretrained_model_name_or_path):
Expand All @@ -418,24 +419,37 @@ def from_pretrained(
# 2. get the configuration file from url, eg: https://ip/path/to/model_config.json
elif is_url(pretrained_model_name_or_path):
resolved_config_file = get_path_from_url_with_filelock(
pretrained_model_name_or_path, cache_dir, check_exist=not force_download
pretrained_model_name_or_path,
cache_dir=os.path.join(cache_dir, pretrained_model_name_or_path, subfolder),
check_exist=not force_download,
)
# 3. get the configuration file from local dir with default name, eg: /local/path
elif os.path.isdir(pretrained_model_name_or_path):
configuration_file = os.path.join(pretrained_model_name_or_path, GENERATION_CONFIG_NAME)
configuration_file = os.path.join(pretrained_model_name_or_path, subfolder, config_file_name)
if os.path.exists(configuration_file):
resolved_config_file = configuration_file
else:
# try to detect old-school config file
raise FileNotFoundError("please make sure there is `generation_config.json` under the dir")

# 4. get the configuration file from HF hub
# 4. get the configuration file from aistudio
elif from_aistudio:
resolved_config_file = aistudio_download(
repo_id=pretrained_model_name_or_path,
filename=config_file_name,
cache_dir=cache_dir,
subfolder=subfolder,
)
# 5. get the configuration file from HF hub
elif from_hf_hub:
resolved_config_file = resolve_hf_generation_config_path(
repo_id=pretrained_model_name_or_path, cache_dir=cache_dir, subfolder=subfolder
)
else:
community_url = "/".join([COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, GENERATION_CONFIG_NAME])
url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, config_file_name]
cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)
if subfolder != "":
url_list.insert(2, subfolder)
community_url = "/".join(url_list)
if url_file_exists(community_url):
resolved_config_file = get_path_from_url_with_filelock(
community_url, cache_dir, check_exist=not force_download
Expand Down
1 change: 1 addition & 0 deletions paddlenlp/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@
from .auto.modeling import *
from .auto.tokenizer import *
from .auto.processing import *
from .auto.image_processing import *
from .auto.configuration import *
from .codegen.modeling import *
from .codegen.tokenizer import *
Expand Down
27 changes: 25 additions & 2 deletions paddlenlp/transformers/aistudio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

from aistudio_sdk.hub import download


Expand All @@ -23,11 +25,32 @@ class EntryNotFoundError(Exception):
pass


def aistudio_download(repo_id: str, filename: str):
# TODO: add arguments such as cache_dir, revision, etc.
def _add_subfolder(weights_name: str, subfolder: Optional[str] = None) -> str:
if subfolder is not None and subfolder != "":
weights_name = "/".join([subfolder, weights_name])
return weights_name


def aistudio_download(
repo_id: str,
filename: str = None,
cache_dir: Optional[str] = None,
subfolder: Optional[str] = "",
revision: Optional[str] = None,
**kwargs,
):
if revision is None:
revision = "master"
filename = _add_subfolder(filename, subfolder)
download_kwargs = {}
if revision is not None:
download_kwargs["revision"] = revision
if cache_dir is not None:
download_kwargs["cache_dir"] = cache_dir
res = download(
repo_id=repo_id,
filename=filename,
**download_kwargs,
)
if "path" in res:
return res["path"]
Expand Down
34 changes: 19 additions & 15 deletions paddlenlp/transformers/auto/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar
config = AutoConfig.from_pretrained("bert-base-uncased")
config.save_pretrained('./bert-base-uncased')
"""
subfolder = kwargs.get("subfolder", None)
from_aistudio = kwargs.get("from_aistudio", False)
from_hf_hub = kwargs.get("from_hf_hub", False)
cache_dir = resolve_cache_dir(
pretrained_model_name_or_path, from_hf_hub=from_hf_hub, cache_dir=kwargs.pop("cache_dir", None)
)
subfolder = kwargs.get("subfolder", "")
if subfolder is None:
subfolder = ""
from_aistudio = kwargs.pop("from_aistudio", False)
from_hf_hub = kwargs.pop("from_hf_hub", False)
cache_dir = kwargs.pop("cache_dir", None)
cache_dir = resolve_cache_dir(from_hf_hub=from_hf_hub, from_aistudio=from_aistudio, cache_dir=cache_dir)

if not cls.name2class:
cls.name2class = {}
Expand All @@ -182,10 +183,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar

# From local dir path
elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, cls.config_file)
config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_file)
if not os.path.exists(config_file):
# try to load legacy config file
legacy_config_file = os.path.join(pretrained_model_name_or_path, cls.legacy_config_file)
legacy_config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.legacy_config_file)
if not os.path.exists(legacy_config_file):
raise ValueError(
f"config file<{cls.config_file}> or legacy config file<{cls.legacy_config_file}> not found"
Expand All @@ -203,6 +204,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar
file = aistudio_download(
repo_id=pretrained_model_name_or_path,
filename=cls.config_file,
subfolder=subfolder,
cache_dir=cache_dir,
)
return cls.from_pretrained(os.path.dirname(file))
elif from_hf_hub:
Expand All @@ -219,15 +222,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar

# Assuming from community-contributed pretrained models
else:
# support subfolder
if subfolder is not None:
pretrained_model_name_or_path = os.path.join(pretrained_model_name_or_path, subfolder)
url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.config_file]
legacy_url_list = [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_config_file]
cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)
if subfolder != "":
url_list.insert(2, subfolder)
legacy_url_list.insert(2, subfolder)
community_config_path = "/".join(url_list)
legacy_community_config_path = "/".join(legacy_url_list)

community_config_path = "/".join([COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.config_file])
if not url_file_exists(community_config_path):
legacy_community_config_path = "/".join(
[COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_config_file]
)
if not url_file_exists(legacy_community_config_path):
raise RuntimeError(
f"Can't load Config for '{pretrained_model_name_or_path}'.\n"
Expand Down
Loading
Loading