From ecedd8de5270057cd5f66c0a3e2f5cc9efb26801 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 9 Aug 2024 13:04:19 +0800 Subject: [PATCH 1/3] fix gguf config vocab size --- src/transformers/modeling_gguf_pytorch_utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 0b1621b7bf2d..900c80b45d58 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -130,6 +130,17 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): if gguf_key in reader_keys: logger.info(f"Some keys were not parsed and added into account {gguf_key} | {value}") + # retrieve config vocab_size from tokenizer, refer to issue-#32526 + if "vocab_size" not in parsed_parameters["config"]: + tokenizer_parameters = parsed_parameters["tokenizer"] + if "tokens" in tokenizer_parameters: + parsed_parameters["config"]["vocab_size"] = len(parsed_parameters["tokenizer"]["tokens"]) + else: + logger.warning( + "Can't find a way to retrieve missing config vocab_size from tokenizer parameters. " + "This will use default value from model config class and cause unexpected behavior." + ) + if return_tensors: tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture] From a493464c7915df7823e462e9eee14db394f47b15 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 9 Aug 2024 14:15:14 +0800 Subject: [PATCH 2/3] minor fix --- src/transformers/modeling_gguf_pytorch_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 900c80b45d58..8850220d0bc9 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -134,7 +134,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): if "vocab_size" not in parsed_parameters["config"]: tokenizer_parameters = parsed_parameters["tokenizer"] if "tokens" in tokenizer_parameters: - parsed_parameters["config"]["vocab_size"] = len(parsed_parameters["tokenizer"]["tokens"]) + parsed_parameters["config"]["vocab_size"] = len(tokenizer_parameters["tokens"]) else: logger.warning( "Can't find a way to retrieve missing config vocab_size from tokenizer parameters. " From 2ed6f2952045a61f31395e1f841ee257c67890c9 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 16 Aug 2024 20:37:15 +0800 Subject: [PATCH 3/3] link issue --- src/transformers/modeling_gguf_pytorch_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 8850220d0bc9..52b1068e003f 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -130,7 +130,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): if gguf_key in reader_keys: logger.info(f"Some keys were not parsed and added into account {gguf_key} | {value}") - # retrieve config vocab_size from tokenizer, refer to issue-#32526 + # retrieve config vocab_size from tokenizer + # Pleas refer to https://github.com/huggingface/transformers/issues/32526 for more details if "vocab_size" not in parsed_parameters["config"]: tokenizer_parameters = parsed_parameters["tokenizer"] if "tokens" in tokenizer_parameters: