@@ -80,7 +80,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
8080 if not self .is_safetensors :
8181 self .part_names = Model .get_model_part_names (self .dir_model , "pytorch_model" , ".bin" )
8282 self .hparams = Model .load_hparams (self .dir_model )
83- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" ])
83+ self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" ])
8484 self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
8585 self .tensor_names = None
8686 if self .ftype == gguf .LlamaFileType .GUESSED :
@@ -2771,6 +2771,124 @@ def write_tensors(self):
27712771 raise ValueError (f"Unprocessed experts: { experts } " )
27722772
27732773
2774+ @Model .register ("T5ForConditionalGeneration" )
2775+ @Model .register ("T5WithLMHeadModel" )
2776+ class T5Model (Model ):
2777+ model_arch = gguf .MODEL_ARCH .T5
2778+
2779+ def set_vocab (self ):
2780+ # to avoid TypeError: Descriptors cannot be created directly
2781+ # exception when importing sentencepiece_model_pb2
2782+ os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
2783+ from sentencepiece import SentencePieceProcessor
2784+ from sentencepiece import sentencepiece_model_pb2 as model
2785+
2786+ tokenizer_path = self .dir_model / 'spiece.model'
2787+
2788+ if not tokenizer_path .is_file ():
2789+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
2790+
2791+ sentencepiece_model = model .ModelProto ()
2792+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
2793+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
2794+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
2795+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
2796+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
2797+
2798+ tokenizer = SentencePieceProcessor ()
2799+ tokenizer .LoadFromFile (str (tokenizer_path ))
2800+
2801+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
2802+
2803+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
2804+ scores : list [float ] = [- 10000.0 ] * vocab_size
2805+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
2806+
2807+ for token_id in range (tokenizer .vocab_size ()):
2808+ piece = tokenizer .IdToPiece (token_id )
2809+ text = piece .encode ("utf-8" )
2810+ score = tokenizer .GetScore (token_id )
2811+
2812+ toktype = SentencePieceTokenTypes .NORMAL
2813+ if tokenizer .IsUnknown (token_id ):
2814+ toktype = SentencePieceTokenTypes .UNKNOWN
2815+ elif tokenizer .IsControl (token_id ):
2816+ toktype = SentencePieceTokenTypes .CONTROL
2817+ elif tokenizer .IsUnused (token_id ):
2818+ toktype = SentencePieceTokenTypes .UNUSED
2819+ elif tokenizer .IsByte (token_id ):
2820+ toktype = SentencePieceTokenTypes .BYTE
2821+
2822+ tokens [token_id ] = text
2823+ scores [token_id ] = score
2824+ toktypes [token_id ] = toktype
2825+
2826+ added_tokens_file = self .dir_model / 'added_tokens.json'
2827+ if added_tokens_file .is_file ():
2828+ with open (added_tokens_file , "r" , encoding = "utf-8" ) as f :
2829+ added_tokens_json = json .load (f )
2830+ for key in added_tokens_json :
2831+ token_id = added_tokens_json [key ]
2832+ if (token_id >= vocab_size ):
2833+ logger .warning (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
2834+ continue
2835+
2836+ tokens [token_id ] = key .encode ("utf-8" )
2837+ scores [token_id ] = - 1000.0
2838+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2839+
2840+ if vocab_size > len (tokens ):
2841+ pad_count = vocab_size - len (tokens )
2842+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
2843+ for i in range (1 , pad_count + 1 ):
2844+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
2845+ scores .append (- 1000.0 )
2846+ toktypes .append (SentencePieceTokenTypes .UNUSED )
2847+
2848+ self .gguf_writer .add_tokenizer_model ("t5" )
2849+ self .gguf_writer .add_tokenizer_pre ("default" )
2850+ self .gguf_writer .add_token_list (tokens )
2851+ self .gguf_writer .add_token_scores (scores )
2852+ self .gguf_writer .add_token_types (toktypes )
2853+ self .gguf_writer .add_add_space_prefix (add_prefix )
2854+ self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
2855+ if precompiled_charsmap :
2856+ self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
2857+
2858+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2859+ special_vocab .add_to_gguf (self .gguf_writer )
2860+
2861+ self .gguf_writer .add_add_bos_token (False )
2862+ self .gguf_writer .add_add_eos_token (True )
2863+
2864+ def set_gguf_parameters (self ):
2865+ self .gguf_writer .add_name ("T5" )
2866+ self .gguf_writer .add_context_length (self .hparams ["n_positions" ])
2867+ self .gguf_writer .add_embedding_length (self .hparams ["d_model" ])
2868+ self .gguf_writer .add_feed_forward_length (self .hparams ["d_ff" ])
2869+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
2870+ self .gguf_writer .add_head_count (self .hparams ["num_heads" ])
2871+ self .gguf_writer .add_key_length (self .hparams ["d_kv" ])
2872+ self .gguf_writer .add_value_length (self .hparams ["d_kv" ])
2873+ self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
2874+ self .gguf_writer .add_relative_attn_buckets_count (self .hparams ["relative_attention_num_buckets" ])
2875+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layer_norm_epsilon" ])
2876+ self .gguf_writer .add_decoder_start_token_id (self .hparams ["decoder_start_token_id" ])
2877+ self .gguf_writer .add_file_type (self .ftype )
2878+
2879+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2880+ del bid # unused
2881+
2882+ # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
2883+ # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
2884+ # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
2885+ if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight" :
2886+ logger .debug (f"Skipping tensor { name !r} in safetensors so that convert can end normally." )
2887+ return []
2888+
2889+ return [(self .map_tensor_name (name ), data_torch )]
2890+
2891+
27742892###### CONVERSION LOGIC ######
27752893
27762894
0 commit comments