@@ -2143,6 +2143,9 @@ def set_vocab(self):
21432143 toktype = SentencePieceTokenTypes .UNUSED
21442144 elif tokenizer .IsByte (token_id ):
21452145 toktype = SentencePieceTokenTypes .BYTE
2146+ # take care of ununsed raw token
2147+ if piece .startswith ('[UNUSED' ):
2148+ toktype = SentencePieceTokenTypes .UNKNOWN
21462149
21472150 tokens .append (text )
21482151 scores .append (score )
@@ -2158,6 +2161,47 @@ def set_vocab(self):
21582161 scores .append (- 1000.0 )
21592162 toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
21602163
2164+ chat_eos_token = '<|im_end|>'
2165+ chat_eos_token_id = None
2166+
2167+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
2168+ if tokenizer_config_file .is_file ():
2169+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
2170+ tokenizer_config_json = json .load (f )
2171+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
2172+ for token_id , foken_data in added_tokens_decoder .items ():
2173+ token_id = int (token_id )
2174+ token = foken_data ["content" ]
2175+ if token == chat_eos_token :
2176+ chat_eos_token_id = token_id
2177+ token = token .encode ("utf-8" )
2178+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2179+ assert (tokens [token_id ] == token )
2180+ tokens [token_id ] = token
2181+ scores [token_id ] = - 1000.0
2182+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2183+ if foken_data .get ("special" ):
2184+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2185+
2186+ tokenizer_file = self .dir_model / 'tokenizer.json'
2187+ if tokenizer_file .is_file ():
2188+ with open (tokenizer_file , "r" , encoding = "utf-8" ) as f :
2189+ tokenizer_json = json .load (f )
2190+ added_tokens = tokenizer_json .get ("added_tokens" , [])
2191+ for foken_data in added_tokens :
2192+ token_id = int (foken_data ["id" ])
2193+ token = foken_data ["content" ]
2194+ if token == chat_eos_token :
2195+ chat_eos_token_id = token_id
2196+ token = token .encode ("utf-8" )
2197+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
2198+ assert (tokens [token_id ] == token )
2199+ tokens [token_id ] = token
2200+ scores [token_id ] = - 1000.0
2201+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2202+ if foken_data .get ("special" ):
2203+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
2204+
21612205 self .gguf_writer .add_tokenizer_model ("llama" )
21622206 self .gguf_writer .add_tokenizer_pre ("default" )
21632207 self .gguf_writer .add_token_list (tokens )
@@ -2167,28 +2211,16 @@ def set_vocab(self):
21672211
21682212 special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
21692213 old_eos = special_vocab .special_token_ids ["eos" ]
2170- if "chat" in os . path . basename ( self . dir_model . absolute ()) :
2214+ if chat_eos_token_id is not None :
21712215 # For the chat model, we replace the eos with '<|im_end|>'.
21722216 # TODO: this is a hack, should be fixed
21732217 # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
2174- special_vocab .special_token_ids ["eos" ] = self . _try_get_sft_eos ( tokenizer )
2175- logger .warning (f"Replace eos:{ old_eos } with a special token:{ special_vocab . special_token_ids [ 'eos' ] } \
2176- in chat mode so that the conversation can end normally." )
2218+ special_vocab .special_token_ids ["eos" ] = chat_eos_token_id
2219+ logger .warning (f"Replace eos:{ old_eos } with a special token:{ chat_eos_token_id } "
2220+ " in chat mode so that the conversation can end normally." )
21772221
21782222 special_vocab .add_to_gguf (self .gguf_writer )
21792223
2180- def _try_get_sft_eos (self , tokenizer ):
2181- unused_145_list = tokenizer .Encode ('[UNUSED_TOKEN_145]' )
2182- im_end_list = tokenizer .Encode ('<|im_end|>' )
2183- eos_token = None
2184- assert (len (unused_145_list ) == 1 ) ^ (len (im_end_list ) == 1 )
2185- if len (unused_145_list ) == 1 :
2186- eos_token = unused_145_list [0 ]
2187- if len (im_end_list ) == 1 :
2188- eos_token = im_end_list [0 ]
2189- assert eos_token
2190- return eos_token
2191-
21922224 def _hf_permute_qk (self , weights , n_head : int , n_head_kv : int ):
21932225 if n_head_kv is not None and n_head != n_head_kv :
21942226 n_head = n_head_kv
@@ -2207,6 +2239,10 @@ def set_gguf_parameters(self):
22072239 self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
22082240 self .gguf_writer .add_head_count_kv (self .hparams ["num_key_value_heads" ])
22092241 self .gguf_writer .add_file_type (self .ftype )
2242+ if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
2243+ if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
2244+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
2245+ self .gguf_writer .add_rope_scaling_factor (self .hparams ["rope_scaling" ]["factor" ])
22102246
22112247 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
22122248 num_heads = self .hparams ["num_attention_heads" ]
0 commit comments