@@ -511,6 +511,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
511
511
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a" :
512
512
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
513
513
res = "jina-v2-code"
514
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" :
515
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
516
+ res = "chatglm-bpe"
514
517
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee" :
515
518
# ref: https://huggingface.co/LumiOpen/Viking-7B
516
519
res = "viking"
@@ -3187,6 +3190,190 @@ def write_tensors(self):
3187
3190
self .gguf_writer .add_max_alibi_bias (self .max_alibi_bias )
3188
3191
3189
3192
3193
+ @Model .register ("ChatGLMModel" , "ChatGLMForConditionalGeneration" )
3194
+ class ChatGLMModel (Model ):
3195
+ model_arch = gguf .MODEL_ARCH .CHATGLM
3196
+
3197
+ def set_vocab_chatglm3 (self ):
3198
+ dir_model = self .dir_model
3199
+ hparams = self .hparams
3200
+ tokens : list [bytearray ] = []
3201
+ toktypes : list [int ] = []
3202
+ scores : list [float ] = []
3203
+
3204
+ from transformers import AutoTokenizer
3205
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
3206
+ vocab_size = hparams .get ("padded_vocab_size" , len (tokenizer .get_vocab ()))
3207
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
3208
+ role_special_tokens = ["<|system|>" , "<|user|>" , "<|assistant|>" , "<|observation|>" ]
3209
+ special_tokens = ["[MASK]" , "[gMASK]" , "[sMASK]" , "sop" , "eop" ] + role_special_tokens
3210
+ for token_id in range (vocab_size ):
3211
+ piece = tokenizer ._convert_id_to_token (token_id )
3212
+ if token_id == 0 :
3213
+ piece = "<unk>"
3214
+ elif token_id == 1 :
3215
+ piece = "<bos>"
3216
+ elif token_id == 2 :
3217
+ piece = "<eos>"
3218
+
3219
+ text = piece .encode ("utf-8" )
3220
+ score = 0.0
3221
+ # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
3222
+ # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
3223
+ if len (piece ) != 0 and token_id < tokenizer .tokenizer .sp_model .vocab_size ():
3224
+ score = tokenizer .tokenizer .sp_model .get_score (token_id )
3225
+
3226
+ if len (piece ) == 0 :
3227
+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3228
+
3229
+ if token_id >= tokenizer .tokenizer .sp_model .vocab_size ():
3230
+ if piece in special_tokens :
3231
+ # show special tokens in prompt
3232
+ toktype = SentencePieceTokenTypes .USER_DEFINED
3233
+ else :
3234
+ toktype = SentencePieceTokenTypes .UNKNOWN
3235
+ tokens .append (text )
3236
+ scores .append (score )
3237
+ toktypes .append (toktype )
3238
+ continue
3239
+
3240
+ toktype = SentencePieceTokenTypes .NORMAL
3241
+ if tokenizer .tokenizer .sp_model .is_unknown (token_id ):
3242
+ toktype = SentencePieceTokenTypes .UNKNOWN
3243
+ elif tokenizer .tokenizer .sp_model .is_control (token_id ):
3244
+ toktype = SentencePieceTokenTypes .CONTROL
3245
+ elif tokenizer .tokenizer .sp_model .is_unused (token_id ):
3246
+ toktype = SentencePieceTokenTypes .UNUSED
3247
+ elif tokenizer .tokenizer .sp_model .is_byte (token_id ):
3248
+ toktype = SentencePieceTokenTypes .BYTE
3249
+
3250
+ tokens .append (text )
3251
+ scores .append (score )
3252
+ toktypes .append (toktype )
3253
+
3254
+ self .gguf_writer .add_tokenizer_model ("llama" )
3255
+ # glm3 needs prefix and suffix formatted as:
3256
+ # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
3257
+ self .gguf_writer .add_tokenizer_pre ("chatglm-spm" )
3258
+ self .gguf_writer .add_token_list (tokens )
3259
+ self .gguf_writer .add_token_scores (scores )
3260
+ self .gguf_writer .add_token_types (toktypes )
3261
+
3262
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3263
+ special_vocab .add_to_gguf (self .gguf_writer )
3264
+
3265
+ @staticmethod
3266
+ def token_bytes_to_string (b ):
3267
+ from transformers .models .gpt2 .tokenization_gpt2 import bytes_to_unicode
3268
+ byte_encoder = bytes_to_unicode ()
3269
+ return '' .join ([byte_encoder [ord (char )] for char in b .decode ('latin-1' )])
3270
+
3271
+ @staticmethod
3272
+ def bpe (mergeable_ranks : dict [bytes , int ], token : bytes , max_rank : int | None = None ) -> list [bytes ]:
3273
+ parts = [bytes ([b ]) for b in token ]
3274
+ while True :
3275
+ min_idx = None
3276
+ min_rank = None
3277
+ for i , pair in enumerate (zip (parts [:- 1 ], parts [1 :])):
3278
+ rank = mergeable_ranks .get (pair [0 ] + pair [1 ])
3279
+ if rank is not None and (min_rank is None or rank < min_rank ):
3280
+ min_idx = i
3281
+ min_rank = rank
3282
+ if min_rank is None or (max_rank is not None and min_rank >= max_rank ):
3283
+ break
3284
+ assert min_idx is not None
3285
+ parts = parts [:min_idx ] + [parts [min_idx ] + parts [min_idx + 1 ]] + parts [min_idx + 2 :]
3286
+ return parts
3287
+
3288
+ def set_vocab (self ):
3289
+ if "THUDM/chatglm3-6b" in self .hparams .get ("_name_or_path" , "" ):
3290
+ self .set_vocab_chatglm3 ()
3291
+ return
3292
+
3293
+ dir_model = self .dir_model
3294
+ hparams = self .hparams
3295
+ tokens : list [str ] = []
3296
+ toktypes : list [int ] = []
3297
+
3298
+ from transformers import AutoTokenizer
3299
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
3300
+ vocab_size = hparams ["padded_vocab_size" ]
3301
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
3302
+
3303
+ tokpre = self .get_vocab_base_pre (tokenizer )
3304
+
3305
+ merges = []
3306
+ vocab = {}
3307
+ mergeable_ranks = tokenizer .mergeable_ranks
3308
+ for token , rank in mergeable_ranks .items ():
3309
+ vocab [ChatGLMModel .token_bytes_to_string (token )] = rank
3310
+ if len (token ) == 1 :
3311
+ continue
3312
+ merged = ChatGLMModel .bpe (mergeable_ranks , token , max_rank = rank )
3313
+ assert len (merged ) >= 2 and len (merged ) <= 7
3314
+ merges .append (' ' .join (map (ChatGLMModel .token_bytes_to_string , merged )))
3315
+
3316
+ # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3317
+ added_vocab = tokenizer .get_added_vocab ()
3318
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** added_vocab }.items ()}
3319
+
3320
+ for i in range (vocab_size ):
3321
+ if i not in reverse_vocab :
3322
+ tokens .append (f"[PAD{ i } ]" )
3323
+ toktypes .append (gguf .TokenType .USER_DEFINED )
3324
+ elif reverse_vocab [i ] in added_vocab :
3325
+ tokens .append (reverse_vocab [i ])
3326
+ if tokenizer .added_tokens_decoder [i ].special :
3327
+ toktypes .append (gguf .TokenType .CONTROL )
3328
+ else :
3329
+ toktypes .append (gguf .TokenType .USER_DEFINED )
3330
+ else :
3331
+ tokens .append (reverse_vocab [i ])
3332
+ toktypes .append (gguf .TokenType .NORMAL )
3333
+
3334
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
3335
+ self .gguf_writer .add_tokenizer_pre (tokpre )
3336
+ self .gguf_writer .add_token_list (tokens )
3337
+ self .gguf_writer .add_token_types (toktypes )
3338
+
3339
+ special_vocab = gguf .SpecialVocab (dir_model , load_merges = False )
3340
+ special_vocab .merges = merges
3341
+ # only add special tokens when they were not already loaded from config.json
3342
+ special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
3343
+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
3344
+ # this one is usually not in config.json anyway
3345
+ special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
3346
+ special_vocab .add_to_gguf (self .gguf_writer )
3347
+
3348
+ def set_gguf_parameters (self ):
3349
+ self .gguf_writer .add_name (self .hparams .get ("_name_or_path" ).split ("/" )[1 ]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
3350
+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
3351
+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
3352
+ n_head_kv = self .hparams .get ("multi_query_group_num" , n_head )
3353
+ self .gguf_writer .add_context_length (self .hparams .get ("seq_length" , n_embed ))
3354
+ self .gguf_writer .add_embedding_length (n_embed )
3355
+ self .gguf_writer .add_feed_forward_length (self .hparams .get ("ffn_hidden_size" , 4 * n_embed ))
3356
+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
3357
+ self .gguf_writer .add_head_count (n_head )
3358
+ self .gguf_writer .add_head_count_kv (n_head_kv )
3359
+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layernorm_epsilon" ])
3360
+ self .gguf_writer .add_file_type (self .ftype )
3361
+ self .gguf_writer .add_rope_dimension_count (64 )
3362
+ self .gguf_writer .add_add_bos_token (False )
3363
+ rope_freq = 10000
3364
+ if "rope_ratio" in self .hparams :
3365
+ rope_freq = rope_freq * self .hparams ["rope_ratio" ]
3366
+ self .gguf_writer .add_rope_freq_base (rope_freq )
3367
+
3368
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3369
+ del bid # unused
3370
+
3371
+ if name .endswith (".rotary_pos_emb.inv_freq" ):
3372
+ return []
3373
+
3374
+ name = name .removeprefix ("transformer." )
3375
+ return [(self .map_tensor_name (name ), data_torch )]
3376
+
3190
3377
###### CONVERSION LOGIC ######
3191
3378
3192
3379
0 commit comments