@@ -487,6 +487,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
487
487
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a" :
488
488
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
489
489
res = "jina-v2-code"
490
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" :
491
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
492
+ res = "chatglm-bpe"
490
493
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee" :
491
494
# ref: https://huggingface.co/LumiOpen/Viking-7B
492
495
res = "viking"
@@ -3176,6 +3179,190 @@ def write_tensors(self):
3176
3179
self .gguf_writer .add_max_alibi_bias (self .max_alibi_bias )
3177
3180
3178
3181
3182
+ @Model .register ("ChatGLMModel" , "ChatGLMForConditionalGeneration" )
3183
+ class ChatGLMModel (Model ):
3184
+ model_arch = gguf .MODEL_ARCH .CHATGLM
3185
+
3186
+ def set_vocab_chatglm3 (self ):
3187
+ dir_model = self .dir_model
3188
+ hparams = self .hparams
3189
+ tokens : list [bytearray ] = []
3190
+ toktypes : list [int ] = []
3191
+ scores : list [float ] = []
3192
+
3193
+ from transformers import AutoTokenizer
3194
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
3195
+ vocab_size = hparams .get ("padded_vocab_size" , len (tokenizer .get_vocab ()))
3196
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
3197
+ role_special_tokens = ["<|system|>" , "<|user|>" , "<|assistant|>" , "<|observation|>" ]
3198
+ special_tokens = ["[MASK]" , "[gMASK]" , "[sMASK]" , "sop" , "eop" ] + role_special_tokens
3199
+ for token_id in range (vocab_size ):
3200
+ piece = tokenizer ._convert_id_to_token (token_id )
3201
+ if token_id == 0 :
3202
+ piece = "<unk>"
3203
+ elif token_id == 1 :
3204
+ piece = "<bos>"
3205
+ elif token_id == 2 :
3206
+ piece = "<eos>"
3207
+
3208
+ text = piece .encode ("utf-8" )
3209
+ score = 0.0
3210
+ # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
3211
+ # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
3212
+ if len (piece ) != 0 and token_id < tokenizer .tokenizer .sp_model .vocab_size ():
3213
+ score = tokenizer .tokenizer .sp_model .get_score (token_id )
3214
+
3215
+ if len (piece ) == 0 :
3216
+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3217
+
3218
+ if token_id >= tokenizer .tokenizer .sp_model .vocab_size ():
3219
+ if piece in special_tokens :
3220
+ # show special tokens in prompt
3221
+ toktype = SentencePieceTokenTypes .USER_DEFINED
3222
+ else :
3223
+ toktype = SentencePieceTokenTypes .UNKNOWN
3224
+ tokens .append (text )
3225
+ scores .append (score )
3226
+ toktypes .append (toktype )
3227
+ continue
3228
+
3229
+ toktype = SentencePieceTokenTypes .NORMAL
3230
+ if tokenizer .tokenizer .sp_model .is_unknown (token_id ):
3231
+ toktype = SentencePieceTokenTypes .UNKNOWN
3232
+ elif tokenizer .tokenizer .sp_model .is_control (token_id ):
3233
+ toktype = SentencePieceTokenTypes .CONTROL
3234
+ elif tokenizer .tokenizer .sp_model .is_unused (token_id ):
3235
+ toktype = SentencePieceTokenTypes .UNUSED
3236
+ elif tokenizer .tokenizer .sp_model .is_byte (token_id ):
3237
+ toktype = SentencePieceTokenTypes .BYTE
3238
+
3239
+ tokens .append (text )
3240
+ scores .append (score )
3241
+ toktypes .append (toktype )
3242
+
3243
+ self .gguf_writer .add_tokenizer_model ("llama" )
3244
+ # glm3 needs prefix and suffix formatted as:
3245
+ # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
3246
+ self .gguf_writer .add_tokenizer_pre ("chatglm-spm" )
3247
+ self .gguf_writer .add_token_list (tokens )
3248
+ self .gguf_writer .add_token_scores (scores )
3249
+ self .gguf_writer .add_token_types (toktypes )
3250
+
3251
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3252
+ special_vocab .add_to_gguf (self .gguf_writer )
3253
+
3254
+ @staticmethod
3255
+ def token_bytes_to_string (b ):
3256
+ from transformers .models .gpt2 .tokenization_gpt2 import bytes_to_unicode
3257
+ byte_encoder = bytes_to_unicode ()
3258
+ return '' .join ([byte_encoder [ord (char )] for char in b .decode ('latin-1' )])
3259
+
3260
+ @staticmethod
3261
+ def bpe (mergeable_ranks : dict [bytes , int ], token : bytes , max_rank : int | None = None ) -> list [bytes ]:
3262
+ parts = [bytes ([b ]) for b in token ]
3263
+ while True :
3264
+ min_idx = None
3265
+ min_rank = None
3266
+ for i , pair in enumerate (zip (parts [:- 1 ], parts [1 :])):
3267
+ rank = mergeable_ranks .get (pair [0 ] + pair [1 ])
3268
+ if rank is not None and (min_rank is None or rank < min_rank ):
3269
+ min_idx = i
3270
+ min_rank = rank
3271
+ if min_rank is None or (max_rank is not None and min_rank >= max_rank ):
3272
+ break
3273
+ assert min_idx is not None
3274
+ parts = parts [:min_idx ] + [parts [min_idx ] + parts [min_idx + 1 ]] + parts [min_idx + 2 :]
3275
+ return parts
3276
+
3277
+ def set_vocab (self ):
3278
+ if "THUDM/chatglm3-6b" in self .hparams .get ("_name_or_path" , "" ):
3279
+ self .set_vocab_chatglm3 ()
3280
+ return
3281
+
3282
+ dir_model = self .dir_model
3283
+ hparams = self .hparams
3284
+ tokens : list [str ] = []
3285
+ toktypes : list [int ] = []
3286
+
3287
+ from transformers import AutoTokenizer
3288
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
3289
+ vocab_size = hparams ["padded_vocab_size" ]
3290
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
3291
+
3292
+ tokpre = self .get_vocab_base_pre (tokenizer )
3293
+
3294
+ merges = []
3295
+ vocab = {}
3296
+ mergeable_ranks = tokenizer .mergeable_ranks
3297
+ for token , rank in mergeable_ranks .items ():
3298
+ vocab [ChatGLMModel .token_bytes_to_string (token )] = rank
3299
+ if len (token ) == 1 :
3300
+ continue
3301
+ merged = ChatGLMModel .bpe (mergeable_ranks , token , max_rank = rank )
3302
+ assert len (merged ) >= 2 and len (merged ) <= 7
3303
+ merges .append (' ' .join (map (ChatGLMModel .token_bytes_to_string , merged )))
3304
+
3305
+ # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3306
+ added_vocab = tokenizer .get_added_vocab ()
3307
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** added_vocab }.items ()}
3308
+
3309
+ for i in range (vocab_size ):
3310
+ if i not in reverse_vocab :
3311
+ tokens .append (f"[PAD{ i } ]" )
3312
+ toktypes .append (gguf .TokenType .USER_DEFINED )
3313
+ elif reverse_vocab [i ] in added_vocab :
3314
+ tokens .append (reverse_vocab [i ])
3315
+ if tokenizer .added_tokens_decoder [i ].special :
3316
+ toktypes .append (gguf .TokenType .CONTROL )
3317
+ else :
3318
+ toktypes .append (gguf .TokenType .USER_DEFINED )
3319
+ else :
3320
+ tokens .append (reverse_vocab [i ])
3321
+ toktypes .append (gguf .TokenType .NORMAL )
3322
+
3323
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
3324
+ self .gguf_writer .add_tokenizer_pre (tokpre )
3325
+ self .gguf_writer .add_token_list (tokens )
3326
+ self .gguf_writer .add_token_types (toktypes )
3327
+
3328
+ special_vocab = gguf .SpecialVocab (dir_model , load_merges = False )
3329
+ special_vocab .merges = merges
3330
+ # only add special tokens when they were not already loaded from config.json
3331
+ special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
3332
+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
3333
+ # this one is usually not in config.json anyway
3334
+ special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
3335
+ special_vocab .add_to_gguf (self .gguf_writer )
3336
+
3337
+ def set_gguf_parameters (self ):
3338
+ self .gguf_writer .add_name (self .hparams .get ("_name_or_path" ).split ("/" )[1 ]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
3339
+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
3340
+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
3341
+ n_head_kv = self .hparams .get ("multi_query_group_num" , n_head )
3342
+ self .gguf_writer .add_context_length (self .hparams .get ("seq_length" , n_embed ))
3343
+ self .gguf_writer .add_embedding_length (n_embed )
3344
+ self .gguf_writer .add_feed_forward_length (self .hparams .get ("ffn_hidden_size" , 4 * n_embed ))
3345
+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
3346
+ self .gguf_writer .add_head_count (n_head )
3347
+ self .gguf_writer .add_head_count_kv (n_head_kv )
3348
+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layernorm_epsilon" ])
3349
+ self .gguf_writer .add_file_type (self .ftype )
3350
+ self .gguf_writer .add_rope_dimension_count (64 )
3351
+ self .gguf_writer .add_add_bos_token (False )
3352
+ rope_freq = 10000
3353
+ if "rope_ratio" in self .hparams :
3354
+ rope_freq = rope_freq * self .hparams ["rope_ratio" ]
3355
+ self .gguf_writer .add_rope_freq_base (rope_freq )
3356
+
3357
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3358
+ del bid # unused
3359
+
3360
+ if name .endswith (".rotary_pos_emb.inv_freq" ):
3361
+ return []
3362
+
3363
+ name = name .removeprefix ("transformer." )
3364
+ return [(self .map_tensor_name (name ), data_torch )]
3365
+
3179
3366
###### CONVERSION LOGIC ######
3180
3367
3181
3368
0 commit comments