@@ -487,6 +487,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
487
487
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a" :
488
488
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
489
489
res = "jina-v2-code"
490
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" :
491
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
492
+ res = "chatglm-bpe"
490
493
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee" :
491
494
# ref: https://huggingface.co/LumiOpen/Viking-7B
492
495
res = "viking"
@@ -3175,6 +3178,190 @@ def write_tensors(self):
3175
3178
self .gguf_writer .add_max_alibi_bias (self .max_alibi_bias )
3176
3179
3177
3180
3181
+ @Model .register ("ChatGLMModel" , "ChatGLMForConditionalGeneration" )
3182
+ class ChatGLMModel (Model ):
3183
+ model_arch = gguf .MODEL_ARCH .CHATGLM
3184
+
3185
+ def set_vocab_chatglm3 (self ):
3186
+ dir_model = self .dir_model
3187
+ hparams = self .hparams
3188
+ tokens : list [bytearray ] = []
3189
+ toktypes : list [int ] = []
3190
+ scores : list [float ] = []
3191
+
3192
+ from transformers import AutoTokenizer
3193
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
3194
+ vocab_size = hparams .get ("padded_vocab_size" , len (tokenizer .get_vocab ()))
3195
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
3196
+ role_special_tokens = ["<|system|>" , "<|user|>" , "<|assistant|>" , "<|observation|>" ]
3197
+ special_tokens = ["[MASK]" , "[gMASK]" , "[sMASK]" , "sop" , "eop" ] + role_special_tokens
3198
+ for token_id in range (vocab_size ):
3199
+ piece = tokenizer ._convert_id_to_token (token_id )
3200
+ if token_id == 0 :
3201
+ piece = "<unk>"
3202
+ elif token_id == 1 :
3203
+ piece = "<bos>"
3204
+ elif token_id == 2 :
3205
+ piece = "<eos>"
3206
+
3207
+ text = piece .encode ("utf-8" )
3208
+ score = 0.0
3209
+ # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
3210
+ # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
3211
+ if len (piece ) != 0 and token_id < tokenizer .tokenizer .sp_model .vocab_size ():
3212
+ score = tokenizer .tokenizer .sp_model .get_score (token_id )
3213
+
3214
+ if len (piece ) == 0 :
3215
+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
3216
+
3217
+ if token_id >= tokenizer .tokenizer .sp_model .vocab_size ():
3218
+ if piece in special_tokens :
3219
+ # show special tokens in prompt
3220
+ toktype = SentencePieceTokenTypes .USER_DEFINED
3221
+ else :
3222
+ toktype = SentencePieceTokenTypes .UNKNOWN
3223
+ tokens .append (text )
3224
+ scores .append (score )
3225
+ toktypes .append (toktype )
3226
+ continue
3227
+
3228
+ toktype = SentencePieceTokenTypes .NORMAL
3229
+ if tokenizer .tokenizer .sp_model .is_unknown (token_id ):
3230
+ toktype = SentencePieceTokenTypes .UNKNOWN
3231
+ elif tokenizer .tokenizer .sp_model .is_control (token_id ):
3232
+ toktype = SentencePieceTokenTypes .CONTROL
3233
+ elif tokenizer .tokenizer .sp_model .is_unused (token_id ):
3234
+ toktype = SentencePieceTokenTypes .UNUSED
3235
+ elif tokenizer .tokenizer .sp_model .is_byte (token_id ):
3236
+ toktype = SentencePieceTokenTypes .BYTE
3237
+
3238
+ tokens .append (text )
3239
+ scores .append (score )
3240
+ toktypes .append (toktype )
3241
+
3242
+ self .gguf_writer .add_tokenizer_model ("llama" )
3243
+ # glm3 needs prefix and suffix formatted as:
3244
+ # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
3245
+ self .gguf_writer .add_tokenizer_pre ("chatglm-spm" )
3246
+ self .gguf_writer .add_token_list (tokens )
3247
+ self .gguf_writer .add_token_scores (scores )
3248
+ self .gguf_writer .add_token_types (toktypes )
3249
+
3250
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
3251
+ special_vocab .add_to_gguf (self .gguf_writer )
3252
+
3253
+ @staticmethod
3254
+ def token_bytes_to_string (b ):
3255
+ from transformers .models .gpt2 .tokenization_gpt2 import bytes_to_unicode
3256
+ byte_encoder = bytes_to_unicode ()
3257
+ return '' .join ([byte_encoder [ord (char )] for char in b .decode ('latin-1' )])
3258
+
3259
+ @staticmethod
3260
+ def bpe (mergeable_ranks : dict [bytes , int ], token : bytes , max_rank : int | None = None ) -> list [bytes ]:
3261
+ parts = [bytes ([b ]) for b in token ]
3262
+ while True :
3263
+ min_idx = None
3264
+ min_rank = None
3265
+ for i , pair in enumerate (zip (parts [:- 1 ], parts [1 :])):
3266
+ rank = mergeable_ranks .get (pair [0 ] + pair [1 ])
3267
+ if rank is not None and (min_rank is None or rank < min_rank ):
3268
+ min_idx = i
3269
+ min_rank = rank
3270
+ if min_rank is None or (max_rank is not None and min_rank >= max_rank ):
3271
+ break
3272
+ assert min_idx is not None
3273
+ parts = parts [:min_idx ] + [parts [min_idx ] + parts [min_idx + 1 ]] + parts [min_idx + 2 :]
3274
+ return parts
3275
+
3276
+ def set_vocab (self ):
3277
+ if "THUDM/chatglm3-6b" in self .hparams .get ("_name_or_path" , "" ):
3278
+ self .set_vocab_chatglm3 ()
3279
+ return
3280
+
3281
+ dir_model = self .dir_model
3282
+ hparams = self .hparams
3283
+ tokens : list [str ] = []
3284
+ toktypes : list [int ] = []
3285
+
3286
+ from transformers import AutoTokenizer
3287
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
3288
+ vocab_size = hparams ["padded_vocab_size" ]
3289
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
3290
+
3291
+ tokpre = self .get_vocab_base_pre (tokenizer )
3292
+
3293
+ merges = []
3294
+ vocab = {}
3295
+ mergeable_ranks = tokenizer .mergeable_ranks
3296
+ for token , rank in mergeable_ranks .items ():
3297
+ vocab [ChatGLMModel .token_bytes_to_string (token )] = rank
3298
+ if len (token ) == 1 :
3299
+ continue
3300
+ merged = ChatGLMModel .bpe (mergeable_ranks , token , max_rank = rank )
3301
+ assert len (merged ) >= 2 and len (merged ) <= 7
3302
+ merges .append (' ' .join (map (ChatGLMModel .token_bytes_to_string , merged )))
3303
+
3304
+ # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3305
+ added_vocab = tokenizer .get_added_vocab ()
3306
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** added_vocab }.items ()}
3307
+
3308
+ for i in range (vocab_size ):
3309
+ if i not in reverse_vocab :
3310
+ tokens .append (f"[PAD{ i } ]" )
3311
+ toktypes .append (gguf .TokenType .USER_DEFINED )
3312
+ elif reverse_vocab [i ] in added_vocab :
3313
+ tokens .append (reverse_vocab [i ])
3314
+ if tokenizer .added_tokens_decoder [i ].special :
3315
+ toktypes .append (gguf .TokenType .CONTROL )
3316
+ else :
3317
+ toktypes .append (gguf .TokenType .USER_DEFINED )
3318
+ else :
3319
+ tokens .append (reverse_vocab [i ])
3320
+ toktypes .append (gguf .TokenType .NORMAL )
3321
+
3322
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
3323
+ self .gguf_writer .add_tokenizer_pre (tokpre )
3324
+ self .gguf_writer .add_token_list (tokens )
3325
+ self .gguf_writer .add_token_types (toktypes )
3326
+
3327
+ special_vocab = gguf .SpecialVocab (dir_model , load_merges = False )
3328
+ special_vocab .merges = merges
3329
+ # only add special tokens when they were not already loaded from config.json
3330
+ special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
3331
+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
3332
+ # this one is usually not in config.json anyway
3333
+ special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
3334
+ special_vocab .add_to_gguf (self .gguf_writer )
3335
+
3336
+ def set_gguf_parameters (self ):
3337
+ self .gguf_writer .add_name (self .hparams .get ("_name_or_path" ).split ("/" )[1 ]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
3338
+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
3339
+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
3340
+ n_head_kv = self .hparams .get ("multi_query_group_num" , n_head )
3341
+ self .gguf_writer .add_context_length (self .hparams .get ("seq_length" , n_embed ))
3342
+ self .gguf_writer .add_embedding_length (n_embed )
3343
+ self .gguf_writer .add_feed_forward_length (self .hparams .get ("ffn_hidden_size" , 4 * n_embed ))
3344
+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
3345
+ self .gguf_writer .add_head_count (n_head )
3346
+ self .gguf_writer .add_head_count_kv (n_head_kv )
3347
+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layernorm_epsilon" ])
3348
+ self .gguf_writer .add_file_type (self .ftype )
3349
+ self .gguf_writer .add_rope_dimension_count (64 )
3350
+ self .gguf_writer .add_add_bos_token (False )
3351
+ rope_freq = 10000
3352
+ if "rope_ratio" in self .hparams :
3353
+ rope_freq = rope_freq * self .hparams ["rope_ratio" ]
3354
+ self .gguf_writer .add_rope_freq_base (rope_freq )
3355
+
3356
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3357
+ del bid # unused
3358
+
3359
+ if name .endswith (".rotary_pos_emb.inv_freq" ):
3360
+ return []
3361
+
3362
+ name = name .removeprefix ("transformer." )
3363
+ return [(self .map_tensor_name (name ), data_torch )]
3364
+
3178
3365
###### CONVERSION LOGIC ######
3179
3366
3180
3367
0 commit comments