Skip to content

Commit a47d89d

Browse files
youth123xingxingqiaoUmpire2018
authored andcommitted
llama : support glm3 and glm4 (ggml-org#8031)
* add chatglm3-6b model support huggingface model: https://hf-mirror.com/THUDM/chatglm3-6b Signed-off-by: XingXing Qiao <[email protected]> * remove .rotary_pos_emb.inv_freq and unuse code for chatglm3 model Signed-off-by: XingXing Qiao <[email protected]> * fix lint error Signed-off-by: XingXing Qiao <[email protected]> * optimize convert-hf-to-gguf.py for chatglm model Signed-off-by: XingXing Qiao <[email protected]> * support glm-4-9b-chat Signed-off-by: XingXing Qiao <[email protected]> * fix eos tokens to glm4 * remove unused log * add preprocess to chatglm3 and chatglm4 * add eos_id_list to llama.cpp * fix code style * fix code style * fix conflicts * fix conflicts * Revert "add eos_id_list to llama.cpp" This reverts commit 3a4d579. * set <|endoftext|> as eos and <|user|> as eot * fix chat template bug * add comment to glm prefix and suffix * fix conflicts and add rope_ratio & ChatGLMForConditionalGeneration * fix chat template bug * fix codestyle * fix conflicts * modified the general name of glm model * fix conflicts * remove prefix and suffix * use normal glm4 chattempalte & use LLM_FFN_SWIGLU in phi3 * fix: resolve Flake8 errors in `convert-hf-to-gguf.py` - Fix E302 by adding two blank lines before top-level function definitions - Replace print statements to fix NP100 - Fix E303 by ensuring only one blank line between lines of code * fix rope ratio to solve incorrect answers * fix by comments --------- Signed-off-by: XingXing Qiao <[email protected]> Co-authored-by: XingXing Qiao <[email protected]> Co-authored-by: Umpire2018 <[email protected]>
1 parent 1bf7194 commit a47d89d

File tree

6 files changed

+455
-25
lines changed

6 files changed

+455
-25
lines changed

convert-hf-to-gguf.py

+187
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
511511
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
512512
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
513513
res = "jina-v2-code"
514+
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
515+
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
516+
res = "chatglm-bpe"
514517
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
515518
# ref: https://huggingface.co/LumiOpen/Viking-7B
516519
res = "viking"
@@ -3187,6 +3190,190 @@ def write_tensors(self):
31873190
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
31883191

31893192

3193+
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
3194+
class ChatGLMModel(Model):
3195+
model_arch = gguf.MODEL_ARCH.CHATGLM
3196+
3197+
def set_vocab_chatglm3(self):
3198+
dir_model = self.dir_model
3199+
hparams = self.hparams
3200+
tokens: list[bytearray] = []
3201+
toktypes: list[int] = []
3202+
scores: list[float] = []
3203+
3204+
from transformers import AutoTokenizer
3205+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3206+
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
3207+
assert max(tokenizer.get_vocab().values()) < vocab_size
3208+
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
3209+
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
3210+
for token_id in range(vocab_size):
3211+
piece = tokenizer._convert_id_to_token(token_id)
3212+
if token_id == 0:
3213+
piece = "<unk>"
3214+
elif token_id == 1:
3215+
piece = "<bos>"
3216+
elif token_id == 2:
3217+
piece = "<eos>"
3218+
3219+
text = piece.encode("utf-8")
3220+
score = 0.0
3221+
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
3222+
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
3223+
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
3224+
score = tokenizer.tokenizer.sp_model.get_score(token_id)
3225+
3226+
if len(piece) == 0:
3227+
text = f"[PAD{token_id}]".encode("utf-8")
3228+
3229+
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
3230+
if piece in special_tokens:
3231+
# show special tokens in prompt
3232+
toktype = SentencePieceTokenTypes.USER_DEFINED
3233+
else:
3234+
toktype = SentencePieceTokenTypes.UNKNOWN
3235+
tokens.append(text)
3236+
scores.append(score)
3237+
toktypes.append(toktype)
3238+
continue
3239+
3240+
toktype = SentencePieceTokenTypes.NORMAL
3241+
if tokenizer.tokenizer.sp_model.is_unknown(token_id):
3242+
toktype = SentencePieceTokenTypes.UNKNOWN
3243+
elif tokenizer.tokenizer.sp_model.is_control(token_id):
3244+
toktype = SentencePieceTokenTypes.CONTROL
3245+
elif tokenizer.tokenizer.sp_model.is_unused(token_id):
3246+
toktype = SentencePieceTokenTypes.UNUSED
3247+
elif tokenizer.tokenizer.sp_model.is_byte(token_id):
3248+
toktype = SentencePieceTokenTypes.BYTE
3249+
3250+
tokens.append(text)
3251+
scores.append(score)
3252+
toktypes.append(toktype)
3253+
3254+
self.gguf_writer.add_tokenizer_model("llama")
3255+
# glm3 needs prefix and suffix formatted as:
3256+
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
3257+
self.gguf_writer.add_tokenizer_pre("chatglm-spm")
3258+
self.gguf_writer.add_token_list(tokens)
3259+
self.gguf_writer.add_token_scores(scores)
3260+
self.gguf_writer.add_token_types(toktypes)
3261+
3262+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3263+
special_vocab.add_to_gguf(self.gguf_writer)
3264+
3265+
@staticmethod
3266+
def token_bytes_to_string(b):
3267+
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
3268+
byte_encoder = bytes_to_unicode()
3269+
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
3270+
3271+
@staticmethod
3272+
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
3273+
parts = [bytes([b]) for b in token]
3274+
while True:
3275+
min_idx = None
3276+
min_rank = None
3277+
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
3278+
rank = mergeable_ranks.get(pair[0] + pair[1])
3279+
if rank is not None and (min_rank is None or rank < min_rank):
3280+
min_idx = i
3281+
min_rank = rank
3282+
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
3283+
break
3284+
assert min_idx is not None
3285+
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
3286+
return parts
3287+
3288+
def set_vocab(self):
3289+
if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
3290+
self.set_vocab_chatglm3()
3291+
return
3292+
3293+
dir_model = self.dir_model
3294+
hparams = self.hparams
3295+
tokens: list[str] = []
3296+
toktypes: list[int] = []
3297+
3298+
from transformers import AutoTokenizer
3299+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
3300+
vocab_size = hparams["padded_vocab_size"]
3301+
assert max(tokenizer.get_vocab().values()) < vocab_size
3302+
3303+
tokpre = self.get_vocab_base_pre(tokenizer)
3304+
3305+
merges = []
3306+
vocab = {}
3307+
mergeable_ranks = tokenizer.mergeable_ranks
3308+
for token, rank in mergeable_ranks.items():
3309+
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
3310+
if len(token) == 1:
3311+
continue
3312+
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
3313+
assert len(merged) >= 2 and len(merged) <= 7
3314+
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
3315+
3316+
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
3317+
added_vocab = tokenizer.get_added_vocab()
3318+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
3319+
3320+
for i in range(vocab_size):
3321+
if i not in reverse_vocab:
3322+
tokens.append(f"[PAD{i}]")
3323+
toktypes.append(gguf.TokenType.USER_DEFINED)
3324+
elif reverse_vocab[i] in added_vocab:
3325+
tokens.append(reverse_vocab[i])
3326+
if tokenizer.added_tokens_decoder[i].special:
3327+
toktypes.append(gguf.TokenType.CONTROL)
3328+
else:
3329+
toktypes.append(gguf.TokenType.USER_DEFINED)
3330+
else:
3331+
tokens.append(reverse_vocab[i])
3332+
toktypes.append(gguf.TokenType.NORMAL)
3333+
3334+
self.gguf_writer.add_tokenizer_model("gpt2")
3335+
self.gguf_writer.add_tokenizer_pre(tokpre)
3336+
self.gguf_writer.add_token_list(tokens)
3337+
self.gguf_writer.add_token_types(toktypes)
3338+
3339+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
3340+
special_vocab.merges = merges
3341+
# only add special tokens when they were not already loaded from config.json
3342+
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
3343+
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
3344+
# this one is usually not in config.json anyway
3345+
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
3346+
special_vocab.add_to_gguf(self.gguf_writer)
3347+
3348+
def set_gguf_parameters(self):
3349+
self.gguf_writer.add_name(self.hparams.get("_name_or_path").split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
3350+
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
3351+
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
3352+
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
3353+
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
3354+
self.gguf_writer.add_embedding_length(n_embed)
3355+
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
3356+
self.gguf_writer.add_block_count(self.hparams["num_layers"])
3357+
self.gguf_writer.add_head_count(n_head)
3358+
self.gguf_writer.add_head_count_kv(n_head_kv)
3359+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
3360+
self.gguf_writer.add_file_type(self.ftype)
3361+
self.gguf_writer.add_rope_dimension_count(64)
3362+
self.gguf_writer.add_add_bos_token(False)
3363+
rope_freq = 10000
3364+
if "rope_ratio" in self.hparams:
3365+
rope_freq = rope_freq * self.hparams["rope_ratio"]
3366+
self.gguf_writer.add_rope_freq_base(rope_freq)
3367+
3368+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3369+
del bid # unused
3370+
3371+
if name.endswith(".rotary_pos_emb.inv_freq"):
3372+
return []
3373+
3374+
name = name.removeprefix("transformer.")
3375+
return [(self.map_tensor_name(name), data_torch)]
3376+
31903377
###### CONVERSION LOGIC ######
31913378

31923379

gguf-py/gguf/constants.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ class Tokenizer:
120120
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
121121
EOT_ID = "tokenizer.ggml.eot_token_id"
122122

123-
124123
#
125124
# recommended mapping of model tensor names for storage in gguf
126125
#
@@ -163,6 +162,7 @@ class MODEL_ARCH(IntEnum):
163162
OPENELM = auto()
164163
ARCTIC = auto()
165164
DEEPSEEK2 = auto()
165+
CHATGLM = auto()
166166
BITNET = auto()
167167
T5 = auto()
168168
JAIS = auto()
@@ -289,6 +289,7 @@ class MODEL_TENSOR(IntEnum):
289289
MODEL_ARCH.OPENELM: "openelm",
290290
MODEL_ARCH.ARCTIC: "arctic",
291291
MODEL_ARCH.DEEPSEEK2: "deepseek2",
292+
MODEL_ARCH.CHATGLM: "chatglm",
292293
MODEL_ARCH.BITNET: "bitnet",
293294
MODEL_ARCH.T5: "t5",
294295
MODEL_ARCH.JAIS: "jais",
@@ -924,6 +925,18 @@ class MODEL_TENSOR(IntEnum):
924925
MODEL_TENSOR.FFN_DOWN_SHEXP,
925926
MODEL_TENSOR.FFN_UP_SHEXP,
926927
],
928+
MODEL_ARCH.CHATGLM : [
929+
MODEL_TENSOR.TOKEN_EMBD,
930+
MODEL_TENSOR.ROPE_FREQS,
931+
MODEL_TENSOR.OUTPUT_NORM,
932+
MODEL_TENSOR.OUTPUT,
933+
MODEL_TENSOR.ATTN_NORM,
934+
MODEL_TENSOR.ATTN_QKV,
935+
MODEL_TENSOR.ATTN_OUT,
936+
MODEL_TENSOR.FFN_NORM,
937+
MODEL_TENSOR.FFN_DOWN,
938+
MODEL_TENSOR.FFN_UP,
939+
],
927940
MODEL_ARCH.BITNET: [
928941
MODEL_TENSOR.ATTN_Q,
929942
MODEL_TENSOR.ATTN_K,
@@ -1020,6 +1033,9 @@ class MODEL_TENSOR(IntEnum):
10201033
MODEL_TENSOR.ROPE_FREQS,
10211034
MODEL_TENSOR.ATTN_ROT_EMBD,
10221035
],
1036+
MODEL_ARCH.CHATGLM: [
1037+
MODEL_TENSOR.ROPE_FREQS,
1038+
],
10231039
}
10241040

10251041
#

gguf-py/gguf/tensor_mapping.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class TensorNameMap:
2424
"backbone.embedding", # mamba
2525
"backbone.embeddings", # mamba-hf
2626
"transformer.in_out_embed", # Grok
27+
"embedding.word_embeddings", # chatglm
2728
"transformer.token_embeddings", # openelm
2829
"shared", # t5
2930
),
@@ -55,6 +56,7 @@ class TensorNameMap:
5556
"output", # llama-pth bloom internlm2
5657
"word_embeddings_for_head", # persimmon
5758
"lm_head.linear", # phi2
59+
"output_layer", # chatglm
5860
),
5961

6062
# Output norm
@@ -71,12 +73,14 @@ class TensorNameMap:
7173
"model.norm_f", # mamba-qbert
7274
"backbone.norm_f", # mamba
7375
"transformer.rms_norm", # Grok
76+
"encoder.final_layernorm", # chatglm
7477
"transformer.norm", # openelm
7578
),
7679

7780
# Rope frequencies
7881
MODEL_TENSOR.ROPE_FREQS: (
7982
"rope.freqs", # llama-pth
83+
"rotary_pos_emb.inv_freq", # chatglm
8084
),
8185
}
8286

@@ -101,6 +105,7 @@ class TensorNameMap:
101105
"backbone.layers.{bid}.norm", # mamba
102106
"transformer.decoder_layer.{bid}.rms_norm", # Grok
103107
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
108+
"encoder.layers.{bid}.input_layernorm", # chatglm
104109
"transformer.layers.{bid}.attn_norm", # openelm
105110
),
106111

@@ -124,6 +129,7 @@ class TensorNameMap:
124129
"transformer.h.{bid}.mixer.Wqkv", # phi2
125130
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
126131
"model.layers.{bid}.self_attn.qkv_proj", # phi3
132+
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
127133
"transformer.layers.{bid}.attn.qkv_proj", # openelm
128134
),
129135

@@ -135,7 +141,7 @@ class TensorNameMap:
135141
"transformer.h.{bid}.attn.q_proj", # gpt-j
136142
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
137143
"model.layers.{bid}.attention.wq", # internlm2
138-
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
144+
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
139145
),
140146

141147
# Attention key
@@ -147,7 +153,7 @@ class TensorNameMap:
147153
"transformer.h.{bid}.attn.k", # refact
148154
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
149155
"model.layers.{bid}.attention.wk", # internlm2
150-
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
156+
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
151157
),
152158

153159
# Attention value
@@ -182,6 +188,7 @@ class TensorNameMap:
182188
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
183189
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
184190
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
191+
"encoder.layers.{bid}.self_attention.dense", # chatglm
185192
"transformer.layers.{bid}.attn.out_proj", # openelm
186193
),
187194

@@ -218,6 +225,7 @@ class TensorNameMap:
218225
"h.{bid}.ln_2", # gpt2
219226
"model.layers.{bid}.ffn_norm", # internlm2
220227
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
228+
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
221229
"transformer.layers.{bid}.ffn_norm", # openelm
222230
),
223231

@@ -268,6 +276,7 @@ class TensorNameMap:
268276
"model.layers.{bid}.mlp.c_fc", # starcoder2
269277
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
270278
"model.layers.{bid}.residual_mlp.w3", # arctic
279+
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
271280
),
272281

273282
MODEL_TENSOR.FFN_UP_EXP: (
@@ -337,6 +346,7 @@ class TensorNameMap:
337346
"transformer.layers.{bid}.ffn.proj_2", # openelm
338347
"model.layers.{bid}.residual_mlp.w2", # arctic
339348
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
349+
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
340350
),
341351

342352
MODEL_TENSOR.FFN_DOWN_EXP: (

0 commit comments

Comments
 (0)