@@ -194,6 +194,7 @@ enum llm_arch {
194194 LLM_ARCH_QWEN,
195195 LLM_ARCH_PHI2,
196196 LLM_ARCH_PLAMO,
197+ LLM_ARCH_CODESHELL,
197198 LLM_ARCH_UNKNOWN,
198199};
199200
@@ -213,6 +214,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
213214 { LLM_ARCH_QWEN, " qwen" },
214215 { LLM_ARCH_PHI2, " phi2" },
215216 { LLM_ARCH_PLAMO, " plamo" },
217+ { LLM_ARCH_CODESHELL, " codeshell" },
216218};
217219
218220enum llm_kv {
@@ -600,6 +602,26 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
600602 { LLM_TENSOR_FFN_UP, " blk.%d.ffn_up" },
601603 },
602604 },
605+ {
606+ LLM_ARCH_CODESHELL,
607+ {
608+ { LLM_TENSOR_TOKEN_EMBD, " token_embd" },
609+ { LLM_TENSOR_OUTPUT_NORM, " output_norm" },
610+ { LLM_TENSOR_OUTPUT, " output" },
611+ { LLM_TENSOR_ROPE_FREQS, " rope_freqs" },
612+ { LLM_TENSOR_ATTN_NORM, " blk.%d.attn_norm" },
613+ { LLM_TENSOR_ATTN_Q, " blk.%d.attn_q" },
614+ { LLM_TENSOR_ATTN_K, " blk.%d.attn_k" },
615+ { LLM_TENSOR_ATTN_V, " blk.%d.attn_v" },
616+ { LLM_TENSOR_ATTN_QKV, " blk.%d.attn_qkv" },
617+ { LLM_TENSOR_ATTN_OUT, " blk.%d.attn_output" },
618+ { LLM_TENSOR_ATTN_ROT_EMBD, " blk.%d.attn_rot_embd" },
619+ { LLM_TENSOR_FFN_NORM, " blk.%d.ffn_norm" },
620+ { LLM_TENSOR_FFN_GATE, " blk.%d.ffn_gate" },
621+ { LLM_TENSOR_FFN_DOWN, " blk.%d.ffn_down" },
622+ { LLM_TENSOR_FFN_UP, " blk.%d.ffn_up" },
623+ },
624+ },
603625
604626 {
605627 LLM_ARCH_UNKNOWN,
@@ -2877,6 +2899,14 @@ static void llm_load_hparams(
28772899 default : model.type = e_model::MODEL_UNKNOWN;
28782900 }
28792901 } break ;
2902+ case LLM_ARCH_CODESHELL:
2903+ {
2904+ ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
2905+ switch (hparams.n_layer ) {
2906+ case 42 : model.type = e_model::MODEL_SMALL; break ;
2907+ default : model.type = e_model::MODEL_UNKNOWN;
2908+ }
2909+ } break ;
28802910
28812911 default : (void )0 ;
28822912 }
@@ -3784,6 +3814,42 @@ static bool llm_load_tensors(
37843814 layer.ffn_up_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff});
37853815 }
37863816 } break ;
3817+ case LLM_ARCH_CODESHELL:
3818+ {
3819+ model.tok_embd = ml.create_tensor (ctx_input, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab});
3820+
3821+ // output
3822+ {
3823+ model.output_norm = ml.create_tensor (ctx_output, tn (LLM_TENSOR_OUTPUT_NORM, " weight" ), {n_embd});
3824+ model.output_norm_b = ml.create_tensor (ctx_output, tn (LLM_TENSOR_OUTPUT_NORM, " bias" ), {n_embd});
3825+ model.output = ml.create_tensor (ctx_output_split, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab});
3826+ }
3827+
3828+ for (int i = 0 ; i < n_layer; ++i) {
3829+ ggml_context * ctx_layer = ctx_for_layer (i);
3830+ ggml_context * ctx_split = ctx_for_layer_split (i);
3831+
3832+ auto & layer = model.layers [i];
3833+
3834+ layer.attn_norm = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd});
3835+ layer.attn_norm_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " bias" , i), {n_embd});
3836+
3837+ layer.wqkv = ml.create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_QKV, " weight" , i), {n_embd, n_embd + 2 *n_embd_gqa});
3838+ layer.bqkv = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_QKV, " bias" , i), {n_embd + 2 *n_embd_gqa});
3839+
3840+ layer.wo = ml.create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd});
3841+ layer.bo = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd});
3842+
3843+ layer.ffn_norm = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd});
3844+ layer.ffn_norm_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_NORM, " bias" , i), {n_embd});
3845+
3846+ layer.ffn_down = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), {n_ff, n_embd});
3847+ layer.ffn_down_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd});
3848+
3849+ layer.ffn_up = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff});
3850+ layer.ffn_up_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff});
3851+ }
3852+ } break ;
37873853 default :
37883854 throw std::runtime_error (" unknown architecture" );
37893855 }
@@ -5965,6 +6031,117 @@ struct llm_build_context {
59656031
59666032 return gf;
59676033 }
6034+
6035+ struct ggml_cgraph * build_codeshell () {
6036+ struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
6037+
6038+ const int64_t n_embd_head = hparams.n_embd_head_v ;
6039+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
6040+ GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
6041+ GGML_ASSERT (n_embd_head == hparams.n_rot );
6042+
6043+ struct ggml_tensor * cur;
6044+ struct ggml_tensor * inpL;
6045+
6046+ inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embd , cb);
6047+ cb (inpL, " inp_embd" , -1 );
6048+
6049+ // inp_pos - contains the positions
6050+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
6051+ cb (inp_pos, " inp_pos" , -1 );
6052+
6053+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6054+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1 );
6055+ cb (KQ_mask, " KQ_mask" , -1 );
6056+
6057+ // shift the entire K-cache if needed
6058+ if (do_rope_shift) {
6059+ llm_build_k_shift (ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6060+ }
6061+
6062+ for (int il = 0 ; il < n_layer; ++il) {
6063+ cur = llm_build_norm (ctx0, inpL, hparams,
6064+ model.layers [il].attn_norm ,
6065+ model.layers [il].attn_norm_b ,
6066+ LLM_NORM, cb, il);
6067+ cb (cur, " attn_norm" , il);
6068+
6069+ // self-attention
6070+ {
6071+ cur = ggml_mul_mat (ctx0, model.layers [il].wqkv , cur);
6072+ cb (cur, " wqkv" , il);
6073+
6074+ cur = ggml_add (ctx0, cur, model.layers [il].bqkv );
6075+ cb (cur, " bqkv" , il);
6076+
6077+ struct ggml_tensor * tmpq = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd, n_tokens, cur->nb [1 ], 0 *sizeof (float )*(n_embd)));
6078+ struct ggml_tensor * tmpk = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd_gqa, n_tokens, cur->nb [1 ], 1 *sizeof (float )*(n_embd)));
6079+ struct ggml_tensor * Vcur = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd_gqa, n_tokens, cur->nb [1 ], 1 *sizeof (float )*(n_embd + n_embd_gqa)));
6080+
6081+ cb (tmpq, " tmpq" , il);
6082+ cb (tmpk, " tmpk" , il);
6083+ cb (Vcur, " Vcur" , il);
6084+
6085+ struct ggml_tensor * Qcur = ggml_rope_custom (
6086+ ctx0, ggml_reshape_3d (ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
6087+ hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
6088+ ext_factor, attn_factor, beta_fast, beta_slow
6089+ );
6090+ cb (Qcur, " Qcur" , il);
6091+
6092+ struct ggml_tensor * Kcur = ggml_rope_custom (
6093+ ctx0, ggml_reshape_3d (ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
6094+ hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
6095+ ext_factor, attn_factor, beta_fast, beta_slow
6096+ );
6097+ cb (Kcur, " Kcur" , il);
6098+
6099+ llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
6100+
6101+ cur = llm_build_kqv (ctx0, model, hparams, kv_self,
6102+ model.layers [il].wo , model.layers [il].bo ,
6103+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , 1 .0f /sqrtf (float (n_embd_head)), cb, il);
6104+ cb (cur, " kqv_out" , il);
6105+ }
6106+
6107+ // add the input
6108+ struct ggml_tensor * ffn_inp = ggml_add (ctx0, cur, inpL);
6109+ cb (ffn_inp, " ffn_inp" , il);
6110+
6111+ // FF
6112+ {
6113+ cur = llm_build_norm (ctx0, ffn_inp, hparams,
6114+ model.layers [il].ffn_norm ,
6115+ model.layers [il].ffn_norm_b ,
6116+ LLM_NORM, cb, il);
6117+ cb (cur, " ffn_norm" , il);
6118+
6119+ cur = llm_build_ffn (ctx0, cur,
6120+ model.layers [il].ffn_up , model.layers [il].ffn_up_b ,
6121+ NULL , NULL ,
6122+ model.layers [il].ffn_down , model.layers [il].ffn_down_b ,
6123+ NULL ,
6124+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6125+ cb (cur, " ffn_out" , il);
6126+ }
6127+
6128+ inpL = ggml_add (ctx0, cur, ffn_inp);
6129+ cb (inpL, " l_out" , il);
6130+ }
6131+
6132+ cur = llm_build_norm (ctx0, inpL, hparams,
6133+ model.output_norm ,
6134+ model.output_norm_b ,
6135+ LLM_NORM, cb, -1 );
6136+ cb (cur, " result_norm" , -1 );
6137+
6138+ cur = ggml_mul_mat (ctx0, model.output , cur);
6139+ cb (cur, " result_output" , -1 );
6140+
6141+ ggml_build_forward_expand (gf, cur);
6142+
6143+ return gf;
6144+ }
59686145};
59696146
59706147static struct ggml_cgraph * llama_build_graph (
@@ -6159,6 +6336,10 @@ static struct ggml_cgraph * llama_build_graph(
61596336 {
61606337 result = llm.build_gpt2 ();
61616338 } break ;
6339+ case LLM_ARCH_CODESHELL:
6340+ {
6341+ result = llm.build_codeshell ();
6342+ } break ;
61626343 default :
61636344 GGML_ASSERT (false );
61646345 }
0 commit comments