@@ -2768,80 +2768,6 @@ def write_tensors(self):
27682768            if  len (experts ) >  0 :
27692769                raise  ValueError (f"Unprocessed experts: { experts }  )
27702770
2771- @Model .register ("JAISLMHeadModel" ) 
2772- class  JaisModel (Model ):
2773-     model_arch  =  gguf .MODEL_ARCH .JAIS 
2774- 
2775-     def  __init__ (self , * args , ** kwargs ):
2776-         super ().__init__ (* args , ** kwargs )
2777- 
2778-         # SwigLU activation 
2779-         assert  self .hparams ["activation_function" ] ==  "swiglu" 
2780-         # ALiBi position embedding 
2781-         assert  self .hparams ["position_embedding_type" ] ==  "alibi" 
2782- 
2783-         # Embeddings scale 
2784-         self .embeddings_scale  =  1.0 
2785-         # note: For some JAIS flavors, output is tied to (same as) wte in original model 
2786-         self .output_is_wte  =  False 
2787-         if   'mup_embeddings_scale'  in  self .hparams :
2788-             self .output_is_wte  =  True    # Hack (?) 
2789-             self .embeddings_scale  =  self .hparams ['mup_embeddings_scale' ]
2790-         elif  'embeddings_scale'  in  self .hparams :
2791-             self .embeddings_scale  =  self .hparams ['embeddings_scale' ]
2792-         else :
2793-             assert  False 
2794- 
2795-         self .width_scale  =  1.0 
2796-         if   'mup_output_alpha'  in  self .hparams :
2797-             assert  'mup_width_scale'  in  self .hparams 
2798-             self .width_scale  =  self .hparams ['mup_output_alpha' ] *  self .hparams ['mup_width_scale' ]
2799-         elif  'width_scale'  in  self .hparams :
2800-             self .width_scale  =  self .hparams ['width_scale' ]
2801-         else :
2802-             assert  False 
2803- 
2804-     def  set_vocab (self ):
2805-         self ._set_vocab_gpt2 ()
2806- 
2807-     def  set_gguf_parameters (self ):
2808-         self .gguf_writer .add_name (self .dir_model .name )
2809-         self .gguf_writer .add_block_count (self .hparams ["n_layer" ])
2810-         self .gguf_writer .add_context_length (self .hparams ["n_positions" ])
2811-         self .gguf_writer .add_embedding_length (self .hparams ["n_embd" ])
2812-         self .gguf_writer .add_feed_forward_length (self .hparams ["n_inner" ])
2813-         self .gguf_writer .add_head_count (self .hparams ["n_head" ])
2814-         self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
2815-         self .gguf_writer .add_file_type (self .ftype )
2816- 
2817-     def  modify_tensors (self , data_torch : Tensor , name : str , bid : int  |  None ) ->  Iterable [tuple [str , Tensor ]]:
2818-         del  bid   # unused 
2819- 
2820-         tensors : list [tuple [str , Tensor ]] =  []
2821- 
2822-         # we don't need these 
2823-         if  name .endswith ((".attn.bias" , "relative_pe.slopes" )):
2824-             return  tensors 
2825- 
2826-         if  name .endswith ((".c_attn.weight" , ".c_proj.weight" , ".c_fc.weight" , ".c_fc2.weight" )):
2827-             data_torch  =  data_torch .transpose (1 , 0 )
2828- 
2829-         new_name  =  self .map_tensor_name (name )
2830- 
2831-         if  new_name  ==  self .format_tensor_name (gguf .MODEL_TENSOR .TOKEN_EMBD ):
2832-             tensors .append ((new_name , data_torch  *  self .embeddings_scale ))
2833-             if  self .output_is_wte :
2834-                 tensors .append ((self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ), data_torch  *  self .width_scale ))
2835-         elif  new_name  ==  self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ):
2836-             assert  not  self .output_is_wte 
2837-             tensors .append ((new_name , data_torch  *  self .width_scale ))
2838-         else :
2839-             tensors .append ((new_name , data_torch ))
2840- 
2841-         return  tensors 
2842- 
2843- 
2844- 
28452771@Model .register ("T5ForConditionalGeneration" ) 
28462772@Model .register ("T5WithLMHeadModel" ) 
28472773class  T5Model (Model ):
@@ -2959,6 +2885,78 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
29592885
29602886        return  [(self .map_tensor_name (name ), data_torch )]
29612887
2888+ @Model .register ("JAISLMHeadModel" ) 
2889+ class  JaisModel (Model ):
2890+     model_arch  =  gguf .MODEL_ARCH .JAIS 
2891+ 
2892+     def  __init__ (self , * args , ** kwargs ):
2893+         super ().__init__ (* args , ** kwargs )
2894+ 
2895+         # SwigLU activation 
2896+         assert  self .hparams ["activation_function" ] ==  "swiglu" 
2897+         # ALiBi position embedding 
2898+         assert  self .hparams ["position_embedding_type" ] ==  "alibi" 
2899+ 
2900+         # Embeddings scale 
2901+         self .embeddings_scale  =  1.0 
2902+         # note: For some JAIS flavors, output is tied to (same as) wte in original model 
2903+         self .output_is_wte  =  False 
2904+         if   'mup_embeddings_scale'  in  self .hparams :
2905+             self .output_is_wte  =  True    # Hack (?) 
2906+             self .embeddings_scale  =  self .hparams ['mup_embeddings_scale' ]
2907+         elif  'embeddings_scale'  in  self .hparams :
2908+             self .embeddings_scale  =  self .hparams ['embeddings_scale' ]
2909+         else :
2910+             assert  False 
2911+ 
2912+         self .width_scale  =  1.0 
2913+         if   'mup_output_alpha'  in  self .hparams :
2914+             assert  'mup_width_scale'  in  self .hparams 
2915+             self .width_scale  =  self .hparams ['mup_output_alpha' ] *  self .hparams ['mup_width_scale' ]
2916+         elif  'width_scale'  in  self .hparams :
2917+             self .width_scale  =  self .hparams ['width_scale' ]
2918+         else :
2919+             assert  False 
2920+ 
2921+     def  set_vocab (self ):
2922+         self ._set_vocab_gpt2 ()
2923+ 
2924+     def  set_gguf_parameters (self ):
2925+         self .gguf_writer .add_name (self .dir_model .name )
2926+         self .gguf_writer .add_block_count (self .hparams ["n_layer" ])
2927+         self .gguf_writer .add_context_length (self .hparams ["n_positions" ])
2928+         self .gguf_writer .add_embedding_length (self .hparams ["n_embd" ])
2929+         self .gguf_writer .add_feed_forward_length (self .hparams ["n_inner" ])
2930+         self .gguf_writer .add_head_count (self .hparams ["n_head" ])
2931+         self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
2932+         self .gguf_writer .add_file_type (self .ftype )
2933+ 
2934+     def  modify_tensors (self , data_torch : Tensor , name : str , bid : int  |  None ) ->  Iterable [tuple [str , Tensor ]]:
2935+         del  bid   # unused 
2936+ 
2937+         tensors : list [tuple [str , Tensor ]] =  []
2938+ 
2939+         # we don't need these 
2940+         if  name .endswith ((".attn.bias" , "relative_pe.slopes" )):
2941+             return  tensors 
2942+ 
2943+         if  name .endswith ((".c_attn.weight" , ".c_proj.weight" , ".c_fc.weight" , ".c_fc2.weight" )):
2944+             data_torch  =  data_torch .transpose (1 , 0 )
2945+ 
2946+         new_name  =  self .map_tensor_name (name )
2947+ 
2948+         if  new_name  ==  self .format_tensor_name (gguf .MODEL_TENSOR .TOKEN_EMBD ):
2949+             tensors .append ((new_name , data_torch  *  self .embeddings_scale ))
2950+             if  self .output_is_wte :
2951+                 tensors .append ((self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ), data_torch  *  self .width_scale ))
2952+         elif  new_name  ==  self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ):
2953+             assert  not  self .output_is_wte 
2954+             tensors .append ((new_name , data_torch  *  self .width_scale ))
2955+         else :
2956+             tensors .append ((new_name , data_torch ))
2957+ 
2958+         return  tensors 
2959+ 
29622960
29632961###### CONVERSION LOGIC ###### 
29642962
0 commit comments